linux/fs/btrfs/send.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2012 Alexander Block.  All rights reserved.
   4 */
   5
   6#include <linux/bsearch.h>
   7#include <linux/fs.h>
   8#include <linux/file.h>
   9#include <linux/sort.h>
  10#include <linux/mount.h>
  11#include <linux/xattr.h>
  12#include <linux/posix_acl_xattr.h>
  13#include <linux/radix-tree.h>
  14#include <linux/vmalloc.h>
  15#include <linux/string.h>
  16#include <linux/compat.h>
  17#include <linux/crc32c.h>
  18
  19#include "send.h"
  20#include "backref.h"
  21#include "locking.h"
  22#include "disk-io.h"
  23#include "btrfs_inode.h"
  24#include "transaction.h"
  25#include "compression.h"
  26#include "xattr.h"
  27#include "print-tree.h"
  28
  29/*
  30 * Maximum number of references an extent can have in order for us to attempt to
  31 * issue clone operations instead of write operations. This currently exists to
  32 * avoid hitting limitations of the backreference walking code (taking a lot of
  33 * time and using too much memory for extents with large number of references).
  34 */
  35#define SEND_MAX_EXTENT_REFS    64
  36
  37/*
  38 * A fs_path is a helper to dynamically build path names with unknown size.
  39 * It reallocates the internal buffer on demand.
  40 * It allows fast adding of path elements on the right side (normal path) and
  41 * fast adding to the left side (reversed path). A reversed path can also be
  42 * unreversed if needed.
  43 */
  44struct fs_path {
  45        union {
  46                struct {
  47                        char *start;
  48                        char *end;
  49
  50                        char *buf;
  51                        unsigned short buf_len:15;
  52                        unsigned short reversed:1;
  53                        char inline_buf[];
  54                };
  55                /*
  56                 * Average path length does not exceed 200 bytes, we'll have
  57                 * better packing in the slab and higher chance to satisfy
  58                 * a allocation later during send.
  59                 */
  60                char pad[256];
  61        };
  62};
  63#define FS_PATH_INLINE_SIZE \
  64        (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
  65
  66
  67/* reused for each extent */
  68struct clone_root {
  69        struct btrfs_root *root;
  70        u64 ino;
  71        u64 offset;
  72
  73        u64 found_refs;
  74};
  75
  76#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
  77#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
  78
  79struct send_ctx {
  80        struct file *send_filp;
  81        loff_t send_off;
  82        char *send_buf;
  83        u32 send_size;
  84        u32 send_max_size;
  85        u64 total_send_size;
  86        u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
  87        u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
  88        /* Protocol version compatibility requested */
  89        u32 proto;
  90
  91        struct btrfs_root *send_root;
  92        struct btrfs_root *parent_root;
  93        struct clone_root *clone_roots;
  94        int clone_roots_cnt;
  95
  96        /* current state of the compare_tree call */
  97        struct btrfs_path *left_path;
  98        struct btrfs_path *right_path;
  99        struct btrfs_key *cmp_key;
 100
 101        /*
 102         * Keep track of the generation of the last transaction that was used
 103         * for relocating a block group. This is periodically checked in order
 104         * to detect if a relocation happened since the last check, so that we
 105         * don't operate on stale extent buffers for nodes (level >= 1) or on
 106         * stale disk_bytenr values of file extent items.
 107         */
 108        u64 last_reloc_trans;
 109
 110        /*
 111         * infos of the currently processed inode. In case of deleted inodes,
 112         * these are the values from the deleted inode.
 113         */
 114        u64 cur_ino;
 115        u64 cur_inode_gen;
 116        int cur_inode_new;
 117        int cur_inode_new_gen;
 118        int cur_inode_deleted;
 119        u64 cur_inode_size;
 120        u64 cur_inode_mode;
 121        u64 cur_inode_rdev;
 122        u64 cur_inode_last_extent;
 123        u64 cur_inode_next_write_offset;
 124        bool ignore_cur_inode;
 125
 126        u64 send_progress;
 127
 128        struct list_head new_refs;
 129        struct list_head deleted_refs;
 130
 131        struct radix_tree_root name_cache;
 132        struct list_head name_cache_list;
 133        int name_cache_size;
 134
 135        struct file_ra_state ra;
 136
 137        /*
 138         * We process inodes by their increasing order, so if before an
 139         * incremental send we reverse the parent/child relationship of
 140         * directories such that a directory with a lower inode number was
 141         * the parent of a directory with a higher inode number, and the one
 142         * becoming the new parent got renamed too, we can't rename/move the
 143         * directory with lower inode number when we finish processing it - we
 144         * must process the directory with higher inode number first, then
 145         * rename/move it and then rename/move the directory with lower inode
 146         * number. Example follows.
 147         *
 148         * Tree state when the first send was performed:
 149         *
 150         * .
 151         * |-- a                   (ino 257)
 152         *     |-- b               (ino 258)
 153         *         |
 154         *         |
 155         *         |-- c           (ino 259)
 156         *         |   |-- d       (ino 260)
 157         *         |
 158         *         |-- c2          (ino 261)
 159         *
 160         * Tree state when the second (incremental) send is performed:
 161         *
 162         * .
 163         * |-- a                   (ino 257)
 164         *     |-- b               (ino 258)
 165         *         |-- c2          (ino 261)
 166         *             |-- d2      (ino 260)
 167         *                 |-- cc  (ino 259)
 168         *
 169         * The sequence of steps that lead to the second state was:
 170         *
 171         * mv /a/b/c/d /a/b/c2/d2
 172         * mv /a/b/c /a/b/c2/d2/cc
 173         *
 174         * "c" has lower inode number, but we can't move it (2nd mv operation)
 175         * before we move "d", which has higher inode number.
 176         *
 177         * So we just memorize which move/rename operations must be performed
 178         * later when their respective parent is processed and moved/renamed.
 179         */
 180
 181        /* Indexed by parent directory inode number. */
 182        struct rb_root pending_dir_moves;
 183
 184        /*
 185         * Reverse index, indexed by the inode number of a directory that
 186         * is waiting for the move/rename of its immediate parent before its
 187         * own move/rename can be performed.
 188         */
 189        struct rb_root waiting_dir_moves;
 190
 191        /*
 192         * A directory that is going to be rm'ed might have a child directory
 193         * which is in the pending directory moves index above. In this case,
 194         * the directory can only be removed after the move/rename of its child
 195         * is performed. Example:
 196         *
 197         * Parent snapshot:
 198         *
 199         * .                        (ino 256)
 200         * |-- a/                   (ino 257)
 201         *     |-- b/               (ino 258)
 202         *         |-- c/           (ino 259)
 203         *         |   |-- x/       (ino 260)
 204         *         |
 205         *         |-- y/           (ino 261)
 206         *
 207         * Send snapshot:
 208         *
 209         * .                        (ino 256)
 210         * |-- a/                   (ino 257)
 211         *     |-- b/               (ino 258)
 212         *         |-- YY/          (ino 261)
 213         *              |-- x/      (ino 260)
 214         *
 215         * Sequence of steps that lead to the send snapshot:
 216         * rm -f /a/b/c/foo.txt
 217         * mv /a/b/y /a/b/YY
 218         * mv /a/b/c/x /a/b/YY
 219         * rmdir /a/b/c
 220         *
 221         * When the child is processed, its move/rename is delayed until its
 222         * parent is processed (as explained above), but all other operations
 223         * like update utimes, chown, chgrp, etc, are performed and the paths
 224         * that it uses for those operations must use the orphanized name of
 225         * its parent (the directory we're going to rm later), so we need to
 226         * memorize that name.
 227         *
 228         * Indexed by the inode number of the directory to be deleted.
 229         */
 230        struct rb_root orphan_dirs;
 231};
 232
 233struct pending_dir_move {
 234        struct rb_node node;
 235        struct list_head list;
 236        u64 parent_ino;
 237        u64 ino;
 238        u64 gen;
 239        struct list_head update_refs;
 240};
 241
 242struct waiting_dir_move {
 243        struct rb_node node;
 244        u64 ino;
 245        /*
 246         * There might be some directory that could not be removed because it
 247         * was waiting for this directory inode to be moved first. Therefore
 248         * after this directory is moved, we can try to rmdir the ino rmdir_ino.
 249         */
 250        u64 rmdir_ino;
 251        u64 rmdir_gen;
 252        bool orphanized;
 253};
 254
 255struct orphan_dir_info {
 256        struct rb_node node;
 257        u64 ino;
 258        u64 gen;
 259        u64 last_dir_index_offset;
 260};
 261
 262struct name_cache_entry {
 263        struct list_head list;
 264        /*
 265         * radix_tree has only 32bit entries but we need to handle 64bit inums.
 266         * We use the lower 32bit of the 64bit inum to store it in the tree. If
 267         * more then one inum would fall into the same entry, we use radix_list
 268         * to store the additional entries. radix_list is also used to store
 269         * entries where two entries have the same inum but different
 270         * generations.
 271         */
 272        struct list_head radix_list;
 273        u64 ino;
 274        u64 gen;
 275        u64 parent_ino;
 276        u64 parent_gen;
 277        int ret;
 278        int need_later_update;
 279        int name_len;
 280        char name[];
 281};
 282
 283#define ADVANCE                                                 1
 284#define ADVANCE_ONLY_NEXT                                       -1
 285
 286enum btrfs_compare_tree_result {
 287        BTRFS_COMPARE_TREE_NEW,
 288        BTRFS_COMPARE_TREE_DELETED,
 289        BTRFS_COMPARE_TREE_CHANGED,
 290        BTRFS_COMPARE_TREE_SAME,
 291};
 292
 293__cold
 294static void inconsistent_snapshot_error(struct send_ctx *sctx,
 295                                        enum btrfs_compare_tree_result result,
 296                                        const char *what)
 297{
 298        const char *result_string;
 299
 300        switch (result) {
 301        case BTRFS_COMPARE_TREE_NEW:
 302                result_string = "new";
 303                break;
 304        case BTRFS_COMPARE_TREE_DELETED:
 305                result_string = "deleted";
 306                break;
 307        case BTRFS_COMPARE_TREE_CHANGED:
 308                result_string = "updated";
 309                break;
 310        case BTRFS_COMPARE_TREE_SAME:
 311                ASSERT(0);
 312                result_string = "unchanged";
 313                break;
 314        default:
 315                ASSERT(0);
 316                result_string = "unexpected";
 317        }
 318
 319        btrfs_err(sctx->send_root->fs_info,
 320                  "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
 321                  result_string, what, sctx->cmp_key->objectid,
 322                  sctx->send_root->root_key.objectid,
 323                  (sctx->parent_root ?
 324                   sctx->parent_root->root_key.objectid : 0));
 325}
 326
 327__maybe_unused
 328static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
 329{
 330        switch (sctx->proto) {
 331        case 1:  return cmd < __BTRFS_SEND_C_MAX_V1;
 332        case 2:  return cmd < __BTRFS_SEND_C_MAX_V2;
 333        default: return false;
 334        }
 335}
 336
 337static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 338
 339static struct waiting_dir_move *
 340get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
 341
 342static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
 343
 344static int need_send_hole(struct send_ctx *sctx)
 345{
 346        return (sctx->parent_root && !sctx->cur_inode_new &&
 347                !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
 348                S_ISREG(sctx->cur_inode_mode));
 349}
 350
 351static void fs_path_reset(struct fs_path *p)
 352{
 353        if (p->reversed) {
 354                p->start = p->buf + p->buf_len - 1;
 355                p->end = p->start;
 356                *p->start = 0;
 357        } else {
 358                p->start = p->buf;
 359                p->end = p->start;
 360                *p->start = 0;
 361        }
 362}
 363
 364static struct fs_path *fs_path_alloc(void)
 365{
 366        struct fs_path *p;
 367
 368        p = kmalloc(sizeof(*p), GFP_KERNEL);
 369        if (!p)
 370                return NULL;
 371        p->reversed = 0;
 372        p->buf = p->inline_buf;
 373        p->buf_len = FS_PATH_INLINE_SIZE;
 374        fs_path_reset(p);
 375        return p;
 376}
 377
 378static struct fs_path *fs_path_alloc_reversed(void)
 379{
 380        struct fs_path *p;
 381
 382        p = fs_path_alloc();
 383        if (!p)
 384                return NULL;
 385        p->reversed = 1;
 386        fs_path_reset(p);
 387        return p;
 388}
 389
 390static void fs_path_free(struct fs_path *p)
 391{
 392        if (!p)
 393                return;
 394        if (p->buf != p->inline_buf)
 395                kfree(p->buf);
 396        kfree(p);
 397}
 398
 399static int fs_path_len(struct fs_path *p)
 400{
 401        return p->end - p->start;
 402}
 403
 404static int fs_path_ensure_buf(struct fs_path *p, int len)
 405{
 406        char *tmp_buf;
 407        int path_len;
 408        int old_buf_len;
 409
 410        len++;
 411
 412        if (p->buf_len >= len)
 413                return 0;
 414
 415        if (len > PATH_MAX) {
 416                WARN_ON(1);
 417                return -ENOMEM;
 418        }
 419
 420        path_len = p->end - p->start;
 421        old_buf_len = p->buf_len;
 422
 423        /*
 424         * First time the inline_buf does not suffice
 425         */
 426        if (p->buf == p->inline_buf) {
 427                tmp_buf = kmalloc(len, GFP_KERNEL);
 428                if (tmp_buf)
 429                        memcpy(tmp_buf, p->buf, old_buf_len);
 430        } else {
 431                tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
 432        }
 433        if (!tmp_buf)
 434                return -ENOMEM;
 435        p->buf = tmp_buf;
 436        /*
 437         * The real size of the buffer is bigger, this will let the fast path
 438         * happen most of the time
 439         */
 440        p->buf_len = ksize(p->buf);
 441
 442        if (p->reversed) {
 443                tmp_buf = p->buf + old_buf_len - path_len - 1;
 444                p->end = p->buf + p->buf_len - 1;
 445                p->start = p->end - path_len;
 446                memmove(p->start, tmp_buf, path_len + 1);
 447        } else {
 448                p->start = p->buf;
 449                p->end = p->start + path_len;
 450        }
 451        return 0;
 452}
 453
 454static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
 455                                   char **prepared)
 456{
 457        int ret;
 458        int new_len;
 459
 460        new_len = p->end - p->start + name_len;
 461        if (p->start != p->end)
 462                new_len++;
 463        ret = fs_path_ensure_buf(p, new_len);
 464        if (ret < 0)
 465                goto out;
 466
 467        if (p->reversed) {
 468                if (p->start != p->end)
 469                        *--p->start = '/';
 470                p->start -= name_len;
 471                *prepared = p->start;
 472        } else {
 473                if (p->start != p->end)
 474                        *p->end++ = '/';
 475                *prepared = p->end;
 476                p->end += name_len;
 477                *p->end = 0;
 478        }
 479
 480out:
 481        return ret;
 482}
 483
 484static int fs_path_add(struct fs_path *p, const char *name, int name_len)
 485{
 486        int ret;
 487        char *prepared;
 488
 489        ret = fs_path_prepare_for_add(p, name_len, &prepared);
 490        if (ret < 0)
 491                goto out;
 492        memcpy(prepared, name, name_len);
 493
 494out:
 495        return ret;
 496}
 497
 498static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
 499{
 500        int ret;
 501        char *prepared;
 502
 503        ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
 504        if (ret < 0)
 505                goto out;
 506        memcpy(prepared, p2->start, p2->end - p2->start);
 507
 508out:
 509        return ret;
 510}
 511
 512static int fs_path_add_from_extent_buffer(struct fs_path *p,
 513                                          struct extent_buffer *eb,
 514                                          unsigned long off, int len)
 515{
 516        int ret;
 517        char *prepared;
 518
 519        ret = fs_path_prepare_for_add(p, len, &prepared);
 520        if (ret < 0)
 521                goto out;
 522
 523        read_extent_buffer(eb, prepared, off, len);
 524
 525out:
 526        return ret;
 527}
 528
 529static int fs_path_copy(struct fs_path *p, struct fs_path *from)
 530{
 531        int ret;
 532
 533        p->reversed = from->reversed;
 534        fs_path_reset(p);
 535
 536        ret = fs_path_add_path(p, from);
 537
 538        return ret;
 539}
 540
 541
 542static void fs_path_unreverse(struct fs_path *p)
 543{
 544        char *tmp;
 545        int len;
 546
 547        if (!p->reversed)
 548                return;
 549
 550        tmp = p->start;
 551        len = p->end - p->start;
 552        p->start = p->buf;
 553        p->end = p->start + len;
 554        memmove(p->start, tmp, len + 1);
 555        p->reversed = 0;
 556}
 557
 558static struct btrfs_path *alloc_path_for_send(void)
 559{
 560        struct btrfs_path *path;
 561
 562        path = btrfs_alloc_path();
 563        if (!path)
 564                return NULL;
 565        path->search_commit_root = 1;
 566        path->skip_locking = 1;
 567        path->need_commit_sem = 1;
 568        return path;
 569}
 570
 571static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
 572{
 573        int ret;
 574        u32 pos = 0;
 575
 576        while (pos < len) {
 577                ret = kernel_write(filp, buf + pos, len - pos, off);
 578                /* TODO handle that correctly */
 579                /*if (ret == -ERESTARTSYS) {
 580                        continue;
 581                }*/
 582                if (ret < 0)
 583                        return ret;
 584                if (ret == 0) {
 585                        return -EIO;
 586                }
 587                pos += ret;
 588        }
 589
 590        return 0;
 591}
 592
 593static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
 594{
 595        struct btrfs_tlv_header *hdr;
 596        int total_len = sizeof(*hdr) + len;
 597        int left = sctx->send_max_size - sctx->send_size;
 598
 599        if (unlikely(left < total_len))
 600                return -EOVERFLOW;
 601
 602        hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
 603        put_unaligned_le16(attr, &hdr->tlv_type);
 604        put_unaligned_le16(len, &hdr->tlv_len);
 605        memcpy(hdr + 1, data, len);
 606        sctx->send_size += total_len;
 607
 608        return 0;
 609}
 610
 611#define TLV_PUT_DEFINE_INT(bits) \
 612        static int tlv_put_u##bits(struct send_ctx *sctx,               \
 613                        u##bits attr, u##bits value)                    \
 614        {                                                               \
 615                __le##bits __tmp = cpu_to_le##bits(value);              \
 616                return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));      \
 617        }
 618
 619TLV_PUT_DEFINE_INT(64)
 620
 621static int tlv_put_string(struct send_ctx *sctx, u16 attr,
 622                          const char *str, int len)
 623{
 624        if (len == -1)
 625                len = strlen(str);
 626        return tlv_put(sctx, attr, str, len);
 627}
 628
 629static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
 630                        const u8 *uuid)
 631{
 632        return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
 633}
 634
 635static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
 636                                  struct extent_buffer *eb,
 637                                  struct btrfs_timespec *ts)
 638{
 639        struct btrfs_timespec bts;
 640        read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
 641        return tlv_put(sctx, attr, &bts, sizeof(bts));
 642}
 643
 644
 645#define TLV_PUT(sctx, attrtype, data, attrlen) \
 646        do { \
 647                ret = tlv_put(sctx, attrtype, data, attrlen); \
 648                if (ret < 0) \
 649                        goto tlv_put_failure; \
 650        } while (0)
 651
 652#define TLV_PUT_INT(sctx, attrtype, bits, value) \
 653        do { \
 654                ret = tlv_put_u##bits(sctx, attrtype, value); \
 655                if (ret < 0) \
 656                        goto tlv_put_failure; \
 657        } while (0)
 658
 659#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
 660#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
 661#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
 662#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
 663#define TLV_PUT_STRING(sctx, attrtype, str, len) \
 664        do { \
 665                ret = tlv_put_string(sctx, attrtype, str, len); \
 666                if (ret < 0) \
 667                        goto tlv_put_failure; \
 668        } while (0)
 669#define TLV_PUT_PATH(sctx, attrtype, p) \
 670        do { \
 671                ret = tlv_put_string(sctx, attrtype, p->start, \
 672                        p->end - p->start); \
 673                if (ret < 0) \
 674                        goto tlv_put_failure; \
 675        } while(0)
 676#define TLV_PUT_UUID(sctx, attrtype, uuid) \
 677        do { \
 678                ret = tlv_put_uuid(sctx, attrtype, uuid); \
 679                if (ret < 0) \
 680                        goto tlv_put_failure; \
 681        } while (0)
 682#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
 683        do { \
 684                ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
 685                if (ret < 0) \
 686                        goto tlv_put_failure; \
 687        } while (0)
 688
 689static int send_header(struct send_ctx *sctx)
 690{
 691        struct btrfs_stream_header hdr;
 692
 693        strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
 694        hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
 695
 696        return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
 697                                        &sctx->send_off);
 698}
 699
 700/*
 701 * For each command/item we want to send to userspace, we call this function.
 702 */
 703static int begin_cmd(struct send_ctx *sctx, int cmd)
 704{
 705        struct btrfs_cmd_header *hdr;
 706
 707        if (WARN_ON(!sctx->send_buf))
 708                return -EINVAL;
 709
 710        BUG_ON(sctx->send_size);
 711
 712        sctx->send_size += sizeof(*hdr);
 713        hdr = (struct btrfs_cmd_header *)sctx->send_buf;
 714        put_unaligned_le16(cmd, &hdr->cmd);
 715
 716        return 0;
 717}
 718
 719static int send_cmd(struct send_ctx *sctx)
 720{
 721        int ret;
 722        struct btrfs_cmd_header *hdr;
 723        u32 crc;
 724
 725        hdr = (struct btrfs_cmd_header *)sctx->send_buf;
 726        put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
 727        put_unaligned_le32(0, &hdr->crc);
 728
 729        crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
 730        put_unaligned_le32(crc, &hdr->crc);
 731
 732        ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
 733                                        &sctx->send_off);
 734
 735        sctx->total_send_size += sctx->send_size;
 736        sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
 737        sctx->send_size = 0;
 738
 739        return ret;
 740}
 741
 742/*
 743 * Sends a move instruction to user space
 744 */
 745static int send_rename(struct send_ctx *sctx,
 746                     struct fs_path *from, struct fs_path *to)
 747{
 748        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 749        int ret;
 750
 751        btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
 752
 753        ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
 754        if (ret < 0)
 755                goto out;
 756
 757        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
 758        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
 759
 760        ret = send_cmd(sctx);
 761
 762tlv_put_failure:
 763out:
 764        return ret;
 765}
 766
 767/*
 768 * Sends a link instruction to user space
 769 */
 770static int send_link(struct send_ctx *sctx,
 771                     struct fs_path *path, struct fs_path *lnk)
 772{
 773        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 774        int ret;
 775
 776        btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
 777
 778        ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
 779        if (ret < 0)
 780                goto out;
 781
 782        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
 783        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
 784
 785        ret = send_cmd(sctx);
 786
 787tlv_put_failure:
 788out:
 789        return ret;
 790}
 791
 792/*
 793 * Sends an unlink instruction to user space
 794 */
 795static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
 796{
 797        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 798        int ret;
 799
 800        btrfs_debug(fs_info, "send_unlink %s", path->start);
 801
 802        ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
 803        if (ret < 0)
 804                goto out;
 805
 806        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
 807
 808        ret = send_cmd(sctx);
 809
 810tlv_put_failure:
 811out:
 812        return ret;
 813}
 814
 815/*
 816 * Sends a rmdir instruction to user space
 817 */
 818static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
 819{
 820        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
 821        int ret;
 822
 823        btrfs_debug(fs_info, "send_rmdir %s", path->start);
 824
 825        ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
 826        if (ret < 0)
 827                goto out;
 828
 829        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
 830
 831        ret = send_cmd(sctx);
 832
 833tlv_put_failure:
 834out:
 835        return ret;
 836}
 837
 838/*
 839 * Helper function to retrieve some fields from an inode item.
 840 */
 841static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
 842                          u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
 843                          u64 *gid, u64 *rdev)
 844{
 845        int ret;
 846        struct btrfs_inode_item *ii;
 847        struct btrfs_key key;
 848
 849        key.objectid = ino;
 850        key.type = BTRFS_INODE_ITEM_KEY;
 851        key.offset = 0;
 852        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 853        if (ret) {
 854                if (ret > 0)
 855                        ret = -ENOENT;
 856                return ret;
 857        }
 858
 859        ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
 860                        struct btrfs_inode_item);
 861        if (size)
 862                *size = btrfs_inode_size(path->nodes[0], ii);
 863        if (gen)
 864                *gen = btrfs_inode_generation(path->nodes[0], ii);
 865        if (mode)
 866                *mode = btrfs_inode_mode(path->nodes[0], ii);
 867        if (uid)
 868                *uid = btrfs_inode_uid(path->nodes[0], ii);
 869        if (gid)
 870                *gid = btrfs_inode_gid(path->nodes[0], ii);
 871        if (rdev)
 872                *rdev = btrfs_inode_rdev(path->nodes[0], ii);
 873
 874        return ret;
 875}
 876
 877static int get_inode_info(struct btrfs_root *root,
 878                          u64 ino, u64 *size, u64 *gen,
 879                          u64 *mode, u64 *uid, u64 *gid,
 880                          u64 *rdev)
 881{
 882        struct btrfs_path *path;
 883        int ret;
 884
 885        path = alloc_path_for_send();
 886        if (!path)
 887                return -ENOMEM;
 888        ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
 889                               rdev);
 890        btrfs_free_path(path);
 891        return ret;
 892}
 893
 894typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
 895                                   struct fs_path *p,
 896                                   void *ctx);
 897
 898/*
 899 * Helper function to iterate the entries in ONE btrfs_inode_ref or
 900 * btrfs_inode_extref.
 901 * The iterate callback may return a non zero value to stop iteration. This can
 902 * be a negative value for error codes or 1 to simply stop it.
 903 *
 904 * path must point to the INODE_REF or INODE_EXTREF when called.
 905 */
 906static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
 907                             struct btrfs_key *found_key, int resolve,
 908                             iterate_inode_ref_t iterate, void *ctx)
 909{
 910        struct extent_buffer *eb = path->nodes[0];
 911        struct btrfs_inode_ref *iref;
 912        struct btrfs_inode_extref *extref;
 913        struct btrfs_path *tmp_path;
 914        struct fs_path *p;
 915        u32 cur = 0;
 916        u32 total;
 917        int slot = path->slots[0];
 918        u32 name_len;
 919        char *start;
 920        int ret = 0;
 921        int num = 0;
 922        int index;
 923        u64 dir;
 924        unsigned long name_off;
 925        unsigned long elem_size;
 926        unsigned long ptr;
 927
 928        p = fs_path_alloc_reversed();
 929        if (!p)
 930                return -ENOMEM;
 931
 932        tmp_path = alloc_path_for_send();
 933        if (!tmp_path) {
 934                fs_path_free(p);
 935                return -ENOMEM;
 936        }
 937
 938
 939        if (found_key->type == BTRFS_INODE_REF_KEY) {
 940                ptr = (unsigned long)btrfs_item_ptr(eb, slot,
 941                                                    struct btrfs_inode_ref);
 942                total = btrfs_item_size(eb, slot);
 943                elem_size = sizeof(*iref);
 944        } else {
 945                ptr = btrfs_item_ptr_offset(eb, slot);
 946                total = btrfs_item_size(eb, slot);
 947                elem_size = sizeof(*extref);
 948        }
 949
 950        while (cur < total) {
 951                fs_path_reset(p);
 952
 953                if (found_key->type == BTRFS_INODE_REF_KEY) {
 954                        iref = (struct btrfs_inode_ref *)(ptr + cur);
 955                        name_len = btrfs_inode_ref_name_len(eb, iref);
 956                        name_off = (unsigned long)(iref + 1);
 957                        index = btrfs_inode_ref_index(eb, iref);
 958                        dir = found_key->offset;
 959                } else {
 960                        extref = (struct btrfs_inode_extref *)(ptr + cur);
 961                        name_len = btrfs_inode_extref_name_len(eb, extref);
 962                        name_off = (unsigned long)&extref->name;
 963                        index = btrfs_inode_extref_index(eb, extref);
 964                        dir = btrfs_inode_extref_parent(eb, extref);
 965                }
 966
 967                if (resolve) {
 968                        start = btrfs_ref_to_path(root, tmp_path, name_len,
 969                                                  name_off, eb, dir,
 970                                                  p->buf, p->buf_len);
 971                        if (IS_ERR(start)) {
 972                                ret = PTR_ERR(start);
 973                                goto out;
 974                        }
 975                        if (start < p->buf) {
 976                                /* overflow , try again with larger buffer */
 977                                ret = fs_path_ensure_buf(p,
 978                                                p->buf_len + p->buf - start);
 979                                if (ret < 0)
 980                                        goto out;
 981                                start = btrfs_ref_to_path(root, tmp_path,
 982                                                          name_len, name_off,
 983                                                          eb, dir,
 984                                                          p->buf, p->buf_len);
 985                                if (IS_ERR(start)) {
 986                                        ret = PTR_ERR(start);
 987                                        goto out;
 988                                }
 989                                BUG_ON(start < p->buf);
 990                        }
 991                        p->start = start;
 992                } else {
 993                        ret = fs_path_add_from_extent_buffer(p, eb, name_off,
 994                                                             name_len);
 995                        if (ret < 0)
 996                                goto out;
 997                }
 998
 999                cur += elem_size + name_len;
1000                ret = iterate(num, dir, index, p, ctx);
1001                if (ret)
1002                        goto out;
1003                num++;
1004        }
1005
1006out:
1007        btrfs_free_path(tmp_path);
1008        fs_path_free(p);
1009        return ret;
1010}
1011
1012typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
1013                                  const char *name, int name_len,
1014                                  const char *data, int data_len,
1015                                  void *ctx);
1016
1017/*
1018 * Helper function to iterate the entries in ONE btrfs_dir_item.
1019 * The iterate callback may return a non zero value to stop iteration. This can
1020 * be a negative value for error codes or 1 to simply stop it.
1021 *
1022 * path must point to the dir item when called.
1023 */
1024static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1025                            iterate_dir_item_t iterate, void *ctx)
1026{
1027        int ret = 0;
1028        struct extent_buffer *eb;
1029        struct btrfs_dir_item *di;
1030        struct btrfs_key di_key;
1031        char *buf = NULL;
1032        int buf_len;
1033        u32 name_len;
1034        u32 data_len;
1035        u32 cur;
1036        u32 len;
1037        u32 total;
1038        int slot;
1039        int num;
1040
1041        /*
1042         * Start with a small buffer (1 page). If later we end up needing more
1043         * space, which can happen for xattrs on a fs with a leaf size greater
1044         * then the page size, attempt to increase the buffer. Typically xattr
1045         * values are small.
1046         */
1047        buf_len = PATH_MAX;
1048        buf = kmalloc(buf_len, GFP_KERNEL);
1049        if (!buf) {
1050                ret = -ENOMEM;
1051                goto out;
1052        }
1053
1054        eb = path->nodes[0];
1055        slot = path->slots[0];
1056        di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1057        cur = 0;
1058        len = 0;
1059        total = btrfs_item_size(eb, slot);
1060
1061        num = 0;
1062        while (cur < total) {
1063                name_len = btrfs_dir_name_len(eb, di);
1064                data_len = btrfs_dir_data_len(eb, di);
1065                btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1066
1067                if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
1068                        if (name_len > XATTR_NAME_MAX) {
1069                                ret = -ENAMETOOLONG;
1070                                goto out;
1071                        }
1072                        if (name_len + data_len >
1073                                        BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
1074                                ret = -E2BIG;
1075                                goto out;
1076                        }
1077                } else {
1078                        /*
1079                         * Path too long
1080                         */
1081                        if (name_len + data_len > PATH_MAX) {
1082                                ret = -ENAMETOOLONG;
1083                                goto out;
1084                        }
1085                }
1086
1087                if (name_len + data_len > buf_len) {
1088                        buf_len = name_len + data_len;
1089                        if (is_vmalloc_addr(buf)) {
1090                                vfree(buf);
1091                                buf = NULL;
1092                        } else {
1093                                char *tmp = krealloc(buf, buf_len,
1094                                                GFP_KERNEL | __GFP_NOWARN);
1095
1096                                if (!tmp)
1097                                        kfree(buf);
1098                                buf = tmp;
1099                        }
1100                        if (!buf) {
1101                                buf = kvmalloc(buf_len, GFP_KERNEL);
1102                                if (!buf) {
1103                                        ret = -ENOMEM;
1104                                        goto out;
1105                                }
1106                        }
1107                }
1108
1109                read_extent_buffer(eb, buf, (unsigned long)(di + 1),
1110                                name_len + data_len);
1111
1112                len = sizeof(*di) + name_len + data_len;
1113                di = (struct btrfs_dir_item *)((char *)di + len);
1114                cur += len;
1115
1116                ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1117                              data_len, ctx);
1118                if (ret < 0)
1119                        goto out;
1120                if (ret) {
1121                        ret = 0;
1122                        goto out;
1123                }
1124
1125                num++;
1126        }
1127
1128out:
1129        kvfree(buf);
1130        return ret;
1131}
1132
1133static int __copy_first_ref(int num, u64 dir, int index,
1134                            struct fs_path *p, void *ctx)
1135{
1136        int ret;
1137        struct fs_path *pt = ctx;
1138
1139        ret = fs_path_copy(pt, p);
1140        if (ret < 0)
1141                return ret;
1142
1143        /* we want the first only */
1144        return 1;
1145}
1146
1147/*
1148 * Retrieve the first path of an inode. If an inode has more then one
1149 * ref/hardlink, this is ignored.
1150 */
1151static int get_inode_path(struct btrfs_root *root,
1152                          u64 ino, struct fs_path *path)
1153{
1154        int ret;
1155        struct btrfs_key key, found_key;
1156        struct btrfs_path *p;
1157
1158        p = alloc_path_for_send();
1159        if (!p)
1160                return -ENOMEM;
1161
1162        fs_path_reset(path);
1163
1164        key.objectid = ino;
1165        key.type = BTRFS_INODE_REF_KEY;
1166        key.offset = 0;
1167
1168        ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
1169        if (ret < 0)
1170                goto out;
1171        if (ret) {
1172                ret = 1;
1173                goto out;
1174        }
1175        btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
1176        if (found_key.objectid != ino ||
1177            (found_key.type != BTRFS_INODE_REF_KEY &&
1178             found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1179                ret = -ENOENT;
1180                goto out;
1181        }
1182
1183        ret = iterate_inode_ref(root, p, &found_key, 1,
1184                                __copy_first_ref, path);
1185        if (ret < 0)
1186                goto out;
1187        ret = 0;
1188
1189out:
1190        btrfs_free_path(p);
1191        return ret;
1192}
1193
1194struct backref_ctx {
1195        struct send_ctx *sctx;
1196
1197        /* number of total found references */
1198        u64 found;
1199
1200        /*
1201         * used for clones found in send_root. clones found behind cur_objectid
1202         * and cur_offset are not considered as allowed clones.
1203         */
1204        u64 cur_objectid;
1205        u64 cur_offset;
1206
1207        /* may be truncated in case it's the last extent in a file */
1208        u64 extent_len;
1209
1210        /* Just to check for bugs in backref resolving */
1211        int found_itself;
1212};
1213
1214static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1215{
1216        u64 root = (u64)(uintptr_t)key;
1217        const struct clone_root *cr = elt;
1218
1219        if (root < cr->root->root_key.objectid)
1220                return -1;
1221        if (root > cr->root->root_key.objectid)
1222                return 1;
1223        return 0;
1224}
1225
1226static int __clone_root_cmp_sort(const void *e1, const void *e2)
1227{
1228        const struct clone_root *cr1 = e1;
1229        const struct clone_root *cr2 = e2;
1230
1231        if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
1232                return -1;
1233        if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
1234                return 1;
1235        return 0;
1236}
1237
1238/*
1239 * Called for every backref that is found for the current extent.
1240 * Results are collected in sctx->clone_roots->ino/offset/found_refs
1241 */
1242static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1243{
1244        struct backref_ctx *bctx = ctx_;
1245        struct clone_root *found;
1246
1247        /* First check if the root is in the list of accepted clone sources */
1248        found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
1249                        bctx->sctx->clone_roots_cnt,
1250                        sizeof(struct clone_root),
1251                        __clone_root_cmp_bsearch);
1252        if (!found)
1253                return 0;
1254
1255        if (found->root == bctx->sctx->send_root &&
1256            ino == bctx->cur_objectid &&
1257            offset == bctx->cur_offset) {
1258                bctx->found_itself = 1;
1259        }
1260
1261        /*
1262         * Make sure we don't consider clones from send_root that are
1263         * behind the current inode/offset.
1264         */
1265        if (found->root == bctx->sctx->send_root) {
1266                /*
1267                 * If the source inode was not yet processed we can't issue a
1268                 * clone operation, as the source extent does not exist yet at
1269                 * the destination of the stream.
1270                 */
1271                if (ino > bctx->cur_objectid)
1272                        return 0;
1273                /*
1274                 * We clone from the inode currently being sent as long as the
1275                 * source extent is already processed, otherwise we could try
1276                 * to clone from an extent that does not exist yet at the
1277                 * destination of the stream.
1278                 */
1279                if (ino == bctx->cur_objectid &&
1280                    offset + bctx->extent_len >
1281                    bctx->sctx->cur_inode_next_write_offset)
1282                        return 0;
1283        }
1284
1285        bctx->found++;
1286        found->found_refs++;
1287        if (ino < found->ino) {
1288                found->ino = ino;
1289                found->offset = offset;
1290        } else if (found->ino == ino) {
1291                /*
1292                 * same extent found more then once in the same file.
1293                 */
1294                if (found->offset > offset + bctx->extent_len)
1295                        found->offset = offset;
1296        }
1297
1298        return 0;
1299}
1300
1301/*
1302 * Given an inode, offset and extent item, it finds a good clone for a clone
1303 * instruction. Returns -ENOENT when none could be found. The function makes
1304 * sure that the returned clone is usable at the point where sending is at the
1305 * moment. This means, that no clones are accepted which lie behind the current
1306 * inode+offset.
1307 *
1308 * path must point to the extent item when called.
1309 */
1310static int find_extent_clone(struct send_ctx *sctx,
1311                             struct btrfs_path *path,
1312                             u64 ino, u64 data_offset,
1313                             u64 ino_size,
1314                             struct clone_root **found)
1315{
1316        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1317        int ret;
1318        int extent_type;
1319        u64 logical;
1320        u64 disk_byte;
1321        u64 num_bytes;
1322        u64 extent_item_pos;
1323        u64 flags = 0;
1324        struct btrfs_file_extent_item *fi;
1325        struct extent_buffer *eb = path->nodes[0];
1326        struct backref_ctx backref_ctx = {0};
1327        struct clone_root *cur_clone_root;
1328        struct btrfs_key found_key;
1329        struct btrfs_path *tmp_path;
1330        struct btrfs_extent_item *ei;
1331        int compressed;
1332        u32 i;
1333
1334        tmp_path = alloc_path_for_send();
1335        if (!tmp_path)
1336                return -ENOMEM;
1337
1338        /* We only use this path under the commit sem */
1339        tmp_path->need_commit_sem = 0;
1340
1341        if (data_offset >= ino_size) {
1342                /*
1343                 * There may be extents that lie behind the file's size.
1344                 * I at least had this in combination with snapshotting while
1345                 * writing large files.
1346                 */
1347                ret = 0;
1348                goto out;
1349        }
1350
1351        fi = btrfs_item_ptr(eb, path->slots[0],
1352                        struct btrfs_file_extent_item);
1353        extent_type = btrfs_file_extent_type(eb, fi);
1354        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1355                ret = -ENOENT;
1356                goto out;
1357        }
1358        compressed = btrfs_file_extent_compression(eb, fi);
1359
1360        num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1361        disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1362        if (disk_byte == 0) {
1363                ret = -ENOENT;
1364                goto out;
1365        }
1366        logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1367
1368        down_read(&fs_info->commit_root_sem);
1369        ret = extent_from_logical(fs_info, disk_byte, tmp_path,
1370                                  &found_key, &flags);
1371        up_read(&fs_info->commit_root_sem);
1372
1373        if (ret < 0)
1374                goto out;
1375        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1376                ret = -EIO;
1377                goto out;
1378        }
1379
1380        ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
1381                            struct btrfs_extent_item);
1382        /*
1383         * Backreference walking (iterate_extent_inodes() below) is currently
1384         * too expensive when an extent has a large number of references, both
1385         * in time spent and used memory. So for now just fallback to write
1386         * operations instead of clone operations when an extent has more than
1387         * a certain amount of references.
1388         */
1389        if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
1390                ret = -ENOENT;
1391                goto out;
1392        }
1393        btrfs_release_path(tmp_path);
1394
1395        /*
1396         * Setup the clone roots.
1397         */
1398        for (i = 0; i < sctx->clone_roots_cnt; i++) {
1399                cur_clone_root = sctx->clone_roots + i;
1400                cur_clone_root->ino = (u64)-1;
1401                cur_clone_root->offset = 0;
1402                cur_clone_root->found_refs = 0;
1403        }
1404
1405        backref_ctx.sctx = sctx;
1406        backref_ctx.found = 0;
1407        backref_ctx.cur_objectid = ino;
1408        backref_ctx.cur_offset = data_offset;
1409        backref_ctx.found_itself = 0;
1410        backref_ctx.extent_len = num_bytes;
1411
1412        /*
1413         * The last extent of a file may be too large due to page alignment.
1414         * We need to adjust extent_len in this case so that the checks in
1415         * __iterate_backrefs work.
1416         */
1417        if (data_offset + num_bytes >= ino_size)
1418                backref_ctx.extent_len = ino_size - data_offset;
1419
1420        /*
1421         * Now collect all backrefs.
1422         */
1423        if (compressed == BTRFS_COMPRESS_NONE)
1424                extent_item_pos = logical - found_key.objectid;
1425        else
1426                extent_item_pos = 0;
1427        ret = iterate_extent_inodes(fs_info, found_key.objectid,
1428                                    extent_item_pos, 1, __iterate_backrefs,
1429                                    &backref_ctx, false);
1430
1431        if (ret < 0)
1432                goto out;
1433
1434        down_read(&fs_info->commit_root_sem);
1435        if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
1436                /*
1437                 * A transaction commit for a transaction in which block group
1438                 * relocation was done just happened.
1439                 * The disk_bytenr of the file extent item we processed is
1440                 * possibly stale, referring to the extent's location before
1441                 * relocation. So act as if we haven't found any clone sources
1442                 * and fallback to write commands, which will read the correct
1443                 * data from the new extent location. Otherwise we will fail
1444                 * below because we haven't found our own back reference or we
1445                 * could be getting incorrect sources in case the old extent
1446                 * was already reallocated after the relocation.
1447                 */
1448                up_read(&fs_info->commit_root_sem);
1449                ret = -ENOENT;
1450                goto out;
1451        }
1452        up_read(&fs_info->commit_root_sem);
1453
1454        if (!backref_ctx.found_itself) {
1455                /* found a bug in backref code? */
1456                ret = -EIO;
1457                btrfs_err(fs_info,
1458                          "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
1459                          ino, data_offset, disk_byte, found_key.objectid);
1460                goto out;
1461        }
1462
1463        btrfs_debug(fs_info,
1464                    "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1465                    data_offset, ino, num_bytes, logical);
1466
1467        if (!backref_ctx.found)
1468                btrfs_debug(fs_info, "no clones found");
1469
1470        cur_clone_root = NULL;
1471        for (i = 0; i < sctx->clone_roots_cnt; i++) {
1472                if (sctx->clone_roots[i].found_refs) {
1473                        if (!cur_clone_root)
1474                                cur_clone_root = sctx->clone_roots + i;
1475                        else if (sctx->clone_roots[i].root == sctx->send_root)
1476                                /* prefer clones from send_root over others */
1477                                cur_clone_root = sctx->clone_roots + i;
1478                }
1479
1480        }
1481
1482        if (cur_clone_root) {
1483                *found = cur_clone_root;
1484                ret = 0;
1485        } else {
1486                ret = -ENOENT;
1487        }
1488
1489out:
1490        btrfs_free_path(tmp_path);
1491        return ret;
1492}
1493
1494static int read_symlink(struct btrfs_root *root,
1495                        u64 ino,
1496                        struct fs_path *dest)
1497{
1498        int ret;
1499        struct btrfs_path *path;
1500        struct btrfs_key key;
1501        struct btrfs_file_extent_item *ei;
1502        u8 type;
1503        u8 compression;
1504        unsigned long off;
1505        int len;
1506
1507        path = alloc_path_for_send();
1508        if (!path)
1509                return -ENOMEM;
1510
1511        key.objectid = ino;
1512        key.type = BTRFS_EXTENT_DATA_KEY;
1513        key.offset = 0;
1514        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1515        if (ret < 0)
1516                goto out;
1517        if (ret) {
1518                /*
1519                 * An empty symlink inode. Can happen in rare error paths when
1520                 * creating a symlink (transaction committed before the inode
1521                 * eviction handler removed the symlink inode items and a crash
1522                 * happened in between or the subvol was snapshoted in between).
1523                 * Print an informative message to dmesg/syslog so that the user
1524                 * can delete the symlink.
1525                 */
1526                btrfs_err(root->fs_info,
1527                          "Found empty symlink inode %llu at root %llu",
1528                          ino, root->root_key.objectid);
1529                ret = -EIO;
1530                goto out;
1531        }
1532
1533        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1534                        struct btrfs_file_extent_item);
1535        type = btrfs_file_extent_type(path->nodes[0], ei);
1536        compression = btrfs_file_extent_compression(path->nodes[0], ei);
1537        BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
1538        BUG_ON(compression);
1539
1540        off = btrfs_file_extent_inline_start(ei);
1541        len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
1542
1543        ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1544
1545out:
1546        btrfs_free_path(path);
1547        return ret;
1548}
1549
1550/*
1551 * Helper function to generate a file name that is unique in the root of
1552 * send_root and parent_root. This is used to generate names for orphan inodes.
1553 */
1554static int gen_unique_name(struct send_ctx *sctx,
1555                           u64 ino, u64 gen,
1556                           struct fs_path *dest)
1557{
1558        int ret = 0;
1559        struct btrfs_path *path;
1560        struct btrfs_dir_item *di;
1561        char tmp[64];
1562        int len;
1563        u64 idx = 0;
1564
1565        path = alloc_path_for_send();
1566        if (!path)
1567                return -ENOMEM;
1568
1569        while (1) {
1570                len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1571                                ino, gen, idx);
1572                ASSERT(len < sizeof(tmp));
1573
1574                di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1575                                path, BTRFS_FIRST_FREE_OBJECTID,
1576                                tmp, strlen(tmp), 0);
1577                btrfs_release_path(path);
1578                if (IS_ERR(di)) {
1579                        ret = PTR_ERR(di);
1580                        goto out;
1581                }
1582                if (di) {
1583                        /* not unique, try again */
1584                        idx++;
1585                        continue;
1586                }
1587
1588                if (!sctx->parent_root) {
1589                        /* unique */
1590                        ret = 0;
1591                        break;
1592                }
1593
1594                di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
1595                                path, BTRFS_FIRST_FREE_OBJECTID,
1596                                tmp, strlen(tmp), 0);
1597                btrfs_release_path(path);
1598                if (IS_ERR(di)) {
1599                        ret = PTR_ERR(di);
1600                        goto out;
1601                }
1602                if (di) {
1603                        /* not unique, try again */
1604                        idx++;
1605                        continue;
1606                }
1607                /* unique */
1608                break;
1609        }
1610
1611        ret = fs_path_add(dest, tmp, strlen(tmp));
1612
1613out:
1614        btrfs_free_path(path);
1615        return ret;
1616}
1617
1618enum inode_state {
1619        inode_state_no_change,
1620        inode_state_will_create,
1621        inode_state_did_create,
1622        inode_state_will_delete,
1623        inode_state_did_delete,
1624};
1625
1626static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1627{
1628        int ret;
1629        int left_ret;
1630        int right_ret;
1631        u64 left_gen;
1632        u64 right_gen;
1633
1634        ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1635                        NULL, NULL);
1636        if (ret < 0 && ret != -ENOENT)
1637                goto out;
1638        left_ret = ret;
1639
1640        if (!sctx->parent_root) {
1641                right_ret = -ENOENT;
1642        } else {
1643                ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1644                                NULL, NULL, NULL, NULL);
1645                if (ret < 0 && ret != -ENOENT)
1646                        goto out;
1647                right_ret = ret;
1648        }
1649
1650        if (!left_ret && !right_ret) {
1651                if (left_gen == gen && right_gen == gen) {
1652                        ret = inode_state_no_change;
1653                } else if (left_gen == gen) {
1654                        if (ino < sctx->send_progress)
1655                                ret = inode_state_did_create;
1656                        else
1657                                ret = inode_state_will_create;
1658                } else if (right_gen == gen) {
1659                        if (ino < sctx->send_progress)
1660                                ret = inode_state_did_delete;
1661                        else
1662                                ret = inode_state_will_delete;
1663                } else  {
1664                        ret = -ENOENT;
1665                }
1666        } else if (!left_ret) {
1667                if (left_gen == gen) {
1668                        if (ino < sctx->send_progress)
1669                                ret = inode_state_did_create;
1670                        else
1671                                ret = inode_state_will_create;
1672                } else {
1673                        ret = -ENOENT;
1674                }
1675        } else if (!right_ret) {
1676                if (right_gen == gen) {
1677                        if (ino < sctx->send_progress)
1678                                ret = inode_state_did_delete;
1679                        else
1680                                ret = inode_state_will_delete;
1681                } else {
1682                        ret = -ENOENT;
1683                }
1684        } else {
1685                ret = -ENOENT;
1686        }
1687
1688out:
1689        return ret;
1690}
1691
1692static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
1693{
1694        int ret;
1695
1696        if (ino == BTRFS_FIRST_FREE_OBJECTID)
1697                return 1;
1698
1699        ret = get_cur_inode_state(sctx, ino, gen);
1700        if (ret < 0)
1701                goto out;
1702
1703        if (ret == inode_state_no_change ||
1704            ret == inode_state_did_create ||
1705            ret == inode_state_will_delete)
1706                ret = 1;
1707        else
1708                ret = 0;
1709
1710out:
1711        return ret;
1712}
1713
1714/*
1715 * Helper function to lookup a dir item in a dir.
1716 */
1717static int lookup_dir_item_inode(struct btrfs_root *root,
1718                                 u64 dir, const char *name, int name_len,
1719                                 u64 *found_inode)
1720{
1721        int ret = 0;
1722        struct btrfs_dir_item *di;
1723        struct btrfs_key key;
1724        struct btrfs_path *path;
1725
1726        path = alloc_path_for_send();
1727        if (!path)
1728                return -ENOMEM;
1729
1730        di = btrfs_lookup_dir_item(NULL, root, path,
1731                        dir, name, name_len, 0);
1732        if (IS_ERR_OR_NULL(di)) {
1733                ret = di ? PTR_ERR(di) : -ENOENT;
1734                goto out;
1735        }
1736        btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1737        if (key.type == BTRFS_ROOT_ITEM_KEY) {
1738                ret = -ENOENT;
1739                goto out;
1740        }
1741        *found_inode = key.objectid;
1742
1743out:
1744        btrfs_free_path(path);
1745        return ret;
1746}
1747
1748/*
1749 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1750 * generation of the parent dir and the name of the dir entry.
1751 */
1752static int get_first_ref(struct btrfs_root *root, u64 ino,
1753                         u64 *dir, u64 *dir_gen, struct fs_path *name)
1754{
1755        int ret;
1756        struct btrfs_key key;
1757        struct btrfs_key found_key;
1758        struct btrfs_path *path;
1759        int len;
1760        u64 parent_dir;
1761
1762        path = alloc_path_for_send();
1763        if (!path)
1764                return -ENOMEM;
1765
1766        key.objectid = ino;
1767        key.type = BTRFS_INODE_REF_KEY;
1768        key.offset = 0;
1769
1770        ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
1771        if (ret < 0)
1772                goto out;
1773        if (!ret)
1774                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1775                                path->slots[0]);
1776        if (ret || found_key.objectid != ino ||
1777            (found_key.type != BTRFS_INODE_REF_KEY &&
1778             found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1779                ret = -ENOENT;
1780                goto out;
1781        }
1782
1783        if (found_key.type == BTRFS_INODE_REF_KEY) {
1784                struct btrfs_inode_ref *iref;
1785                iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1786                                      struct btrfs_inode_ref);
1787                len = btrfs_inode_ref_name_len(path->nodes[0], iref);
1788                ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1789                                                     (unsigned long)(iref + 1),
1790                                                     len);
1791                parent_dir = found_key.offset;
1792        } else {
1793                struct btrfs_inode_extref *extref;
1794                extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1795                                        struct btrfs_inode_extref);
1796                len = btrfs_inode_extref_name_len(path->nodes[0], extref);
1797                ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1798                                        (unsigned long)&extref->name, len);
1799                parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
1800        }
1801        if (ret < 0)
1802                goto out;
1803        btrfs_release_path(path);
1804
1805        if (dir_gen) {
1806                ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
1807                                     NULL, NULL, NULL);
1808                if (ret < 0)
1809                        goto out;
1810        }
1811
1812        *dir = parent_dir;
1813
1814out:
1815        btrfs_free_path(path);
1816        return ret;
1817}
1818
1819static int is_first_ref(struct btrfs_root *root,
1820                        u64 ino, u64 dir,
1821                        const char *name, int name_len)
1822{
1823        int ret;
1824        struct fs_path *tmp_name;
1825        u64 tmp_dir;
1826
1827        tmp_name = fs_path_alloc();
1828        if (!tmp_name)
1829                return -ENOMEM;
1830
1831        ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
1832        if (ret < 0)
1833                goto out;
1834
1835        if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
1836                ret = 0;
1837                goto out;
1838        }
1839
1840        ret = !memcmp(tmp_name->start, name, name_len);
1841
1842out:
1843        fs_path_free(tmp_name);
1844        return ret;
1845}
1846
1847/*
1848 * Used by process_recorded_refs to determine if a new ref would overwrite an
1849 * already existing ref. In case it detects an overwrite, it returns the
1850 * inode/gen in who_ino/who_gen.
1851 * When an overwrite is detected, process_recorded_refs does proper orphanizing
1852 * to make sure later references to the overwritten inode are possible.
1853 * Orphanizing is however only required for the first ref of an inode.
1854 * process_recorded_refs does an additional is_first_ref check to see if
1855 * orphanizing is really required.
1856 */
1857static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1858                              const char *name, int name_len,
1859                              u64 *who_ino, u64 *who_gen, u64 *who_mode)
1860{
1861        int ret = 0;
1862        u64 gen;
1863        u64 other_inode = 0;
1864
1865        if (!sctx->parent_root)
1866                goto out;
1867
1868        ret = is_inode_existent(sctx, dir, dir_gen);
1869        if (ret <= 0)
1870                goto out;
1871
1872        /*
1873         * If we have a parent root we need to verify that the parent dir was
1874         * not deleted and then re-created, if it was then we have no overwrite
1875         * and we can just unlink this entry.
1876         */
1877        if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
1878                ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
1879                                     NULL, NULL, NULL);
1880                if (ret < 0 && ret != -ENOENT)
1881                        goto out;
1882                if (ret) {
1883                        ret = 0;
1884                        goto out;
1885                }
1886                if (gen != dir_gen)
1887                        goto out;
1888        }
1889
1890        ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
1891                                    &other_inode);
1892        if (ret < 0 && ret != -ENOENT)
1893                goto out;
1894        if (ret) {
1895                ret = 0;
1896                goto out;
1897        }
1898
1899        /*
1900         * Check if the overwritten ref was already processed. If yes, the ref
1901         * was already unlinked/moved, so we can safely assume that we will not
1902         * overwrite anything at this point in time.
1903         */
1904        if (other_inode > sctx->send_progress ||
1905            is_waiting_for_move(sctx, other_inode)) {
1906                ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1907                                who_gen, who_mode, NULL, NULL, NULL);
1908                if (ret < 0)
1909                        goto out;
1910
1911                ret = 1;
1912                *who_ino = other_inode;
1913        } else {
1914                ret = 0;
1915        }
1916
1917out:
1918        return ret;
1919}
1920
1921/*
1922 * Checks if the ref was overwritten by an already processed inode. This is
1923 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
1924 * thus the orphan name needs be used.
1925 * process_recorded_refs also uses it to avoid unlinking of refs that were
1926 * overwritten.
1927 */
1928static int did_overwrite_ref(struct send_ctx *sctx,
1929                            u64 dir, u64 dir_gen,
1930                            u64 ino, u64 ino_gen,
1931                            const char *name, int name_len)
1932{
1933        int ret = 0;
1934        u64 gen;
1935        u64 ow_inode;
1936
1937        if (!sctx->parent_root)
1938                goto out;
1939
1940        ret = is_inode_existent(sctx, dir, dir_gen);
1941        if (ret <= 0)
1942                goto out;
1943
1944        if (dir != BTRFS_FIRST_FREE_OBJECTID) {
1945                ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
1946                                     NULL, NULL, NULL);
1947                if (ret < 0 && ret != -ENOENT)
1948                        goto out;
1949                if (ret) {
1950                        ret = 0;
1951                        goto out;
1952                }
1953                if (gen != dir_gen)
1954                        goto out;
1955        }
1956
1957        /* check if the ref was overwritten by another ref */
1958        ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
1959                                    &ow_inode);
1960        if (ret < 0 && ret != -ENOENT)
1961                goto out;
1962        if (ret) {
1963                /* was never and will never be overwritten */
1964                ret = 0;
1965                goto out;
1966        }
1967
1968        ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1969                        NULL, NULL);
1970        if (ret < 0)
1971                goto out;
1972
1973        if (ow_inode == ino && gen == ino_gen) {
1974                ret = 0;
1975                goto out;
1976        }
1977
1978        /*
1979         * We know that it is or will be overwritten. Check this now.
1980         * The current inode being processed might have been the one that caused
1981         * inode 'ino' to be orphanized, therefore check if ow_inode matches
1982         * the current inode being processed.
1983         */
1984        if ((ow_inode < sctx->send_progress) ||
1985            (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
1986             gen == sctx->cur_inode_gen))
1987                ret = 1;
1988        else
1989                ret = 0;
1990
1991out:
1992        return ret;
1993}
1994
1995/*
1996 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
1997 * that got overwritten. This is used by process_recorded_refs to determine
1998 * if it has to use the path as returned by get_cur_path or the orphan name.
1999 */
2000static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
2001{
2002        int ret = 0;
2003        struct fs_path *name = NULL;
2004        u64 dir;
2005        u64 dir_gen;
2006
2007        if (!sctx->parent_root)
2008                goto out;
2009
2010        name = fs_path_alloc();
2011        if (!name)
2012                return -ENOMEM;
2013
2014        ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
2015        if (ret < 0)
2016                goto out;
2017
2018        ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
2019                        name->start, fs_path_len(name));
2020
2021out:
2022        fs_path_free(name);
2023        return ret;
2024}
2025
2026/*
2027 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
2028 * so we need to do some special handling in case we have clashes. This function
2029 * takes care of this with the help of name_cache_entry::radix_list.
2030 * In case of error, nce is kfreed.
2031 */
2032static int name_cache_insert(struct send_ctx *sctx,
2033                             struct name_cache_entry *nce)
2034{
2035        int ret = 0;
2036        struct list_head *nce_head;
2037
2038        nce_head = radix_tree_lookup(&sctx->name_cache,
2039                        (unsigned long)nce->ino);
2040        if (!nce_head) {
2041                nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
2042                if (!nce_head) {
2043                        kfree(nce);
2044                        return -ENOMEM;
2045                }
2046                INIT_LIST_HEAD(nce_head);
2047
2048                ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
2049                if (ret < 0) {
2050                        kfree(nce_head);
2051                        kfree(nce);
2052                        return ret;
2053                }
2054        }
2055        list_add_tail(&nce->radix_list, nce_head);
2056        list_add_tail(&nce->list, &sctx->name_cache_list);
2057        sctx->name_cache_size++;
2058
2059        return ret;
2060}
2061
2062static void name_cache_delete(struct send_ctx *sctx,
2063                              struct name_cache_entry *nce)
2064{
2065        struct list_head *nce_head;
2066
2067        nce_head = radix_tree_lookup(&sctx->name_cache,
2068                        (unsigned long)nce->ino);
2069        if (!nce_head) {
2070                btrfs_err(sctx->send_root->fs_info,
2071              "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
2072                        nce->ino, sctx->name_cache_size);
2073        }
2074
2075        list_del(&nce->radix_list);
2076        list_del(&nce->list);
2077        sctx->name_cache_size--;
2078
2079        /*
2080         * We may not get to the final release of nce_head if the lookup fails
2081         */
2082        if (nce_head && list_empty(nce_head)) {
2083                radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
2084                kfree(nce_head);
2085        }
2086}
2087
2088static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
2089                                                    u64 ino, u64 gen)
2090{
2091        struct list_head *nce_head;
2092        struct name_cache_entry *cur;
2093
2094        nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
2095        if (!nce_head)
2096                return NULL;
2097
2098        list_for_each_entry(cur, nce_head, radix_list) {
2099                if (cur->ino == ino && cur->gen == gen)
2100                        return cur;
2101        }
2102        return NULL;
2103}
2104
2105/*
2106 * Remove some entries from the beginning of name_cache_list.
2107 */
2108static void name_cache_clean_unused(struct send_ctx *sctx)
2109{
2110        struct name_cache_entry *nce;
2111
2112        if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
2113                return;
2114
2115        while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
2116                nce = list_entry(sctx->name_cache_list.next,
2117                                struct name_cache_entry, list);
2118                name_cache_delete(sctx, nce);
2119                kfree(nce);
2120        }
2121}
2122
2123static void name_cache_free(struct send_ctx *sctx)
2124{
2125        struct name_cache_entry *nce;
2126
2127        while (!list_empty(&sctx->name_cache_list)) {
2128                nce = list_entry(sctx->name_cache_list.next,
2129                                struct name_cache_entry, list);
2130                name_cache_delete(sctx, nce);
2131                kfree(nce);
2132        }
2133}
2134
2135/*
2136 * Used by get_cur_path for each ref up to the root.
2137 * Returns 0 if it succeeded.
2138 * Returns 1 if the inode is not existent or got overwritten. In that case, the
2139 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2140 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
2141 * Returns <0 in case of error.
2142 */
2143static int __get_cur_name_and_parent(struct send_ctx *sctx,
2144                                     u64 ino, u64 gen,
2145                                     u64 *parent_ino,
2146                                     u64 *parent_gen,
2147                                     struct fs_path *dest)
2148{
2149        int ret;
2150        int nce_ret;
2151        struct name_cache_entry *nce = NULL;
2152
2153        /*
2154         * First check if we already did a call to this function with the same
2155         * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2156         * return the cached result.
2157         */
2158        nce = name_cache_search(sctx, ino, gen);
2159        if (nce) {
2160                if (ino < sctx->send_progress && nce->need_later_update) {
2161                        name_cache_delete(sctx, nce);
2162                        kfree(nce);
2163                        nce = NULL;
2164                } else {
2165                        /*
2166                         * Removes the entry from the list and adds it back to
2167                         * the end.  This marks the entry as recently used so
2168                         * that name_cache_clean_unused does not remove it.
2169                         */
2170                        list_move_tail(&nce->list, &sctx->name_cache_list);
2171
2172                        *parent_ino = nce->parent_ino;
2173                        *parent_gen = nce->parent_gen;
2174                        ret = fs_path_add(dest, nce->name, nce->name_len);
2175                        if (ret < 0)
2176                                goto out;
2177                        ret = nce->ret;
2178                        goto out;
2179                }
2180        }
2181
2182        /*
2183         * If the inode is not existent yet, add the orphan name and return 1.
2184         * This should only happen for the parent dir that we determine in
2185         * __record_new_ref
2186         */
2187        ret = is_inode_existent(sctx, ino, gen);
2188        if (ret < 0)
2189                goto out;
2190
2191        if (!ret) {
2192                ret = gen_unique_name(sctx, ino, gen, dest);
2193                if (ret < 0)
2194                        goto out;
2195                ret = 1;
2196                goto out_cache;
2197        }
2198
2199        /*
2200         * Depending on whether the inode was already processed or not, use
2201         * send_root or parent_root for ref lookup.
2202         */
2203        if (ino < sctx->send_progress)
2204                ret = get_first_ref(sctx->send_root, ino,
2205                                    parent_ino, parent_gen, dest);
2206        else
2207                ret = get_first_ref(sctx->parent_root, ino,
2208                                    parent_ino, parent_gen, dest);
2209        if (ret < 0)
2210                goto out;
2211
2212        /*
2213         * Check if the ref was overwritten by an inode's ref that was processed
2214         * earlier. If yes, treat as orphan and return 1.
2215         */
2216        ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
2217                        dest->start, dest->end - dest->start);
2218        if (ret < 0)
2219                goto out;
2220        if (ret) {
2221                fs_path_reset(dest);
2222                ret = gen_unique_name(sctx, ino, gen, dest);
2223                if (ret < 0)
2224                        goto out;
2225                ret = 1;
2226        }
2227
2228out_cache:
2229        /*
2230         * Store the result of the lookup in the name cache.
2231         */
2232        nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
2233        if (!nce) {
2234                ret = -ENOMEM;
2235                goto out;
2236        }
2237
2238        nce->ino = ino;
2239        nce->gen = gen;
2240        nce->parent_ino = *parent_ino;
2241        nce->parent_gen = *parent_gen;
2242        nce->name_len = fs_path_len(dest);
2243        nce->ret = ret;
2244        strcpy(nce->name, dest->start);
2245
2246        if (ino < sctx->send_progress)
2247                nce->need_later_update = 0;
2248        else
2249                nce->need_later_update = 1;
2250
2251        nce_ret = name_cache_insert(sctx, nce);
2252        if (nce_ret < 0)
2253                ret = nce_ret;
2254        name_cache_clean_unused(sctx);
2255
2256out:
2257        return ret;
2258}
2259
2260/*
2261 * Magic happens here. This function returns the first ref to an inode as it
2262 * would look like while receiving the stream at this point in time.
2263 * We walk the path up to the root. For every inode in between, we check if it
2264 * was already processed/sent. If yes, we continue with the parent as found
2265 * in send_root. If not, we continue with the parent as found in parent_root.
2266 * If we encounter an inode that was deleted at this point in time, we use the
2267 * inodes "orphan" name instead of the real name and stop. Same with new inodes
2268 * that were not created yet and overwritten inodes/refs.
2269 *
2270 * When do we have orphan inodes:
2271 * 1. When an inode is freshly created and thus no valid refs are available yet
2272 * 2. When a directory lost all it's refs (deleted) but still has dir items
2273 *    inside which were not processed yet (pending for move/delete). If anyone
2274 *    tried to get the path to the dir items, it would get a path inside that
2275 *    orphan directory.
2276 * 3. When an inode is moved around or gets new links, it may overwrite the ref
2277 *    of an unprocessed inode. If in that case the first ref would be
2278 *    overwritten, the overwritten inode gets "orphanized". Later when we
2279 *    process this overwritten inode, it is restored at a new place by moving
2280 *    the orphan inode.
2281 *
2282 * sctx->send_progress tells this function at which point in time receiving
2283 * would be.
2284 */
2285static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2286                        struct fs_path *dest)
2287{
2288        int ret = 0;
2289        struct fs_path *name = NULL;
2290        u64 parent_inode = 0;
2291        u64 parent_gen = 0;
2292        int stop = 0;
2293
2294        name = fs_path_alloc();
2295        if (!name) {
2296                ret = -ENOMEM;
2297                goto out;
2298        }
2299
2300        dest->reversed = 1;
2301        fs_path_reset(dest);
2302
2303        while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2304                struct waiting_dir_move *wdm;
2305
2306                fs_path_reset(name);
2307
2308                if (is_waiting_for_rm(sctx, ino, gen)) {
2309                        ret = gen_unique_name(sctx, ino, gen, name);
2310                        if (ret < 0)
2311                                goto out;
2312                        ret = fs_path_add_path(dest, name);
2313                        break;
2314                }
2315
2316                wdm = get_waiting_dir_move(sctx, ino);
2317                if (wdm && wdm->orphanized) {
2318                        ret = gen_unique_name(sctx, ino, gen, name);
2319                        stop = 1;
2320                } else if (wdm) {
2321                        ret = get_first_ref(sctx->parent_root, ino,
2322                                            &parent_inode, &parent_gen, name);
2323                } else {
2324                        ret = __get_cur_name_and_parent(sctx, ino, gen,
2325                                                        &parent_inode,
2326                                                        &parent_gen, name);
2327                        if (ret)
2328                                stop = 1;
2329                }
2330
2331                if (ret < 0)
2332                        goto out;
2333
2334                ret = fs_path_add_path(dest, name);
2335                if (ret < 0)
2336                        goto out;
2337
2338                ino = parent_inode;
2339                gen = parent_gen;
2340        }
2341
2342out:
2343        fs_path_free(name);
2344        if (!ret)
2345                fs_path_unreverse(dest);
2346        return ret;
2347}
2348
2349/*
2350 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2351 */
2352static int send_subvol_begin(struct send_ctx *sctx)
2353{
2354        int ret;
2355        struct btrfs_root *send_root = sctx->send_root;
2356        struct btrfs_root *parent_root = sctx->parent_root;
2357        struct btrfs_path *path;
2358        struct btrfs_key key;
2359        struct btrfs_root_ref *ref;
2360        struct extent_buffer *leaf;
2361        char *name = NULL;
2362        int namelen;
2363
2364        path = btrfs_alloc_path();
2365        if (!path)
2366                return -ENOMEM;
2367
2368        name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2369        if (!name) {
2370                btrfs_free_path(path);
2371                return -ENOMEM;
2372        }
2373
2374        key.objectid = send_root->root_key.objectid;
2375        key.type = BTRFS_ROOT_BACKREF_KEY;
2376        key.offset = 0;
2377
2378        ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
2379                                &key, path, 1, 0);
2380        if (ret < 0)
2381                goto out;
2382        if (ret) {
2383                ret = -ENOENT;
2384                goto out;
2385        }
2386
2387        leaf = path->nodes[0];
2388        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2389        if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2390            key.objectid != send_root->root_key.objectid) {
2391                ret = -ENOENT;
2392                goto out;
2393        }
2394        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
2395        namelen = btrfs_root_ref_name_len(leaf, ref);
2396        read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2397        btrfs_release_path(path);
2398
2399        if (parent_root) {
2400                ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2401                if (ret < 0)
2402                        goto out;
2403        } else {
2404                ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
2405                if (ret < 0)
2406                        goto out;
2407        }
2408
2409        TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2410
2411        if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
2412                TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2413                            sctx->send_root->root_item.received_uuid);
2414        else
2415                TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2416                            sctx->send_root->root_item.uuid);
2417
2418        TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2419                    btrfs_root_ctransid(&sctx->send_root->root_item));
2420        if (parent_root) {
2421                if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
2422                        TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2423                                     parent_root->root_item.received_uuid);
2424                else
2425                        TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2426                                     parent_root->root_item.uuid);
2427                TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2428                            btrfs_root_ctransid(&sctx->parent_root->root_item));
2429        }
2430
2431        ret = send_cmd(sctx);
2432
2433tlv_put_failure:
2434out:
2435        btrfs_free_path(path);
2436        kfree(name);
2437        return ret;
2438}
2439
2440static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2441{
2442        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2443        int ret = 0;
2444        struct fs_path *p;
2445
2446        btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
2447
2448        p = fs_path_alloc();
2449        if (!p)
2450                return -ENOMEM;
2451
2452        ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
2453        if (ret < 0)
2454                goto out;
2455
2456        ret = get_cur_path(sctx, ino, gen, p);
2457        if (ret < 0)
2458                goto out;
2459        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2460        TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2461
2462        ret = send_cmd(sctx);
2463
2464tlv_put_failure:
2465out:
2466        fs_path_free(p);
2467        return ret;
2468}
2469
2470static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2471{
2472        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2473        int ret = 0;
2474        struct fs_path *p;
2475
2476        btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
2477
2478        p = fs_path_alloc();
2479        if (!p)
2480                return -ENOMEM;
2481
2482        ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
2483        if (ret < 0)
2484                goto out;
2485
2486        ret = get_cur_path(sctx, ino, gen, p);
2487        if (ret < 0)
2488                goto out;
2489        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2490        TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
2491
2492        ret = send_cmd(sctx);
2493
2494tlv_put_failure:
2495out:
2496        fs_path_free(p);
2497        return ret;
2498}
2499
2500static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2501{
2502        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2503        int ret = 0;
2504        struct fs_path *p;
2505
2506        btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
2507                    ino, uid, gid);
2508
2509        p = fs_path_alloc();
2510        if (!p)
2511                return -ENOMEM;
2512
2513        ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
2514        if (ret < 0)
2515                goto out;
2516
2517        ret = get_cur_path(sctx, ino, gen, p);
2518        if (ret < 0)
2519                goto out;
2520        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2521        TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2522        TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2523
2524        ret = send_cmd(sctx);
2525
2526tlv_put_failure:
2527out:
2528        fs_path_free(p);
2529        return ret;
2530}
2531
2532static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2533{
2534        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2535        int ret = 0;
2536        struct fs_path *p = NULL;
2537        struct btrfs_inode_item *ii;
2538        struct btrfs_path *path = NULL;
2539        struct extent_buffer *eb;
2540        struct btrfs_key key;
2541        int slot;
2542
2543        btrfs_debug(fs_info, "send_utimes %llu", ino);
2544
2545        p = fs_path_alloc();
2546        if (!p)
2547                return -ENOMEM;
2548
2549        path = alloc_path_for_send();
2550        if (!path) {
2551                ret = -ENOMEM;
2552                goto out;
2553        }
2554
2555        key.objectid = ino;
2556        key.type = BTRFS_INODE_ITEM_KEY;
2557        key.offset = 0;
2558        ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2559        if (ret > 0)
2560                ret = -ENOENT;
2561        if (ret < 0)
2562                goto out;
2563
2564        eb = path->nodes[0];
2565        slot = path->slots[0];
2566        ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2567
2568        ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
2569        if (ret < 0)
2570                goto out;
2571
2572        ret = get_cur_path(sctx, ino, gen, p);
2573        if (ret < 0)
2574                goto out;
2575        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2576        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2577        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2578        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2579        /* TODO Add otime support when the otime patches get into upstream */
2580
2581        ret = send_cmd(sctx);
2582
2583tlv_put_failure:
2584out:
2585        fs_path_free(p);
2586        btrfs_free_path(path);
2587        return ret;
2588}
2589
2590/*
2591 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2592 * a valid path yet because we did not process the refs yet. So, the inode
2593 * is created as orphan.
2594 */
2595static int send_create_inode(struct send_ctx *sctx, u64 ino)
2596{
2597        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2598        int ret = 0;
2599        struct fs_path *p;
2600        int cmd;
2601        u64 gen;
2602        u64 mode;
2603        u64 rdev;
2604
2605        btrfs_debug(fs_info, "send_create_inode %llu", ino);
2606
2607        p = fs_path_alloc();
2608        if (!p)
2609                return -ENOMEM;
2610
2611        if (ino != sctx->cur_ino) {
2612                ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2613                                     NULL, NULL, &rdev);
2614                if (ret < 0)
2615                        goto out;
2616        } else {
2617                gen = sctx->cur_inode_gen;
2618                mode = sctx->cur_inode_mode;
2619                rdev = sctx->cur_inode_rdev;
2620        }
2621
2622        if (S_ISREG(mode)) {
2623                cmd = BTRFS_SEND_C_MKFILE;
2624        } else if (S_ISDIR(mode)) {
2625                cmd = BTRFS_SEND_C_MKDIR;
2626        } else if (S_ISLNK(mode)) {
2627                cmd = BTRFS_SEND_C_SYMLINK;
2628        } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2629                cmd = BTRFS_SEND_C_MKNOD;
2630        } else if (S_ISFIFO(mode)) {
2631                cmd = BTRFS_SEND_C_MKFIFO;
2632        } else if (S_ISSOCK(mode)) {
2633                cmd = BTRFS_SEND_C_MKSOCK;
2634        } else {
2635                btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2636                                (int)(mode & S_IFMT));
2637                ret = -EOPNOTSUPP;
2638                goto out;
2639        }
2640
2641        ret = begin_cmd(sctx, cmd);
2642        if (ret < 0)
2643                goto out;
2644
2645        ret = gen_unique_name(sctx, ino, gen, p);
2646        if (ret < 0)
2647                goto out;
2648
2649        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2650        TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2651
2652        if (S_ISLNK(mode)) {
2653                fs_path_reset(p);
2654                ret = read_symlink(sctx->send_root, ino, p);
2655                if (ret < 0)
2656                        goto out;
2657                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2658        } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2659                   S_ISFIFO(mode) || S_ISSOCK(mode)) {
2660                TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2661                TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2662        }
2663
2664        ret = send_cmd(sctx);
2665        if (ret < 0)
2666                goto out;
2667
2668
2669tlv_put_failure:
2670out:
2671        fs_path_free(p);
2672        return ret;
2673}
2674
2675/*
2676 * We need some special handling for inodes that get processed before the parent
2677 * directory got created. See process_recorded_refs for details.
2678 * This function does the check if we already created the dir out of order.
2679 */
2680static int did_create_dir(struct send_ctx *sctx, u64 dir)
2681{
2682        int ret = 0;
2683        struct btrfs_path *path = NULL;
2684        struct btrfs_key key;
2685        struct btrfs_key found_key;
2686        struct btrfs_key di_key;
2687        struct extent_buffer *eb;
2688        struct btrfs_dir_item *di;
2689        int slot;
2690
2691        path = alloc_path_for_send();
2692        if (!path) {
2693                ret = -ENOMEM;
2694                goto out;
2695        }
2696
2697        key.objectid = dir;
2698        key.type = BTRFS_DIR_INDEX_KEY;
2699        key.offset = 0;
2700        ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2701        if (ret < 0)
2702                goto out;
2703
2704        while (1) {
2705                eb = path->nodes[0];
2706                slot = path->slots[0];
2707                if (slot >= btrfs_header_nritems(eb)) {
2708                        ret = btrfs_next_leaf(sctx->send_root, path);
2709                        if (ret < 0) {
2710                                goto out;
2711                        } else if (ret > 0) {
2712                                ret = 0;
2713                                break;
2714                        }
2715                        continue;
2716                }
2717
2718                btrfs_item_key_to_cpu(eb, &found_key, slot);
2719                if (found_key.objectid != key.objectid ||
2720                    found_key.type != key.type) {
2721                        ret = 0;
2722                        goto out;
2723                }
2724
2725                di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2726                btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2727
2728                if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
2729                    di_key.objectid < sctx->send_progress) {
2730                        ret = 1;
2731                        goto out;
2732                }
2733
2734                path->slots[0]++;
2735        }
2736
2737out:
2738        btrfs_free_path(path);
2739        return ret;
2740}
2741
2742/*
2743 * Only creates the inode if it is:
2744 * 1. Not a directory
2745 * 2. Or a directory which was not created already due to out of order
2746 *    directories. See did_create_dir and process_recorded_refs for details.
2747 */
2748static int send_create_inode_if_needed(struct send_ctx *sctx)
2749{
2750        int ret;
2751
2752        if (S_ISDIR(sctx->cur_inode_mode)) {
2753                ret = did_create_dir(sctx, sctx->cur_ino);
2754                if (ret < 0)
2755                        return ret;
2756                else if (ret > 0)
2757                        return 0;
2758        }
2759
2760        return send_create_inode(sctx, sctx->cur_ino);
2761}
2762
2763struct recorded_ref {
2764        struct list_head list;
2765        char *name;
2766        struct fs_path *full_path;
2767        u64 dir;
2768        u64 dir_gen;
2769        int name_len;
2770};
2771
2772static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
2773{
2774        ref->full_path = path;
2775        ref->name = (char *)kbasename(ref->full_path->start);
2776        ref->name_len = ref->full_path->end - ref->name;
2777}
2778
2779/*
2780 * We need to process new refs before deleted refs, but compare_tree gives us
2781 * everything mixed. So we first record all refs and later process them.
2782 * This function is a helper to record one ref.
2783 */
2784static int __record_ref(struct list_head *head, u64 dir,
2785                      u64 dir_gen, struct fs_path *path)
2786{
2787        struct recorded_ref *ref;
2788
2789        ref = kmalloc(sizeof(*ref), GFP_KERNEL);
2790        if (!ref)
2791                return -ENOMEM;
2792
2793        ref->dir = dir;
2794        ref->dir_gen = dir_gen;
2795        set_ref_path(ref, path);
2796        list_add_tail(&ref->list, head);
2797        return 0;
2798}
2799
2800static int dup_ref(struct recorded_ref *ref, struct list_head *list)
2801{
2802        struct recorded_ref *new;
2803
2804        new = kmalloc(sizeof(*ref), GFP_KERNEL);
2805        if (!new)
2806                return -ENOMEM;
2807
2808        new->dir = ref->dir;
2809        new->dir_gen = ref->dir_gen;
2810        new->full_path = NULL;
2811        INIT_LIST_HEAD(&new->list);
2812        list_add_tail(&new->list, list);
2813        return 0;
2814}
2815
2816static void __free_recorded_refs(struct list_head *head)
2817{
2818        struct recorded_ref *cur;
2819
2820        while (!list_empty(head)) {
2821                cur = list_entry(head->next, struct recorded_ref, list);
2822                fs_path_free(cur->full_path);
2823                list_del(&cur->list);
2824                kfree(cur);
2825        }
2826}
2827
2828static void free_recorded_refs(struct send_ctx *sctx)
2829{
2830        __free_recorded_refs(&sctx->new_refs);
2831        __free_recorded_refs(&sctx->deleted_refs);
2832}
2833
2834/*
2835 * Renames/moves a file/dir to its orphan name. Used when the first
2836 * ref of an unprocessed inode gets overwritten and for all non empty
2837 * directories.
2838 */
2839static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2840                          struct fs_path *path)
2841{
2842        int ret;
2843        struct fs_path *orphan;
2844
2845        orphan = fs_path_alloc();
2846        if (!orphan)
2847                return -ENOMEM;
2848
2849        ret = gen_unique_name(sctx, ino, gen, orphan);
2850        if (ret < 0)
2851                goto out;
2852
2853        ret = send_rename(sctx, path, orphan);
2854
2855out:
2856        fs_path_free(orphan);
2857        return ret;
2858}
2859
2860static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
2861                                                   u64 dir_ino, u64 dir_gen)
2862{
2863        struct rb_node **p = &sctx->orphan_dirs.rb_node;
2864        struct rb_node *parent = NULL;
2865        struct orphan_dir_info *entry, *odi;
2866
2867        while (*p) {
2868                parent = *p;
2869                entry = rb_entry(parent, struct orphan_dir_info, node);
2870                if (dir_ino < entry->ino)
2871                        p = &(*p)->rb_left;
2872                else if (dir_ino > entry->ino)
2873                        p = &(*p)->rb_right;
2874                else if (dir_gen < entry->gen)
2875                        p = &(*p)->rb_left;
2876                else if (dir_gen > entry->gen)
2877                        p = &(*p)->rb_right;
2878                else
2879                        return entry;
2880        }
2881
2882        odi = kmalloc(sizeof(*odi), GFP_KERNEL);
2883        if (!odi)
2884                return ERR_PTR(-ENOMEM);
2885        odi->ino = dir_ino;
2886        odi->gen = dir_gen;
2887        odi->last_dir_index_offset = 0;
2888
2889        rb_link_node(&odi->node, parent, p);
2890        rb_insert_color(&odi->node, &sctx->orphan_dirs);
2891        return odi;
2892}
2893
2894static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
2895                                                   u64 dir_ino, u64 gen)
2896{
2897        struct rb_node *n = sctx->orphan_dirs.rb_node;
2898        struct orphan_dir_info *entry;
2899
2900        while (n) {
2901                entry = rb_entry(n, struct orphan_dir_info, node);
2902                if (dir_ino < entry->ino)
2903                        n = n->rb_left;
2904                else if (dir_ino > entry->ino)
2905                        n = n->rb_right;
2906                else if (gen < entry->gen)
2907                        n = n->rb_left;
2908                else if (gen > entry->gen)
2909                        n = n->rb_right;
2910                else
2911                        return entry;
2912        }
2913        return NULL;
2914}
2915
2916static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
2917{
2918        struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
2919
2920        return odi != NULL;
2921}
2922
2923static void free_orphan_dir_info(struct send_ctx *sctx,
2924                                 struct orphan_dir_info *odi)
2925{
2926        if (!odi)
2927                return;
2928        rb_erase(&odi->node, &sctx->orphan_dirs);
2929        kfree(odi);
2930}
2931
2932/*
2933 * Returns 1 if a directory can be removed at this point in time.
2934 * We check this by iterating all dir items and checking if the inode behind
2935 * the dir item was already processed.
2936 */
2937static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2938                     u64 send_progress)
2939{
2940        int ret = 0;
2941        struct btrfs_root *root = sctx->parent_root;
2942        struct btrfs_path *path;
2943        struct btrfs_key key;
2944        struct btrfs_key found_key;
2945        struct btrfs_key loc;
2946        struct btrfs_dir_item *di;
2947        struct orphan_dir_info *odi = NULL;
2948
2949        /*
2950         * Don't try to rmdir the top/root subvolume dir.
2951         */
2952        if (dir == BTRFS_FIRST_FREE_OBJECTID)
2953                return 0;
2954
2955        path = alloc_path_for_send();
2956        if (!path)
2957                return -ENOMEM;
2958
2959        key.objectid = dir;
2960        key.type = BTRFS_DIR_INDEX_KEY;
2961        key.offset = 0;
2962
2963        odi = get_orphan_dir_info(sctx, dir, dir_gen);
2964        if (odi)
2965                key.offset = odi->last_dir_index_offset;
2966
2967        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2968        if (ret < 0)
2969                goto out;
2970
2971        while (1) {
2972                struct waiting_dir_move *dm;
2973
2974                if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2975                        ret = btrfs_next_leaf(root, path);
2976                        if (ret < 0)
2977                                goto out;
2978                        else if (ret > 0)
2979                                break;
2980                        continue;
2981                }
2982                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2983                                      path->slots[0]);
2984                if (found_key.objectid != key.objectid ||
2985                    found_key.type != key.type)
2986                        break;
2987
2988                di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2989                                struct btrfs_dir_item);
2990                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2991
2992                dm = get_waiting_dir_move(sctx, loc.objectid);
2993                if (dm) {
2994                        odi = add_orphan_dir_info(sctx, dir, dir_gen);
2995                        if (IS_ERR(odi)) {
2996                                ret = PTR_ERR(odi);
2997                                goto out;
2998                        }
2999                        odi->gen = dir_gen;
3000                        odi->last_dir_index_offset = found_key.offset;
3001                        dm->rmdir_ino = dir;
3002                        dm->rmdir_gen = dir_gen;
3003                        ret = 0;
3004                        goto out;
3005                }
3006
3007                if (loc.objectid > send_progress) {
3008                        odi = add_orphan_dir_info(sctx, dir, dir_gen);
3009                        if (IS_ERR(odi)) {
3010                                ret = PTR_ERR(odi);
3011                                goto out;
3012                        }
3013                        odi->gen = dir_gen;
3014                        odi->last_dir_index_offset = found_key.offset;
3015                        ret = 0;
3016                        goto out;
3017                }
3018
3019                path->slots[0]++;
3020        }
3021        free_orphan_dir_info(sctx, odi);
3022
3023        ret = 1;
3024
3025out:
3026        btrfs_free_path(path);
3027        return ret;
3028}
3029
3030static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
3031{
3032        struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3033
3034        return entry != NULL;
3035}
3036
3037static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3038{
3039        struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
3040        struct rb_node *parent = NULL;
3041        struct waiting_dir_move *entry, *dm;
3042
3043        dm = kmalloc(sizeof(*dm), GFP_KERNEL);
3044        if (!dm)
3045                return -ENOMEM;
3046        dm->ino = ino;
3047        dm->rmdir_ino = 0;
3048        dm->rmdir_gen = 0;
3049        dm->orphanized = orphanized;
3050
3051        while (*p) {
3052                parent = *p;
3053                entry = rb_entry(parent, struct waiting_dir_move, node);
3054                if (ino < entry->ino) {
3055                        p = &(*p)->rb_left;
3056                } else if (ino > entry->ino) {
3057                        p = &(*p)->rb_right;
3058                } else {
3059                        kfree(dm);
3060                        return -EEXIST;
3061                }
3062        }
3063
3064        rb_link_node(&dm->node, parent, p);
3065        rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
3066        return 0;
3067}
3068
3069static struct waiting_dir_move *
3070get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3071{
3072        struct rb_node *n = sctx->waiting_dir_moves.rb_node;
3073        struct waiting_dir_move *entry;
3074
3075        while (n) {
3076                entry = rb_entry(n, struct waiting_dir_move, node);
3077                if (ino < entry->ino)
3078                        n = n->rb_left;
3079                else if (ino > entry->ino)
3080                        n = n->rb_right;
3081                else
3082                        return entry;
3083        }
3084        return NULL;
3085}
3086
3087static void free_waiting_dir_move(struct send_ctx *sctx,
3088                                  struct waiting_dir_move *dm)
3089{
3090        if (!dm)
3091                return;
3092        rb_erase(&dm->node, &sctx->waiting_dir_moves);
3093        kfree(dm);
3094}
3095
3096static int add_pending_dir_move(struct send_ctx *sctx,
3097                                u64 ino,
3098                                u64 ino_gen,
3099                                u64 parent_ino,
3100                                struct list_head *new_refs,
3101                                struct list_head *deleted_refs,
3102                                const bool is_orphan)
3103{
3104        struct rb_node **p = &sctx->pending_dir_moves.rb_node;
3105        struct rb_node *parent = NULL;
3106        struct pending_dir_move *entry = NULL, *pm;
3107        struct recorded_ref *cur;
3108        int exists = 0;
3109        int ret;
3110
3111        pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3112        if (!pm)
3113                return -ENOMEM;
3114        pm->parent_ino = parent_ino;
3115        pm->ino = ino;
3116        pm->gen = ino_gen;
3117        INIT_LIST_HEAD(&pm->list);
3118        INIT_LIST_HEAD(&pm->update_refs);
3119        RB_CLEAR_NODE(&pm->node);
3120
3121        while (*p) {
3122                parent = *p;
3123                entry = rb_entry(parent, struct pending_dir_move, node);
3124                if (parent_ino < entry->parent_ino) {
3125                        p = &(*p)->rb_left;
3126                } else if (parent_ino > entry->parent_ino) {
3127                        p = &(*p)->rb_right;
3128                } else {
3129                        exists = 1;
3130                        break;
3131                }
3132        }
3133
3134        list_for_each_entry(cur, deleted_refs, list) {
3135                ret = dup_ref(cur, &pm->update_refs);
3136                if (ret < 0)
3137                        goto out;
3138        }
3139        list_for_each_entry(cur, new_refs, list) {
3140                ret = dup_ref(cur, &pm->update_refs);
3141                if (ret < 0)
3142                        goto out;
3143        }
3144
3145        ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
3146        if (ret)
3147                goto out;
3148
3149        if (exists) {
3150                list_add_tail(&pm->list, &entry->list);
3151        } else {
3152                rb_link_node(&pm->node, parent, p);
3153                rb_insert_color(&pm->node, &sctx->pending_dir_moves);
3154        }
3155        ret = 0;
3156out:
3157        if (ret) {
3158                __free_recorded_refs(&pm->update_refs);
3159                kfree(pm);
3160        }
3161        return ret;
3162}
3163
3164static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3165                                                      u64 parent_ino)
3166{
3167        struct rb_node *n = sctx->pending_dir_moves.rb_node;
3168        struct pending_dir_move *entry;
3169
3170        while (n) {
3171                entry = rb_entry(n, struct pending_dir_move, node);
3172                if (parent_ino < entry->parent_ino)
3173                        n = n->rb_left;
3174                else if (parent_ino > entry->parent_ino)
3175                        n = n->rb_right;
3176                else
3177                        return entry;
3178        }
3179        return NULL;
3180}
3181
3182static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3183                     u64 ino, u64 gen, u64 *ancestor_ino)
3184{
3185        int ret = 0;
3186        u64 parent_inode = 0;
3187        u64 parent_gen = 0;
3188        u64 start_ino = ino;
3189
3190        *ancestor_ino = 0;
3191        while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3192                fs_path_reset(name);
3193
3194                if (is_waiting_for_rm(sctx, ino, gen))
3195                        break;
3196                if (is_waiting_for_move(sctx, ino)) {
3197                        if (*ancestor_ino == 0)
3198                                *ancestor_ino = ino;
3199                        ret = get_first_ref(sctx->parent_root, ino,
3200                                            &parent_inode, &parent_gen, name);
3201                } else {
3202                        ret = __get_cur_name_and_parent(sctx, ino, gen,
3203                                                        &parent_inode,
3204                                                        &parent_gen, name);
3205                        if (ret > 0) {
3206                                ret = 0;
3207                                break;
3208                        }
3209                }
3210                if (ret < 0)
3211                        break;
3212                if (parent_inode == start_ino) {
3213                        ret = 1;
3214                        if (*ancestor_ino == 0)
3215                                *ancestor_ino = ino;
3216                        break;
3217                }
3218                ino = parent_inode;
3219                gen = parent_gen;
3220        }
3221        return ret;
3222}
3223
3224static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3225{
3226        struct fs_path *from_path = NULL;
3227        struct fs_path *to_path = NULL;
3228        struct fs_path *name = NULL;
3229        u64 orig_progress = sctx->send_progress;
3230        struct recorded_ref *cur;
3231        u64 parent_ino, parent_gen;
3232        struct waiting_dir_move *dm = NULL;
3233        u64 rmdir_ino = 0;
3234        u64 rmdir_gen;
3235        u64 ancestor;
3236        bool is_orphan;
3237        int ret;
3238
3239        name = fs_path_alloc();
3240        from_path = fs_path_alloc();
3241        if (!name || !from_path) {
3242                ret = -ENOMEM;
3243                goto out;
3244        }
3245
3246        dm = get_waiting_dir_move(sctx, pm->ino);
3247        ASSERT(dm);
3248        rmdir_ino = dm->rmdir_ino;
3249        rmdir_gen = dm->rmdir_gen;
3250        is_orphan = dm->orphanized;
3251        free_waiting_dir_move(sctx, dm);
3252
3253        if (is_orphan) {
3254                ret = gen_unique_name(sctx, pm->ino,
3255                                      pm->gen, from_path);
3256        } else {
3257                ret = get_first_ref(sctx->parent_root, pm->ino,
3258                                    &parent_ino, &parent_gen, name);
3259                if (ret < 0)
3260                        goto out;
3261                ret = get_cur_path(sctx, parent_ino, parent_gen,
3262                                   from_path);
3263                if (ret < 0)
3264                        goto out;
3265                ret = fs_path_add_path(from_path, name);
3266        }
3267        if (ret < 0)
3268                goto out;
3269
3270        sctx->send_progress = sctx->cur_ino + 1;
3271        ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3272        if (ret < 0)
3273                goto out;
3274        if (ret) {
3275                LIST_HEAD(deleted_refs);
3276                ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3277                ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3278                                           &pm->update_refs, &deleted_refs,
3279                                           is_orphan);
3280                if (ret < 0)
3281                        goto out;
3282                if (rmdir_ino) {
3283                        dm = get_waiting_dir_move(sctx, pm->ino);
3284                        ASSERT(dm);
3285                        dm->rmdir_ino = rmdir_ino;
3286                        dm->rmdir_gen = rmdir_gen;
3287                }
3288                goto out;
3289        }
3290        fs_path_reset(name);
3291        to_path = name;
3292        name = NULL;
3293        ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
3294        if (ret < 0)
3295                goto out;
3296
3297        ret = send_rename(sctx, from_path, to_path);
3298        if (ret < 0)
3299                goto out;
3300
3301        if (rmdir_ino) {
3302                struct orphan_dir_info *odi;
3303                u64 gen;
3304
3305                odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
3306                if (!odi) {
3307                        /* already deleted */
3308                        goto finish;
3309                }
3310                gen = odi->gen;
3311
3312                ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
3313                if (ret < 0)
3314                        goto out;
3315                if (!ret)
3316                        goto finish;
3317
3318                name = fs_path_alloc();
3319                if (!name) {
3320                        ret = -ENOMEM;
3321                        goto out;
3322                }
3323                ret = get_cur_path(sctx, rmdir_ino, gen, name);
3324                if (ret < 0)
3325                        goto out;
3326                ret = send_rmdir(sctx, name);
3327                if (ret < 0)
3328                        goto out;
3329        }
3330
3331finish:
3332        ret = send_utimes(sctx, pm->ino, pm->gen);
3333        if (ret < 0)
3334                goto out;
3335
3336        /*
3337         * After rename/move, need to update the utimes of both new parent(s)
3338         * and old parent(s).
3339         */
3340        list_for_each_entry(cur, &pm->update_refs, list) {
3341                /*
3342                 * The parent inode might have been deleted in the send snapshot
3343                 */
3344                ret = get_inode_info(sctx->send_root, cur->dir, NULL,
3345                                     NULL, NULL, NULL, NULL, NULL);
3346                if (ret == -ENOENT) {
3347                        ret = 0;
3348                        continue;
3349                }
3350                if (ret < 0)
3351                        goto out;
3352
3353                ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3354                if (ret < 0)
3355                        goto out;
3356        }
3357
3358out:
3359        fs_path_free(name);
3360        fs_path_free(from_path);
3361        fs_path_free(to_path);
3362        sctx->send_progress = orig_progress;
3363
3364        return ret;
3365}
3366
3367static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
3368{
3369        if (!list_empty(&m->list))
3370                list_del(&m->list);
3371        if (!RB_EMPTY_NODE(&m->node))
3372                rb_erase(&m->node, &sctx->pending_dir_moves);
3373        __free_recorded_refs(&m->update_refs);
3374        kfree(m);
3375}
3376
3377static void tail_append_pending_moves(struct send_ctx *sctx,
3378                                      struct pending_dir_move *moves,
3379                                      struct list_head *stack)
3380{
3381        if (list_empty(&moves->list)) {
3382                list_add_tail(&moves->list, stack);
3383        } else {
3384                LIST_HEAD(list);
3385                list_splice_init(&moves->list, &list);
3386                list_add_tail(&moves->list, stack);
3387                list_splice_tail(&list, stack);
3388        }
3389        if (!RB_EMPTY_NODE(&moves->node)) {
3390                rb_erase(&moves->node, &sctx->pending_dir_moves);
3391                RB_CLEAR_NODE(&moves->node);
3392        }
3393}
3394
3395static int apply_children_dir_moves(struct send_ctx *sctx)
3396{
3397        struct pending_dir_move *pm;
3398        struct list_head stack;
3399        u64 parent_ino = sctx->cur_ino;
3400        int ret = 0;
3401
3402        pm = get_pending_dir_moves(sctx, parent_ino);
3403        if (!pm)
3404                return 0;
3405
3406        INIT_LIST_HEAD(&stack);
3407        tail_append_pending_moves(sctx, pm, &stack);
3408
3409        while (!list_empty(&stack)) {
3410                pm = list_first_entry(&stack, struct pending_dir_move, list);
3411                parent_ino = pm->ino;
3412                ret = apply_dir_move(sctx, pm);
3413                free_pending_move(sctx, pm);
3414                if (ret)
3415                        goto out;
3416                pm = get_pending_dir_moves(sctx, parent_ino);
3417                if (pm)
3418                        tail_append_pending_moves(sctx, pm, &stack);
3419        }
3420        return 0;
3421
3422out:
3423        while (!list_empty(&stack)) {
3424                pm = list_first_entry(&stack, struct pending_dir_move, list);
3425                free_pending_move(sctx, pm);
3426        }
3427        return ret;
3428}
3429
3430/*
3431 * We might need to delay a directory rename even when no ancestor directory
3432 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3433 * renamed. This happens when we rename a directory to the old name (the name
3434 * in the parent root) of some other unrelated directory that got its rename
3435 * delayed due to some ancestor with higher number that got renamed.
3436 *
3437 * Example:
3438 *
3439 * Parent snapshot:
3440 * .                                       (ino 256)
3441 * |---- a/                                (ino 257)
3442 * |     |---- file                        (ino 260)
3443 * |
3444 * |---- b/                                (ino 258)
3445 * |---- c/                                (ino 259)
3446 *
3447 * Send snapshot:
3448 * .                                       (ino 256)
3449 * |---- a/                                (ino 258)
3450 * |---- x/                                (ino 259)
3451 *       |---- y/                          (ino 257)
3452 *             |----- file                 (ino 260)
3453 *
3454 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3455 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
3456 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3457 * must issue is:
3458 *
3459 * 1 - rename 259 from 'c' to 'x'
3460 * 2 - rename 257 from 'a' to 'x/y'
3461 * 3 - rename 258 from 'b' to 'a'
3462 *
3463 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3464 * be done right away and < 0 on error.
3465 */
3466static int wait_for_dest_dir_move(struct send_ctx *sctx,
3467                                  struct recorded_ref *parent_ref,
3468                                  const bool is_orphan)
3469{
3470        struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
3471        struct btrfs_path *path;
3472        struct btrfs_key key;
3473        struct btrfs_key di_key;
3474        struct btrfs_dir_item *di;
3475        u64 left_gen;
3476        u64 right_gen;
3477        int ret = 0;
3478        struct waiting_dir_move *wdm;
3479
3480        if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3481                return 0;
3482
3483        path = alloc_path_for_send();
3484        if (!path)
3485                return -ENOMEM;
3486
3487        key.objectid = parent_ref->dir;
3488        key.type = BTRFS_DIR_ITEM_KEY;
3489        key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
3490
3491        ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
3492        if (ret < 0) {
3493                goto out;
3494        } else if (ret > 0) {
3495                ret = 0;
3496                goto out;
3497        }
3498
3499        di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
3500                                       parent_ref->name_len);
3501        if (!di) {
3502                ret = 0;
3503                goto out;
3504        }
3505        /*
3506         * di_key.objectid has the number of the inode that has a dentry in the
3507         * parent directory with the same name that sctx->cur_ino is being
3508         * renamed to. We need to check if that inode is in the send root as
3509         * well and if it is currently marked as an inode with a pending rename,
3510         * if it is, we need to delay the rename of sctx->cur_ino as well, so
3511         * that it happens after that other inode is renamed.
3512         */
3513        btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
3514        if (di_key.type != BTRFS_INODE_ITEM_KEY) {
3515                ret = 0;
3516                goto out;
3517        }
3518
3519        ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
3520                             &left_gen, NULL, NULL, NULL, NULL);
3521        if (ret < 0)
3522                goto out;
3523        ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
3524                             &right_gen, NULL, NULL, NULL, NULL);
3525        if (ret < 0) {
3526                if (ret == -ENOENT)
3527                        ret = 0;
3528                goto out;
3529        }
3530
3531        /* Different inode, no need to delay the rename of sctx->cur_ino */
3532        if (right_gen != left_gen) {
3533                ret = 0;
3534                goto out;
3535        }
3536
3537        wdm = get_waiting_dir_move(sctx, di_key.objectid);
3538        if (wdm && !wdm->orphanized) {
3539                ret = add_pending_dir_move(sctx,
3540                                           sctx->cur_ino,
3541                                           sctx->cur_inode_gen,
3542                                           di_key.objectid,
3543                                           &sctx->new_refs,
3544                                           &sctx->deleted_refs,
3545                                           is_orphan);
3546                if (!ret)
3547                        ret = 1;
3548        }
3549out:
3550        btrfs_free_path(path);
3551        return ret;
3552}
3553
3554/*
3555 * Check if inode ino2, or any of its ancestors, is inode ino1.
3556 * Return 1 if true, 0 if false and < 0 on error.
3557 */
3558static int check_ino_in_path(struct btrfs_root *root,
3559                             const u64 ino1,
3560                             const u64 ino1_gen,
3561                             const u64 ino2,
3562                             const u64 ino2_gen,
3563                             struct fs_path *fs_path)
3564{
3565        u64 ino = ino2;
3566
3567        if (ino1 == ino2)
3568                return ino1_gen == ino2_gen;
3569
3570        while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3571                u64 parent;
3572                u64 parent_gen;
3573                int ret;
3574
3575                fs_path_reset(fs_path);
3576                ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
3577                if (ret < 0)
3578                        return ret;
3579                if (parent == ino1)
3580                        return parent_gen == ino1_gen;
3581                ino = parent;
3582        }
3583        return 0;
3584}
3585
3586/*
3587 * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
3588 * possible path (in case ino2 is not a directory and has multiple hard links).
3589 * Return 1 if true, 0 if false and < 0 on error.
3590 */
3591static int is_ancestor(struct btrfs_root *root,
3592                       const u64 ino1,
3593                       const u64 ino1_gen,
3594                       const u64 ino2,
3595                       struct fs_path *fs_path)
3596{
3597        bool free_fs_path = false;
3598        int ret = 0;
3599        struct btrfs_path *path = NULL;
3600        struct btrfs_key key;
3601
3602        if (!fs_path) {
3603                fs_path = fs_path_alloc();
3604                if (!fs_path)
3605                        return -ENOMEM;
3606                free_fs_path = true;
3607        }
3608
3609        path = alloc_path_for_send();
3610        if (!path) {
3611                ret = -ENOMEM;
3612                goto out;
3613        }
3614
3615        key.objectid = ino2;
3616        key.type = BTRFS_INODE_REF_KEY;
3617        key.offset = 0;
3618
3619        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3620        if (ret < 0)
3621                goto out;
3622
3623        while (true) {
3624                struct extent_buffer *leaf = path->nodes[0];
3625                int slot = path->slots[0];
3626                u32 cur_offset = 0;
3627                u32 item_size;
3628
3629                if (slot >= btrfs_header_nritems(leaf)) {
3630                        ret = btrfs_next_leaf(root, path);
3631                        if (ret < 0)
3632                                goto out;
3633                        if (ret > 0)
3634                                break;
3635                        continue;
3636                }
3637
3638                btrfs_item_key_to_cpu(leaf, &key, slot);
3639                if (key.objectid != ino2)
3640                        break;
3641                if (key.type != BTRFS_INODE_REF_KEY &&
3642                    key.type != BTRFS_INODE_EXTREF_KEY)
3643                        break;
3644
3645                item_size = btrfs_item_size(leaf, slot);
3646                while (cur_offset < item_size) {
3647                        u64 parent;
3648                        u64 parent_gen;
3649
3650                        if (key.type == BTRFS_INODE_EXTREF_KEY) {
3651                                unsigned long ptr;
3652                                struct btrfs_inode_extref *extref;
3653
3654                                ptr = btrfs_item_ptr_offset(leaf, slot);
3655                                extref = (struct btrfs_inode_extref *)
3656                                        (ptr + cur_offset);
3657                                parent = btrfs_inode_extref_parent(leaf,
3658                                                                   extref);
3659                                cur_offset += sizeof(*extref);
3660                                cur_offset += btrfs_inode_extref_name_len(leaf,
3661                                                                  extref);
3662                        } else {
3663                                parent = key.offset;
3664                                cur_offset = item_size;
3665                        }
3666
3667                        ret = get_inode_info(root, parent, NULL, &parent_gen,
3668                                             NULL, NULL, NULL, NULL);
3669                        if (ret < 0)
3670                                goto out;
3671                        ret = check_ino_in_path(root, ino1, ino1_gen,
3672                                                parent, parent_gen, fs_path);
3673                        if (ret)
3674                                goto out;
3675                }
3676                path->slots[0]++;
3677        }
3678        ret = 0;
3679 out:
3680        btrfs_free_path(path);
3681        if (free_fs_path)
3682                fs_path_free(fs_path);
3683        return ret;
3684}
3685
3686static int wait_for_parent_move(struct send_ctx *sctx,
3687                                struct recorded_ref *parent_ref,
3688                                const bool is_orphan)
3689{
3690        int ret = 0;
3691        u64 ino = parent_ref->dir;
3692        u64 ino_gen = parent_ref->dir_gen;
3693        u64 parent_ino_before, parent_ino_after;
3694        struct fs_path *path_before = NULL;
3695        struct fs_path *path_after = NULL;
3696        int len1, len2;
3697
3698        path_after = fs_path_alloc();
3699        path_before = fs_path_alloc();
3700        if (!path_after || !path_before) {
3701                ret = -ENOMEM;
3702                goto out;
3703        }
3704
3705        /*
3706         * Our current directory inode may not yet be renamed/moved because some
3707         * ancestor (immediate or not) has to be renamed/moved first. So find if
3708         * such ancestor exists and make sure our own rename/move happens after
3709         * that ancestor is processed to avoid path build infinite loops (done
3710         * at get_cur_path()).
3711         */
3712        while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3713                u64 parent_ino_after_gen;
3714
3715                if (is_waiting_for_move(sctx, ino)) {
3716                        /*
3717                         * If the current inode is an ancestor of ino in the
3718                         * parent root, we need to delay the rename of the
3719                         * current inode, otherwise don't delayed the rename
3720                         * because we can end up with a circular dependency
3721                         * of renames, resulting in some directories never
3722                         * getting the respective rename operations issued in
3723                         * the send stream or getting into infinite path build
3724                         * loops.
3725                         */
3726                        ret = is_ancestor(sctx->parent_root,
3727                                          sctx->cur_ino, sctx->cur_inode_gen,
3728                                          ino, path_before);
3729                        if (ret)
3730                                break;
3731                }
3732
3733                fs_path_reset(path_before);
3734                fs_path_reset(path_after);
3735
3736                ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3737                                    &parent_ino_after_gen, path_after);
3738                if (ret < 0)
3739                        goto out;
3740                ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3741                                    NULL, path_before);
3742                if (ret < 0 && ret != -ENOENT) {
3743                        goto out;
3744                } else if (ret == -ENOENT) {
3745                        ret = 0;
3746                        break;
3747                }
3748
3749                len1 = fs_path_len(path_before);
3750                len2 = fs_path_len(path_after);
3751                if (ino > sctx->cur_ino &&
3752                    (parent_ino_before != parent_ino_after || len1 != len2 ||
3753                     memcmp(path_before->start, path_after->start, len1))) {
3754                        u64 parent_ino_gen;
3755
3756                        ret = get_inode_info(sctx->parent_root, ino, NULL,
3757                                             &parent_ino_gen, NULL, NULL, NULL,
3758                                             NULL);
3759                        if (ret < 0)
3760                                goto out;
3761                        if (ino_gen == parent_ino_gen) {
3762                                ret = 1;
3763                                break;
3764                        }
3765                }
3766                ino = parent_ino_after;
3767                ino_gen = parent_ino_after_gen;
3768        }
3769
3770out:
3771        fs_path_free(path_before);
3772        fs_path_free(path_after);
3773
3774        if (ret == 1) {
3775                ret = add_pending_dir_move(sctx,
3776                                           sctx->cur_ino,
3777                                           sctx->cur_inode_gen,
3778                                           ino,
3779                                           &sctx->new_refs,
3780                                           &sctx->deleted_refs,
3781                                           is_orphan);
3782                if (!ret)
3783                        ret = 1;
3784        }
3785
3786        return ret;
3787}
3788
3789static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
3790{
3791        int ret;
3792        struct fs_path *new_path;
3793
3794        /*
3795         * Our reference's name member points to its full_path member string, so
3796         * we use here a new path.
3797         */
3798        new_path = fs_path_alloc();
3799        if (!new_path)
3800                return -ENOMEM;
3801
3802        ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
3803        if (ret < 0) {
3804                fs_path_free(new_path);
3805                return ret;
3806        }
3807        ret = fs_path_add(new_path, ref->name, ref->name_len);
3808        if (ret < 0) {
3809                fs_path_free(new_path);
3810                return ret;
3811        }
3812
3813        fs_path_free(ref->full_path);
3814        set_ref_path(ref, new_path);
3815
3816        return 0;
3817}
3818
3819/*
3820 * When processing the new references for an inode we may orphanize an existing
3821 * directory inode because its old name conflicts with one of the new references
3822 * of the current inode. Later, when processing another new reference of our
3823 * inode, we might need to orphanize another inode, but the path we have in the
3824 * reference reflects the pre-orphanization name of the directory we previously
3825 * orphanized. For example:
3826 *
3827 * parent snapshot looks like:
3828 *
3829 * .                                     (ino 256)
3830 * |----- f1                             (ino 257)
3831 * |----- f2                             (ino 258)
3832 * |----- d1/                            (ino 259)
3833 *        |----- d2/                     (ino 260)
3834 *
3835 * send snapshot looks like:
3836 *
3837 * .                                     (ino 256)
3838 * |----- d1                             (ino 258)
3839 * |----- f2/                            (ino 259)
3840 *        |----- f2_link/                (ino 260)
3841 *        |       |----- f1              (ino 257)
3842 *        |
3843 *        |----- d2                      (ino 258)
3844 *
3845 * When processing inode 257 we compute the name for inode 259 as "d1", and we
3846 * cache it in the name cache. Later when we start processing inode 258, when
3847 * collecting all its new references we set a full path of "d1/d2" for its new
3848 * reference with name "d2". When we start processing the new references we
3849 * start by processing the new reference with name "d1", and this results in
3850 * orphanizing inode 259, since its old reference causes a conflict. Then we
3851 * move on the next new reference, with name "d2", and we find out we must
3852 * orphanize inode 260, as its old reference conflicts with ours - but for the
3853 * orphanization we use a source path corresponding to the path we stored in the
3854 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
3855 * receiver fail since the path component "d1/" no longer exists, it was renamed
3856 * to "o259-6-0/" when processing the previous new reference. So in this case we
3857 * must recompute the path in the new reference and use it for the new
3858 * orphanization operation.
3859 */
3860static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
3861{
3862        char *name;
3863        int ret;
3864
3865        name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
3866        if (!name)
3867                return -ENOMEM;
3868
3869        fs_path_reset(ref->full_path);
3870        ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
3871        if (ret < 0)
3872                goto out;
3873
3874        ret = fs_path_add(ref->full_path, name, ref->name_len);
3875        if (ret < 0)
3876                goto out;
3877
3878        /* Update the reference's base name pointer. */
3879        set_ref_path(ref, ref->full_path);
3880out:
3881        kfree(name);
3882        return ret;
3883}
3884
3885/*
3886 * This does all the move/link/unlink/rmdir magic.
3887 */
3888static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3889{
3890        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
3891        int ret = 0;
3892        struct recorded_ref *cur;
3893        struct recorded_ref *cur2;
3894        struct list_head check_dirs;
3895        struct fs_path *valid_path = NULL;
3896        u64 ow_inode = 0;
3897        u64 ow_gen;
3898        u64 ow_mode;
3899        int did_overwrite = 0;
3900        int is_orphan = 0;
3901        u64 last_dir_ino_rm = 0;
3902        bool can_rename = true;
3903        bool orphanized_dir = false;
3904        bool orphanized_ancestor = false;
3905
3906        btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
3907
3908        /*
3909         * This should never happen as the root dir always has the same ref
3910         * which is always '..'
3911         */
3912        BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
3913        INIT_LIST_HEAD(&check_dirs);
3914
3915        valid_path = fs_path_alloc();
3916        if (!valid_path) {
3917                ret = -ENOMEM;
3918                goto out;
3919        }
3920
3921        /*
3922         * First, check if the first ref of the current inode was overwritten
3923         * before. If yes, we know that the current inode was already orphanized
3924         * and thus use the orphan name. If not, we can use get_cur_path to
3925         * get the path of the first ref as it would like while receiving at
3926         * this point in time.
3927         * New inodes are always orphan at the beginning, so force to use the
3928         * orphan name in this case.
3929         * The first ref is stored in valid_path and will be updated if it
3930         * gets moved around.
3931         */
3932        if (!sctx->cur_inode_new) {
3933                ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
3934                                sctx->cur_inode_gen);
3935                if (ret < 0)
3936                        goto out;
3937                if (ret)
3938                        did_overwrite = 1;
3939        }
3940        if (sctx->cur_inode_new || did_overwrite) {
3941                ret = gen_unique_name(sctx, sctx->cur_ino,
3942                                sctx->cur_inode_gen, valid_path);
3943                if (ret < 0)
3944                        goto out;
3945                is_orphan = 1;
3946        } else {
3947                ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3948                                valid_path);
3949                if (ret < 0)
3950                        goto out;
3951        }
3952
3953        /*
3954         * Before doing any rename and link operations, do a first pass on the
3955         * new references to orphanize any unprocessed inodes that may have a
3956         * reference that conflicts with one of the new references of the current
3957         * inode. This needs to happen first because a new reference may conflict
3958         * with the old reference of a parent directory, so we must make sure
3959         * that the path used for link and rename commands don't use an
3960         * orphanized name when an ancestor was not yet orphanized.
3961         *
3962         * Example:
3963         *
3964         * Parent snapshot:
3965         *
3966         * .                                                      (ino 256)
3967         * |----- testdir/                                        (ino 259)
3968         * |          |----- a                                    (ino 257)
3969         * |
3970         * |----- b                                               (ino 258)
3971         *
3972         * Send snapshot:
3973         *
3974         * .                                                      (ino 256)
3975         * |----- testdir_2/                                      (ino 259)
3976         * |          |----- a                                    (ino 260)
3977         * |
3978         * |----- testdir                                         (ino 257)
3979         * |----- b                                               (ino 257)
3980         * |----- b2                                              (ino 258)
3981         *
3982         * Processing the new reference for inode 257 with name "b" may happen
3983         * before processing the new reference with name "testdir". If so, we
3984         * must make sure that by the time we send a link command to create the
3985         * hard link "b", inode 259 was already orphanized, since the generated
3986         * path in "valid_path" already contains the orphanized name for 259.
3987         * We are processing inode 257, so only later when processing 259 we do
3988         * the rename operation to change its temporary (orphanized) name to
3989         * "testdir_2".
3990         */
3991        list_for_each_entry(cur, &sctx->new_refs, list) {
3992                ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
3993                if (ret < 0)
3994                        goto out;
3995                if (ret == inode_state_will_create)
3996                        continue;
3997
3998                /*
3999                 * Check if this new ref would overwrite the first ref of another
4000                 * unprocessed inode. If yes, orphanize the overwritten inode.
4001                 * If we find an overwritten ref that is not the first ref,
4002                 * simply unlink it.
4003                 */
4004                ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
4005                                cur->name, cur->name_len,
4006                                &ow_inode, &ow_gen, &ow_mode);
4007                if (ret < 0)
4008                        goto out;
4009                if (ret) {
4010                        ret = is_first_ref(sctx->parent_root,
4011                                           ow_inode, cur->dir, cur->name,
4012                                           cur->name_len);
4013                        if (ret < 0)
4014                                goto out;
4015                        if (ret) {
4016                                struct name_cache_entry *nce;
4017                                struct waiting_dir_move *wdm;
4018
4019                                if (orphanized_dir) {
4020                                        ret = refresh_ref_path(sctx, cur);
4021                                        if (ret < 0)
4022                                                goto out;
4023                                }
4024
4025                                ret = orphanize_inode(sctx, ow_inode, ow_gen,
4026                                                cur->full_path);
4027                                if (ret < 0)
4028                                        goto out;
4029                                if (S_ISDIR(ow_mode))
4030                                        orphanized_dir = true;
4031
4032                                /*
4033                                 * If ow_inode has its rename operation delayed
4034                                 * make sure that its orphanized name is used in
4035                                 * the source path when performing its rename
4036                                 * operation.
4037                                 */
4038                                if (is_waiting_for_move(sctx, ow_inode)) {
4039                                        wdm = get_waiting_dir_move(sctx,
4040                                                                   ow_inode);
4041                                        ASSERT(wdm);
4042                                        wdm->orphanized = true;
4043                                }
4044
4045                                /*
4046                                 * Make sure we clear our orphanized inode's
4047                                 * name from the name cache. This is because the
4048                                 * inode ow_inode might be an ancestor of some
4049                                 * other inode that will be orphanized as well
4050                                 * later and has an inode number greater than
4051                                 * sctx->send_progress. We need to prevent
4052                                 * future name lookups from using the old name
4053                                 * and get instead the orphan name.
4054                                 */
4055                                nce = name_cache_search(sctx, ow_inode, ow_gen);
4056                                if (nce) {
4057                                        name_cache_delete(sctx, nce);
4058                                        kfree(nce);
4059                                }
4060
4061                                /*
4062                                 * ow_inode might currently be an ancestor of
4063                                 * cur_ino, therefore compute valid_path (the
4064                                 * current path of cur_ino) again because it
4065                                 * might contain the pre-orphanization name of
4066                                 * ow_inode, which is no longer valid.
4067                                 */
4068                                ret = is_ancestor(sctx->parent_root,
4069                                                  ow_inode, ow_gen,
4070                                                  sctx->cur_ino, NULL);
4071                                if (ret > 0) {
4072                                        orphanized_ancestor = true;
4073                                        fs_path_reset(valid_path);
4074                                        ret = get_cur_path(sctx, sctx->cur_ino,
4075                                                           sctx->cur_inode_gen,
4076                                                           valid_path);
4077                                }
4078                                if (ret < 0)
4079                                        goto out;
4080                        } else {
4081                                /*
4082                                 * If we previously orphanized a directory that
4083                                 * collided with a new reference that we already
4084                                 * processed, recompute the current path because
4085                                 * that directory may be part of the path.
4086                                 */
4087                                if (orphanized_dir) {
4088                                        ret = refresh_ref_path(sctx, cur);
4089                                        if (ret < 0)
4090                                                goto out;
4091                                }
4092                                ret = send_unlink(sctx, cur->full_path);
4093                                if (ret < 0)
4094                                        goto out;
4095                        }
4096                }
4097
4098        }
4099
4100        list_for_each_entry(cur, &sctx->new_refs, list) {
4101                /*
4102                 * We may have refs where the parent directory does not exist
4103                 * yet. This happens if the parent directories inum is higher
4104                 * than the current inum. To handle this case, we create the
4105                 * parent directory out of order. But we need to check if this
4106                 * did already happen before due to other refs in the same dir.
4107                 */
4108                ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
4109                if (ret < 0)
4110                        goto out;
4111                if (ret == inode_state_will_create) {
4112                        ret = 0;
4113                        /*
4114                         * First check if any of the current inodes refs did
4115                         * already create the dir.
4116                         */
4117                        list_for_each_entry(cur2, &sctx->new_refs, list) {
4118                                if (cur == cur2)
4119                                        break;
4120                                if (cur2->dir == cur->dir) {
4121                                        ret = 1;
4122                                        break;
4123                                }
4124                        }
4125
4126                        /*
4127                         * If that did not happen, check if a previous inode
4128                         * did already create the dir.
4129                         */
4130                        if (!ret)
4131                                ret = did_create_dir(sctx, cur->dir);
4132                        if (ret < 0)
4133                                goto out;
4134                        if (!ret) {
4135                                ret = send_create_inode(sctx, cur->dir);
4136                                if (ret < 0)
4137                                        goto out;
4138                        }
4139                }
4140
4141                if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
4142                        ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
4143                        if (ret < 0)
4144                                goto out;
4145                        if (ret == 1) {
4146                                can_rename = false;
4147                                *pending_move = 1;
4148                        }
4149                }
4150
4151                if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
4152                    can_rename) {
4153                        ret = wait_for_parent_move(sctx, cur, is_orphan);
4154                        if (ret < 0)
4155                                goto out;
4156                        if (ret == 1) {
4157                                can_rename = false;
4158                                *pending_move = 1;
4159                        }
4160                }
4161
4162                /*
4163                 * link/move the ref to the new place. If we have an orphan
4164                 * inode, move it and update valid_path. If not, link or move
4165                 * it depending on the inode mode.
4166                 */
4167                if (is_orphan && can_rename) {
4168                        ret = send_rename(sctx, valid_path, cur->full_path);
4169                        if (ret < 0)
4170                                goto out;
4171                        is_orphan = 0;
4172                        ret = fs_path_copy(valid_path, cur->full_path);
4173                        if (ret < 0)
4174                                goto out;
4175                } else if (can_rename) {
4176                        if (S_ISDIR(sctx->cur_inode_mode)) {
4177                                /*
4178                                 * Dirs can't be linked, so move it. For moved
4179                                 * dirs, we always have one new and one deleted
4180                                 * ref. The deleted ref is ignored later.
4181                                 */
4182                                ret = send_rename(sctx, valid_path,
4183                                                  cur->full_path);
4184                                if (!ret)
4185                                        ret = fs_path_copy(valid_path,
4186                                                           cur->full_path);
4187                                if (ret < 0)
4188                                        goto out;
4189                        } else {
4190                                /*
4191                                 * We might have previously orphanized an inode
4192                                 * which is an ancestor of our current inode,
4193                                 * so our reference's full path, which was
4194                                 * computed before any such orphanizations, must
4195                                 * be updated.
4196                                 */
4197                                if (orphanized_dir) {
4198                                        ret = update_ref_path(sctx, cur);
4199                                        if (ret < 0)
4200                                                goto out;
4201                                }
4202                                ret = send_link(sctx, cur->full_path,
4203                                                valid_path);
4204                                if (ret < 0)
4205                                        goto out;
4206                        }
4207                }
4208                ret = dup_ref(cur, &check_dirs);
4209                if (ret < 0)
4210                        goto out;
4211        }
4212
4213        if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
4214                /*
4215                 * Check if we can already rmdir the directory. If not,
4216                 * orphanize it. For every dir item inside that gets deleted
4217                 * later, we do this check again and rmdir it then if possible.
4218                 * See the use of check_dirs for more details.
4219                 */
4220                ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4221                                sctx->cur_ino);
4222                if (ret < 0)
4223                        goto out;
4224                if (ret) {
4225                        ret = send_rmdir(sctx, valid_path);
4226                        if (ret < 0)
4227                                goto out;
4228                } else if (!is_orphan) {
4229                        ret = orphanize_inode(sctx, sctx->cur_ino,
4230                                        sctx->cur_inode_gen, valid_path);
4231                        if (ret < 0)
4232                                goto out;
4233                        is_orphan = 1;
4234                }
4235
4236                list_for_each_entry(cur, &sctx->deleted_refs, list) {
4237                        ret = dup_ref(cur, &check_dirs);
4238                        if (ret < 0)
4239                                goto out;
4240                }
4241        } else if (S_ISDIR(sctx->cur_inode_mode) &&
4242                   !list_empty(&sctx->deleted_refs)) {
4243                /*
4244                 * We have a moved dir. Add the old parent to check_dirs
4245                 */
4246                cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
4247                                list);
4248                ret = dup_ref(cur, &check_dirs);
4249                if (ret < 0)
4250                        goto out;
4251        } else if (!S_ISDIR(sctx->cur_inode_mode)) {
4252                /*
4253                 * We have a non dir inode. Go through all deleted refs and
4254                 * unlink them if they were not already overwritten by other
4255                 * inodes.
4256                 */
4257                list_for_each_entry(cur, &sctx->deleted_refs, list) {
4258                        ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
4259                                        sctx->cur_ino, sctx->cur_inode_gen,
4260                                        cur->name, cur->name_len);
4261                        if (ret < 0)
4262                                goto out;
4263                        if (!ret) {
4264                                /*
4265                                 * If we orphanized any ancestor before, we need
4266                                 * to recompute the full path for deleted names,
4267                                 * since any such path was computed before we
4268                                 * processed any references and orphanized any
4269                                 * ancestor inode.
4270                                 */
4271                                if (orphanized_ancestor) {
4272                                        ret = update_ref_path(sctx, cur);
4273                                        if (ret < 0)
4274                                                goto out;
4275                                }
4276                                ret = send_unlink(sctx, cur->full_path);
4277                                if (ret < 0)
4278                                        goto out;
4279                        }
4280                        ret = dup_ref(cur, &check_dirs);
4281                        if (ret < 0)
4282                                goto out;
4283                }
4284                /*
4285                 * If the inode is still orphan, unlink the orphan. This may
4286                 * happen when a previous inode did overwrite the first ref
4287                 * of this inode and no new refs were added for the current
4288                 * inode. Unlinking does not mean that the inode is deleted in
4289                 * all cases. There may still be links to this inode in other
4290                 * places.
4291                 */
4292                if (is_orphan) {
4293                        ret = send_unlink(sctx, valid_path);
4294                        if (ret < 0)
4295                                goto out;
4296                }
4297        }
4298
4299        /*
4300         * We did collect all parent dirs where cur_inode was once located. We
4301         * now go through all these dirs and check if they are pending for
4302         * deletion and if it's finally possible to perform the rmdir now.
4303         * We also update the inode stats of the parent dirs here.
4304         */
4305        list_for_each_entry(cur, &check_dirs, list) {
4306                /*
4307                 * In case we had refs into dirs that were not processed yet,
4308                 * we don't need to do the utime and rmdir logic for these dirs.
4309                 * The dir will be processed later.
4310                 */
4311                if (cur->dir > sctx->cur_ino)
4312                        continue;
4313
4314                ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
4315                if (ret < 0)
4316                        goto out;
4317
4318                if (ret == inode_state_did_create ||
4319                    ret == inode_state_no_change) {
4320                        /* TODO delayed utimes */
4321                        ret = send_utimes(sctx, cur->dir, cur->dir_gen);
4322                        if (ret < 0)
4323                                goto out;
4324                } else if (ret == inode_state_did_delete &&
4325                           cur->dir != last_dir_ino_rm) {
4326                        ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
4327                                        sctx->cur_ino);
4328                        if (ret < 0)
4329                                goto out;
4330                        if (ret) {
4331                                ret = get_cur_path(sctx, cur->dir,
4332                                                   cur->dir_gen, valid_path);
4333                                if (ret < 0)
4334                                        goto out;
4335                                ret = send_rmdir(sctx, valid_path);
4336                                if (ret < 0)
4337                                        goto out;
4338                                last_dir_ino_rm = cur->dir;
4339                        }
4340                }
4341        }
4342
4343        ret = 0;
4344
4345out:
4346        __free_recorded_refs(&check_dirs);
4347        free_recorded_refs(sctx);
4348        fs_path_free(valid_path);
4349        return ret;
4350}
4351
4352static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
4353                      void *ctx, struct list_head *refs)
4354{
4355        int ret = 0;
4356        struct send_ctx *sctx = ctx;
4357        struct fs_path *p;
4358        u64 gen;
4359
4360        p = fs_path_alloc();
4361        if (!p)
4362                return -ENOMEM;
4363
4364        ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
4365                        NULL, NULL);
4366        if (ret < 0)
4367                goto out;
4368
4369        ret = get_cur_path(sctx, dir, gen, p);
4370        if (ret < 0)
4371                goto out;
4372        ret = fs_path_add_path(p, name);
4373        if (ret < 0)
4374                goto out;
4375
4376        ret = __record_ref(refs, dir, gen, p);
4377
4378out:
4379        if (ret)
4380                fs_path_free(p);
4381        return ret;
4382}
4383
4384static int __record_new_ref(int num, u64 dir, int index,
4385                            struct fs_path *name,
4386                            void *ctx)
4387{
4388        struct send_ctx *sctx = ctx;
4389        return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
4390}
4391
4392
4393static int __record_deleted_ref(int num, u64 dir, int index,
4394                                struct fs_path *name,
4395                                void *ctx)
4396{
4397        struct send_ctx *sctx = ctx;
4398        return record_ref(sctx->parent_root, dir, name, ctx,
4399                          &sctx->deleted_refs);
4400}
4401
4402static int record_new_ref(struct send_ctx *sctx)
4403{
4404        int ret;
4405
4406        ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4407                                sctx->cmp_key, 0, __record_new_ref, sctx);
4408        if (ret < 0)
4409                goto out;
4410        ret = 0;
4411
4412out:
4413        return ret;
4414}
4415
4416static int record_deleted_ref(struct send_ctx *sctx)
4417{
4418        int ret;
4419
4420        ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4421                                sctx->cmp_key, 0, __record_deleted_ref, sctx);
4422        if (ret < 0)
4423                goto out;
4424        ret = 0;
4425
4426out:
4427        return ret;
4428}
4429
4430struct find_ref_ctx {
4431        u64 dir;
4432        u64 dir_gen;
4433        struct btrfs_root *root;
4434        struct fs_path *name;
4435        int found_idx;
4436};
4437
4438static int __find_iref(int num, u64 dir, int index,
4439                       struct fs_path *name,
4440                       void *ctx_)
4441{
4442        struct find_ref_ctx *ctx = ctx_;
4443        u64 dir_gen;
4444        int ret;
4445
4446        if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
4447            strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
4448                /*
4449                 * To avoid doing extra lookups we'll only do this if everything
4450                 * else matches.
4451                 */
4452                ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
4453                                     NULL, NULL, NULL);
4454                if (ret)
4455                        return ret;
4456                if (dir_gen != ctx->dir_gen)
4457                        return 0;
4458                ctx->found_idx = num;
4459                return 1;
4460        }
4461        return 0;
4462}
4463
4464static int find_iref(struct btrfs_root *root,
4465                     struct btrfs_path *path,
4466                     struct btrfs_key *key,
4467                     u64 dir, u64 dir_gen, struct fs_path *name)
4468{
4469        int ret;
4470        struct find_ref_ctx ctx;
4471
4472        ctx.dir = dir;
4473        ctx.name = name;
4474        ctx.dir_gen = dir_gen;
4475        ctx.found_idx = -1;
4476        ctx.root = root;
4477
4478        ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
4479        if (ret < 0)
4480                return ret;
4481
4482        if (ctx.found_idx == -1)
4483                return -ENOENT;
4484
4485        return ctx.found_idx;
4486}
4487
4488static int __record_changed_new_ref(int num, u64 dir, int index,
4489                                    struct fs_path *name,
4490                                    void *ctx)
4491{
4492        u64 dir_gen;
4493        int ret;
4494        struct send_ctx *sctx = ctx;
4495
4496        ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
4497                             NULL, NULL, NULL);
4498        if (ret)
4499                return ret;
4500
4501        ret = find_iref(sctx->parent_root, sctx->right_path,
4502                        sctx->cmp_key, dir, dir_gen, name);
4503        if (ret == -ENOENT)
4504                ret = __record_new_ref(num, dir, index, name, sctx);
4505        else if (ret > 0)
4506                ret = 0;
4507
4508        return ret;
4509}
4510
4511static int __record_changed_deleted_ref(int num, u64 dir, int index,
4512                                        struct fs_path *name,
4513                                        void *ctx)
4514{
4515        u64 dir_gen;
4516        int ret;
4517        struct send_ctx *sctx = ctx;
4518
4519        ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
4520                             NULL, NULL, NULL);
4521        if (ret)
4522                return ret;
4523
4524        ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
4525                        dir, dir_gen, name);
4526        if (ret == -ENOENT)
4527                ret = __record_deleted_ref(num, dir, index, name, sctx);
4528        else if (ret > 0)
4529                ret = 0;
4530
4531        return ret;
4532}
4533
4534static int record_changed_ref(struct send_ctx *sctx)
4535{
4536        int ret = 0;
4537
4538        ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4539                        sctx->cmp_key, 0, __record_changed_new_ref, sctx);
4540        if (ret < 0)
4541                goto out;
4542        ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4543                        sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
4544        if (ret < 0)
4545                goto out;
4546        ret = 0;
4547
4548out:
4549        return ret;
4550}
4551
4552/*
4553 * Record and process all refs at once. Needed when an inode changes the
4554 * generation number, which means that it was deleted and recreated.
4555 */
4556static int process_all_refs(struct send_ctx *sctx,
4557                            enum btrfs_compare_tree_result cmd)
4558{
4559        int ret;
4560        struct btrfs_root *root;
4561        struct btrfs_path *path;
4562        struct btrfs_key key;
4563        struct btrfs_key found_key;
4564        struct extent_buffer *eb;
4565        int slot;
4566        iterate_inode_ref_t cb;
4567        int pending_move = 0;
4568
4569        path = alloc_path_for_send();
4570        if (!path)
4571                return -ENOMEM;
4572
4573        if (cmd == BTRFS_COMPARE_TREE_NEW) {
4574                root = sctx->send_root;
4575                cb = __record_new_ref;
4576        } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
4577                root = sctx->parent_root;
4578                cb = __record_deleted_ref;
4579        } else {
4580                btrfs_err(sctx->send_root->fs_info,
4581                                "Wrong command %d in process_all_refs", cmd);
4582                ret = -EINVAL;
4583                goto out;
4584        }
4585
4586        key.objectid = sctx->cmp_key->objectid;
4587        key.type = BTRFS_INODE_REF_KEY;
4588        key.offset = 0;
4589        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4590        if (ret < 0)
4591                goto out;
4592
4593        while (1) {
4594                eb = path->nodes[0];
4595                slot = path->slots[0];
4596                if (slot >= btrfs_header_nritems(eb)) {
4597                        ret = btrfs_next_leaf(root, path);
4598                        if (ret < 0)
4599                                goto out;
4600                        else if (ret > 0)
4601                                break;
4602                        continue;
4603                }
4604
4605                btrfs_item_key_to_cpu(eb, &found_key, slot);
4606
4607                if (found_key.objectid != key.objectid ||
4608                    (found_key.type != BTRFS_INODE_REF_KEY &&
4609                     found_key.type != BTRFS_INODE_EXTREF_KEY))
4610                        break;
4611
4612                ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
4613                if (ret < 0)
4614                        goto out;
4615
4616                path->slots[0]++;
4617        }
4618        btrfs_release_path(path);
4619
4620        /*
4621         * We don't actually care about pending_move as we are simply
4622         * re-creating this inode and will be rename'ing it into place once we
4623         * rename the parent directory.
4624         */
4625        ret = process_recorded_refs(sctx, &pending_move);
4626out:
4627        btrfs_free_path(path);
4628        return ret;
4629}
4630
4631static int send_set_xattr(struct send_ctx *sctx,
4632                          struct fs_path *path,
4633                          const char *name, int name_len,
4634                          const char *data, int data_len)
4635{
4636        int ret = 0;
4637
4638        ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
4639        if (ret < 0)
4640                goto out;
4641
4642        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4643        TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4644        TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
4645
4646        ret = send_cmd(sctx);
4647
4648tlv_put_failure:
4649out:
4650        return ret;
4651}
4652
4653static int send_remove_xattr(struct send_ctx *sctx,
4654                          struct fs_path *path,
4655                          const char *name, int name_len)
4656{
4657        int ret = 0;
4658
4659        ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
4660        if (ret < 0)
4661                goto out;
4662
4663        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4664        TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4665
4666        ret = send_cmd(sctx);
4667
4668tlv_put_failure:
4669out:
4670        return ret;
4671}
4672
4673static int __process_new_xattr(int num, struct btrfs_key *di_key,
4674                               const char *name, int name_len, const char *data,
4675                               int data_len, void *ctx)
4676{
4677        int ret;
4678        struct send_ctx *sctx = ctx;
4679        struct fs_path *p;
4680        struct posix_acl_xattr_header dummy_acl;
4681
4682        /* Capabilities are emitted by finish_inode_if_needed */
4683        if (!strncmp(name, XATTR_NAME_CAPS, name_len))
4684                return 0;
4685
4686        p = fs_path_alloc();
4687        if (!p)
4688                return -ENOMEM;
4689
4690        /*
4691         * This hack is needed because empty acls are stored as zero byte
4692         * data in xattrs. Problem with that is, that receiving these zero byte
4693         * acls will fail later. To fix this, we send a dummy acl list that
4694         * only contains the version number and no entries.
4695         */
4696        if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
4697            !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
4698                if (data_len == 0) {
4699                        dummy_acl.a_version =
4700                                        cpu_to_le32(POSIX_ACL_XATTR_VERSION);
4701                        data = (char *)&dummy_acl;
4702                        data_len = sizeof(dummy_acl);
4703                }
4704        }
4705
4706        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4707        if (ret < 0)
4708                goto out;
4709
4710        ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
4711
4712out:
4713        fs_path_free(p);
4714        return ret;
4715}
4716
4717static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
4718                                   const char *name, int name_len,
4719                                   const char *data, int data_len, void *ctx)
4720{
4721        int ret;
4722        struct send_ctx *sctx = ctx;
4723        struct fs_path *p;
4724
4725        p = fs_path_alloc();
4726        if (!p)
4727                return -ENOMEM;
4728
4729        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4730        if (ret < 0)
4731                goto out;
4732
4733        ret = send_remove_xattr(sctx, p, name, name_len);
4734
4735out:
4736        fs_path_free(p);
4737        return ret;
4738}
4739
4740static int process_new_xattr(struct send_ctx *sctx)
4741{
4742        int ret = 0;
4743
4744        ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4745                               __process_new_xattr, sctx);
4746
4747        return ret;
4748}
4749
4750static int process_deleted_xattr(struct send_ctx *sctx)
4751{
4752        return iterate_dir_item(sctx->parent_root, sctx->right_path,
4753                                __process_deleted_xattr, sctx);
4754}
4755
4756struct find_xattr_ctx {
4757        const char *name;
4758        int name_len;
4759        int found_idx;
4760        char *found_data;
4761        int found_data_len;
4762};
4763
4764static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
4765                        int name_len, const char *data, int data_len, void *vctx)
4766{
4767        struct find_xattr_ctx *ctx = vctx;
4768
4769        if (name_len == ctx->name_len &&
4770            strncmp(name, ctx->name, name_len) == 0) {
4771                ctx->found_idx = num;
4772                ctx->found_data_len = data_len;
4773                ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
4774                if (!ctx->found_data)
4775                        return -ENOMEM;
4776                return 1;
4777        }
4778        return 0;
4779}
4780
4781static int find_xattr(struct btrfs_root *root,
4782                      struct btrfs_path *path,
4783                      struct btrfs_key *key,
4784                      const char *name, int name_len,
4785                      char **data, int *data_len)
4786{
4787        int ret;
4788        struct find_xattr_ctx ctx;
4789
4790        ctx.name = name;
4791        ctx.name_len = name_len;
4792        ctx.found_idx = -1;
4793        ctx.found_data = NULL;
4794        ctx.found_data_len = 0;
4795
4796        ret = iterate_dir_item(root, path, __find_xattr, &ctx);
4797        if (ret < 0)
4798                return ret;
4799
4800        if (ctx.found_idx == -1)
4801                return -ENOENT;
4802        if (data) {
4803                *data = ctx.found_data;
4804                *data_len = ctx.found_data_len;
4805        } else {
4806                kfree(ctx.found_data);
4807        }
4808        return ctx.found_idx;
4809}
4810
4811
4812static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
4813                                       const char *name, int name_len,
4814                                       const char *data, int data_len,
4815                                       void *ctx)
4816{
4817        int ret;
4818        struct send_ctx *sctx = ctx;
4819        char *found_data = NULL;
4820        int found_data_len  = 0;
4821
4822        ret = find_xattr(sctx->parent_root, sctx->right_path,
4823                         sctx->cmp_key, name, name_len, &found_data,
4824                         &found_data_len);
4825        if (ret == -ENOENT) {
4826                ret = __process_new_xattr(num, di_key, name, name_len, data,
4827                                          data_len, ctx);
4828        } else if (ret >= 0) {
4829                if (data_len != found_data_len ||
4830                    memcmp(data, found_data, data_len)) {
4831                        ret = __process_new_xattr(num, di_key, name, name_len,
4832                                                  data, data_len, ctx);
4833                } else {
4834                        ret = 0;
4835                }
4836        }
4837
4838        kfree(found_data);
4839        return ret;
4840}
4841
4842static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
4843                                           const char *name, int name_len,
4844                                           const char *data, int data_len,
4845                                           void *ctx)
4846{
4847        int ret;
4848        struct send_ctx *sctx = ctx;
4849
4850        ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
4851                         name, name_len, NULL, NULL);
4852        if (ret == -ENOENT)
4853                ret = __process_deleted_xattr(num, di_key, name, name_len, data,
4854                                              data_len, ctx);
4855        else if (ret >= 0)
4856                ret = 0;
4857
4858        return ret;
4859}
4860
4861static int process_changed_xattr(struct send_ctx *sctx)
4862{
4863        int ret = 0;
4864
4865        ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4866                        __process_changed_new_xattr, sctx);
4867        if (ret < 0)
4868                goto out;
4869        ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
4870                        __process_changed_deleted_xattr, sctx);
4871
4872out:
4873        return ret;
4874}
4875
4876static int process_all_new_xattrs(struct send_ctx *sctx)
4877{
4878        int ret;
4879        struct btrfs_root *root;
4880        struct btrfs_path *path;
4881        struct btrfs_key key;
4882        struct btrfs_key found_key;
4883        struct extent_buffer *eb;
4884        int slot;
4885
4886        path = alloc_path_for_send();
4887        if (!path)
4888                return -ENOMEM;
4889
4890        root = sctx->send_root;
4891
4892        key.objectid = sctx->cmp_key->objectid;
4893        key.type = BTRFS_XATTR_ITEM_KEY;
4894        key.offset = 0;
4895        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4896        if (ret < 0)
4897                goto out;
4898
4899        while (1) {
4900                eb = path->nodes[0];
4901                slot = path->slots[0];
4902                if (slot >= btrfs_header_nritems(eb)) {
4903                        ret = btrfs_next_leaf(root, path);
4904                        if (ret < 0) {
4905                                goto out;
4906                        } else if (ret > 0) {
4907                                ret = 0;
4908                                break;
4909                        }
4910                        continue;
4911                }
4912
4913                btrfs_item_key_to_cpu(eb, &found_key, slot);
4914                if (found_key.objectid != key.objectid ||
4915                    found_key.type != key.type) {
4916                        ret = 0;
4917                        goto out;
4918                }
4919
4920                ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
4921                if (ret < 0)
4922                        goto out;
4923
4924                path->slots[0]++;
4925        }
4926
4927out:
4928        btrfs_free_path(path);
4929        return ret;
4930}
4931
4932static inline u64 max_send_read_size(const struct send_ctx *sctx)
4933{
4934        return sctx->send_max_size - SZ_16K;
4935}
4936
4937static int put_data_header(struct send_ctx *sctx, u32 len)
4938{
4939        struct btrfs_tlv_header *hdr;
4940
4941        if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
4942                return -EOVERFLOW;
4943        hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
4944        put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
4945        put_unaligned_le16(len, &hdr->tlv_len);
4946        sctx->send_size += sizeof(*hdr);
4947        return 0;
4948}
4949
4950static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
4951{
4952        struct btrfs_root *root = sctx->send_root;
4953        struct btrfs_fs_info *fs_info = root->fs_info;
4954        struct inode *inode;
4955        struct page *page;
4956        pgoff_t index = offset >> PAGE_SHIFT;
4957        pgoff_t last_index;
4958        unsigned pg_offset = offset_in_page(offset);
4959        int ret;
4960
4961        ret = put_data_header(sctx, len);
4962        if (ret)
4963                return ret;
4964
4965        inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
4966        if (IS_ERR(inode))
4967                return PTR_ERR(inode);
4968
4969        last_index = (offset + len - 1) >> PAGE_SHIFT;
4970
4971        /* initial readahead */
4972        memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4973        file_ra_state_init(&sctx->ra, inode->i_mapping);
4974
4975        while (index <= last_index) {
4976                unsigned cur_len = min_t(unsigned, len,
4977                                         PAGE_SIZE - pg_offset);
4978
4979                page = find_lock_page(inode->i_mapping, index);
4980                if (!page) {
4981                        page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
4982                                NULL, index, last_index + 1 - index);
4983
4984                        page = find_or_create_page(inode->i_mapping, index,
4985                                        GFP_KERNEL);
4986                        if (!page) {
4987                                ret = -ENOMEM;
4988                                break;
4989                        }
4990                }
4991
4992                if (PageReadahead(page)) {
4993                        page_cache_async_readahead(inode->i_mapping, &sctx->ra,
4994                                NULL, page, index, last_index + 1 - index);
4995                }
4996
4997                if (!PageUptodate(page)) {
4998                        btrfs_readpage(NULL, page);
4999                        lock_page(page);
5000                        if (!PageUptodate(page)) {
5001                                unlock_page(page);
5002                                btrfs_err(fs_info,
5003                        "send: IO error at offset %llu for inode %llu root %llu",
5004                                        page_offset(page), sctx->cur_ino,
5005                                        sctx->send_root->root_key.objectid);
5006                                put_page(page);
5007                                ret = -EIO;
5008                                break;
5009                        }
5010                }
5011
5012                memcpy_from_page(sctx->send_buf + sctx->send_size, page,
5013                                 pg_offset, cur_len);
5014                unlock_page(page);
5015                put_page(page);
5016                index++;
5017                pg_offset = 0;
5018                len -= cur_len;
5019                sctx->send_size += cur_len;
5020        }
5021        iput(inode);
5022        return ret;
5023}
5024
5025/*
5026 * Read some bytes from the current inode/file and send a write command to
5027 * user space.
5028 */
5029static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
5030{
5031        struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5032        int ret = 0;
5033        struct fs_path *p;
5034
5035        p = fs_path_alloc();
5036        if (!p)
5037                return -ENOMEM;
5038
5039        btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
5040
5041        ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
5042        if (ret < 0)
5043                goto out;
5044
5045        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5046        if (ret < 0)
5047                goto out;
5048
5049        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5050        TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5051        ret = put_file_data(sctx, offset, len);
5052        if (ret < 0)
5053                goto out;
5054
5055        ret = send_cmd(sctx);
5056
5057tlv_put_failure:
5058out:
5059        fs_path_free(p);
5060        return ret;
5061}
5062
5063/*
5064 * Send a clone command to user space.
5065 */
5066static int send_clone(struct send_ctx *sctx,
5067                      u64 offset, u32 len,
5068                      struct clone_root *clone_root)
5069{
5070        int ret = 0;
5071        struct fs_path *p;
5072        u64 gen;
5073
5074        btrfs_debug(sctx->send_root->fs_info,
5075                    "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5076                    offset, len, clone_root->root->root_key.objectid,
5077                    clone_root->ino, clone_root->offset);
5078
5079        p = fs_path_alloc();
5080        if (!p)
5081                return -ENOMEM;
5082
5083        ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
5084        if (ret < 0)
5085                goto out;
5086
5087        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5088        if (ret < 0)
5089                goto out;
5090
5091        TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5092        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
5093        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5094
5095        if (clone_root->root == sctx->send_root) {
5096                ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
5097                                &gen, NULL, NULL, NULL, NULL);
5098                if (ret < 0)
5099                        goto out;
5100                ret = get_cur_path(sctx, clone_root->ino, gen, p);
5101        } else {
5102                ret = get_inode_path(clone_root->root, clone_root->ino, p);
5103        }
5104        if (ret < 0)
5105                goto out;
5106
5107        /*
5108         * If the parent we're using has a received_uuid set then use that as
5109         * our clone source as that is what we will look for when doing a
5110         * receive.
5111         *
5112         * This covers the case that we create a snapshot off of a received
5113         * subvolume and then use that as the parent and try to receive on a
5114         * different host.
5115         */
5116        if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
5117                TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5118                             clone_root->root->root_item.received_uuid);
5119        else
5120                TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
5121                             clone_root->root->root_item.uuid);
5122        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
5123                    btrfs_root_ctransid(&clone_root->root->root_item));
5124        TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
5125        TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
5126                        clone_root->offset);
5127
5128        ret = send_cmd(sctx);
5129
5130tlv_put_failure:
5131out:
5132        fs_path_free(p);
5133        return ret;
5134}
5135
5136/*
5137 * Send an update extent command to user space.
5138 */
5139static int send_update_extent(struct send_ctx *sctx,
5140                              u64 offset, u32 len)
5141{
5142        int ret = 0;
5143        struct fs_path *p;
5144
5145        p = fs_path_alloc();
5146        if (!p)
5147                return -ENOMEM;
5148
5149        ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
5150        if (ret < 0)
5151                goto out;
5152
5153        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5154        if (ret < 0)
5155                goto out;
5156
5157        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5158        TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5159        TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5160
5161        ret = send_cmd(sctx);
5162
5163tlv_put_failure:
5164out:
5165        fs_path_free(p);
5166        return ret;
5167}
5168
5169static int send_hole(struct send_ctx *sctx, u64 end)
5170{
5171        struct fs_path *p = NULL;
5172        u64 read_size = max_send_read_size(sctx);
5173        u64 offset = sctx->cur_inode_last_extent;
5174        int ret = 0;
5175
5176        /*
5177         * A hole that starts at EOF or beyond it. Since we do not yet support
5178         * fallocate (for extent preallocation and hole punching), sending a
5179         * write of zeroes starting at EOF or beyond would later require issuing
5180         * a truncate operation which would undo the write and achieve nothing.
5181         */
5182        if (offset >= sctx->cur_inode_size)
5183                return 0;
5184
5185        /*
5186         * Don't go beyond the inode's i_size due to prealloc extents that start
5187         * after the i_size.
5188         */
5189        end = min_t(u64, end, sctx->cur_inode_size);
5190
5191        if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5192                return send_update_extent(sctx, offset, end - offset);
5193
5194        p = fs_path_alloc();
5195        if (!p)
5196                return -ENOMEM;
5197        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5198        if (ret < 0)
5199                goto tlv_put_failure;
5200        while (offset < end) {
5201                u64 len = min(end - offset, read_size);
5202
5203                ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
5204                if (ret < 0)
5205                        break;
5206                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5207                TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5208                ret = put_data_header(sctx, len);
5209                if (ret < 0)
5210                        break;
5211                memset(sctx->send_buf + sctx->send_size, 0, len);
5212                sctx->send_size += len;
5213                ret = send_cmd(sctx);
5214                if (ret < 0)
5215                        break;
5216                offset += len;
5217        }
5218        sctx->cur_inode_next_write_offset = offset;
5219tlv_put_failure:
5220        fs_path_free(p);
5221        return ret;
5222}
5223
5224static int send_extent_data(struct send_ctx *sctx,
5225                            const u64 offset,
5226                            const u64 len)
5227{
5228        u64 read_size = max_send_read_size(sctx);
5229        u64 sent = 0;
5230
5231        if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5232                return send_update_extent(sctx, offset, len);
5233
5234        while (sent < len) {
5235                u64 size = min(len - sent, read_size);
5236                int ret;
5237
5238                ret = send_write(sctx, offset + sent, size);
5239                if (ret < 0)
5240                        return ret;
5241                sent += size;
5242        }
5243        return 0;
5244}
5245
5246/*
5247 * Search for a capability xattr related to sctx->cur_ino. If the capability is
5248 * found, call send_set_xattr function to emit it.
5249 *
5250 * Return 0 if there isn't a capability, or when the capability was emitted
5251 * successfully, or < 0 if an error occurred.
5252 */
5253static int send_capabilities(struct send_ctx *sctx)
5254{
5255        struct fs_path *fspath = NULL;
5256        struct btrfs_path *path;
5257        struct btrfs_dir_item *di;
5258        struct extent_buffer *leaf;
5259        unsigned long data_ptr;
5260        char *buf = NULL;
5261        int buf_len;
5262        int ret = 0;
5263
5264        path = alloc_path_for_send();
5265        if (!path)
5266                return -ENOMEM;
5267
5268        di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
5269                                XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
5270        if (!di) {
5271                /* There is no xattr for this inode */
5272                goto out;
5273        } else if (IS_ERR(di)) {
5274                ret = PTR_ERR(di);
5275                goto out;
5276        }
5277
5278        leaf = path->nodes[0];
5279        buf_len = btrfs_dir_data_len(leaf, di);
5280
5281        fspath = fs_path_alloc();
5282        buf = kmalloc(buf_len, GFP_KERNEL);
5283        if (!fspath || !buf) {
5284                ret = -ENOMEM;
5285                goto out;
5286        }
5287
5288        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
5289        if (ret < 0)
5290                goto out;
5291
5292        data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
5293        read_extent_buffer(leaf, buf, data_ptr, buf_len);
5294
5295        ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
5296                        strlen(XATTR_NAME_CAPS), buf, buf_len);
5297out:
5298        kfree(buf);
5299        fs_path_free(fspath);
5300        btrfs_free_path(path);
5301        return ret;
5302}
5303
5304static int clone_range(struct send_ctx *sctx,
5305                       struct clone_root *clone_root,
5306                       const u64 disk_byte,
5307                       u64 data_offset,
5308                       u64 offset,
5309                       u64 len)
5310{
5311        struct btrfs_path *path;
5312        struct btrfs_key key;
5313        int ret;
5314        u64 clone_src_i_size = 0;
5315
5316        /*
5317         * Prevent cloning from a zero offset with a length matching the sector
5318         * size because in some scenarios this will make the receiver fail.
5319         *
5320         * For example, if in the source filesystem the extent at offset 0
5321         * has a length of sectorsize and it was written using direct IO, then
5322         * it can never be an inline extent (even if compression is enabled).
5323         * Then this extent can be cloned in the original filesystem to a non
5324         * zero file offset, but it may not be possible to clone in the
5325         * destination filesystem because it can be inlined due to compression
5326         * on the destination filesystem (as the receiver's write operations are
5327         * always done using buffered IO). The same happens when the original
5328         * filesystem does not have compression enabled but the destination
5329         * filesystem has.
5330         */
5331        if (clone_root->offset == 0 &&
5332            len == sctx->send_root->fs_info->sectorsize)
5333                return send_extent_data(sctx, offset, len);
5334
5335        path = alloc_path_for_send();
5336        if (!path)
5337                return -ENOMEM;
5338
5339        /*
5340         * There are inodes that have extents that lie behind its i_size. Don't
5341         * accept clones from these extents.
5342         */
5343        ret = __get_inode_info(clone_root->root, path, clone_root->ino,
5344                               &clone_src_i_size, NULL, NULL, NULL, NULL, NULL);
5345        btrfs_release_path(path);
5346        if (ret < 0)
5347                goto out;
5348
5349        /*
5350         * We can't send a clone operation for the entire range if we find
5351         * extent items in the respective range in the source file that
5352         * refer to different extents or if we find holes.
5353         * So check for that and do a mix of clone and regular write/copy
5354         * operations if needed.
5355         *
5356         * Example:
5357         *
5358         * mkfs.btrfs -f /dev/sda
5359         * mount /dev/sda /mnt
5360         * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5361         * cp --reflink=always /mnt/foo /mnt/bar
5362         * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5363         * btrfs subvolume snapshot -r /mnt /mnt/snap
5364         *
5365         * If when we send the snapshot and we are processing file bar (which
5366         * has a higher inode number than foo) we blindly send a clone operation
5367         * for the [0, 100K[ range from foo to bar, the receiver ends up getting
5368         * a file bar that matches the content of file foo - iow, doesn't match
5369         * the content from bar in the original filesystem.
5370         */
5371        key.objectid = clone_root->ino;
5372        key.type = BTRFS_EXTENT_DATA_KEY;
5373        key.offset = clone_root->offset;
5374        ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
5375        if (ret < 0)
5376                goto out;
5377        if (ret > 0 && path->slots[0] > 0) {
5378                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
5379                if (key.objectid == clone_root->ino &&
5380                    key.type == BTRFS_EXTENT_DATA_KEY)
5381                        path->slots[0]--;
5382        }
5383
5384        while (true) {
5385                struct extent_buffer *leaf = path->nodes[0];
5386                int slot = path->slots[0];
5387                struct btrfs_file_extent_item *ei;
5388                u8 type;
5389                u64 ext_len;
5390                u64 clone_len;
5391                u64 clone_data_offset;
5392
5393                if (slot >= btrfs_header_nritems(leaf)) {
5394                        ret = btrfs_next_leaf(clone_root->root, path);
5395                        if (ret < 0)
5396                                goto out;
5397                        else if (ret > 0)
5398                                break;
5399                        continue;
5400                }
5401
5402                btrfs_item_key_to_cpu(leaf, &key, slot);
5403
5404                /*
5405                 * We might have an implicit trailing hole (NO_HOLES feature
5406                 * enabled). We deal with it after leaving this loop.
5407                 */
5408                if (key.objectid != clone_root->ino ||
5409                    key.type != BTRFS_EXTENT_DATA_KEY)
5410                        break;
5411
5412                ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5413                type = btrfs_file_extent_type(leaf, ei);
5414                if (type == BTRFS_FILE_EXTENT_INLINE) {
5415                        ext_len = btrfs_file_extent_ram_bytes(leaf, ei);
5416                        ext_len = PAGE_ALIGN(ext_len);
5417                } else {
5418                        ext_len = btrfs_file_extent_num_bytes(leaf, ei);
5419                }
5420
5421                if (key.offset + ext_len <= clone_root->offset)
5422                        goto next;
5423
5424                if (key.offset > clone_root->offset) {
5425                        /* Implicit hole, NO_HOLES feature enabled. */
5426                        u64 hole_len = key.offset - clone_root->offset;
5427
5428                        if (hole_len > len)
5429                                hole_len = len;
5430                        ret = send_extent_data(sctx, offset, hole_len);
5431                        if (ret < 0)
5432                                goto out;
5433
5434                        len -= hole_len;
5435                        if (len == 0)
5436                                break;
5437                        offset += hole_len;
5438                        clone_root->offset += hole_len;
5439                        data_offset += hole_len;
5440                }
5441
5442                if (key.offset >= clone_root->offset + len)
5443                        break;
5444
5445                if (key.offset >= clone_src_i_size)
5446                        break;
5447
5448                if (key.offset + ext_len > clone_src_i_size)
5449                        ext_len = clone_src_i_size - key.offset;
5450
5451                clone_data_offset = btrfs_file_extent_offset(leaf, ei);
5452                if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
5453                        clone_root->offset = key.offset;
5454                        if (clone_data_offset < data_offset &&
5455                                clone_data_offset + ext_len > data_offset) {
5456                                u64 extent_offset;
5457
5458                                extent_offset = data_offset - clone_data_offset;
5459                                ext_len -= extent_offset;
5460                                clone_data_offset += extent_offset;
5461                                clone_root->offset += extent_offset;
5462                        }
5463                }
5464
5465                clone_len = min_t(u64, ext_len, len);
5466
5467                if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
5468                    clone_data_offset == data_offset) {
5469                        const u64 src_end = clone_root->offset + clone_len;
5470                        const u64 sectorsize = SZ_64K;
5471
5472                        /*
5473                         * We can't clone the last block, when its size is not
5474                         * sector size aligned, into the middle of a file. If we
5475                         * do so, the receiver will get a failure (-EINVAL) when
5476                         * trying to clone or will silently corrupt the data in
5477                         * the destination file if it's on a kernel without the
5478                         * fix introduced by commit ac765f83f1397646
5479                         * ("Btrfs: fix data corruption due to cloning of eof
5480                         * block).
5481                         *
5482                         * So issue a clone of the aligned down range plus a
5483                         * regular write for the eof block, if we hit that case.
5484                         *
5485                         * Also, we use the maximum possible sector size, 64K,
5486                         * because we don't know what's the sector size of the
5487                         * filesystem that receives the stream, so we have to
5488                         * assume the largest possible sector size.
5489                         */
5490                        if (src_end == clone_src_i_size &&
5491                            !IS_ALIGNED(src_end, sectorsize) &&
5492                            offset + clone_len < sctx->cur_inode_size) {
5493                                u64 slen;
5494
5495                                slen = ALIGN_DOWN(src_end - clone_root->offset,
5496                                                  sectorsize);
5497                                if (slen > 0) {
5498                                        ret = send_clone(sctx, offset, slen,
5499                                                         clone_root);
5500                                        if (ret < 0)
5501                                                goto out;
5502                                }
5503                                ret = send_extent_data(sctx, offset + slen,
5504                                                       clone_len - slen);
5505                        } else {
5506                                ret = send_clone(sctx, offset, clone_len,
5507                                                 clone_root);
5508                        }
5509                } else {
5510                        ret = send_extent_data(sctx, offset, clone_len);
5511                }
5512
5513                if (ret < 0)
5514                        goto out;
5515
5516                len -= clone_len;
5517                if (len == 0)
5518                        break;
5519                offset += clone_len;
5520                clone_root->offset += clone_len;
5521
5522                /*
5523                 * If we are cloning from the file we are currently processing,
5524                 * and using the send root as the clone root, we must stop once
5525                 * the current clone offset reaches the current eof of the file
5526                 * at the receiver, otherwise we would issue an invalid clone
5527                 * operation (source range going beyond eof) and cause the
5528                 * receiver to fail. So if we reach the current eof, bail out
5529                 * and fallback to a regular write.
5530                 */
5531                if (clone_root->root == sctx->send_root &&
5532                    clone_root->ino == sctx->cur_ino &&
5533                    clone_root->offset >= sctx->cur_inode_next_write_offset)
5534                        break;
5535
5536                data_offset += clone_len;
5537next:
5538                path->slots[0]++;
5539        }
5540
5541        if (len > 0)
5542                ret = send_extent_data(sctx, offset, len);
5543        else
5544                ret = 0;
5545out:
5546        btrfs_free_path(path);
5547        return ret;
5548}
5549
5550static int send_write_or_clone(struct send_ctx *sctx,
5551                               struct btrfs_path *path,
5552                               struct btrfs_key *key,
5553                               struct clone_root *clone_root)
5554{
5555        int ret = 0;
5556        u64 offset = key->offset;
5557        u64 end;
5558        u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
5559
5560        end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
5561        if (offset >= end)
5562                return 0;
5563
5564        if (clone_root && IS_ALIGNED(end, bs)) {
5565                struct btrfs_file_extent_item *ei;
5566                u64 disk_byte;
5567                u64 data_offset;
5568
5569                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
5570                                    struct btrfs_file_extent_item);
5571                disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
5572                data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
5573                ret = clone_range(sctx, clone_root, disk_byte, data_offset,
5574                                  offset, end - offset);
5575        } else {
5576                ret = send_extent_data(sctx, offset, end - offset);
5577        }
5578        sctx->cur_inode_next_write_offset = end;
5579        return ret;
5580}
5581
5582static int is_extent_unchanged(struct send_ctx *sctx,
5583                               struct btrfs_path *left_path,
5584                               struct btrfs_key *ekey)
5585{
5586        int ret = 0;
5587        struct btrfs_key key;
5588        struct btrfs_path *path = NULL;
5589        struct extent_buffer *eb;
5590        int slot;
5591        struct btrfs_key found_key;
5592        struct btrfs_file_extent_item *ei;
5593        u64 left_disknr;
5594        u64 right_disknr;
5595        u64 left_offset;
5596        u64 right_offset;
5597        u64 left_offset_fixed;
5598        u64 left_len;
5599        u64 right_len;
5600        u64 left_gen;
5601        u64 right_gen;
5602        u8 left_type;
5603        u8 right_type;
5604
5605        path = alloc_path_for_send();
5606        if (!path)
5607                return -ENOMEM;
5608
5609        eb = left_path->nodes[0];
5610        slot = left_path->slots[0];
5611        ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
5612        left_type = btrfs_file_extent_type(eb, ei);
5613
5614        if (left_type != BTRFS_FILE_EXTENT_REG) {
5615                ret = 0;
5616                goto out;
5617        }
5618        left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
5619        left_len = btrfs_file_extent_num_bytes(eb, ei);
5620        left_offset = btrfs_file_extent_offset(eb, ei);
5621        left_gen = btrfs_file_extent_generation(eb, ei);
5622
5623        /*
5624         * Following comments will refer to these graphics. L is the left
5625         * extents which we are checking at the moment. 1-8 are the right
5626         * extents that we iterate.
5627         *
5628         *       |-----L-----|
5629         * |-1-|-2a-|-3-|-4-|-5-|-6-|
5630         *
5631         *       |-----L-----|
5632         * |--1--|-2b-|...(same as above)
5633         *
5634         * Alternative situation. Happens on files where extents got split.
5635         *       |-----L-----|
5636         * |-----------7-----------|-6-|
5637         *
5638         * Alternative situation. Happens on files which got larger.
5639         *       |-----L-----|
5640         * |-8-|
5641         * Nothing follows after 8.
5642         */
5643
5644        key.objectid = ekey->objectid;
5645        key.type = BTRFS_EXTENT_DATA_KEY;
5646        key.offset = ekey->offset;
5647        ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
5648        if (ret < 0)
5649                goto out;
5650        if (ret) {
5651                ret = 0;
5652                goto out;
5653        }
5654
5655        /*
5656         * Handle special case where the right side has no extents at all.
5657         */
5658        eb = path->nodes[0];
5659        slot = path->slots[0];
5660        btrfs_item_key_to_cpu(eb, &found_key, slot);
5661        if (found_key.objectid != key.objectid ||
5662            found_key.type != key.type) {
5663                /* If we're a hole then just pretend nothing changed */
5664                ret = (left_disknr) ? 0 : 1;
5665                goto out;
5666        }
5667
5668        /*
5669         * We're now on 2a, 2b or 7.
5670         */
5671        key = found_key;
5672        while (key.offset < ekey->offset + left_len) {
5673                ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
5674                right_type = btrfs_file_extent_type(eb, ei);
5675                if (right_type != BTRFS_FILE_EXTENT_REG &&
5676                    right_type != BTRFS_FILE_EXTENT_INLINE) {
5677                        ret = 0;
5678                        goto out;
5679                }
5680
5681                if (right_type == BTRFS_FILE_EXTENT_INLINE) {
5682                        right_len = btrfs_file_extent_ram_bytes(eb, ei);
5683                        right_len = PAGE_ALIGN(right_len);
5684                } else {
5685                        right_len = btrfs_file_extent_num_bytes(eb, ei);
5686                }
5687
5688                /*
5689                 * Are we at extent 8? If yes, we know the extent is changed.
5690                 * This may only happen on the first iteration.
5691                 */
5692                if (found_key.offset + right_len <= ekey->offset) {
5693                        /* If we're a hole just pretend nothing changed */
5694                        ret = (left_disknr) ? 0 : 1;
5695                        goto out;
5696                }
5697
5698                /*
5699                 * We just wanted to see if when we have an inline extent, what
5700                 * follows it is a regular extent (wanted to check the above
5701                 * condition for inline extents too). This should normally not
5702                 * happen but it's possible for example when we have an inline
5703                 * compressed extent representing data with a size matching
5704                 * the page size (currently the same as sector size).
5705                 */
5706                if (right_type == BTRFS_FILE_EXTENT_INLINE) {
5707                        ret = 0;
5708                        goto out;
5709                }
5710
5711                right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
5712                right_offset = btrfs_file_extent_offset(eb, ei);
5713                right_gen = btrfs_file_extent_generation(eb, ei);
5714
5715                left_offset_fixed = left_offset;
5716                if (key.offset < ekey->offset) {
5717                        /* Fix the right offset for 2a and 7. */
5718                        right_offset += ekey->offset - key.offset;
5719                } else {
5720                        /* Fix the left offset for all behind 2a and 2b */
5721                        left_offset_fixed += key.offset - ekey->offset;
5722                }
5723
5724                /*
5725                 * Check if we have the same extent.
5726                 */
5727                if (left_disknr != right_disknr ||
5728                    left_offset_fixed != right_offset ||
5729                    left_gen != right_gen) {
5730                        ret = 0;
5731                        goto out;
5732                }
5733
5734                /*
5735                 * Go to the next extent.
5736                 */
5737                ret = btrfs_next_item(sctx->parent_root, path);
5738                if (ret < 0)
5739                        goto out;
5740                if (!ret) {
5741                        eb = path->nodes[0];
5742                        slot = path->slots[0];
5743                        btrfs_item_key_to_cpu(eb, &found_key, slot);
5744                }
5745                if (ret || found_key.objectid != key.objectid ||
5746                    found_key.type != key.type) {
5747                        key.offset += right_len;
5748                        break;
5749                }
5750                if (found_key.offset != key.offset + right_len) {
5751                        ret = 0;
5752                        goto out;
5753                }
5754                key = found_key;
5755        }
5756
5757        /*
5758         * We're now behind the left extent (treat as unchanged) or at the end
5759         * of the right side (treat as changed).
5760         */
5761        if (key.offset >= ekey->offset + left_len)
5762                ret = 1;
5763        else
5764                ret = 0;
5765
5766
5767out:
5768        btrfs_free_path(path);
5769        return ret;
5770}
5771
5772static int get_last_extent(struct send_ctx *sctx, u64 offset)
5773{
5774        struct btrfs_path *path;
5775        struct btrfs_root *root = sctx->send_root;
5776        struct btrfs_key key;
5777        int ret;
5778
5779        path = alloc_path_for_send();
5780        if (!path)
5781                return -ENOMEM;
5782
5783        sctx->cur_inode_last_extent = 0;
5784
5785        key.objectid = sctx->cur_ino;
5786        key.type = BTRFS_EXTENT_DATA_KEY;
5787        key.offset = offset;
5788        ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
5789        if (ret < 0)
5790                goto out;
5791        ret = 0;
5792        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
5793        if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
5794                goto out;
5795
5796        sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
5797out:
5798        btrfs_free_path(path);
5799        return ret;
5800}
5801
5802static int range_is_hole_in_parent(struct send_ctx *sctx,
5803                                   const u64 start,
5804                                   const u64 end)
5805{
5806        struct btrfs_path *path;
5807        struct btrfs_key key;
5808        struct btrfs_root *root = sctx->parent_root;
5809        u64 search_start = start;
5810        int ret;
5811
5812        path = alloc_path_for_send();
5813        if (!path)
5814                return -ENOMEM;
5815
5816        key.objectid = sctx->cur_ino;
5817        key.type = BTRFS_EXTENT_DATA_KEY;
5818        key.offset = search_start;
5819        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5820        if (ret < 0)
5821                goto out;
5822        if (ret > 0 && path->slots[0] > 0)
5823                path->slots[0]--;
5824
5825        while (search_start < end) {
5826                struct extent_buffer *leaf = path->nodes[0];
5827                int slot = path->slots[0];
5828                struct btrfs_file_extent_item *fi;
5829                u64 extent_end;
5830
5831                if (slot >= btrfs_header_nritems(leaf)) {
5832                        ret = btrfs_next_leaf(root, path);
5833                        if (ret < 0)
5834                                goto out;
5835                        else if (ret > 0)
5836                                break;
5837                        continue;
5838                }
5839
5840                btrfs_item_key_to_cpu(leaf, &key, slot);
5841                if (key.objectid < sctx->cur_ino ||
5842                    key.type < BTRFS_EXTENT_DATA_KEY)
5843                        goto next;
5844                if (key.objectid > sctx->cur_ino ||
5845                    key.type > BTRFS_EXTENT_DATA_KEY ||
5846                    key.offset >= end)
5847                        break;
5848
5849                fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5850                extent_end = btrfs_file_extent_end(path);
5851                if (extent_end <= start)
5852                        goto next;
5853                if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
5854                        search_start = extent_end;
5855                        goto next;
5856                }
5857                ret = 0;
5858                goto out;
5859next:
5860                path->slots[0]++;
5861        }
5862        ret = 1;
5863out:
5864        btrfs_free_path(path);
5865        return ret;
5866}
5867
5868static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
5869                           struct btrfs_key *key)
5870{
5871        int ret = 0;
5872
5873        if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
5874                return 0;
5875
5876        if (sctx->cur_inode_last_extent == (u64)-1) {
5877                ret = get_last_extent(sctx, key->offset - 1);
5878                if (ret)
5879                        return ret;
5880        }
5881
5882        if (path->slots[0] == 0 &&
5883            sctx->cur_inode_last_extent < key->offset) {
5884                /*
5885                 * We might have skipped entire leafs that contained only
5886                 * file extent items for our current inode. These leafs have
5887                 * a generation number smaller (older) than the one in the
5888                 * current leaf and the leaf our last extent came from, and
5889                 * are located between these 2 leafs.
5890                 */
5891                ret = get_last_extent(sctx, key->offset - 1);
5892                if (ret)
5893                        return ret;
5894        }
5895
5896        if (sctx->cur_inode_last_extent < key->offset) {
5897                ret = range_is_hole_in_parent(sctx,
5898                                              sctx->cur_inode_last_extent,
5899                                              key->offset);
5900                if (ret < 0)
5901                        return ret;
5902                else if (ret == 0)
5903                        ret = send_hole(sctx, key->offset);
5904                else
5905                        ret = 0;
5906        }
5907        sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
5908        return ret;
5909}
5910
5911static int process_extent(struct send_ctx *sctx,
5912                          struct btrfs_path *path,
5913                          struct btrfs_key *key)
5914{
5915        struct clone_root *found_clone = NULL;
5916        int ret = 0;
5917
5918        if (S_ISLNK(sctx->cur_inode_mode))
5919                return 0;
5920
5921        if (sctx->parent_root && !sctx->cur_inode_new) {
5922                ret = is_extent_unchanged(sctx, path, key);
5923                if (ret < 0)
5924                        goto out;
5925                if (ret) {
5926                        ret = 0;
5927                        goto out_hole;
5928                }
5929        } else {
5930                struct btrfs_file_extent_item *ei;
5931                u8 type;
5932
5933                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
5934                                    struct btrfs_file_extent_item);
5935                type = btrfs_file_extent_type(path->nodes[0], ei);
5936                if (type == BTRFS_FILE_EXTENT_PREALLOC ||
5937                    type == BTRFS_FILE_EXTENT_REG) {
5938                        /*
5939                         * The send spec does not have a prealloc command yet,
5940                         * so just leave a hole for prealloc'ed extents until
5941                         * we have enough commands queued up to justify rev'ing
5942                         * the send spec.
5943                         */
5944                        if (type == BTRFS_FILE_EXTENT_PREALLOC) {
5945                                ret = 0;
5946                                goto out;
5947                        }
5948
5949                        /* Have a hole, just skip it. */
5950                        if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
5951                                ret = 0;
5952                                goto out;
5953                        }
5954                }
5955        }
5956
5957        ret = find_extent_clone(sctx, path, key->objectid, key->offset,
5958                        sctx->cur_inode_size, &found_clone);
5959        if (ret != -ENOENT && ret < 0)
5960                goto out;
5961
5962        ret = send_write_or_clone(sctx, path, key, found_clone);
5963        if (ret)
5964                goto out;
5965out_hole:
5966        ret = maybe_send_hole(sctx, path, key);
5967out:
5968        return ret;
5969}
5970
5971static int process_all_extents(struct send_ctx *sctx)
5972{
5973        int ret;
5974        struct btrfs_root *root;
5975        struct btrfs_path *path;
5976        struct btrfs_key key;
5977        struct btrfs_key found_key;
5978        struct extent_buffer *eb;
5979        int slot;
5980
5981        root = sctx->send_root;
5982        path = alloc_path_for_send();
5983        if (!path)
5984                return -ENOMEM;
5985
5986        key.objectid = sctx->cmp_key->objectid;
5987        key.type = BTRFS_EXTENT_DATA_KEY;
5988        key.offset = 0;
5989        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5990        if (ret < 0)
5991                goto out;
5992
5993        while (1) {
5994                eb = path->nodes[0];
5995                slot = path->slots[0];
5996
5997                if (slot >= btrfs_header_nritems(eb)) {
5998                        ret = btrfs_next_leaf(root, path);
5999                        if (ret < 0) {
6000                                goto out;
6001                        } else if (ret > 0) {
6002                                ret = 0;
6003                                break;
6004                        }
6005                        continue;
6006                }
6007
6008                btrfs_item_key_to_cpu(eb, &found_key, slot);
6009
6010                if (found_key.objectid != key.objectid ||
6011                    found_key.type != key.type) {
6012                        ret = 0;
6013                        goto out;
6014                }
6015
6016                ret = process_extent(sctx, path, &found_key);
6017                if (ret < 0)
6018                        goto out;
6019
6020                path->slots[0]++;
6021        }
6022
6023out:
6024        btrfs_free_path(path);
6025        return ret;
6026}
6027
6028static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
6029                                           int *pending_move,
6030                                           int *refs_processed)
6031{
6032        int ret = 0;
6033
6034        if (sctx->cur_ino == 0)
6035                goto out;
6036        if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
6037            sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
6038                goto out;
6039        if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
6040                goto out;
6041
6042        ret = process_recorded_refs(sctx, pending_move);
6043        if (ret < 0)
6044                goto out;
6045
6046        *refs_processed = 1;
6047out:
6048        return ret;
6049}
6050
6051static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
6052{
6053        int ret = 0;
6054        u64 left_mode;
6055        u64 left_uid;
6056        u64 left_gid;
6057        u64 right_mode;
6058        u64 right_uid;
6059        u64 right_gid;
6060        int need_chmod = 0;
6061        int need_chown = 0;
6062        int need_truncate = 1;
6063        int pending_move = 0;
6064        int refs_processed = 0;
6065
6066        if (sctx->ignore_cur_inode)
6067                return 0;
6068
6069        ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
6070                                              &refs_processed);
6071        if (ret < 0)
6072                goto out;
6073
6074        /*
6075         * We have processed the refs and thus need to advance send_progress.
6076         * Now, calls to get_cur_xxx will take the updated refs of the current
6077         * inode into account.
6078         *
6079         * On the other hand, if our current inode is a directory and couldn't
6080         * be moved/renamed because its parent was renamed/moved too and it has
6081         * a higher inode number, we can only move/rename our current inode
6082         * after we moved/renamed its parent. Therefore in this case operate on
6083         * the old path (pre move/rename) of our current inode, and the
6084         * move/rename will be performed later.
6085         */
6086        if (refs_processed && !pending_move)
6087                sctx->send_progress = sctx->cur_ino + 1;
6088
6089        if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
6090                goto out;
6091        if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
6092                goto out;
6093
6094        ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
6095                        &left_mode, &left_uid, &left_gid, NULL);
6096        if (ret < 0)
6097                goto out;
6098
6099        if (!sctx->parent_root || sctx->cur_inode_new) {
6100                need_chown = 1;
6101                if (!S_ISLNK(sctx->cur_inode_mode))
6102                        need_chmod = 1;
6103                if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
6104                        need_truncate = 0;
6105        } else {
6106                u64 old_size;
6107
6108                ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
6109                                &old_size, NULL, &right_mode, &right_uid,
6110                                &right_gid, NULL);
6111                if (ret < 0)
6112                        goto out;
6113
6114                if (left_uid != right_uid || left_gid != right_gid)
6115                        need_chown = 1;
6116                if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
6117                        need_chmod = 1;
6118                if ((old_size == sctx->cur_inode_size) ||
6119                    (sctx->cur_inode_size > old_size &&
6120                     sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
6121                        need_truncate = 0;
6122        }
6123
6124        if (S_ISREG(sctx->cur_inode_mode)) {
6125                if (need_send_hole(sctx)) {
6126                        if (sctx->cur_inode_last_extent == (u64)-1 ||
6127                            sctx->cur_inode_last_extent <
6128                            sctx->cur_inode_size) {
6129                                ret = get_last_extent(sctx, (u64)-1);
6130                                if (ret)
6131                                        goto out;
6132                        }
6133                        if (sctx->cur_inode_last_extent <
6134                            sctx->cur_inode_size) {
6135                                ret = send_hole(sctx, sctx->cur_inode_size);
6136                                if (ret)
6137                                        goto out;
6138                        }
6139                }
6140                if (need_truncate) {
6141                        ret = send_truncate(sctx, sctx->cur_ino,
6142                                            sctx->cur_inode_gen,
6143                                            sctx->cur_inode_size);
6144                        if (ret < 0)
6145                                goto out;
6146                }
6147        }
6148
6149        if (need_chown) {
6150                ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6151                                left_uid, left_gid);
6152                if (ret < 0)
6153                        goto out;
6154        }
6155        if (need_chmod) {
6156                ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
6157                                left_mode);
6158                if (ret < 0)
6159                        goto out;
6160        }
6161
6162        ret = send_capabilities(sctx);
6163        if (ret < 0)
6164                goto out;
6165
6166        /*
6167         * If other directory inodes depended on our current directory
6168         * inode's move/rename, now do their move/rename operations.
6169         */
6170        if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
6171                ret = apply_children_dir_moves(sctx);
6172                if (ret)
6173                        goto out;
6174                /*
6175                 * Need to send that every time, no matter if it actually
6176                 * changed between the two trees as we have done changes to
6177                 * the inode before. If our inode is a directory and it's
6178                 * waiting to be moved/renamed, we will send its utimes when
6179                 * it's moved/renamed, therefore we don't need to do it here.
6180                 */
6181                sctx->send_progress = sctx->cur_ino + 1;
6182                ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
6183                if (ret < 0)
6184                        goto out;
6185        }
6186
6187out:
6188        return ret;
6189}
6190
6191struct parent_paths_ctx {
6192        struct list_head *refs;
6193        struct send_ctx *sctx;
6194};
6195
6196static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
6197                             void *ctx)
6198{
6199        struct parent_paths_ctx *ppctx = ctx;
6200
6201        return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
6202                          ppctx->refs);
6203}
6204
6205/*
6206 * Issue unlink operations for all paths of the current inode found in the
6207 * parent snapshot.
6208 */
6209static int btrfs_unlink_all_paths(struct send_ctx *sctx)
6210{
6211        LIST_HEAD(deleted_refs);
6212        struct btrfs_path *path;
6213        struct btrfs_key key;
6214        struct parent_paths_ctx ctx;
6215        int ret;
6216
6217        path = alloc_path_for_send();
6218        if (!path)
6219                return -ENOMEM;
6220
6221        key.objectid = sctx->cur_ino;
6222        key.type = BTRFS_INODE_REF_KEY;
6223        key.offset = 0;
6224        ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
6225        if (ret < 0)
6226                goto out;
6227
6228        ctx.refs = &deleted_refs;
6229        ctx.sctx = sctx;
6230
6231        while (true) {
6232                struct extent_buffer *eb = path->nodes[0];
6233                int slot = path->slots[0];
6234
6235                if (slot >= btrfs_header_nritems(eb)) {
6236                        ret = btrfs_next_leaf(sctx->parent_root, path);
6237                        if (ret < 0)
6238                                goto out;
6239                        else if (ret > 0)
6240                                break;
6241                        continue;
6242                }
6243
6244                btrfs_item_key_to_cpu(eb, &key, slot);
6245                if (key.objectid != sctx->cur_ino)
6246                        break;
6247                if (key.type != BTRFS_INODE_REF_KEY &&
6248                    key.type != BTRFS_INODE_EXTREF_KEY)
6249                        break;
6250
6251                ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
6252                                        record_parent_ref, &ctx);
6253                if (ret < 0)
6254                        goto out;
6255
6256                path->slots[0]++;
6257        }
6258
6259        while (!list_empty(&deleted_refs)) {
6260                struct recorded_ref *ref;
6261
6262                ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
6263                ret = send_unlink(sctx, ref->full_path);
6264                if (ret < 0)
6265                        goto out;
6266                fs_path_free(ref->full_path);
6267                list_del(&ref->list);
6268                kfree(ref);
6269        }
6270        ret = 0;
6271out:
6272        btrfs_free_path(path);
6273        if (ret)
6274                __free_recorded_refs(&deleted_refs);
6275        return ret;
6276}
6277
6278static int changed_inode(struct send_ctx *sctx,
6279                         enum btrfs_compare_tree_result result)
6280{
6281        int ret = 0;
6282        struct btrfs_key *key = sctx->cmp_key;
6283        struct btrfs_inode_item *left_ii = NULL;
6284        struct btrfs_inode_item *right_ii = NULL;
6285        u64 left_gen = 0;
6286        u64 right_gen = 0;
6287
6288        sctx->cur_ino = key->objectid;
6289        sctx->cur_inode_new_gen = 0;
6290        sctx->cur_inode_last_extent = (u64)-1;
6291        sctx->cur_inode_next_write_offset = 0;
6292        sctx->ignore_cur_inode = false;
6293
6294        /*
6295         * Set send_progress to current inode. This will tell all get_cur_xxx
6296         * functions that the current inode's refs are not updated yet. Later,
6297         * when process_recorded_refs is finished, it is set to cur_ino + 1.
6298         */
6299        sctx->send_progress = sctx->cur_ino;
6300
6301        if (result == BTRFS_COMPARE_TREE_NEW ||
6302            result == BTRFS_COMPARE_TREE_CHANGED) {
6303                left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
6304                                sctx->left_path->slots[0],
6305                                struct btrfs_inode_item);
6306                left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
6307                                left_ii);
6308        } else {
6309                right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6310                                sctx->right_path->slots[0],
6311                                struct btrfs_inode_item);
6312                right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6313                                right_ii);
6314        }
6315        if (result == BTRFS_COMPARE_TREE_CHANGED) {
6316                right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6317                                sctx->right_path->slots[0],
6318                                struct btrfs_inode_item);
6319
6320                right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6321                                right_ii);
6322
6323                /*
6324                 * The cur_ino = root dir case is special here. We can't treat
6325                 * the inode as deleted+reused because it would generate a
6326                 * stream that tries to delete/mkdir the root dir.
6327                 */
6328                if (left_gen != right_gen &&
6329                    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6330                        sctx->cur_inode_new_gen = 1;
6331        }
6332
6333        /*
6334         * Normally we do not find inodes with a link count of zero (orphans)
6335         * because the most common case is to create a snapshot and use it
6336         * for a send operation. However other less common use cases involve
6337         * using a subvolume and send it after turning it to RO mode just
6338         * after deleting all hard links of a file while holding an open
6339         * file descriptor against it or turning a RO snapshot into RW mode,
6340         * keep an open file descriptor against a file, delete it and then
6341         * turn the snapshot back to RO mode before using it for a send
6342         * operation. So if we find such cases, ignore the inode and all its
6343         * items completely if it's a new inode, or if it's a changed inode
6344         * make sure all its previous paths (from the parent snapshot) are all
6345         * unlinked and all other the inode items are ignored.
6346         */
6347        if (result == BTRFS_COMPARE_TREE_NEW ||
6348            result == BTRFS_COMPARE_TREE_CHANGED) {
6349                u32 nlinks;
6350
6351                nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
6352                if (nlinks == 0) {
6353                        sctx->ignore_cur_inode = true;
6354                        if (result == BTRFS_COMPARE_TREE_CHANGED)
6355                                ret = btrfs_unlink_all_paths(sctx);
6356                        goto out;
6357                }
6358        }
6359
6360        if (result == BTRFS_COMPARE_TREE_NEW) {
6361                sctx->cur_inode_gen = left_gen;
6362                sctx->cur_inode_new = 1;
6363                sctx->cur_inode_deleted = 0;
6364                sctx->cur_inode_size = btrfs_inode_size(
6365                                sctx->left_path->nodes[0], left_ii);
6366                sctx->cur_inode_mode = btrfs_inode_mode(
6367                                sctx->left_path->nodes[0], left_ii);
6368                sctx->cur_inode_rdev = btrfs_inode_rdev(
6369                                sctx->left_path->nodes[0], left_ii);
6370                if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6371                        ret = send_create_inode_if_needed(sctx);
6372        } else if (result == BTRFS_COMPARE_TREE_DELETED) {
6373                sctx->cur_inode_gen = right_gen;
6374                sctx->cur_inode_new = 0;
6375                sctx->cur_inode_deleted = 1;
6376                sctx->cur_inode_size = btrfs_inode_size(
6377                                sctx->right_path->nodes[0], right_ii);
6378                sctx->cur_inode_mode = btrfs_inode_mode(
6379                                sctx->right_path->nodes[0], right_ii);
6380        } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
6381                /*
6382                 * We need to do some special handling in case the inode was
6383                 * reported as changed with a changed generation number. This
6384                 * means that the original inode was deleted and new inode
6385                 * reused the same inum. So we have to treat the old inode as
6386                 * deleted and the new one as new.
6387                 */
6388                if (sctx->cur_inode_new_gen) {
6389                        /*
6390                         * First, process the inode as if it was deleted.
6391                         */
6392                        sctx->cur_inode_gen = right_gen;
6393                        sctx->cur_inode_new = 0;
6394                        sctx->cur_inode_deleted = 1;
6395                        sctx->cur_inode_size = btrfs_inode_size(
6396                                        sctx->right_path->nodes[0], right_ii);
6397                        sctx->cur_inode_mode = btrfs_inode_mode(
6398                                        sctx->right_path->nodes[0], right_ii);
6399                        ret = process_all_refs(sctx,
6400                                        BTRFS_COMPARE_TREE_DELETED);
6401                        if (ret < 0)
6402                                goto out;
6403
6404                        /*
6405                         * Now process the inode as if it was new.
6406                         */
6407                        sctx->cur_inode_gen = left_gen;
6408                        sctx->cur_inode_new = 1;
6409                        sctx->cur_inode_deleted = 0;
6410                        sctx->cur_inode_size = btrfs_inode_size(
6411                                        sctx->left_path->nodes[0], left_ii);
6412                        sctx->cur_inode_mode = btrfs_inode_mode(
6413                                        sctx->left_path->nodes[0], left_ii);
6414                        sctx->cur_inode_rdev = btrfs_inode_rdev(
6415                                        sctx->left_path->nodes[0], left_ii);
6416                        ret = send_create_inode_if_needed(sctx);
6417                        if (ret < 0)
6418                                goto out;
6419
6420                        ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
6421                        if (ret < 0)
6422                                goto out;
6423                        /*
6424                         * Advance send_progress now as we did not get into
6425                         * process_recorded_refs_if_needed in the new_gen case.
6426                         */
6427                        sctx->send_progress = sctx->cur_ino + 1;
6428
6429                        /*
6430                         * Now process all extents and xattrs of the inode as if
6431                         * they were all new.
6432                         */
6433                        ret = process_all_extents(sctx);
6434                        if (ret < 0)
6435                                goto out;
6436                        ret = process_all_new_xattrs(sctx);
6437                        if (ret < 0)
6438                                goto out;
6439                } else {
6440                        sctx->cur_inode_gen = left_gen;
6441                        sctx->cur_inode_new = 0;
6442                        sctx->cur_inode_new_gen = 0;
6443                        sctx->cur_inode_deleted = 0;
6444                        sctx->cur_inode_size = btrfs_inode_size(
6445                                        sctx->left_path->nodes[0], left_ii);
6446                        sctx->cur_inode_mode = btrfs_inode_mode(
6447                                        sctx->left_path->nodes[0], left_ii);
6448                }
6449        }
6450
6451out:
6452        return ret;
6453}
6454
6455/*
6456 * We have to process new refs before deleted refs, but compare_trees gives us
6457 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
6458 * first and later process them in process_recorded_refs.
6459 * For the cur_inode_new_gen case, we skip recording completely because
6460 * changed_inode did already initiate processing of refs. The reason for this is
6461 * that in this case, compare_tree actually compares the refs of 2 different
6462 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
6463 * refs of the right tree as deleted and all refs of the left tree as new.
6464 */
6465static int changed_ref(struct send_ctx *sctx,
6466                       enum btrfs_compare_tree_result result)
6467{
6468        int ret = 0;
6469
6470        if (sctx->cur_ino != sctx->cmp_key->objectid) {
6471                inconsistent_snapshot_error(sctx, result, "reference");
6472                return -EIO;
6473        }
6474
6475        if (!sctx->cur_inode_new_gen &&
6476            sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
6477                if (result == BTRFS_COMPARE_TREE_NEW)
6478                        ret = record_new_ref(sctx);
6479                else if (result == BTRFS_COMPARE_TREE_DELETED)
6480                        ret = record_deleted_ref(sctx);
6481                else if (result == BTRFS_COMPARE_TREE_CHANGED)
6482                        ret = record_changed_ref(sctx);
6483        }
6484
6485        return ret;
6486}
6487
6488/*
6489 * Process new/deleted/changed xattrs. We skip processing in the
6490 * cur_inode_new_gen case because changed_inode did already initiate processing
6491 * of xattrs. The reason is the same as in changed_ref
6492 */
6493static int changed_xattr(struct send_ctx *sctx,
6494                         enum btrfs_compare_tree_result result)
6495{
6496        int ret = 0;
6497
6498        if (sctx->cur_ino != sctx->cmp_key->objectid) {
6499                inconsistent_snapshot_error(sctx, result, "xattr");
6500                return -EIO;
6501        }
6502
6503        if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
6504                if (result == BTRFS_COMPARE_TREE_NEW)
6505                        ret = process_new_xattr(sctx);
6506                else if (result == BTRFS_COMPARE_TREE_DELETED)
6507                        ret = process_deleted_xattr(sctx);
6508                else if (result == BTRFS_COMPARE_TREE_CHANGED)
6509                        ret = process_changed_xattr(sctx);
6510        }
6511
6512        return ret;
6513}
6514
6515/*
6516 * Process new/deleted/changed extents. We skip processing in the
6517 * cur_inode_new_gen case because changed_inode did already initiate processing
6518 * of extents. The reason is the same as in changed_ref
6519 */
6520static int changed_extent(struct send_ctx *sctx,
6521                          enum btrfs_compare_tree_result result)
6522{
6523        int ret = 0;
6524
6525        /*
6526         * We have found an extent item that changed without the inode item
6527         * having changed. This can happen either after relocation (where the
6528         * disk_bytenr of an extent item is replaced at
6529         * relocation.c:replace_file_extents()) or after deduplication into a
6530         * file in both the parent and send snapshots (where an extent item can
6531         * get modified or replaced with a new one). Note that deduplication
6532         * updates the inode item, but it only changes the iversion (sequence
6533         * field in the inode item) of the inode, so if a file is deduplicated
6534         * the same amount of times in both the parent and send snapshots, its
6535         * iversion becomes the same in both snapshots, whence the inode item is
6536         * the same on both snapshots.
6537         */
6538        if (sctx->cur_ino != sctx->cmp_key->objectid)
6539                return 0;
6540
6541        if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
6542                if (result != BTRFS_COMPARE_TREE_DELETED)
6543                        ret = process_extent(sctx, sctx->left_path,
6544                                        sctx->cmp_key);
6545        }
6546
6547        return ret;
6548}
6549
6550static int dir_changed(struct send_ctx *sctx, u64 dir)
6551{
6552        u64 orig_gen, new_gen;
6553        int ret;
6554
6555        ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
6556                             NULL, NULL);
6557        if (ret)
6558                return ret;
6559
6560        ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
6561                             NULL, NULL, NULL);
6562        if (ret)
6563                return ret;
6564
6565        return (orig_gen != new_gen) ? 1 : 0;
6566}
6567
6568static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
6569                        struct btrfs_key *key)
6570{
6571        struct btrfs_inode_extref *extref;
6572        struct extent_buffer *leaf;
6573        u64 dirid = 0, last_dirid = 0;
6574        unsigned long ptr;
6575        u32 item_size;
6576        u32 cur_offset = 0;
6577        int ref_name_len;
6578        int ret = 0;
6579
6580        /* Easy case, just check this one dirid */
6581        if (key->type == BTRFS_INODE_REF_KEY) {
6582                dirid = key->offset;
6583
6584                ret = dir_changed(sctx, dirid);
6585                goto out;
6586        }
6587
6588        leaf = path->nodes[0];
6589        item_size = btrfs_item_size(leaf, path->slots[0]);
6590        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
6591        while (cur_offset < item_size) {
6592                extref = (struct btrfs_inode_extref *)(ptr +
6593                                                       cur_offset);
6594                dirid = btrfs_inode_extref_parent(leaf, extref);
6595                ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
6596                cur_offset += ref_name_len + sizeof(*extref);
6597                if (dirid == last_dirid)
6598                        continue;
6599                ret = dir_changed(sctx, dirid);
6600                if (ret)
6601                        break;
6602                last_dirid = dirid;
6603        }
6604out:
6605        return ret;
6606}
6607
6608/*
6609 * Updates compare related fields in sctx and simply forwards to the actual
6610 * changed_xxx functions.
6611 */
6612static int changed_cb(struct btrfs_path *left_path,
6613                      struct btrfs_path *right_path,
6614                      struct btrfs_key *key,
6615                      enum btrfs_compare_tree_result result,
6616                      struct send_ctx *sctx)
6617{
6618        int ret = 0;
6619
6620        /*
6621         * We can not hold the commit root semaphore here. This is because in
6622         * the case of sending and receiving to the same filesystem, using a
6623         * pipe, could result in a deadlock:
6624         *
6625         * 1) The task running send blocks on the pipe because it's full;
6626         *
6627         * 2) The task running receive, which is the only consumer of the pipe,
6628         *    is waiting for a transaction commit (for example due to a space
6629         *    reservation when doing a write or triggering a transaction commit
6630         *    when creating a subvolume);
6631         *
6632         * 3) The transaction is waiting to write lock the commit root semaphore,
6633         *    but can not acquire it since it's being held at 1).
6634         *
6635         * Down this call chain we write to the pipe through kernel_write().
6636         * The same type of problem can also happen when sending to a file that
6637         * is stored in the same filesystem - when reserving space for a write
6638         * into the file, we can trigger a transaction commit.
6639         *
6640         * Our caller has supplied us with clones of leaves from the send and
6641         * parent roots, so we're safe here from a concurrent relocation and
6642         * further reallocation of metadata extents while we are here. Below we
6643         * also assert that the leaves are clones.
6644         */
6645        lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
6646
6647        /*
6648         * We always have a send root, so left_path is never NULL. We will not
6649         * have a leaf when we have reached the end of the send root but have
6650         * not yet reached the end of the parent root.
6651         */
6652        if (left_path->nodes[0])
6653                ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
6654                                &left_path->nodes[0]->bflags));
6655        /*
6656         * When doing a full send we don't have a parent root, so right_path is
6657         * NULL. When doing an incremental send, we may have reached the end of
6658         * the parent root already, so we don't have a leaf at right_path.
6659         */
6660        if (right_path && right_path->nodes[0])
6661                ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
6662                                &right_path->nodes[0]->bflags));
6663
6664        if (result == BTRFS_COMPARE_TREE_SAME) {
6665                if (key->type == BTRFS_INODE_REF_KEY ||
6666                    key->type == BTRFS_INODE_EXTREF_KEY) {
6667                        ret = compare_refs(sctx, left_path, key);
6668                        if (!ret)
6669                                return 0;
6670                        if (ret < 0)
6671                                return ret;
6672                } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
6673                        return maybe_send_hole(sctx, left_path, key);
6674                } else {
6675                        return 0;
6676                }
6677                result = BTRFS_COMPARE_TREE_CHANGED;
6678                ret = 0;
6679        }
6680
6681        sctx->left_path = left_path;
6682        sctx->right_path = right_path;
6683        sctx->cmp_key = key;
6684
6685        ret = finish_inode_if_needed(sctx, 0);
6686        if (ret < 0)
6687                goto out;
6688
6689        /* Ignore non-FS objects */
6690        if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
6691            key->objectid == BTRFS_FREE_SPACE_OBJECTID)
6692                goto out;
6693
6694        if (key->type == BTRFS_INODE_ITEM_KEY) {
6695                ret = changed_inode(sctx, result);
6696        } else if (!sctx->ignore_cur_inode) {
6697                if (key->type == BTRFS_INODE_REF_KEY ||
6698                    key->type == BTRFS_INODE_EXTREF_KEY)
6699                        ret = changed_ref(sctx, result);
6700                else if (key->type == BTRFS_XATTR_ITEM_KEY)
6701                        ret = changed_xattr(sctx, result);
6702                else if (key->type == BTRFS_EXTENT_DATA_KEY)
6703                        ret = changed_extent(sctx, result);
6704        }
6705
6706out:
6707        return ret;
6708}
6709
6710static int search_key_again(const struct send_ctx *sctx,
6711                            struct btrfs_root *root,
6712                            struct btrfs_path *path,
6713                            const struct btrfs_key *key)
6714{
6715        int ret;
6716
6717        if (!path->need_commit_sem)
6718                lockdep_assert_held_read(&root->fs_info->commit_root_sem);
6719
6720        /*
6721         * Roots used for send operations are readonly and no one can add,
6722         * update or remove keys from them, so we should be able to find our
6723         * key again. The only exception is deduplication, which can operate on
6724         * readonly roots and add, update or remove keys to/from them - but at
6725         * the moment we don't allow it to run in parallel with send.
6726         */
6727        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
6728        ASSERT(ret <= 0);
6729        if (ret > 0) {
6730                btrfs_print_tree(path->nodes[path->lowest_level], false);
6731                btrfs_err(root->fs_info,
6732"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
6733                          key->objectid, key->type, key->offset,
6734                          (root == sctx->parent_root ? "parent" : "send"),
6735                          root->root_key.objectid, path->lowest_level,
6736                          path->slots[path->lowest_level]);
6737                return -EUCLEAN;
6738        }
6739
6740        return ret;
6741}
6742
6743static int full_send_tree(struct send_ctx *sctx)
6744{
6745        int ret;
6746        struct btrfs_root *send_root = sctx->send_root;
6747        struct btrfs_key key;
6748        struct btrfs_fs_info *fs_info = send_root->fs_info;
6749        struct btrfs_path *path;
6750
6751        path = alloc_path_for_send();
6752        if (!path)
6753                return -ENOMEM;
6754        path->reada = READA_FORWARD_ALWAYS;
6755
6756        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
6757        key.type = BTRFS_INODE_ITEM_KEY;
6758        key.offset = 0;
6759
6760        down_read(&fs_info->commit_root_sem);
6761        sctx->last_reloc_trans = fs_info->last_reloc_trans;
6762        up_read(&fs_info->commit_root_sem);
6763
6764        ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
6765        if (ret < 0)
6766                goto out;
6767        if (ret)
6768                goto out_finish;
6769
6770        while (1) {
6771                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
6772
6773                ret = changed_cb(path, NULL, &key,
6774                                 BTRFS_COMPARE_TREE_NEW, sctx);
6775                if (ret < 0)
6776                        goto out;
6777
6778                down_read(&fs_info->commit_root_sem);
6779                if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
6780                        sctx->last_reloc_trans = fs_info->last_reloc_trans;
6781                        up_read(&fs_info->commit_root_sem);
6782                        /*
6783                         * A transaction used for relocating a block group was
6784                         * committed or is about to finish its commit. Release
6785                         * our path (leaf) and restart the search, so that we
6786                         * avoid operating on any file extent items that are
6787                         * stale, with a disk_bytenr that reflects a pre
6788                         * relocation value. This way we avoid as much as
6789                         * possible to fallback to regular writes when checking
6790                         * if we can clone file ranges.
6791                         */
6792                        btrfs_release_path(path);
6793                        ret = search_key_again(sctx, send_root, path, &key);
6794                        if (ret < 0)
6795                                goto out;
6796                } else {
6797                        up_read(&fs_info->commit_root_sem);
6798                }
6799
6800                ret = btrfs_next_item(send_root, path);
6801                if (ret < 0)
6802                        goto out;
6803                if (ret) {
6804                        ret  = 0;
6805                        break;
6806                }
6807        }
6808
6809out_finish:
6810        ret = finish_inode_if_needed(sctx, 1);
6811
6812out:
6813        btrfs_free_path(path);
6814        return ret;
6815}
6816
6817static int replace_node_with_clone(struct btrfs_path *path, int level)
6818{
6819        struct extent_buffer *clone;
6820
6821        clone = btrfs_clone_extent_buffer(path->nodes[level]);
6822        if (!clone)
6823                return -ENOMEM;
6824
6825        free_extent_buffer(path->nodes[level]);
6826        path->nodes[level] = clone;
6827
6828        return 0;
6829}
6830
6831static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
6832{
6833        struct extent_buffer *eb;
6834        struct extent_buffer *parent = path->nodes[*level];
6835        int slot = path->slots[*level];
6836        const int nritems = btrfs_header_nritems(parent);
6837        u64 reada_max;
6838        u64 reada_done = 0;
6839
6840        lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
6841
6842        BUG_ON(*level == 0);
6843        eb = btrfs_read_node_slot(parent, slot);
6844        if (IS_ERR(eb))
6845                return PTR_ERR(eb);
6846
6847        /*
6848         * Trigger readahead for the next leaves we will process, so that it is
6849         * very likely that when we need them they are already in memory and we
6850         * will not block on disk IO. For nodes we only do readahead for one,
6851         * since the time window between processing nodes is typically larger.
6852         */
6853        reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
6854
6855        for (slot++; slot < nritems && reada_done < reada_max; slot++) {
6856                if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
6857                        btrfs_readahead_node_child(parent, slot);
6858                        reada_done += eb->fs_info->nodesize;
6859                }
6860        }
6861
6862        path->nodes[*level - 1] = eb;
6863        path->slots[*level - 1] = 0;
6864        (*level)--;
6865
6866        if (*level == 0)
6867                return replace_node_with_clone(path, 0);
6868
6869        return 0;
6870}
6871
6872static int tree_move_next_or_upnext(struct btrfs_path *path,
6873                                    int *level, int root_level)
6874{
6875        int ret = 0;
6876        int nritems;
6877        nritems = btrfs_header_nritems(path->nodes[*level]);
6878
6879        path->slots[*level]++;
6880
6881        while (path->slots[*level] >= nritems) {
6882                if (*level == root_level) {
6883                        path->slots[*level] = nritems - 1;
6884                        return -1;
6885                }
6886
6887                /* move upnext */
6888                path->slots[*level] = 0;
6889                free_extent_buffer(path->nodes[*level]);
6890                path->nodes[*level] = NULL;
6891                (*level)++;
6892                path->slots[*level]++;
6893
6894                nritems = btrfs_header_nritems(path->nodes[*level]);
6895                ret = 1;
6896        }
6897        return ret;
6898}
6899
6900/*
6901 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
6902 * or down.
6903 */
6904static int tree_advance(struct btrfs_path *path,
6905                        int *level, int root_level,
6906                        int allow_down,
6907                        struct btrfs_key *key,
6908                        u64 reada_min_gen)
6909{
6910        int ret;
6911
6912        if (*level == 0 || !allow_down) {
6913                ret = tree_move_next_or_upnext(path, level, root_level);
6914        } else {
6915                ret = tree_move_down(path, level, reada_min_gen);
6916        }
6917
6918        /*
6919         * Even if we have reached the end of a tree, ret is -1, update the key
6920         * anyway, so that in case we need to restart due to a block group
6921         * relocation, we can assert that the last key of the root node still
6922         * exists in the tree.
6923         */
6924        if (*level == 0)
6925                btrfs_item_key_to_cpu(path->nodes[*level], key,
6926                                      path->slots[*level]);
6927        else
6928                btrfs_node_key_to_cpu(path->nodes[*level], key,
6929                                      path->slots[*level]);
6930
6931        return ret;
6932}
6933
6934static int tree_compare_item(struct btrfs_path *left_path,
6935                             struct btrfs_path *right_path,
6936                             char *tmp_buf)
6937{
6938        int cmp;
6939        int len1, len2;
6940        unsigned long off1, off2;
6941
6942        len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
6943        len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
6944        if (len1 != len2)
6945                return 1;
6946
6947        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
6948        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
6949                                right_path->slots[0]);
6950
6951        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
6952
6953        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
6954        if (cmp)
6955                return 1;
6956        return 0;
6957}
6958
6959/*
6960 * A transaction used for relocating a block group was committed or is about to
6961 * finish its commit. Release our paths and restart the search, so that we are
6962 * not using stale extent buffers:
6963 *
6964 * 1) For levels > 0, we are only holding references of extent buffers, without
6965 *    any locks on them, which does not prevent them from having been relocated
6966 *    and reallocated after the last time we released the commit root semaphore.
6967 *    The exception are the root nodes, for which we always have a clone, see
6968 *    the comment at btrfs_compare_trees();
6969 *
6970 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
6971 *    we are safe from the concurrent relocation and reallocation. However they
6972 *    can have file extent items with a pre relocation disk_bytenr value, so we
6973 *    restart the start from the current commit roots and clone the new leaves so
6974 *    that we get the post relocation disk_bytenr values. Not doing so, could
6975 *    make us clone the wrong data in case there are new extents using the old
6976 *    disk_bytenr that happen to be shared.
6977 */
6978static int restart_after_relocation(struct btrfs_path *left_path,
6979                                    struct btrfs_path *right_path,
6980                                    const struct btrfs_key *left_key,
6981                                    const struct btrfs_key *right_key,
6982                                    int left_level,
6983                                    int right_level,
6984                                    const struct send_ctx *sctx)
6985{
6986        int root_level;
6987        int ret;
6988
6989        lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
6990
6991        btrfs_release_path(left_path);
6992        btrfs_release_path(right_path);
6993
6994        /*
6995         * Since keys can not be added or removed to/from our roots because they
6996         * are readonly and we do not allow deduplication to run in parallel
6997         * (which can add, remove or change keys), the layout of the trees should
6998         * not change.
6999         */
7000        left_path->lowest_level = left_level;
7001        ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
7002        if (ret < 0)
7003                return ret;
7004
7005        right_path->lowest_level = right_level;
7006        ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
7007        if (ret < 0)
7008                return ret;
7009
7010        /*
7011         * If the lowest level nodes are leaves, clone them so that they can be
7012         * safely used by changed_cb() while not under the protection of the
7013         * commit root semaphore, even if relocation and reallocation happens in
7014         * parallel.
7015         */
7016        if (left_level == 0) {
7017                ret = replace_node_with_clone(left_path, 0);
7018                if (ret < 0)
7019                        return ret;
7020        }
7021
7022        if (right_level == 0) {
7023                ret = replace_node_with_clone(right_path, 0);
7024                if (ret < 0)
7025                        return ret;
7026        }
7027
7028        /*
7029         * Now clone the root nodes (unless they happen to be the leaves we have
7030         * already cloned). This is to protect against concurrent snapshotting of
7031         * the send and parent roots (see the comment at btrfs_compare_trees()).
7032         */
7033        root_level = btrfs_header_level(sctx->send_root->commit_root);
7034        if (root_level > 0) {
7035                ret = replace_node_with_clone(left_path, root_level);
7036                if (ret < 0)
7037                        return ret;
7038        }
7039
7040        root_level = btrfs_header_level(sctx->parent_root->commit_root);
7041        if (root_level > 0) {
7042                ret = replace_node_with_clone(right_path, root_level);
7043                if (ret < 0)
7044                        return ret;
7045        }
7046
7047        return 0;
7048}
7049
7050/*
7051 * This function compares two trees and calls the provided callback for
7052 * every changed/new/deleted item it finds.
7053 * If shared tree blocks are encountered, whole subtrees are skipped, making
7054 * the compare pretty fast on snapshotted subvolumes.
7055 *
7056 * This currently works on commit roots only. As commit roots are read only,
7057 * we don't do any locking. The commit roots are protected with transactions.
7058 * Transactions are ended and rejoined when a commit is tried in between.
7059 *
7060 * This function checks for modifications done to the trees while comparing.
7061 * If it detects a change, it aborts immediately.
7062 */
7063static int btrfs_compare_trees(struct btrfs_root *left_root,
7064                        struct btrfs_root *right_root, struct send_ctx *sctx)
7065{
7066        struct btrfs_fs_info *fs_info = left_root->fs_info;
7067        int ret;
7068        int cmp;
7069        struct btrfs_path *left_path = NULL;
7070        struct btrfs_path *right_path = NULL;
7071        struct btrfs_key left_key;
7072        struct btrfs_key right_key;
7073        char *tmp_buf = NULL;
7074        int left_root_level;
7075        int right_root_level;
7076        int left_level;
7077        int right_level;
7078        int left_end_reached = 0;
7079        int right_end_reached = 0;
7080        int advance_left = 0;
7081        int advance_right = 0;
7082        u64 left_blockptr;
7083        u64 right_blockptr;
7084        u64 left_gen;
7085        u64 right_gen;
7086        u64 reada_min_gen;
7087
7088        left_path = btrfs_alloc_path();
7089        if (!left_path) {
7090                ret = -ENOMEM;
7091                goto out;
7092        }
7093        right_path = btrfs_alloc_path();
7094        if (!right_path) {
7095                ret = -ENOMEM;
7096                goto out;
7097        }
7098
7099        tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
7100        if (!tmp_buf) {
7101                ret = -ENOMEM;
7102                goto out;
7103        }
7104
7105        left_path->search_commit_root = 1;
7106        left_path->skip_locking = 1;
7107        right_path->search_commit_root = 1;
7108        right_path->skip_locking = 1;
7109
7110        /*
7111         * Strategy: Go to the first items of both trees. Then do
7112         *
7113         * If both trees are at level 0
7114         *   Compare keys of current items
7115         *     If left < right treat left item as new, advance left tree
7116         *       and repeat
7117         *     If left > right treat right item as deleted, advance right tree
7118         *       and repeat
7119         *     If left == right do deep compare of items, treat as changed if
7120         *       needed, advance both trees and repeat
7121         * If both trees are at the same level but not at level 0
7122         *   Compare keys of current nodes/leafs
7123         *     If left < right advance left tree and repeat
7124         *     If left > right advance right tree and repeat
7125         *     If left == right compare blockptrs of the next nodes/leafs
7126         *       If they match advance both trees but stay at the same level
7127         *         and repeat
7128         *       If they don't match advance both trees while allowing to go
7129         *         deeper and repeat
7130         * If tree levels are different
7131         *   Advance the tree that needs it and repeat
7132         *
7133         * Advancing a tree means:
7134         *   If we are at level 0, try to go to the next slot. If that's not
7135         *   possible, go one level up and repeat. Stop when we found a level
7136         *   where we could go to the next slot. We may at this point be on a
7137         *   node or a leaf.
7138         *
7139         *   If we are not at level 0 and not on shared tree blocks, go one
7140         *   level deeper.
7141         *
7142         *   If we are not at level 0 and on shared tree blocks, go one slot to
7143         *   the right if possible or go up and right.
7144         */
7145
7146        down_read(&fs_info->commit_root_sem);
7147        left_level = btrfs_header_level(left_root->commit_root);
7148        left_root_level = left_level;
7149        /*
7150         * We clone the root node of the send and parent roots to prevent races
7151         * with snapshot creation of these roots. Snapshot creation COWs the
7152         * root node of a tree, so after the transaction is committed the old
7153         * extent can be reallocated while this send operation is still ongoing.
7154         * So we clone them, under the commit root semaphore, to be race free.
7155         */
7156        left_path->nodes[left_level] =
7157                        btrfs_clone_extent_buffer(left_root->commit_root);
7158        if (!left_path->nodes[left_level]) {
7159                ret = -ENOMEM;
7160                goto out_unlock;
7161        }
7162
7163        right_level = btrfs_header_level(right_root->commit_root);
7164        right_root_level = right_level;
7165        right_path->nodes[right_level] =
7166                        btrfs_clone_extent_buffer(right_root->commit_root);
7167        if (!right_path->nodes[right_level]) {
7168                ret = -ENOMEM;
7169                goto out_unlock;
7170        }
7171        /*
7172         * Our right root is the parent root, while the left root is the "send"
7173         * root. We know that all new nodes/leaves in the left root must have
7174         * a generation greater than the right root's generation, so we trigger
7175         * readahead for those nodes and leaves of the left root, as we know we
7176         * will need to read them at some point.
7177         */
7178        reada_min_gen = btrfs_header_generation(right_root->commit_root);
7179
7180        if (left_level == 0)
7181                btrfs_item_key_to_cpu(left_path->nodes[left_level],
7182                                &left_key, left_path->slots[left_level]);
7183        else
7184                btrfs_node_key_to_cpu(left_path->nodes[left_level],
7185                                &left_key, left_path->slots[left_level]);
7186        if (right_level == 0)
7187                btrfs_item_key_to_cpu(right_path->nodes[right_level],
7188                                &right_key, right_path->slots[right_level]);
7189        else
7190                btrfs_node_key_to_cpu(right_path->nodes[right_level],
7191                                &right_key, right_path->slots[right_level]);
7192
7193        sctx->last_reloc_trans = fs_info->last_reloc_trans;
7194
7195        while (1) {
7196                if (need_resched() ||
7197                    rwsem_is_contended(&fs_info->commit_root_sem)) {
7198                        up_read(&fs_info->commit_root_sem);
7199                        cond_resched();
7200                        down_read(&fs_info->commit_root_sem);
7201                }
7202
7203                if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
7204                        ret = restart_after_relocation(left_path, right_path,
7205                                                       &left_key, &right_key,
7206                                                       left_level, right_level,
7207                                                       sctx);
7208                        if (ret < 0)
7209                                goto out_unlock;
7210                        sctx->last_reloc_trans = fs_info->last_reloc_trans;
7211                }
7212
7213                if (advance_left && !left_end_reached) {
7214                        ret = tree_advance(left_path, &left_level,
7215                                        left_root_level,
7216                                        advance_left != ADVANCE_ONLY_NEXT,
7217                                        &left_key, reada_min_gen);
7218                        if (ret == -1)
7219                                left_end_reached = ADVANCE;
7220                        else if (ret < 0)
7221                                goto out_unlock;
7222                        advance_left = 0;
7223                }
7224                if (advance_right && !right_end_reached) {
7225                        ret = tree_advance(right_path, &right_level,
7226                                        right_root_level,
7227                                        advance_right != ADVANCE_ONLY_NEXT,
7228                                        &right_key, reada_min_gen);
7229                        if (ret == -1)
7230                                right_end_reached = ADVANCE;
7231                        else if (ret < 0)
7232                                goto out_unlock;
7233                        advance_right = 0;
7234                }
7235
7236                if (left_end_reached && right_end_reached) {
7237                        ret = 0;
7238                        goto out_unlock;
7239                } else if (left_end_reached) {
7240                        if (right_level == 0) {
7241                                up_read(&fs_info->commit_root_sem);
7242                                ret = changed_cb(left_path, right_path,
7243                                                &right_key,
7244                                                BTRFS_COMPARE_TREE_DELETED,
7245                                                sctx);
7246                                if (ret < 0)
7247                                        goto out;
7248                                down_read(&fs_info->commit_root_sem);
7249                        }
7250                        advance_right = ADVANCE;
7251                        continue;
7252                } else if (right_end_reached) {
7253                        if (left_level == 0) {
7254                                up_read(&fs_info->commit_root_sem);
7255                                ret = changed_cb(left_path, right_path,
7256                                                &left_key,
7257                                                BTRFS_COMPARE_TREE_NEW,
7258                                                sctx);
7259                                if (ret < 0)
7260                                        goto out;
7261                                down_read(&fs_info->commit_root_sem);
7262                        }
7263                        advance_left = ADVANCE;
7264                        continue;
7265                }
7266
7267                if (left_level == 0 && right_level == 0) {
7268                        up_read(&fs_info->commit_root_sem);
7269                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7270                        if (cmp < 0) {
7271                                ret = changed_cb(left_path, right_path,
7272                                                &left_key,
7273                                                BTRFS_COMPARE_TREE_NEW,
7274                                                sctx);
7275                                advance_left = ADVANCE;
7276                        } else if (cmp > 0) {
7277                                ret = changed_cb(left_path, right_path,
7278                                                &right_key,
7279                                                BTRFS_COMPARE_TREE_DELETED,
7280                                                sctx);
7281                                advance_right = ADVANCE;
7282                        } else {
7283                                enum btrfs_compare_tree_result result;
7284
7285                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
7286                                ret = tree_compare_item(left_path, right_path,
7287                                                        tmp_buf);
7288                                if (ret)
7289                                        result = BTRFS_COMPARE_TREE_CHANGED;
7290                                else
7291                                        result = BTRFS_COMPARE_TREE_SAME;
7292                                ret = changed_cb(left_path, right_path,
7293                                                 &left_key, result, sctx);
7294                                advance_left = ADVANCE;
7295                                advance_right = ADVANCE;
7296                        }
7297
7298                        if (ret < 0)
7299                                goto out;
7300                        down_read(&fs_info->commit_root_sem);
7301                } else if (left_level == right_level) {
7302                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
7303                        if (cmp < 0) {
7304                                advance_left = ADVANCE;
7305                        } else if (cmp > 0) {
7306                                advance_right = ADVANCE;
7307                        } else {
7308                                left_blockptr = btrfs_node_blockptr(
7309                                                left_path->nodes[left_level],
7310                                                left_path->slots[left_level]);
7311                                right_blockptr = btrfs_node_blockptr(
7312                                                right_path->nodes[right_level],
7313                                                right_path->slots[right_level]);
7314                                left_gen = btrfs_node_ptr_generation(
7315                                                left_path->nodes[left_level],
7316                                                left_path->slots[left_level]);
7317                                right_gen = btrfs_node_ptr_generation(
7318                                                right_path->nodes[right_level],
7319                                                right_path->slots[right_level]);
7320                                if (left_blockptr == right_blockptr &&
7321                                    left_gen == right_gen) {
7322                                        /*
7323                                         * As we're on a shared block, don't
7324                                         * allow to go deeper.
7325                                         */
7326                                        advance_left = ADVANCE_ONLY_NEXT;
7327                                        advance_right = ADVANCE_ONLY_NEXT;
7328                                } else {
7329                                        advance_left = ADVANCE;
7330                                        advance_right = ADVANCE;
7331                                }
7332                        }
7333                } else if (left_level < right_level) {
7334                        advance_right = ADVANCE;
7335                } else {
7336                        advance_left = ADVANCE;
7337                }
7338        }
7339
7340out_unlock:
7341        up_read(&fs_info->commit_root_sem);
7342out:
7343        btrfs_free_path(left_path);
7344        btrfs_free_path(right_path);
7345        kvfree(tmp_buf);
7346        return ret;
7347}
7348
7349static int send_subvol(struct send_ctx *sctx)
7350{
7351        int ret;
7352
7353        if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
7354                ret = send_header(sctx);
7355                if (ret < 0)
7356                        goto out;
7357        }
7358
7359        ret = send_subvol_begin(sctx);
7360        if (ret < 0)
7361                goto out;
7362
7363        if (sctx->parent_root) {
7364                ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
7365                if (ret < 0)
7366                        goto out;
7367                ret = finish_inode_if_needed(sctx, 1);
7368                if (ret < 0)
7369                        goto out;
7370        } else {
7371                ret = full_send_tree(sctx);
7372                if (ret < 0)
7373                        goto out;
7374        }
7375
7376out:
7377        free_recorded_refs(sctx);
7378        return ret;
7379}
7380
7381/*
7382 * If orphan cleanup did remove any orphans from a root, it means the tree
7383 * was modified and therefore the commit root is not the same as the current
7384 * root anymore. This is a problem, because send uses the commit root and
7385 * therefore can see inode items that don't exist in the current root anymore,
7386 * and for example make calls to btrfs_iget, which will do tree lookups based
7387 * on the current root and not on the commit root. Those lookups will fail,
7388 * returning a -ESTALE error, and making send fail with that error. So make
7389 * sure a send does not see any orphans we have just removed, and that it will
7390 * see the same inodes regardless of whether a transaction commit happened
7391 * before it started (meaning that the commit root will be the same as the
7392 * current root) or not.
7393 */
7394static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
7395{
7396        int i;
7397        struct btrfs_trans_handle *trans = NULL;
7398
7399again:
7400        if (sctx->parent_root &&
7401            sctx->parent_root->node != sctx->parent_root->commit_root)
7402                goto commit_trans;
7403
7404        for (i = 0; i < sctx->clone_roots_cnt; i++)
7405                if (sctx->clone_roots[i].root->node !=
7406                    sctx->clone_roots[i].root->commit_root)
7407                        goto commit_trans;
7408
7409        if (trans)
7410                return btrfs_end_transaction(trans);
7411
7412        return 0;
7413
7414commit_trans:
7415        /* Use any root, all fs roots will get their commit roots updated. */
7416        if (!trans) {
7417                trans = btrfs_join_transaction(sctx->send_root);
7418                if (IS_ERR(trans))
7419                        return PTR_ERR(trans);
7420                goto again;
7421        }
7422
7423        return btrfs_commit_transaction(trans);
7424}
7425
7426/*
7427 * Make sure any existing dellaloc is flushed for any root used by a send
7428 * operation so that we do not miss any data and we do not race with writeback
7429 * finishing and changing a tree while send is using the tree. This could
7430 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
7431 * a send operation then uses the subvolume.
7432 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
7433 */
7434static int flush_delalloc_roots(struct send_ctx *sctx)
7435{
7436        struct btrfs_root *root = sctx->parent_root;
7437        int ret;
7438        int i;
7439
7440        if (root) {
7441                ret = btrfs_start_delalloc_snapshot(root, false);
7442                if (ret)
7443                        return ret;
7444                btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
7445        }
7446
7447        for (i = 0; i < sctx->clone_roots_cnt; i++) {
7448                root = sctx->clone_roots[i].root;
7449                ret = btrfs_start_delalloc_snapshot(root, false);
7450                if (ret)
7451                        return ret;
7452                btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
7453        }
7454
7455        return 0;
7456}
7457
7458static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
7459{
7460        spin_lock(&root->root_item_lock);
7461        root->send_in_progress--;
7462        /*
7463         * Not much left to do, we don't know why it's unbalanced and
7464         * can't blindly reset it to 0.
7465         */
7466        if (root->send_in_progress < 0)
7467                btrfs_err(root->fs_info,
7468                          "send_in_progress unbalanced %d root %llu",
7469                          root->send_in_progress, root->root_key.objectid);
7470        spin_unlock(&root->root_item_lock);
7471}
7472
7473static void dedupe_in_progress_warn(const struct btrfs_root *root)
7474{
7475        btrfs_warn_rl(root->fs_info,
7476"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
7477                      root->root_key.objectid, root->dedupe_in_progress);
7478}
7479
7480long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
7481{
7482        int ret = 0;
7483        struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
7484        struct btrfs_fs_info *fs_info = send_root->fs_info;
7485        struct btrfs_root *clone_root;
7486        struct send_ctx *sctx = NULL;
7487        u32 i;
7488        u64 *clone_sources_tmp = NULL;
7489        int clone_sources_to_rollback = 0;
7490        size_t alloc_size;
7491        int sort_clone_roots = 0;
7492
7493        if (!capable(CAP_SYS_ADMIN))
7494                return -EPERM;
7495
7496        /*
7497         * The subvolume must remain read-only during send, protect against
7498         * making it RW. This also protects against deletion.
7499         */
7500        spin_lock(&send_root->root_item_lock);
7501        if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
7502                dedupe_in_progress_warn(send_root);
7503                spin_unlock(&send_root->root_item_lock);
7504                return -EAGAIN;
7505        }
7506        send_root->send_in_progress++;
7507        spin_unlock(&send_root->root_item_lock);
7508
7509        /*
7510         * Userspace tools do the checks and warn the user if it's
7511         * not RO.
7512         */
7513        if (!btrfs_root_readonly(send_root)) {
7514                ret = -EPERM;
7515                goto out;
7516        }
7517
7518        /*
7519         * Check that we don't overflow at later allocations, we request
7520         * clone_sources_count + 1 items, and compare to unsigned long inside
7521         * access_ok.
7522         */
7523        if (arg->clone_sources_count >
7524            ULONG_MAX / sizeof(struct clone_root) - 1) {
7525                ret = -EINVAL;
7526                goto out;
7527        }
7528
7529        if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
7530                ret = -EINVAL;
7531                goto out;
7532        }
7533
7534        sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
7535        if (!sctx) {
7536                ret = -ENOMEM;
7537                goto out;
7538        }
7539
7540        INIT_LIST_HEAD(&sctx->new_refs);
7541        INIT_LIST_HEAD(&sctx->deleted_refs);
7542        INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
7543        INIT_LIST_HEAD(&sctx->name_cache_list);
7544
7545        sctx->flags = arg->flags;
7546
7547        if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
7548                if (arg->version > BTRFS_SEND_STREAM_VERSION) {
7549                        ret = -EPROTO;
7550                        goto out;
7551                }
7552                /* Zero means "use the highest version" */
7553                sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
7554        } else {
7555                sctx->proto = 1;
7556        }
7557
7558        sctx->send_filp = fget(arg->send_fd);
7559        if (!sctx->send_filp) {
7560                ret = -EBADF;
7561                goto out;
7562        }
7563
7564        sctx->send_root = send_root;
7565        /*
7566         * Unlikely but possible, if the subvolume is marked for deletion but
7567         * is slow to remove the directory entry, send can still be started
7568         */
7569        if (btrfs_root_dead(sctx->send_root)) {
7570                ret = -EPERM;
7571                goto out;
7572        }
7573
7574        sctx->clone_roots_cnt = arg->clone_sources_count;
7575
7576        sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
7577        sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
7578        if (!sctx->send_buf) {
7579                ret = -ENOMEM;
7580                goto out;
7581        }
7582
7583        sctx->pending_dir_moves = RB_ROOT;
7584        sctx->waiting_dir_moves = RB_ROOT;
7585        sctx->orphan_dirs = RB_ROOT;
7586
7587        sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
7588                                     arg->clone_sources_count + 1,
7589                                     GFP_KERNEL);
7590        if (!sctx->clone_roots) {
7591                ret = -ENOMEM;
7592                goto out;
7593        }
7594
7595        alloc_size = array_size(sizeof(*arg->clone_sources),
7596                                arg->clone_sources_count);
7597
7598        if (arg->clone_sources_count) {
7599                clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
7600                if (!clone_sources_tmp) {
7601                        ret = -ENOMEM;
7602                        goto out;
7603                }
7604
7605                ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
7606                                alloc_size);
7607                if (ret) {
7608                        ret = -EFAULT;
7609                        goto out;
7610                }
7611
7612                for (i = 0; i < arg->clone_sources_count; i++) {
7613                        clone_root = btrfs_get_fs_root(fs_info,
7614                                                clone_sources_tmp[i], true);
7615                        if (IS_ERR(clone_root)) {
7616                                ret = PTR_ERR(clone_root);
7617                                goto out;
7618                        }
7619                        spin_lock(&clone_root->root_item_lock);
7620                        if (!btrfs_root_readonly(clone_root) ||
7621                            btrfs_root_dead(clone_root)) {
7622                                spin_unlock(&clone_root->root_item_lock);
7623                                btrfs_put_root(clone_root);
7624                                ret = -EPERM;
7625                                goto out;
7626                        }
7627                        if (clone_root->dedupe_in_progress) {
7628                                dedupe_in_progress_warn(clone_root);
7629                                spin_unlock(&clone_root->root_item_lock);
7630                                btrfs_put_root(clone_root);
7631                                ret = -EAGAIN;
7632                                goto out;
7633                        }
7634                        clone_root->send_in_progress++;
7635                        spin_unlock(&clone_root->root_item_lock);
7636
7637                        sctx->clone_roots[i].root = clone_root;
7638                        clone_sources_to_rollback = i + 1;
7639                }
7640                kvfree(clone_sources_tmp);
7641                clone_sources_tmp = NULL;
7642        }
7643
7644        if (arg->parent_root) {
7645                sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
7646                                                      true);
7647                if (IS_ERR(sctx->parent_root)) {
7648                        ret = PTR_ERR(sctx->parent_root);
7649                        goto out;
7650                }
7651
7652                spin_lock(&sctx->parent_root->root_item_lock);
7653                sctx->parent_root->send_in_progress++;
7654                if (!btrfs_root_readonly(sctx->parent_root) ||
7655                                btrfs_root_dead(sctx->parent_root)) {
7656                        spin_unlock(&sctx->parent_root->root_item_lock);
7657                        ret = -EPERM;
7658                        goto out;
7659                }
7660                if (sctx->parent_root->dedupe_in_progress) {
7661                        dedupe_in_progress_warn(sctx->parent_root);
7662                        spin_unlock(&sctx->parent_root->root_item_lock);
7663                        ret = -EAGAIN;
7664                        goto out;
7665                }
7666                spin_unlock(&sctx->parent_root->root_item_lock);
7667        }
7668
7669        /*
7670         * Clones from send_root are allowed, but only if the clone source
7671         * is behind the current send position. This is checked while searching
7672         * for possible clone sources.
7673         */
7674        sctx->clone_roots[sctx->clone_roots_cnt++].root =
7675                btrfs_grab_root(sctx->send_root);
7676
7677        /* We do a bsearch later */
7678        sort(sctx->clone_roots, sctx->clone_roots_cnt,
7679                        sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
7680                        NULL);
7681        sort_clone_roots = 1;
7682
7683        ret = flush_delalloc_roots(sctx);
7684        if (ret)
7685                goto out;
7686
7687        ret = ensure_commit_roots_uptodate(sctx);
7688        if (ret)
7689                goto out;
7690
7691        ret = send_subvol(sctx);
7692        if (ret < 0)
7693                goto out;
7694
7695        if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
7696                ret = begin_cmd(sctx, BTRFS_SEND_C_END);
7697                if (ret < 0)
7698                        goto out;
7699                ret = send_cmd(sctx);
7700                if (ret < 0)
7701                        goto out;
7702        }
7703
7704out:
7705        WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
7706        while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
7707                struct rb_node *n;
7708                struct pending_dir_move *pm;
7709
7710                n = rb_first(&sctx->pending_dir_moves);
7711                pm = rb_entry(n, struct pending_dir_move, node);
7712                while (!list_empty(&pm->list)) {
7713                        struct pending_dir_move *pm2;
7714
7715                        pm2 = list_first_entry(&pm->list,
7716                                               struct pending_dir_move, list);
7717                        free_pending_move(sctx, pm2);
7718                }
7719                free_pending_move(sctx, pm);
7720        }
7721
7722        WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
7723        while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
7724                struct rb_node *n;
7725                struct waiting_dir_move *dm;
7726
7727                n = rb_first(&sctx->waiting_dir_moves);
7728                dm = rb_entry(n, struct waiting_dir_move, node);
7729                rb_erase(&dm->node, &sctx->waiting_dir_moves);
7730                kfree(dm);
7731        }
7732
7733        WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
7734        while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
7735                struct rb_node *n;
7736                struct orphan_dir_info *odi;
7737
7738                n = rb_first(&sctx->orphan_dirs);
7739                odi = rb_entry(n, struct orphan_dir_info, node);
7740                free_orphan_dir_info(sctx, odi);
7741        }
7742
7743        if (sort_clone_roots) {
7744                for (i = 0; i < sctx->clone_roots_cnt; i++) {
7745                        btrfs_root_dec_send_in_progress(
7746                                        sctx->clone_roots[i].root);
7747                        btrfs_put_root(sctx->clone_roots[i].root);
7748                }
7749        } else {
7750                for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
7751                        btrfs_root_dec_send_in_progress(
7752                                        sctx->clone_roots[i].root);
7753                        btrfs_put_root(sctx->clone_roots[i].root);
7754                }
7755
7756                btrfs_root_dec_send_in_progress(send_root);
7757        }
7758        if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
7759                btrfs_root_dec_send_in_progress(sctx->parent_root);
7760                btrfs_put_root(sctx->parent_root);
7761        }
7762
7763        kvfree(clone_sources_tmp);
7764
7765        if (sctx) {
7766                if (sctx->send_filp)
7767                        fput(sctx->send_filp);
7768
7769                kvfree(sctx->clone_roots);
7770                kvfree(sctx->send_buf);
7771
7772                name_cache_free(sctx);
7773
7774                kfree(sctx);
7775        }
7776
7777        return ret;
7778}
7779