linux/fs/btrfs/reflink.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/blkdev.h>
   4#include <linux/iversion.h>
   5#include "compression.h"
   6#include "ctree.h"
   7#include "delalloc-space.h"
   8#include "reflink.h"
   9#include "transaction.h"
  10#include "subpage.h"
  11
  12#define BTRFS_MAX_DEDUPE_LEN    SZ_16M
  13
  14static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
  15                                     struct inode *inode,
  16                                     u64 endoff,
  17                                     const u64 destoff,
  18                                     const u64 olen,
  19                                     int no_time_update)
  20{
  21        struct btrfs_root *root = BTRFS_I(inode)->root;
  22        int ret;
  23
  24        inode_inc_iversion(inode);
  25        if (!no_time_update)
  26                inode->i_mtime = inode->i_ctime = current_time(inode);
  27        /*
  28         * We round up to the block size at eof when determining which
  29         * extents to clone above, but shouldn't round up the file size.
  30         */
  31        if (endoff > destoff + olen)
  32                endoff = destoff + olen;
  33        if (endoff > inode->i_size) {
  34                i_size_write(inode, endoff);
  35                btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
  36        }
  37
  38        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
  39        if (ret) {
  40                btrfs_abort_transaction(trans, ret);
  41                btrfs_end_transaction(trans);
  42                goto out;
  43        }
  44        ret = btrfs_end_transaction(trans);
  45out:
  46        return ret;
  47}
  48
  49static int copy_inline_to_page(struct btrfs_inode *inode,
  50                               const u64 file_offset,
  51                               char *inline_data,
  52                               const u64 size,
  53                               const u64 datal,
  54                               const u8 comp_type)
  55{
  56        struct btrfs_fs_info *fs_info = inode->root->fs_info;
  57        const u32 block_size = fs_info->sectorsize;
  58        const u64 range_end = file_offset + block_size - 1;
  59        const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
  60        char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
  61        struct extent_changeset *data_reserved = NULL;
  62        struct page *page = NULL;
  63        struct address_space *mapping = inode->vfs_inode.i_mapping;
  64        int ret;
  65
  66        ASSERT(IS_ALIGNED(file_offset, block_size));
  67
  68        /*
  69         * We have flushed and locked the ranges of the source and destination
  70         * inodes, we also have locked the inodes, so we are safe to do a
  71         * reservation here. Also we must not do the reservation while holding
  72         * a transaction open, otherwise we would deadlock.
  73         */
  74        ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
  75                                           block_size);
  76        if (ret)
  77                goto out;
  78
  79        page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
  80                                   btrfs_alloc_write_mask(mapping));
  81        if (!page) {
  82                ret = -ENOMEM;
  83                goto out_unlock;
  84        }
  85
  86        ret = set_page_extent_mapped(page);
  87        if (ret < 0)
  88                goto out_unlock;
  89
  90        clear_extent_bit(&inode->io_tree, file_offset, range_end,
  91                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
  92                         0, 0, NULL);
  93        ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
  94        if (ret)
  95                goto out_unlock;
  96
  97        /*
  98         * After dirtying the page our caller will need to start a transaction,
  99         * and if we are low on metadata free space, that can cause flushing of
 100         * delalloc for all inodes in order to get metadata space released.
 101         * However we are holding the range locked for the whole duration of
 102         * the clone/dedupe operation, so we may deadlock if that happens and no
 103         * other task releases enough space. So mark this inode as not being
 104         * possible to flush to avoid such deadlock. We will clear that flag
 105         * when we finish cloning all extents, since a transaction is started
 106         * after finding each extent to clone.
 107         */
 108        set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
 109
 110        if (comp_type == BTRFS_COMPRESS_NONE) {
 111                memcpy_to_page(page, offset_in_page(file_offset), data_start,
 112                               datal);
 113                flush_dcache_page(page);
 114        } else {
 115                ret = btrfs_decompress(comp_type, data_start, page,
 116                                       offset_in_page(file_offset),
 117                                       inline_size, datal);
 118                if (ret)
 119                        goto out_unlock;
 120                flush_dcache_page(page);
 121        }
 122
 123        /*
 124         * If our inline data is smaller then the block/page size, then the
 125         * remaining of the block/page is equivalent to zeroes. We had something
 126         * like the following done:
 127         *
 128         * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
 129         * $ sync  # (or fsync)
 130         * $ xfs_io -c "falloc 0 4K" file
 131         * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
 132         *
 133         * So what's in the range [500, 4095] corresponds to zeroes.
 134         */
 135        if (datal < block_size) {
 136                memzero_page(page, datal, block_size - datal);
 137                flush_dcache_page(page);
 138        }
 139
 140        btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
 141        ClearPageChecked(page);
 142        btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
 143out_unlock:
 144        if (page) {
 145                unlock_page(page);
 146                put_page(page);
 147        }
 148        if (ret)
 149                btrfs_delalloc_release_space(inode, data_reserved, file_offset,
 150                                             block_size, true);
 151        btrfs_delalloc_release_extents(inode, block_size);
 152out:
 153        extent_changeset_free(data_reserved);
 154
 155        return ret;
 156}
 157
 158/*
 159 * Deal with cloning of inline extents. We try to copy the inline extent from
 160 * the source inode to destination inode when possible. When not possible we
 161 * copy the inline extent's data into the respective page of the inode.
 162 */
 163static int clone_copy_inline_extent(struct inode *dst,
 164                                    struct btrfs_path *path,
 165                                    struct btrfs_key *new_key,
 166                                    const u64 drop_start,
 167                                    const u64 datal,
 168                                    const u64 size,
 169                                    const u8 comp_type,
 170                                    char *inline_data,
 171                                    struct btrfs_trans_handle **trans_out)
 172{
 173        struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
 174        struct btrfs_root *root = BTRFS_I(dst)->root;
 175        const u64 aligned_end = ALIGN(new_key->offset + datal,
 176                                      fs_info->sectorsize);
 177        struct btrfs_trans_handle *trans = NULL;
 178        struct btrfs_drop_extents_args drop_args = { 0 };
 179        int ret;
 180        struct btrfs_key key;
 181
 182        if (new_key->offset > 0) {
 183                ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
 184                                          inline_data, size, datal, comp_type);
 185                goto out;
 186        }
 187
 188        key.objectid = btrfs_ino(BTRFS_I(dst));
 189        key.type = BTRFS_EXTENT_DATA_KEY;
 190        key.offset = 0;
 191        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 192        if (ret < 0) {
 193                return ret;
 194        } else if (ret > 0) {
 195                if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 196                        ret = btrfs_next_leaf(root, path);
 197                        if (ret < 0)
 198                                return ret;
 199                        else if (ret > 0)
 200                                goto copy_inline_extent;
 201                }
 202                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 203                if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
 204                    key.type == BTRFS_EXTENT_DATA_KEY) {
 205                        /*
 206                         * There's an implicit hole at file offset 0, copy the
 207                         * inline extent's data to the page.
 208                         */
 209                        ASSERT(key.offset > 0);
 210                        goto copy_to_page;
 211                }
 212        } else if (i_size_read(dst) <= datal) {
 213                struct btrfs_file_extent_item *ei;
 214
 215                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
 216                                    struct btrfs_file_extent_item);
 217                /*
 218                 * If it's an inline extent replace it with the source inline
 219                 * extent, otherwise copy the source inline extent data into
 220                 * the respective page at the destination inode.
 221                 */
 222                if (btrfs_file_extent_type(path->nodes[0], ei) ==
 223                    BTRFS_FILE_EXTENT_INLINE)
 224                        goto copy_inline_extent;
 225
 226                goto copy_to_page;
 227        }
 228
 229copy_inline_extent:
 230        /*
 231         * We have no extent items, or we have an extent at offset 0 which may
 232         * or may not be inlined. All these cases are dealt the same way.
 233         */
 234        if (i_size_read(dst) > datal) {
 235                /*
 236                 * At the destination offset 0 we have either a hole, a regular
 237                 * extent or an inline extent larger then the one we want to
 238                 * clone. Deal with all these cases by copying the inline extent
 239                 * data into the respective page at the destination inode.
 240                 */
 241                goto copy_to_page;
 242        }
 243
 244        /*
 245         * Release path before starting a new transaction so we don't hold locks
 246         * that would confuse lockdep.
 247         */
 248        btrfs_release_path(path);
 249        /*
 250         * If we end up here it means were copy the inline extent into a leaf
 251         * of the destination inode. We know we will drop or adjust at most one
 252         * extent item in the destination root.
 253         *
 254         * 1 unit - adjusting old extent (we may have to split it)
 255         * 1 unit - add new extent
 256         * 1 unit - inode update
 257         */
 258        trans = btrfs_start_transaction(root, 3);
 259        if (IS_ERR(trans)) {
 260                ret = PTR_ERR(trans);
 261                trans = NULL;
 262                goto out;
 263        }
 264        drop_args.path = path;
 265        drop_args.start = drop_start;
 266        drop_args.end = aligned_end;
 267        drop_args.drop_cache = true;
 268        ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
 269        if (ret)
 270                goto out;
 271        ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
 272        if (ret)
 273                goto out;
 274
 275        write_extent_buffer(path->nodes[0], inline_data,
 276                            btrfs_item_ptr_offset(path->nodes[0],
 277                                                  path->slots[0]),
 278                            size);
 279        btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
 280        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
 281        ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
 282out:
 283        if (!ret && !trans) {
 284                /*
 285                 * No transaction here means we copied the inline extent into a
 286                 * page of the destination inode.
 287                 *
 288                 * 1 unit to update inode item
 289                 */
 290                trans = btrfs_start_transaction(root, 1);
 291                if (IS_ERR(trans)) {
 292                        ret = PTR_ERR(trans);
 293                        trans = NULL;
 294                }
 295        }
 296        if (ret && trans) {
 297                btrfs_abort_transaction(trans, ret);
 298                btrfs_end_transaction(trans);
 299        }
 300        if (!ret)
 301                *trans_out = trans;
 302
 303        return ret;
 304
 305copy_to_page:
 306        /*
 307         * Release our path because we don't need it anymore and also because
 308         * copy_inline_to_page() needs to reserve data and metadata, which may
 309         * need to flush delalloc when we are low on available space and
 310         * therefore cause a deadlock if writeback of an inline extent needs to
 311         * write to the same leaf or an ordered extent completion needs to write
 312         * to the same leaf.
 313         */
 314        btrfs_release_path(path);
 315
 316        ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
 317                                  inline_data, size, datal, comp_type);
 318        goto out;
 319}
 320
 321/**
 322 * btrfs_clone() - clone a range from inode file to another
 323 *
 324 * @src: Inode to clone from
 325 * @inode: Inode to clone to
 326 * @off: Offset within source to start clone from
 327 * @olen: Original length, passed by user, of range to clone
 328 * @olen_aligned: Block-aligned value of olen
 329 * @destoff: Offset within @inode to start clone
 330 * @no_time_update: Whether to update mtime/ctime on the target inode
 331 */
 332static int btrfs_clone(struct inode *src, struct inode *inode,
 333                       const u64 off, const u64 olen, const u64 olen_aligned,
 334                       const u64 destoff, int no_time_update)
 335{
 336        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 337        struct btrfs_path *path = NULL;
 338        struct extent_buffer *leaf;
 339        struct btrfs_trans_handle *trans;
 340        char *buf = NULL;
 341        struct btrfs_key key;
 342        u32 nritems;
 343        int slot;
 344        int ret;
 345        const u64 len = olen_aligned;
 346        u64 last_dest_end = destoff;
 347
 348        ret = -ENOMEM;
 349        buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
 350        if (!buf)
 351                return ret;
 352
 353        path = btrfs_alloc_path();
 354        if (!path) {
 355                kvfree(buf);
 356                return ret;
 357        }
 358
 359        path->reada = READA_FORWARD;
 360        /* Clone data */
 361        key.objectid = btrfs_ino(BTRFS_I(src));
 362        key.type = BTRFS_EXTENT_DATA_KEY;
 363        key.offset = off;
 364
 365        while (1) {
 366                u64 next_key_min_offset = key.offset + 1;
 367                struct btrfs_file_extent_item *extent;
 368                u64 extent_gen;
 369                int type;
 370                u32 size;
 371                struct btrfs_key new_key;
 372                u64 disko = 0, diskl = 0;
 373                u64 datao = 0, datal = 0;
 374                u8 comp;
 375                u64 drop_start;
 376
 377                /* Note the key will change type as we walk through the tree */
 378                ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
 379                                0, 0);
 380                if (ret < 0)
 381                        goto out;
 382                /*
 383                 * First search, if no extent item that starts at offset off was
 384                 * found but the previous item is an extent item, it's possible
 385                 * it might overlap our target range, therefore process it.
 386                 */
 387                if (key.offset == off && ret > 0 && path->slots[0] > 0) {
 388                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 389                                              path->slots[0] - 1);
 390                        if (key.type == BTRFS_EXTENT_DATA_KEY)
 391                                path->slots[0]--;
 392                }
 393
 394                nritems = btrfs_header_nritems(path->nodes[0]);
 395process_slot:
 396                if (path->slots[0] >= nritems) {
 397                        ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
 398                        if (ret < 0)
 399                                goto out;
 400                        if (ret > 0)
 401                                break;
 402                        nritems = btrfs_header_nritems(path->nodes[0]);
 403                }
 404                leaf = path->nodes[0];
 405                slot = path->slots[0];
 406
 407                btrfs_item_key_to_cpu(leaf, &key, slot);
 408                if (key.type > BTRFS_EXTENT_DATA_KEY ||
 409                    key.objectid != btrfs_ino(BTRFS_I(src)))
 410                        break;
 411
 412                ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
 413
 414                extent = btrfs_item_ptr(leaf, slot,
 415                                        struct btrfs_file_extent_item);
 416                extent_gen = btrfs_file_extent_generation(leaf, extent);
 417                comp = btrfs_file_extent_compression(leaf, extent);
 418                type = btrfs_file_extent_type(leaf, extent);
 419                if (type == BTRFS_FILE_EXTENT_REG ||
 420                    type == BTRFS_FILE_EXTENT_PREALLOC) {
 421                        disko = btrfs_file_extent_disk_bytenr(leaf, extent);
 422                        diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
 423                        datao = btrfs_file_extent_offset(leaf, extent);
 424                        datal = btrfs_file_extent_num_bytes(leaf, extent);
 425                } else if (type == BTRFS_FILE_EXTENT_INLINE) {
 426                        /* Take upper bound, may be compressed */
 427                        datal = btrfs_file_extent_ram_bytes(leaf, extent);
 428                }
 429
 430                /*
 431                 * The first search might have left us at an extent item that
 432                 * ends before our target range's start, can happen if we have
 433                 * holes and NO_HOLES feature enabled.
 434                 */
 435                if (key.offset + datal <= off) {
 436                        path->slots[0]++;
 437                        goto process_slot;
 438                } else if (key.offset >= off + len) {
 439                        break;
 440                }
 441                next_key_min_offset = key.offset + datal;
 442                size = btrfs_item_size_nr(leaf, slot);
 443                read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
 444                                   size);
 445
 446                btrfs_release_path(path);
 447
 448                memcpy(&new_key, &key, sizeof(new_key));
 449                new_key.objectid = btrfs_ino(BTRFS_I(inode));
 450                if (off <= key.offset)
 451                        new_key.offset = key.offset + destoff - off;
 452                else
 453                        new_key.offset = destoff;
 454
 455                /*
 456                 * Deal with a hole that doesn't have an extent item that
 457                 * represents it (NO_HOLES feature enabled).
 458                 * This hole is either in the middle of the cloning range or at
 459                 * the beginning (fully overlaps it or partially overlaps it).
 460                 */
 461                if (new_key.offset != last_dest_end)
 462                        drop_start = last_dest_end;
 463                else
 464                        drop_start = new_key.offset;
 465
 466                if (type == BTRFS_FILE_EXTENT_REG ||
 467                    type == BTRFS_FILE_EXTENT_PREALLOC) {
 468                        struct btrfs_replace_extent_info clone_info;
 469
 470                        /*
 471                         *    a  | --- range to clone ---|  b
 472                         * | ------------- extent ------------- |
 473                         */
 474
 475                        /* Subtract range b */
 476                        if (key.offset + datal > off + len)
 477                                datal = off + len - key.offset;
 478
 479                        /* Subtract range a */
 480                        if (off > key.offset) {
 481                                datao += off - key.offset;
 482                                datal -= off - key.offset;
 483                        }
 484
 485                        clone_info.disk_offset = disko;
 486                        clone_info.disk_len = diskl;
 487                        clone_info.data_offset = datao;
 488                        clone_info.data_len = datal;
 489                        clone_info.file_offset = new_key.offset;
 490                        clone_info.extent_buf = buf;
 491                        clone_info.is_new_extent = false;
 492                        ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
 493                                        drop_start, new_key.offset + datal - 1,
 494                                        &clone_info, &trans);
 495                        if (ret)
 496                                goto out;
 497                } else if (type == BTRFS_FILE_EXTENT_INLINE) {
 498                        /*
 499                         * Inline extents always have to start at file offset 0
 500                         * and can never be bigger then the sector size. We can
 501                         * never clone only parts of an inline extent, since all
 502                         * reflink operations must start at a sector size aligned
 503                         * offset, and the length must be aligned too or end at
 504                         * the i_size (which implies the whole inlined data).
 505                         */
 506                        ASSERT(key.offset == 0);
 507                        ASSERT(datal <= fs_info->sectorsize);
 508                        if (key.offset != 0 || datal > fs_info->sectorsize)
 509                                return -EUCLEAN;
 510
 511                        ret = clone_copy_inline_extent(inode, path, &new_key,
 512                                                       drop_start, datal, size,
 513                                                       comp, buf, &trans);
 514                        if (ret)
 515                                goto out;
 516                }
 517
 518                btrfs_release_path(path);
 519
 520                /*
 521                 * If this is a new extent update the last_reflink_trans of both
 522                 * inodes. This is used by fsync to make sure it does not log
 523                 * multiple checksum items with overlapping ranges. For older
 524                 * extents we don't need to do it since inode logging skips the
 525                 * checksums for older extents. Also ignore holes and inline
 526                 * extents because they don't have checksums in the csum tree.
 527                 */
 528                if (extent_gen == trans->transid && disko > 0) {
 529                        BTRFS_I(src)->last_reflink_trans = trans->transid;
 530                        BTRFS_I(inode)->last_reflink_trans = trans->transid;
 531                }
 532
 533                last_dest_end = ALIGN(new_key.offset + datal,
 534                                      fs_info->sectorsize);
 535                ret = clone_finish_inode_update(trans, inode, last_dest_end,
 536                                                destoff, olen, no_time_update);
 537                if (ret)
 538                        goto out;
 539                if (new_key.offset + datal >= destoff + len)
 540                        break;
 541
 542                btrfs_release_path(path);
 543                key.offset = next_key_min_offset;
 544
 545                if (fatal_signal_pending(current)) {
 546                        ret = -EINTR;
 547                        goto out;
 548                }
 549
 550                cond_resched();
 551        }
 552        ret = 0;
 553
 554        if (last_dest_end < destoff + len) {
 555                /*
 556                 * We have an implicit hole that fully or partially overlaps our
 557                 * cloning range at its end. This means that we either have the
 558                 * NO_HOLES feature enabled or the implicit hole happened due to
 559                 * mixing buffered and direct IO writes against this file.
 560                 */
 561                btrfs_release_path(path);
 562
 563                /*
 564                 * When using NO_HOLES and we are cloning a range that covers
 565                 * only a hole (no extents) into a range beyond the current
 566                 * i_size, punching a hole in the target range will not create
 567                 * an extent map defining a hole, because the range starts at or
 568                 * beyond current i_size. If the file previously had an i_size
 569                 * greater than the new i_size set by this clone operation, we
 570                 * need to make sure the next fsync is a full fsync, so that it
 571                 * detects and logs a hole covering a range from the current
 572                 * i_size to the new i_size. If the clone range covers extents,
 573                 * besides a hole, then we know the full sync flag was already
 574                 * set by previous calls to btrfs_replace_file_extents() that
 575                 * replaced file extent items.
 576                 */
 577                if (last_dest_end >= i_size_read(inode))
 578                        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 579                                &BTRFS_I(inode)->runtime_flags);
 580
 581                ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
 582                                last_dest_end, destoff + len - 1, NULL, &trans);
 583                if (ret)
 584                        goto out;
 585
 586                ret = clone_finish_inode_update(trans, inode, destoff + len,
 587                                                destoff, olen, no_time_update);
 588        }
 589
 590out:
 591        btrfs_free_path(path);
 592        kvfree(buf);
 593        clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
 594
 595        return ret;
 596}
 597
 598static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
 599                                       struct inode *inode2, u64 loff2, u64 len)
 600{
 601        unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
 602        unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 603}
 604
 605static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
 606                                     struct inode *inode2, u64 loff2, u64 len)
 607{
 608        if (inode1 < inode2) {
 609                swap(inode1, inode2);
 610                swap(loff1, loff2);
 611        } else if (inode1 == inode2 && loff2 < loff1) {
 612                swap(loff1, loff2);
 613        }
 614        lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
 615        lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
 616}
 617
 618static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
 619{
 620        if (inode1 < inode2)
 621                swap(inode1, inode2);
 622        down_write(&BTRFS_I(inode1)->i_mmap_lock);
 623        down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
 624}
 625
 626static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
 627{
 628        up_write(&BTRFS_I(inode1)->i_mmap_lock);
 629        up_write(&BTRFS_I(inode2)->i_mmap_lock);
 630}
 631
 632static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
 633                                   struct inode *dst, u64 dst_loff)
 634{
 635        const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
 636        int ret;
 637
 638        /*
 639         * Lock destination range to serialize with concurrent readpages() and
 640         * source range to serialize with relocation.
 641         */
 642        btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
 643        ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
 644        btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
 645
 646        return ret;
 647}
 648
 649static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 650                             struct inode *dst, u64 dst_loff)
 651{
 652        int ret;
 653        u64 i, tail_len, chunk_count;
 654        struct btrfs_root *root_dst = BTRFS_I(dst)->root;
 655
 656        spin_lock(&root_dst->root_item_lock);
 657        if (root_dst->send_in_progress) {
 658                btrfs_warn_rl(root_dst->fs_info,
 659"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
 660                              root_dst->root_key.objectid,
 661                              root_dst->send_in_progress);
 662                spin_unlock(&root_dst->root_item_lock);
 663                return -EAGAIN;
 664        }
 665        root_dst->dedupe_in_progress++;
 666        spin_unlock(&root_dst->root_item_lock);
 667
 668        tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
 669        chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
 670
 671        for (i = 0; i < chunk_count; i++) {
 672                ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
 673                                              dst, dst_loff);
 674                if (ret)
 675                        goto out;
 676
 677                loff += BTRFS_MAX_DEDUPE_LEN;
 678                dst_loff += BTRFS_MAX_DEDUPE_LEN;
 679        }
 680
 681        if (tail_len > 0)
 682                ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
 683out:
 684        spin_lock(&root_dst->root_item_lock);
 685        root_dst->dedupe_in_progress--;
 686        spin_unlock(&root_dst->root_item_lock);
 687
 688        return ret;
 689}
 690
 691static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 692                                        u64 off, u64 olen, u64 destoff)
 693{
 694        struct inode *inode = file_inode(file);
 695        struct inode *src = file_inode(file_src);
 696        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 697        int ret;
 698        int wb_ret;
 699        u64 len = olen;
 700        u64 bs = fs_info->sb->s_blocksize;
 701
 702        /*
 703         * VFS's generic_remap_file_range_prep() protects us from cloning the
 704         * eof block into the middle of a file, which would result in corruption
 705         * if the file size is not blocksize aligned. So we don't need to check
 706         * for that case here.
 707         */
 708        if (off + len == src->i_size)
 709                len = ALIGN(src->i_size, bs) - off;
 710
 711        if (destoff > inode->i_size) {
 712                const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
 713
 714                ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff);
 715                if (ret)
 716                        return ret;
 717                /*
 718                 * We may have truncated the last block if the inode's size is
 719                 * not sector size aligned, so we need to wait for writeback to
 720                 * complete before proceeding further, otherwise we can race
 721                 * with cloning and attempt to increment a reference to an
 722                 * extent that no longer exists (writeback completed right after
 723                 * we found the previous extent covering eof and before we
 724                 * attempted to increment its reference count).
 725                 */
 726                ret = btrfs_wait_ordered_range(inode, wb_start,
 727                                               destoff - wb_start);
 728                if (ret)
 729                        return ret;
 730        }
 731
 732        /*
 733         * Lock destination range to serialize with concurrent readpages() and
 734         * source range to serialize with relocation.
 735         */
 736        btrfs_double_extent_lock(src, off, inode, destoff, len);
 737        ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
 738        btrfs_double_extent_unlock(src, off, inode, destoff, len);
 739
 740        /*
 741         * We may have copied an inline extent into a page of the destination
 742         * range, so wait for writeback to complete before truncating pages
 743         * from the page cache. This is a rare case.
 744         */
 745        wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
 746        ret = ret ? ret : wb_ret;
 747        /*
 748         * Truncate page cache pages so that future reads will see the cloned
 749         * data immediately and not the previous data.
 750         */
 751        truncate_inode_pages_range(&inode->i_data,
 752                                round_down(destoff, PAGE_SIZE),
 753                                round_up(destoff + len, PAGE_SIZE) - 1);
 754
 755        return ret;
 756}
 757
 758static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 759                                       struct file *file_out, loff_t pos_out,
 760                                       loff_t *len, unsigned int remap_flags)
 761{
 762        struct inode *inode_in = file_inode(file_in);
 763        struct inode *inode_out = file_inode(file_out);
 764        u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
 765        bool same_inode = inode_out == inode_in;
 766        u64 wb_len;
 767        int ret;
 768
 769        if (!(remap_flags & REMAP_FILE_DEDUP)) {
 770                struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
 771
 772                if (btrfs_root_readonly(root_out))
 773                        return -EROFS;
 774
 775                if (file_in->f_path.mnt != file_out->f_path.mnt ||
 776                    inode_in->i_sb != inode_out->i_sb)
 777                        return -EXDEV;
 778        }
 779
 780        /* Don't make the dst file partly checksummed */
 781        if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
 782            (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
 783                return -EINVAL;
 784        }
 785
 786        /*
 787         * Now that the inodes are locked, we need to start writeback ourselves
 788         * and can not rely on the writeback from the VFS's generic helper
 789         * generic_remap_file_range_prep() because:
 790         *
 791         * 1) For compression we must call filemap_fdatawrite_range() range
 792         *    twice (btrfs_fdatawrite_range() does it for us), and the generic
 793         *    helper only calls it once;
 794         *
 795         * 2) filemap_fdatawrite_range(), called by the generic helper only
 796         *    waits for the writeback to complete, i.e. for IO to be done, and
 797         *    not for the ordered extents to complete. We need to wait for them
 798         *    to complete so that new file extent items are in the fs tree.
 799         */
 800        if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
 801                wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
 802        else
 803                wb_len = ALIGN(*len, bs);
 804
 805        /*
 806         * Since we don't lock ranges, wait for ongoing lockless dio writes (as
 807         * any in progress could create its ordered extents after we wait for
 808         * existing ordered extents below).
 809         */
 810        inode_dio_wait(inode_in);
 811        if (!same_inode)
 812                inode_dio_wait(inode_out);
 813
 814        /*
 815         * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
 816         *
 817         * Btrfs' back references do not have a block level granularity, they
 818         * work at the whole extent level.
 819         * NOCOW buffered write without data space reserved may not be able
 820         * to fall back to CoW due to lack of data space, thus could cause
 821         * data loss.
 822         *
 823         * Here we take a shortcut by flushing the whole inode, so that all
 824         * nocow write should reach disk as nocow before we increase the
 825         * reference of the extent. We could do better by only flushing NOCOW
 826         * data, but that needs extra accounting.
 827         *
 828         * Also we don't need to check ASYNC_EXTENT, as async extent will be
 829         * CoWed anyway, not affecting nocow part.
 830         */
 831        ret = filemap_flush(inode_in->i_mapping);
 832        if (ret < 0)
 833                return ret;
 834
 835        ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
 836                                       wb_len);
 837        if (ret < 0)
 838                return ret;
 839        ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
 840                                       wb_len);
 841        if (ret < 0)
 842                return ret;
 843
 844        return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
 845                                            len, remap_flags);
 846}
 847
 848static bool file_sync_write(const struct file *file)
 849{
 850        if (file->f_flags & (__O_SYNC | O_DSYNC))
 851                return true;
 852        if (IS_SYNC(file_inode(file)))
 853                return true;
 854
 855        return false;
 856}
 857
 858loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
 859                struct file *dst_file, loff_t destoff, loff_t len,
 860                unsigned int remap_flags)
 861{
 862        struct inode *src_inode = file_inode(src_file);
 863        struct inode *dst_inode = file_inode(dst_file);
 864        bool same_inode = dst_inode == src_inode;
 865        int ret;
 866
 867        if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
 868                return -EINVAL;
 869
 870        if (same_inode) {
 871                btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
 872        } else {
 873                lock_two_nondirectories(src_inode, dst_inode);
 874                btrfs_double_mmap_lock(src_inode, dst_inode);
 875        }
 876
 877        ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
 878                                          &len, remap_flags);
 879        if (ret < 0 || len == 0)
 880                goto out_unlock;
 881
 882        if (remap_flags & REMAP_FILE_DEDUP)
 883                ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
 884        else
 885                ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
 886
 887out_unlock:
 888        if (same_inode) {
 889                btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
 890        } else {
 891                btrfs_double_mmap_unlock(src_inode, dst_inode);
 892                unlock_two_nondirectories(src_inode, dst_inode);
 893        }
 894
 895        /*
 896         * If either the source or the destination file was opened with O_SYNC,
 897         * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
 898         * source files/ranges, so that after a successful return (0) followed
 899         * by a power failure results in the reflinked data to be readable from
 900         * both files/ranges.
 901         */
 902        if (ret == 0 && len > 0 &&
 903            (file_sync_write(src_file) || file_sync_write(dst_file))) {
 904                ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
 905                if (ret == 0)
 906                        ret = btrfs_sync_file(dst_file, destoff,
 907                                              destoff + len - 1, 0);
 908        }
 909
 910        return ret < 0 ? ret : len;
 911}
 912