LXR linux/fs/btrfs/ioctl.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/kernel.h>
   7#include <linux/bio.h>
   8#include <linux/file.h>
   9#include <linux/fs.h>
  10#include <linux/fsnotify.h>
  11#include <linux/pagemap.h>
  12#include <linux/highmem.h>
  13#include <linux/time.h>
  14#include <linux/string.h>
  15#include <linux/backing-dev.h>
  16#include <linux/mount.h>
  17#include <linux/namei.h>
  18#include <linux/writeback.h>
  19#include <linux/compat.h>
  20#include <linux/security.h>
  21#include <linux/xattr.h>
  22#include <linux/mm.h>
  23#include <linux/slab.h>
  24#include <linux/blkdev.h>
  25#include <linux/uuid.h>
  26#include <linux/btrfs.h>
  27#include <linux/uaccess.h>
  28#include <linux/iversion.h>
  29#include <linux/fileattr.h>
  30#include <linux/fsverity.h>
  31#include <linux/sched/xacct.h>
  32#include "ctree.h"
  33#include "disk-io.h"
  34#include "export.h"
  35#include "transaction.h"
  36#include "btrfs_inode.h"
  37#include "print-tree.h"
  38#include "volumes.h"
  39#include "locking.h"
  40#include "backref.h"
  41#include "rcu-string.h"
  42#include "send.h"
  43#include "dev-replace.h"
  44#include "props.h"
  45#include "sysfs.h"
  46#include "qgroup.h"
  47#include "tree-log.h"
  48#include "compression.h"
  49#include "space-info.h"
  50#include "delalloc-space.h"
  51#include "block-group.h"
  52#include "subpage.h"
  53
  54#ifdef CONFIG_64BIT
  55/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
  56 * structures are incorrect, as the timespec structure from userspace
  57 * is 4 bytes too small. We define these alternatives here to teach
  58 * the kernel about the 32-bit struct packing.
  59 */
  60struct btrfs_ioctl_timespec_32 {
  61        __u64 sec;
  62        __u32 nsec;
  63} __attribute__ ((__packed__));
  64
  65struct btrfs_ioctl_received_subvol_args_32 {
  66        char    uuid[BTRFS_UUID_SIZE];  /* in */
  67        __u64   stransid;               /* in */
  68        __u64   rtransid;               /* out */
  69        struct btrfs_ioctl_timespec_32 stime; /* in */
  70        struct btrfs_ioctl_timespec_32 rtime; /* out */
  71        __u64   flags;                  /* in */
  72        __u64   reserved[16];           /* in */
  73} __attribute__ ((__packed__));
  74
  75#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
  76                                struct btrfs_ioctl_received_subvol_args_32)
  77#endif
  78
  79#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
  80struct btrfs_ioctl_send_args_32 {
  81        __s64 send_fd;                  /* in */
  82        __u64 clone_sources_count;      /* in */
  83        compat_uptr_t clone_sources;    /* in */
  84        __u64 parent_root;              /* in */
  85        __u64 flags;                    /* in */
  86        __u32 version;                  /* in */
  87        __u8  reserved[28];             /* in */
  88} __attribute__ ((__packed__));
  89
  90#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
  91                               struct btrfs_ioctl_send_args_32)
  92
  93struct btrfs_ioctl_encoded_io_args_32 {
  94        compat_uptr_t iov;
  95        compat_ulong_t iovcnt;
  96        __s64 offset;
  97        __u64 flags;
  98        __u64 len;
  99        __u64 unencoded_len;
 100        __u64 unencoded_offset;
 101        __u32 compression;
 102        __u32 encryption;
 103        __u8 reserved[64];
 104};
 105
 106#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
 107                                       struct btrfs_ioctl_encoded_io_args_32)
 108#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
 109                                        struct btrfs_ioctl_encoded_io_args_32)
 110#endif
 111
 112/* Mask out flags that are inappropriate for the given type of inode. */
 113static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
 114                unsigned int flags)
 115{
 116        if (S_ISDIR(inode->i_mode))
 117                return flags;
 118        else if (S_ISREG(inode->i_mode))
 119                return flags & ~FS_DIRSYNC_FL;
 120        else
 121                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
 122}
 123
 124/*
 125 * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
 126 * ioctl.
 127 */
 128static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
 129{
 130        unsigned int iflags = 0;
 131        u32 flags = binode->flags;
 132        u32 ro_flags = binode->ro_flags;
 133
 134        if (flags & BTRFS_INODE_SYNC)
 135                iflags |= FS_SYNC_FL;
 136        if (flags & BTRFS_INODE_IMMUTABLE)
 137                iflags |= FS_IMMUTABLE_FL;
 138        if (flags & BTRFS_INODE_APPEND)
 139                iflags |= FS_APPEND_FL;
 140        if (flags & BTRFS_INODE_NODUMP)
 141                iflags |= FS_NODUMP_FL;
 142        if (flags & BTRFS_INODE_NOATIME)
 143                iflags |= FS_NOATIME_FL;
 144        if (flags & BTRFS_INODE_DIRSYNC)
 145                iflags |= FS_DIRSYNC_FL;
 146        if (flags & BTRFS_INODE_NODATACOW)
 147                iflags |= FS_NOCOW_FL;
 148        if (ro_flags & BTRFS_INODE_RO_VERITY)
 149                iflags |= FS_VERITY_FL;
 150
 151        if (flags & BTRFS_INODE_NOCOMPRESS)
 152                iflags |= FS_NOCOMP_FL;
 153        else if (flags & BTRFS_INODE_COMPRESS)
 154                iflags |= FS_COMPR_FL;
 155
 156        return iflags;
 157}
 158
 159/*
 160 * Update inode->i_flags based on the btrfs internal flags.
 161 */
 162void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
 163{
 164        struct btrfs_inode *binode = BTRFS_I(inode);
 165        unsigned int new_fl = 0;
 166
 167        if (binode->flags & BTRFS_INODE_SYNC)
 168                new_fl |= S_SYNC;
 169        if (binode->flags & BTRFS_INODE_IMMUTABLE)
 170                new_fl |= S_IMMUTABLE;
 171        if (binode->flags & BTRFS_INODE_APPEND)
 172                new_fl |= S_APPEND;
 173        if (binode->flags & BTRFS_INODE_NOATIME)
 174                new_fl |= S_NOATIME;
 175        if (binode->flags & BTRFS_INODE_DIRSYNC)
 176                new_fl |= S_DIRSYNC;
 177        if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
 178                new_fl |= S_VERITY;
 179
 180        set_mask_bits(&inode->i_flags,
 181                      S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
 182                      S_VERITY, new_fl);
 183}
 184
 185/*
 186 * Check if @flags are a supported and valid set of FS_*_FL flags and that
 187 * the old and new flags are not conflicting
 188 */
 189static int check_fsflags(unsigned int old_flags, unsigned int flags)
 190{
 191        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
 192                      FS_NOATIME_FL | FS_NODUMP_FL | \
 193                      FS_SYNC_FL | FS_DIRSYNC_FL | \
 194                      FS_NOCOMP_FL | FS_COMPR_FL |
 195                      FS_NOCOW_FL))
 196                return -EOPNOTSUPP;
 197
 198        /* COMPR and NOCOMP on new/old are valid */
 199        if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
 200                return -EINVAL;
 201
 202        if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
 203                return -EINVAL;
 204
 205        /* NOCOW and compression options are mutually exclusive */
 206        if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
 207                return -EINVAL;
 208        if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
 209                return -EINVAL;
 210
 211        return 0;
 212}
 213
 214static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
 215                                    unsigned int flags)
 216{
 217        if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
 218                return -EPERM;
 219
 220        return 0;
 221}
 222
 223/*
 224 * Set flags/xflags from the internal inode flags. The remaining items of
 225 * fsxattr are zeroed.
 226 */
 227int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 228{
 229        struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
 230
 231        fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
 232        return 0;
 233}
 234
 235int btrfs_fileattr_set(struct user_namespace *mnt_userns,
 236                       struct dentry *dentry, struct fileattr *fa)
 237{
 238        struct inode *inode = d_inode(dentry);
 239        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 240        struct btrfs_inode *binode = BTRFS_I(inode);
 241        struct btrfs_root *root = binode->root;
 242        struct btrfs_trans_handle *trans;
 243        unsigned int fsflags, old_fsflags;
 244        int ret;
 245        const char *comp = NULL;
 246        u32 binode_flags;
 247
 248        if (btrfs_root_readonly(root))
 249                return -EROFS;
 250
 251        if (fileattr_has_fsx(fa))
 252                return -EOPNOTSUPP;
 253
 254        fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
 255        old_fsflags = btrfs_inode_flags_to_fsflags(binode);
 256        ret = check_fsflags(old_fsflags, fsflags);
 257        if (ret)
 258                return ret;
 259
 260        ret = check_fsflags_compatible(fs_info, fsflags);
 261        if (ret)
 262                return ret;
 263
 264        binode_flags = binode->flags;
 265        if (fsflags & FS_SYNC_FL)
 266                binode_flags |= BTRFS_INODE_SYNC;
 267        else
 268                binode_flags &= ~BTRFS_INODE_SYNC;
 269        if (fsflags & FS_IMMUTABLE_FL)
 270                binode_flags |= BTRFS_INODE_IMMUTABLE;
 271        else
 272                binode_flags &= ~BTRFS_INODE_IMMUTABLE;
 273        if (fsflags & FS_APPEND_FL)
 274                binode_flags |= BTRFS_INODE_APPEND;
 275        else
 276                binode_flags &= ~BTRFS_INODE_APPEND;
 277        if (fsflags & FS_NODUMP_FL)
 278                binode_flags |= BTRFS_INODE_NODUMP;
 279        else
 280                binode_flags &= ~BTRFS_INODE_NODUMP;
 281        if (fsflags & FS_NOATIME_FL)
 282                binode_flags |= BTRFS_INODE_NOATIME;
 283        else
 284                binode_flags &= ~BTRFS_INODE_NOATIME;
 285
 286        /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
 287        if (!fa->flags_valid) {
 288                /* 1 item for the inode */
 289                trans = btrfs_start_transaction(root, 1);
 290                if (IS_ERR(trans))
 291                        return PTR_ERR(trans);
 292                goto update_flags;
 293        }
 294
 295        if (fsflags & FS_DIRSYNC_FL)
 296                binode_flags |= BTRFS_INODE_DIRSYNC;
 297        else
 298                binode_flags &= ~BTRFS_INODE_DIRSYNC;
 299        if (fsflags & FS_NOCOW_FL) {
 300                if (S_ISREG(inode->i_mode)) {
 301                        /*
 302                         * It's safe to turn csums off here, no extents exist.
 303                         * Otherwise we want the flag to reflect the real COW
 304                         * status of the file and will not set it.
 305                         */
 306                        if (inode->i_size == 0)
 307                                binode_flags |= BTRFS_INODE_NODATACOW |
 308                                                BTRFS_INODE_NODATASUM;
 309                } else {
 310                        binode_flags |= BTRFS_INODE_NODATACOW;
 311                }
 312        } else {
 313                /*
 314                 * Revert back under same assumptions as above
 315                 */
 316                if (S_ISREG(inode->i_mode)) {
 317                        if (inode->i_size == 0)
 318                                binode_flags &= ~(BTRFS_INODE_NODATACOW |
 319                                                  BTRFS_INODE_NODATASUM);
 320                } else {
 321                        binode_flags &= ~BTRFS_INODE_NODATACOW;
 322                }
 323        }
 324
 325        /*
 326         * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
 327         * flag may be changed automatically if compression code won't make
 328         * things smaller.
 329         */
 330        if (fsflags & FS_NOCOMP_FL) {
 331                binode_flags &= ~BTRFS_INODE_COMPRESS;
 332                binode_flags |= BTRFS_INODE_NOCOMPRESS;
 333        } else if (fsflags & FS_COMPR_FL) {
 334
 335                if (IS_SWAPFILE(inode))
 336                        return -ETXTBSY;
 337
 338                binode_flags |= BTRFS_INODE_COMPRESS;
 339                binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
 340
 341                comp = btrfs_compress_type2str(fs_info->compress_type);
 342                if (!comp || comp[0] == 0)
 343                        comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
 344        } else {
 345                binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 346        }
 347
 348        /*
 349         * 1 for inode item
 350         * 2 for properties
 351         */
 352        trans = btrfs_start_transaction(root, 3);
 353        if (IS_ERR(trans))
 354                return PTR_ERR(trans);
 355
 356        if (comp) {
 357                ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
 358                                     strlen(comp), 0);
 359                if (ret) {
 360                        btrfs_abort_transaction(trans, ret);
 361                        goto out_end_trans;
 362                }
 363        } else {
 364                ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
 365                                     0, 0);
 366                if (ret && ret != -ENODATA) {
 367                        btrfs_abort_transaction(trans, ret);
 368                        goto out_end_trans;
 369                }
 370        }
 371
 372update_flags:
 373        binode->flags = binode_flags;
 374        btrfs_sync_inode_flags_to_i_flags(inode);
 375        inode_inc_iversion(inode);
 376        inode->i_ctime = current_time(inode);
 377        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
 378
 379 out_end_trans:
 380        btrfs_end_transaction(trans);
 381        return ret;
 382}
 383
 384/*
 385 * Start exclusive operation @type, return true on success
 386 */
 387bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
 388                        enum btrfs_exclusive_operation type)
 389{
 390        bool ret = false;
 391
 392        spin_lock(&fs_info->super_lock);
 393        if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
 394                fs_info->exclusive_operation = type;
 395                ret = true;
 396        }
 397        spin_unlock(&fs_info->super_lock);
 398
 399        return ret;
 400}
 401
 402/*
 403 * Conditionally allow to enter the exclusive operation in case it's compatible
 404 * with the running one.  This must be paired with btrfs_exclop_start_unlock and
 405 * btrfs_exclop_finish.
 406 *
 407 * Compatibility:
 408 * - the same type is already running
 409 * - when trying to add a device and balance has been paused
 410 * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
 411 *   must check the condition first that would allow none -> @type
 412 */
 413bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
 414                                 enum btrfs_exclusive_operation type)
 415{
 416        spin_lock(&fs_info->super_lock);
 417        if (fs_info->exclusive_operation == type ||
 418            (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
 419             type == BTRFS_EXCLOP_DEV_ADD))
 420                return true;
 421
 422        spin_unlock(&fs_info->super_lock);
 423        return false;
 424}
 425
 426void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
 427{
 428        spin_unlock(&fs_info->super_lock);
 429}
 430
 431void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
 432{
 433        spin_lock(&fs_info->super_lock);
 434        WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
 435        spin_unlock(&fs_info->super_lock);
 436        sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
 437}
 438
 439void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
 440                          enum btrfs_exclusive_operation op)
 441{
 442        switch (op) {
 443        case BTRFS_EXCLOP_BALANCE_PAUSED:
 444                spin_lock(&fs_info->super_lock);
 445                ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
 446                       fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD);
 447                fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
 448                spin_unlock(&fs_info->super_lock);
 449                break;
 450        case BTRFS_EXCLOP_BALANCE:
 451                spin_lock(&fs_info->super_lock);
 452                ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
 453                fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
 454                spin_unlock(&fs_info->super_lock);
 455                break;
 456        default:
 457                btrfs_warn(fs_info,
 458                        "invalid exclop balance operation %d requested", op);
 459        }
 460}
 461
 462static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
 463{
 464        return put_user(inode->i_generation, arg);
 465}
 466
 467static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
 468                                        void __user *arg)
 469{
 470        struct btrfs_device *device;
 471        struct fstrim_range range;
 472        u64 minlen = ULLONG_MAX;
 473        u64 num_devices = 0;
 474        int ret;
 475
 476        if (!capable(CAP_SYS_ADMIN))
 477                return -EPERM;
 478
 479        /*
 480         * btrfs_trim_block_group() depends on space cache, which is not
 481         * available in zoned filesystem. So, disallow fitrim on a zoned
 482         * filesystem for now.
 483         */
 484        if (btrfs_is_zoned(fs_info))
 485                return -EOPNOTSUPP;
 486
 487        /*
 488         * If the fs is mounted with nologreplay, which requires it to be
 489         * mounted in RO mode as well, we can not allow discard on free space
 490         * inside block groups, because log trees refer to extents that are not
 491         * pinned in a block group's free space cache (pinning the extents is
 492         * precisely the first phase of replaying a log tree).
 493         */
 494        if (btrfs_test_opt(fs_info, NOLOGREPLAY))
 495                return -EROFS;
 496
 497        rcu_read_lock();
 498        list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
 499                                dev_list) {
 500                if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
 501                        continue;
 502                num_devices++;
 503                minlen = min_t(u64, bdev_discard_granularity(device->bdev),
 504                                    minlen);
 505        }
 506        rcu_read_unlock();
 507
 508        if (!num_devices)
 509                return -EOPNOTSUPP;
 510        if (copy_from_user(&range, arg, sizeof(range)))
 511                return -EFAULT;
 512
 513        /*
 514         * NOTE: Don't truncate the range using super->total_bytes.  Bytenr of
 515         * block group is in the logical address space, which can be any
 516         * sectorsize aligned bytenr in  the range [0, U64_MAX].
 517         */
 518        if (range.len < fs_info->sb->s_blocksize)
 519                return -EINVAL;
 520
 521        range.minlen = max(range.minlen, minlen);
 522        ret = btrfs_trim_fs(fs_info, &range);
 523        if (ret < 0)
 524                return ret;
 525
 526        if (copy_to_user(arg, &range, sizeof(range)))
 527                return -EFAULT;
 528
 529        return 0;
 530}
 531
 532int __pure btrfs_is_empty_uuid(u8 *uuid)
 533{
 534        int i;
 535
 536        for (i = 0; i < BTRFS_UUID_SIZE; i++) {
 537                if (uuid[i])
 538                        return 0;
 539        }
 540        return 1;
 541}
 542
 543/*
 544 * Calculate the number of transaction items to reserve for creating a subvolume
 545 * or snapshot, not including the inode, directory entries, or parent directory.
 546 */
 547static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
 548{
 549        /*
 550         * 1 to add root block
 551         * 1 to add root item
 552         * 1 to add root ref
 553         * 1 to add root backref
 554         * 1 to add UUID item
 555         * 1 to add qgroup info
 556         * 1 to add qgroup limit
 557         *
 558         * Ideally the last two would only be accounted if qgroups are enabled,
 559         * but that can change between now and the time we would insert them.
 560         */
 561        unsigned int num_items = 7;
 562
 563        if (inherit) {
 564                /* 2 to add qgroup relations for each inherited qgroup */
 565                num_items += 2 * inherit->num_qgroups;
 566        }
 567        return num_items;
 568}
 569
 570static noinline int create_subvol(struct user_namespace *mnt_userns,
 571                                  struct inode *dir, struct dentry *dentry,
 572                                  struct btrfs_qgroup_inherit *inherit)
 573{
 574        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 575        struct btrfs_trans_handle *trans;
 576        struct btrfs_key key;
 577        struct btrfs_root_item *root_item;
 578        struct btrfs_inode_item *inode_item;
 579        struct extent_buffer *leaf;
 580        struct btrfs_root *root = BTRFS_I(dir)->root;
 581        struct btrfs_root *new_root;
 582        struct btrfs_block_rsv block_rsv;
 583        struct timespec64 cur_time = current_time(dir);
 584        struct btrfs_new_inode_args new_inode_args = {
 585                .dir = dir,
 586                .dentry = dentry,
 587                .subvol = true,
 588        };
 589        unsigned int trans_num_items;
 590        int ret;
 591        dev_t anon_dev;
 592        u64 objectid;
 593
 594        root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
 595        if (!root_item)
 596                return -ENOMEM;
 597
 598        ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
 599        if (ret)
 600                goto out_root_item;
 601
 602        /*
 603         * Don't create subvolume whose level is not zero. Or qgroup will be
 604         * screwed up since it assumes subvolume qgroup's level to be 0.
 605         */
 606        if (btrfs_qgroup_level(objectid)) {
 607                ret = -ENOSPC;
 608                goto out_root_item;
 609        }
 610
 611        ret = get_anon_bdev(&anon_dev);
 612        if (ret < 0)
 613                goto out_root_item;
 614
 615        new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
 616        if (!new_inode_args.inode) {
 617                ret = -ENOMEM;
 618                goto out_anon_dev;
 619        }
 620        ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 621        if (ret)
 622                goto out_inode;
 623        trans_num_items += create_subvol_num_items(inherit);
 624
 625        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 626        ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
 627                                               trans_num_items, false);
 628        if (ret)
 629                goto out_new_inode_args;
 630
 631        trans = btrfs_start_transaction(root, 0);
 632        if (IS_ERR(trans)) {
 633                ret = PTR_ERR(trans);
 634                btrfs_subvolume_release_metadata(root, &block_rsv);
 635                goto out_new_inode_args;
 636        }
 637        trans->block_rsv = &block_rsv;
 638        trans->bytes_reserved = block_rsv.size;
 639
 640        ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
 641        if (ret)
 642                goto out;
 643
 644        leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
 645                                      BTRFS_NESTING_NORMAL);
 646        if (IS_ERR(leaf)) {
 647                ret = PTR_ERR(leaf);
 648                goto out;
 649        }
 650
 651        btrfs_mark_buffer_dirty(leaf);
 652
 653        inode_item = &root_item->inode;
 654        btrfs_set_stack_inode_generation(inode_item, 1);
 655        btrfs_set_stack_inode_size(inode_item, 3);
 656        btrfs_set_stack_inode_nlink(inode_item, 1);
 657        btrfs_set_stack_inode_nbytes(inode_item,
 658                                     fs_info->nodesize);
 659        btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
 660
 661        btrfs_set_root_flags(root_item, 0);
 662        btrfs_set_root_limit(root_item, 0);
 663        btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
 664
 665        btrfs_set_root_bytenr(root_item, leaf->start);
 666        btrfs_set_root_generation(root_item, trans->transid);
 667        btrfs_set_root_level(root_item, 0);
 668        btrfs_set_root_refs(root_item, 1);
 669        btrfs_set_root_used(root_item, leaf->len);
 670        btrfs_set_root_last_snapshot(root_item, 0);
 671
 672        btrfs_set_root_generation_v2(root_item,
 673                        btrfs_root_generation(root_item));
 674        generate_random_guid(root_item->uuid);
 675        btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
 676        btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
 677        root_item->ctime = root_item->otime;
 678        btrfs_set_root_ctransid(root_item, trans->transid);
 679        btrfs_set_root_otransid(root_item, trans->transid);
 680
 681        btrfs_tree_unlock(leaf);
 682
 683        btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
 684
 685        key.objectid = objectid;
 686        key.offset = 0;
 687        key.type = BTRFS_ROOT_ITEM_KEY;
 688        ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
 689                                root_item);
 690        if (ret) {
 691                /*
 692                 * Since we don't abort the transaction in this case, free the
 693                 * tree block so that we don't leak space and leave the
 694                 * filesystem in an inconsistent state (an extent item in the
 695                 * extent tree with a backreference for a root that does not
 696                 * exists).
 697                 */
 698                btrfs_tree_lock(leaf);
 699                btrfs_clean_tree_block(leaf);
 700                btrfs_tree_unlock(leaf);
 701                btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
 702                free_extent_buffer(leaf);
 703                goto out;
 704        }
 705
 706        free_extent_buffer(leaf);
 707        leaf = NULL;
 708
 709        new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
 710        if (IS_ERR(new_root)) {
 711                ret = PTR_ERR(new_root);
 712                btrfs_abort_transaction(trans, ret);
 713                goto out;
 714        }
 715        /* anon_dev is owned by new_root now. */
 716        anon_dev = 0;
 717        BTRFS_I(new_inode_args.inode)->root = new_root;
 718        /* ... and new_root is owned by new_inode_args.inode now. */
 719
 720        ret = btrfs_record_root_in_trans(trans, new_root);
 721        if (ret) {
 722                btrfs_abort_transaction(trans, ret);
 723                goto out;
 724        }
 725
 726        ret = btrfs_uuid_tree_add(trans, root_item->uuid,
 727                                  BTRFS_UUID_KEY_SUBVOL, objectid);
 728        if (ret) {
 729                btrfs_abort_transaction(trans, ret);
 730                goto out;
 731        }
 732
 733        ret = btrfs_create_new_inode(trans, &new_inode_args);
 734        if (ret) {
 735                btrfs_abort_transaction(trans, ret);
 736                goto out;
 737        }
 738
 739        d_instantiate_new(dentry, new_inode_args.inode);
 740        new_inode_args.inode = NULL;
 741
 742out:
 743        trans->block_rsv = NULL;
 744        trans->bytes_reserved = 0;
 745        btrfs_subvolume_release_metadata(root, &block_rsv);
 746
 747        if (ret)
 748                btrfs_end_transaction(trans);
 749        else
 750                ret = btrfs_commit_transaction(trans);
 751out_new_inode_args:
 752        btrfs_new_inode_args_destroy(&new_inode_args);
 753out_inode:
 754        iput(new_inode_args.inode);
 755out_anon_dev:
 756        if (anon_dev)
 757                free_anon_bdev(anon_dev);
 758out_root_item:
 759        kfree(root_item);
 760        return ret;
 761}
 762
 763static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 764                           struct dentry *dentry, bool readonly,
 765                           struct btrfs_qgroup_inherit *inherit)
 766{
 767        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 768        struct inode *inode;
 769        struct btrfs_pending_snapshot *pending_snapshot;
 770        unsigned int trans_num_items;
 771        struct btrfs_trans_handle *trans;
 772        int ret;
 773
 774        /* We do not support snapshotting right now. */
 775        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
 776                btrfs_warn(fs_info,
 777                           "extent tree v2 doesn't support snapshotting yet");
 778                return -EOPNOTSUPP;
 779        }
 780
 781        if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
 782                return -EINVAL;
 783
 784        if (atomic_read(&root->nr_swapfiles)) {
 785                btrfs_warn(fs_info,
 786                           "cannot snapshot subvolume with active swapfile");
 787                return -ETXTBSY;
 788        }
 789
 790        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
 791        if (!pending_snapshot)
 792                return -ENOMEM;
 793
 794        ret = get_anon_bdev(&pending_snapshot->anon_dev);
 795        if (ret < 0)
 796                goto free_pending;
 797        pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
 798                        GFP_KERNEL);
 799        pending_snapshot->path = btrfs_alloc_path();
 800        if (!pending_snapshot->root_item || !pending_snapshot->path) {
 801                ret = -ENOMEM;
 802                goto free_pending;
 803        }
 804
 805        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
 806                             BTRFS_BLOCK_RSV_TEMP);
 807        /*
 808         * 1 to add dir item
 809         * 1 to add dir index
 810         * 1 to update parent inode item
 811         */
 812        trans_num_items = create_subvol_num_items(inherit) + 3;
 813        ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
 814                                               &pending_snapshot->block_rsv,
 815                                               trans_num_items, false);
 816        if (ret)
 817                goto free_pending;
 818
 819        pending_snapshot->dentry = dentry;
 820        pending_snapshot->root = root;
 821        pending_snapshot->readonly = readonly;
 822        pending_snapshot->dir = dir;
 823        pending_snapshot->inherit = inherit;
 824
 825        trans = btrfs_start_transaction(root, 0);
 826        if (IS_ERR(trans)) {
 827                ret = PTR_ERR(trans);
 828                goto fail;
 829        }
 830
 831        trans->pending_snapshot = pending_snapshot;
 832
 833        ret = btrfs_commit_transaction(trans);
 834        if (ret)
 835                goto fail;
 836
 837        ret = pending_snapshot->error;
 838        if (ret)
 839                goto fail;
 840
 841        ret = btrfs_orphan_cleanup(pending_snapshot->snap);
 842        if (ret)
 843                goto fail;
 844
 845        inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
 846        if (IS_ERR(inode)) {
 847                ret = PTR_ERR(inode);
 848                goto fail;
 849        }
 850
 851        d_instantiate(dentry, inode);
 852        ret = 0;
 853        pending_snapshot->anon_dev = 0;
 854fail:
 855        /* Prevent double freeing of anon_dev */
 856        if (ret && pending_snapshot->snap)
 857                pending_snapshot->snap->anon_dev = 0;
 858        btrfs_put_root(pending_snapshot->snap);
 859        btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
 860free_pending:
 861        if (pending_snapshot->anon_dev)
 862                free_anon_bdev(pending_snapshot->anon_dev);
 863        kfree(pending_snapshot->root_item);
 864        btrfs_free_path(pending_snapshot->path);
 865        kfree(pending_snapshot);
 866
 867        return ret;
 868}
 869
 870/*  copy of may_delete in fs/namei.c()
 871 *      Check whether we can remove a link victim from directory dir, check
 872 *  whether the type of victim is right.
 873 *  1. We can't do it if dir is read-only (done in permission())
 874 *  2. We should have write and exec permissions on dir
 875 *  3. We can't remove anything from append-only dir
 876 *  4. We can't do anything with immutable dir (done in permission())
 877 *  5. If the sticky bit on dir is set we should either
 878 *      a. be owner of dir, or
 879 *      b. be owner of victim, or
 880 *      c. have CAP_FOWNER capability
 881 *  6. If the victim is append-only or immutable we can't do anything with
 882 *     links pointing to it.
 883 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 884 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 885 *  9. We can't remove a root or mountpoint.
 886 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 887 *     nfs_async_unlink().
 888 */
 889
 890static int btrfs_may_delete(struct user_namespace *mnt_userns,
 891                            struct inode *dir, struct dentry *victim, int isdir)
 892{
 893        int error;
 894
 895        if (d_really_is_negative(victim))
 896                return -ENOENT;
 897
 898        BUG_ON(d_inode(victim->d_parent) != dir);
 899        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 900
 901        error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 902        if (error)
 903                return error;
 904        if (IS_APPEND(dir))
 905                return -EPERM;
 906        if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
 907            IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
 908            IS_SWAPFILE(d_inode(victim)))
 909                return -EPERM;
 910        if (isdir) {
 911                if (!d_is_dir(victim))
 912                        return -ENOTDIR;
 913                if (IS_ROOT(victim))
 914                        return -EBUSY;
 915        } else if (d_is_dir(victim))
 916                return -EISDIR;
 917        if (IS_DEADDIR(dir))
 918                return -ENOENT;
 919        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
 920                return -EBUSY;
 921        return 0;
 922}
 923
 924/* copy of may_create in fs/namei.c() */
 925static inline int btrfs_may_create(struct user_namespace *mnt_userns,
 926                                   struct inode *dir, struct dentry *child)
 927{
 928        if (d_really_is_positive(child))
 929                return -EEXIST;
 930        if (IS_DEADDIR(dir))
 931                return -ENOENT;
 932        if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
 933                return -EOVERFLOW;
 934        return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
 935}
 936
 937/*
 938 * Create a new subvolume below @parent.  This is largely modeled after
 939 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
 940 * inside this filesystem so it's quite a bit simpler.
 941 */
 942static noinline int btrfs_mksubvol(const struct path *parent,
 943                                   struct user_namespace *mnt_userns,
 944                                   const char *name, int namelen,
 945                                   struct btrfs_root *snap_src,
 946                                   bool readonly,
 947                                   struct btrfs_qgroup_inherit *inherit)
 948{
 949        struct inode *dir = d_inode(parent->dentry);
 950        struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
 951        struct dentry *dentry;
 952        int error;
 953
 954        error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
 955        if (error == -EINTR)
 956                return error;
 957
 958        dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
 959        error = PTR_ERR(dentry);
 960        if (IS_ERR(dentry))
 961                goto out_unlock;
 962
 963        error = btrfs_may_create(mnt_userns, dir, dentry);
 964        if (error)
 965                goto out_dput;
 966
 967        /*
 968         * even if this name doesn't exist, we may get hash collisions.
 969         * check for them now when we can safely fail
 970         */
 971        error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
 972                                               dir->i_ino, name,
 973                                               namelen);
 974        if (error)
 975                goto out_dput;
 976
 977        down_read(&fs_info->subvol_sem);
 978
 979        if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
 980                goto out_up_read;
 981
 982        if (snap_src)
 983                error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
 984        else
 985                error = create_subvol(mnt_userns, dir, dentry, inherit);
 986
 987        if (!error)
 988                fsnotify_mkdir(dir, dentry);
 989out_up_read:
 990        up_read(&fs_info->subvol_sem);
 991out_dput:
 992        dput(dentry);
 993out_unlock:
 994        btrfs_inode_unlock(dir, 0);
 995        return error;
 996}
 997
 998static noinline int btrfs_mksnapshot(const struct path *parent,
 999                                   struct user_namespace *mnt_userns,
1000                                   const char *name, int namelen,

1001                                   struct btrfs_root *root,
1002                                   bool readonly,
1003                                   struct btrfs_qgroup_inherit *inherit)
1004{
1005        int ret;
1006        bool snapshot_force_cow = false;
1007
1008        /*
1009         * Force new buffered writes to reserve space even when NOCOW is
1010         * possible. This is to avoid later writeback (running dealloc) to
1011         * fallback to COW mode and unexpectedly fail with ENOSPC.
1012         */
1013        btrfs_drew_read_lock(&root->snapshot_lock);
1014
1015        ret = btrfs_start_delalloc_snapshot(root, false);
1016        if (ret)
1017                goto out;
1018
1019        /*
1020         * All previous writes have started writeback in NOCOW mode, so now
1021         * we force future writes to fallback to COW mode during snapshot
1022         * creation.
1023         */
1024        atomic_inc(&root->snapshot_force_cow);
1025        snapshot_force_cow = true;
1026
1027        btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
1028
1029        ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
1030                             root, readonly, inherit);
1031out:
1032        if (snapshot_force_cow)
1033                atomic_dec(&root->snapshot_force_cow);
1034        btrfs_drew_read_unlock(&root->snapshot_lock);
1035        return ret;
1036}
1037
1038/*
1039 * Defrag specific helper to get an extent map.
1040 *
1041 * Differences between this and btrfs_get_extent() are:
1042 *
1043 * - No extent_map will be added to inode->extent_tree
1044 *   To reduce memory usage in the long run.
1045 *
1046 * - Extra optimization to skip file extents older than @newer_than
1047 *   By using btrfs_search_forward() we can skip entire file ranges that
1048 *   have extents created in past transactions, because btrfs_search_forward()
1049 *   will not visit leaves and nodes with a generation smaller than given
1050 *   minimal generation threshold (@newer_than).
1051 *
1052 * Return valid em if we find a file extent matching the requirement.
1053 * Return NULL if we can not find a file extent matching the requirement.
1054 *
1055 * Return ERR_PTR() for error.
1056 */
1057static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
1058                                            u64 start, u64 newer_than)
1059{
1060        struct btrfs_root *root = inode->root;
1061        struct btrfs_file_extent_item *fi;
1062        struct btrfs_path path = { 0 };
1063        struct extent_map *em;
1064        struct btrfs_key key;
1065        u64 ino = btrfs_ino(inode);
1066        int ret;
1067
1068        em = alloc_extent_map();
1069        if (!em) {
1070                ret = -ENOMEM;
1071                goto err;
1072        }
1073
1074        key.objectid = ino;
1075        key.type = BTRFS_EXTENT_DATA_KEY;
1076        key.offset = start;
1077
1078        if (newer_than) {
1079                ret = btrfs_search_forward(root, &key, &path, newer_than);
1080                if (ret < 0)
1081                        goto err;
1082                /* Can't find anything newer */
1083                if (ret > 0)
1084                        goto not_found;
1085        } else {
1086                ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
1087                if (ret < 0)
1088                        goto err;
1089        }
1090        if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
1091                /*
1092                 * If btrfs_search_slot() makes path to point beyond nritems,
1093                 * we should not have an empty leaf, as this inode must at
1094                 * least have its INODE_ITEM.
1095                 */
1096                ASSERT(btrfs_header_nritems(path.nodes[0]));
1097                path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
1098        }
1099        btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
1100        /* Perfect match, no need to go one slot back */
1101        if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
1102            key.offset == start)
1103                goto iterate;
1104
1105        /* We didn't find a perfect match, needs to go one slot back */
1106        if (path.slots[0] > 0) {
1107                btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
1108                if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
1109                        path.slots[0]--;
1110        }
1111
1112iterate:
1113        /* Iterate through the path to find a file extent covering @start */
1114        while (true) {
1115                u64 extent_end;
1116
1117                if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
1118                        goto next;
1119
1120                btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
1121
1122                /*
1123                 * We may go one slot back to INODE_REF/XATTR item, then
1124                 * need to go forward until we reach an EXTENT_DATA.
1125                 * But we should still has the correct ino as key.objectid.
1126                 */
1127                if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
1128                        goto next;
1129
1130                /* It's beyond our target range, definitely not extent found */
1131                if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
1132                        goto not_found;
1133
1134                /*
1135                 *      |       |<- File extent ->|
1136                 *      \- start
1137                 *
1138                 * This means there is a hole between start and key.offset.
1139                 */
1140                if (key.offset > start) {
1141                        em->start = start;
1142                        em->orig_start = start;
1143                        em->block_start = EXTENT_MAP_HOLE;
1144                        em->len = key.offset - start;
1145                        break;
1146                }
1147
1148                fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
1149                                    struct btrfs_file_extent_item);
1150                extent_end = btrfs_file_extent_end(&path);
1151
1152                /*
1153                 *      |<- file extent ->|     |
1154                 *                              \- start
1155                 *
1156                 * We haven't reached start, search next slot.
1157                 */
1158                if (extent_end <= start)
1159                        goto next;
1160
1161                /* Now this extent covers @start, convert it to em */
1162                btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
1163                break;
1164next:
1165                ret = btrfs_next_item(root, &path);
1166                if (ret < 0)
1167                        goto err;
1168                if (ret > 0)
1169                        goto not_found;
1170        }
1171        btrfs_release_path(&path);
1172        return em;
1173
1174not_found:
1175        btrfs_release_path(&path);
1176        free_extent_map(em);
1177        return NULL;
1178
1179err:
1180        btrfs_release_path(&path);
1181        free_extent_map(em);
1182        return ERR_PTR(ret);
1183}
1184
1185static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
1186                                               u64 newer_than, bool locked)
1187{
1188        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1189        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1190        struct extent_map *em;
1191        const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
1192
1193        /*
1194         * hopefully we have this extent in the tree already, try without
1195         * the full extent lock
1196         */
1197        read_lock(&em_tree->lock);
1198        em = lookup_extent_mapping(em_tree, start, sectorsize);
1199        read_unlock(&em_tree->lock);
1200
1201        /*
1202         * We can get a merged extent, in that case, we need to re-search
1203         * tree to get the original em for defrag.
1204         *
1205         * If @newer_than is 0 or em::generation < newer_than, we can trust
1206         * this em, as either we don't care about the generation, or the
1207         * merged extent map will be rejected anyway.
1208         */
1209        if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
1210            newer_than && em->generation >= newer_than) {
1211                free_extent_map(em);
1212                em = NULL;
1213        }
1214
1215        if (!em) {
1216                struct extent_state *cached = NULL;
1217                u64 end = start + sectorsize - 1;
1218
1219                /* get the big lock and read metadata off disk */
1220                if (!locked)
1221                        lock_extent_bits(io_tree, start, end, &cached);
1222                em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
1223                if (!locked)
1224                        unlock_extent_cached(io_tree, start, end, &cached);
1225
1226                if (IS_ERR(em))
1227                        return NULL;
1228        }
1229
1230        return em;
1231}
1232
1233static u32 get_extent_max_capacity(const struct extent_map *em)
1234{
1235        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1236                return BTRFS_MAX_COMPRESSED;
1237        return BTRFS_MAX_EXTENT_SIZE;
1238}
1239
1240static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
1241                                     u32 extent_thresh, u64 newer_than, bool locked)
1242{
1243        struct extent_map *next;
1244        bool ret = false;
1245
1246        /* this is the last extent */
1247        if (em->start + em->len >= i_size_read(inode))
1248                return false;
1249
1250        /*
1251         * Here we need to pass @newer_then when checking the next extent, or
1252         * we will hit a case we mark current extent for defrag, but the next
1253         * one will not be a target.
1254         * This will just cause extra IO without really reducing the fragments.
1255         */
1256        next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
1257        /* No more em or hole */
1258        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
1259                goto out;
1260        if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
1261                goto out;
1262        /*
1263         * If the next extent is at its max capacity, defragging current extent
1264         * makes no sense, as the total number of extents won't change.
1265         */
1266        if (next->len >= get_extent_max_capacity(em))
1267                goto out;
1268        /* Skip older extent */
1269        if (next->generation < newer_than)
1270                goto out;
1271        /* Also check extent size */
1272        if (next->len >= extent_thresh)
1273                goto out;
1274
1275        ret = true;
1276out:
1277        free_extent_map(next);
1278        return ret;
1279}
1280
1281/*
1282 * Prepare one page to be defragged.
1283 *
1284 * This will ensure:
1285 *
1286 * - Returned page is locked and has been set up properly.
1287 * - No ordered extent exists in the page.
1288 * - The page is uptodate.
1289 *
1290 * NOTE: Caller should also wait for page writeback after the cluster is
1291 * prepared, here we don't do writeback wait for each page.
1292 */
1293static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
1294                                            pgoff_t index)
1295{
1296        struct address_space *mapping = inode->vfs_inode.i_mapping;
1297        gfp_t mask = btrfs_alloc_write_mask(mapping);
1298        u64 page_start = (u64)index << PAGE_SHIFT;
1299        u64 page_end = page_start + PAGE_SIZE - 1;
1300        struct extent_state *cached_state = NULL;
1301        struct page *page;
1302        int ret;
1303
1304again:
1305        page = find_or_create_page(mapping, index, mask);
1306        if (!page)
1307                return ERR_PTR(-ENOMEM);
1308
1309        /*
1310         * Since we can defragment files opened read-only, we can encounter
1311         * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
1312         * can't do I/O using huge pages yet, so return an error for now.
1313         * Filesystem transparent huge pages are typically only used for
1314         * executables that explicitly enable them, so this isn't very
1315         * restrictive.
1316         */
1317        if (PageCompound(page)) {
1318                unlock_page(page);
1319                put_page(page);
1320                return ERR_PTR(-ETXTBSY);
1321        }
1322
1323        ret = set_page_extent_mapped(page);
1324        if (ret < 0) {
1325                unlock_page(page);
1326                put_page(page);
1327                return ERR_PTR(ret);
1328        }
1329
1330        /* Wait for any existing ordered extent in the range */
1331        while (1) {
1332                struct btrfs_ordered_extent *ordered;
1333
1334                lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
1335                ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
1336                unlock_extent_cached(&inode->io_tree, page_start, page_end,
1337                                     &cached_state);
1338                if (!ordered)
1339                        break;
1340
1341                unlock_page(page);
1342                btrfs_start_ordered_extent(ordered, 1);
1343                btrfs_put_ordered_extent(ordered);
1344                lock_page(page);
1345                /*
1346                 * We unlocked the page above, so we need check if it was
1347                 * released or not.
1348                 */
1349                if (page->mapping != mapping || !PagePrivate(page)) {
1350                        unlock_page(page);
1351                        put_page(page);
1352                        goto again;
1353                }
1354        }
1355
1356        /*
1357         * Now the page range has no ordered extent any more.  Read the page to
1358         * make it uptodate.
1359         */
1360        if (!PageUptodate(page)) {
1361                btrfs_read_folio(NULL, page_folio(page));
1362                lock_page(page);
1363                if (page->mapping != mapping || !PagePrivate(page)) {
1364                        unlock_page(page);
1365                        put_page(page);
1366                        goto again;
1367                }
1368                if (!PageUptodate(page)) {
1369                        unlock_page(page);
1370                        put_page(page);
1371                        return ERR_PTR(-EIO);
1372                }
1373        }
1374        return page;
1375}
1376
1377struct defrag_target_range {
1378        struct list_head list;
1379        u64 start;
1380        u64 len;
1381};
1382
1383/*
1384 * Collect all valid target extents.
1385 *
1386 * @start:         file offset to lookup
1387 * @len:           length to lookup
1388 * @extent_thresh: file extent size threshold, any extent size >= this value
1389 *                 will be ignored
1390 * @newer_than:    only defrag extents newer than this value
1391 * @do_compress:   whether the defrag is doing compression
1392 *                 if true, @extent_thresh will be ignored and all regular
1393 *                 file extents meeting @newer_than will be targets.
1394 * @locked:        if the range has already held extent lock
1395 * @target_list:   list of targets file extents
1396 */
1397static int defrag_collect_targets(struct btrfs_inode *inode,
1398                                  u64 start, u64 len, u32 extent_thresh,
1399                                  u64 newer_than, bool do_compress,
1400                                  bool locked, struct list_head *target_list,
1401                                  u64 *last_scanned_ret)
1402{
1403        bool last_is_target = false;
1404        u64 cur = start;
1405        int ret = 0;
1406
1407        while (cur < start + len) {
1408                struct extent_map *em;
1409                struct defrag_target_range *new;
1410                bool next_mergeable = true;
1411                u64 range_len;
1412
1413                last_is_target = false;
1414                em = defrag_lookup_extent(&inode->vfs_inode, cur,
1415                                          newer_than, locked);
1416                if (!em)
1417                        break;
1418
1419                /*
1420                 * If the file extent is an inlined one, we may still want to
1421                 * defrag it (fallthrough) if it will cause a regular extent.
1422                 * This is for users who want to convert inline extents to
1423                 * regular ones through max_inline= mount option.
1424                 */
1425                if (em->block_start == EXTENT_MAP_INLINE &&
1426                    em->len <= inode->root->fs_info->max_inline)
1427                        goto next;
1428
1429                /* Skip hole/delalloc/preallocated extents */
1430                if (em->block_start == EXTENT_MAP_HOLE ||
1431                    em->block_start == EXTENT_MAP_DELALLOC ||
1432                    test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
1433                        goto next;
1434
1435                /* Skip older extent */
1436                if (em->generation < newer_than)
1437                        goto next;
1438
1439                /* This em is under writeback, no need to defrag */
1440                if (em->generation == (u64)-1)
1441                        goto next;
1442
1443                /*
1444                 * Our start offset might be in the middle of an existing extent
1445                 * map, so take that into account.
1446                 */
1447                range_len = em->len - (cur - em->start);
1448                /*
1449                 * If this range of the extent map is already flagged for delalloc,
1450                 * skip it, because:
1451                 *
1452                 * 1) We could deadlock later, when trying to reserve space for
1453                 *    delalloc, because in case we can't immediately reserve space
1454                 *    the flusher can start delalloc and wait for the respective
1455                 *    ordered extents to complete. The deadlock would happen
1456                 *    because we do the space reservation while holding the range
1457                 *    locked, and starting writeback, or finishing an ordered
1458                 *    extent, requires locking the range;
1459                 *
1460                 * 2) If there's delalloc there, it means there's dirty pages for
1461                 *    which writeback has not started yet (we clean the delalloc
1462                 *    flag when starting writeback and after creating an ordered
1463                 *    extent). If we mark pages in an adjacent range for defrag,
1464                 *    then we will have a larger contiguous range for delalloc,
1465                 *    very likely resulting in a larger extent after writeback is
1466                 *    triggered (except in a case of free space fragmentation).
1467                 */
1468                if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
1469                                   EXTENT_DELALLOC, 0, NULL))
1470                        goto next;
1471
1472                /*
1473                 * For do_compress case, we want to compress all valid file
1474                 * extents, thus no @extent_thresh or mergeable check.
1475                 */
1476                if (do_compress)
1477                        goto add;
1478
1479                /* Skip too large extent */
1480                if (range_len >= extent_thresh)
1481                        goto next;
1482
1483                /*
1484                 * Skip extents already at its max capacity, this is mostly for
1485                 * compressed extents, which max cap is only 128K.
1486                 */
1487                if (em->len >= get_extent_max_capacity(em))
1488                        goto next;
1489
1490                /*
1491                 * Normally there are no more extents after an inline one, thus
1492                 * @next_mergeable will normally be false and not defragged.
1493                 * So if an inline extent passed all above checks, just add it
1494                 * for defrag, and be converted to regular extents.
1495                 */
1496                if (em->block_start == EXTENT_MAP_INLINE)
1497                        goto add;
1498
1499                next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
1500                                                extent_thresh, newer_than, locked);
1501                if (!next_mergeable) {
1502                        struct defrag_target_range *last;
1503
1504                        /* Empty target list, no way to merge with last entry */
1505                        if (list_empty(target_list))
1506                                goto next;
1507                        last = list_entry(target_list->prev,
1508                                          struct defrag_target_range, list);
1509                        /* Not mergeable with last entry */
1510                        if (last->start + last->len != cur)
1511                                goto next;
1512
1513                        /* Mergeable, fall through to add it to @target_list. */
1514                }
1515
1516add:
1517                last_is_target = true;
1518                range_len = min(extent_map_end(em), start + len) - cur;
1519                /*
1520                 * This one is a good target, check if it can be merged into
1521                 * last range of the target list.
1522                 */
1523                if (!list_empty(target_list)) {
1524                        struct defrag_target_range *last;
1525
1526                        last = list_entry(target_list->prev,
1527                                          struct defrag_target_range, list);
1528                        ASSERT(last->start + last->len <= cur);
1529                        if (last->start + last->len == cur) {
1530                                /* Mergeable, enlarge the last entry */
1531                                last->len += range_len;
1532                                goto next;
1533                        }
1534                        /* Fall through to allocate a new entry */
1535                }
1536
1537                /* Allocate new defrag_target_range */
1538                new = kmalloc(sizeof(*new), GFP_NOFS);
1539                if (!new) {
1540                        free_extent_map(em);
1541                        ret = -ENOMEM;
1542                        break;
1543                }
1544                new->start = cur;
1545                new->len = range_len;
1546                list_add_tail(&new->list, target_list);
1547
1548next:
1549                cur = extent_map_end(em);
1550                free_extent_map(em);
1551        }
1552        if (ret < 0) {
1553                struct defrag_target_range *entry;
1554                struct defrag_target_range *tmp;
1555
1556                list_for_each_entry_safe(entry, tmp, target_list, list) {
1557                        list_del_init(&entry->list);
1558                        kfree(entry);
1559                }
1560        }
1561        if (!ret && last_scanned_ret) {
1562                /*
1563                 * If the last extent is not a target, the caller can skip to
1564                 * the end of that extent.
1565                 * Otherwise, we can only go the end of the specified range.
1566                 */
1567                if (!last_is_target)
1568                        *last_scanned_ret = max(cur, *last_scanned_ret);
1569                else
1570                        *last_scanned_ret = max(start + len, *last_scanned_ret);
1571        }
1572        return ret;
1573}
1574
1575#define CLUSTER_SIZE    (SZ_256K)
1576static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
1577
1578/*
1579 * Defrag one contiguous target range.
1580 *
1581 * @inode:      target inode
1582 * @target:     target range to defrag
1583 * @pages:      locked pages covering the defrag range
1584 * @nr_pages:   number of locked pages
1585 *
1586 * Caller should ensure:
1587 *
1588 * - Pages are prepared
1589 *   Pages should be locked, no ordered extent in the pages range,
1590 *   no writeback.
1591 *
1592 * - Extent bits are locked
1593 */
1594static int defrag_one_locked_target(struct btrfs_inode *inode,
1595                                    struct defrag_target_range *target,
1596                                    struct page **pages, int nr_pages,
1597                                    struct extent_state **cached_state)
1598{
1599        struct btrfs_fs_info *fs_info = inode->root->fs_info;
1600        struct extent_changeset *data_reserved = NULL;
1601        const u64 start = target->start;
1602        const u64 len = target->len;
1603        unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
1604        unsigned long start_index = start >> PAGE_SHIFT;
1605        unsigned long first_index = page_index(pages[0]);
1606        int ret = 0;
1607        int i;
1608
1609        ASSERT(last_index - first_index + 1 <= nr_pages);
1610
1611        ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
1612        if (ret < 0)
1613                return ret;
1614        clear_extent_bit(&inode->io_tree, start, start + len - 1,
1615                         EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1616                         EXTENT_DEFRAG, 0, 0, cached_state);
1617        set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
1618
1619        /* Update the page status */
1620        for (i = start_index - first_index; i <= last_index - first_index; i++) {
1621                ClearPageChecked(pages[i]);
1622                btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
1623        }
1624        btrfs_delalloc_release_extents(inode, len);
1625        extent_changeset_free(data_reserved);
1626
1627        return ret;
1628}
1629
1630static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
1631                            u32 extent_thresh, u64 newer_than, bool do_compress,
1632                            u64 *last_scanned_ret)
1633{
1634        struct extent_state *cached_state = NULL;
1635        struct defrag_target_range *entry;
1636        struct defrag_target_range *tmp;
1637        LIST_HEAD(target_list);
1638        struct page **pages;
1639        const u32 sectorsize = inode->root->fs_info->sectorsize;
1640        u64 last_index = (start + len - 1) >> PAGE_SHIFT;
1641        u64 start_index = start >> PAGE_SHIFT;
1642        unsigned int nr_pages = last_index - start_index + 1;
1643        int ret = 0;
1644        int i;
1645
1646        ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
1647        ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
1648
1649        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
1650        if (!pages)
1651                return -ENOMEM;
1652
1653        /* Prepare all pages */
1654        for (i = 0; i < nr_pages; i++) {
1655                pages[i] = defrag_prepare_one_page(inode, start_index + i);
1656                if (IS_ERR(pages[i])) {
1657                        ret = PTR_ERR(pages[i]);
1658                        pages[i] = NULL;
1659                        goto free_pages;
1660                }
1661        }
1662        for (i = 0; i < nr_pages; i++)
1663                wait_on_page_writeback(pages[i]);
1664
1665        /* Lock the pages range */
1666        lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
1667                         (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1668                         &cached_state);
1669        /*
1670         * Now we have a consistent view about the extent map, re-check
1671         * which range really needs to be defragged.
1672         *
1673         * And this time we have extent locked already, pass @locked = true
1674         * so that we won't relock the extent range and cause deadlock.
1675         */
1676        ret = defrag_collect_targets(inode, start, len, extent_thresh,
1677                                     newer_than, do_compress, true,
1678                                     &target_list, last_scanned_ret);
1679        if (ret < 0)
1680                goto unlock_extent;
1681
1682        list_for_each_entry(entry, &target_list, list) {
1683                ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
1684                                               &cached_state);
1685                if (ret < 0)
1686                        break;
1687        }
1688
1689        list_for_each_entry_safe(entry, tmp, &target_list, list) {
1690                list_del_init(&entry->list);
1691                kfree(entry);
1692        }
1693unlock_extent:
1694        unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
1695                             (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1696                             &cached_state);
1697free_pages:
1698        for (i = 0; i < nr_pages; i++) {
1699                if (pages[i]) {
1700                        unlock_page(pages[i]);
1701                        put_page(pages[i]);
1702                }
1703        }
1704        kfree(pages);
1705        return ret;
1706}
1707
1708static int defrag_one_cluster(struct btrfs_inode *inode,
1709                              struct file_ra_state *ra,
1710                              u64 start, u32 len, u32 extent_thresh,
1711                              u64 newer_than, bool do_compress,
1712                              unsigned long *sectors_defragged,
1713                              unsigned long max_sectors,
1714                              u64 *last_scanned_ret)
1715{
1716        const u32 sectorsize = inode->root->fs_info->sectorsize;
1717        struct defrag_target_range *entry;
1718        struct defrag_target_range *tmp;
1719        LIST_HEAD(target_list);
1720        int ret;
1721
1722        ret = defrag_collect_targets(inode, start, len, extent_thresh,
1723                                     newer_than, do_compress, false,
1724                                     &target_list, NULL);
1725        if (ret < 0)
1726                goto out;
1727
1728        list_for_each_entry(entry, &target_list, list) {
1729                u32 range_len = entry->len;
1730
1731                /* Reached or beyond the limit */
1732                if (max_sectors && *sectors_defragged >= max_sectors) {
1733                        ret = 1;
1734                        break;
1735                }
1736
1737                if (max_sectors)
1738                        range_len = min_t(u32, range_len,
1739                                (max_sectors - *sectors_defragged) * sectorsize);
1740
1741                /*
1742                 * If defrag_one_range() has updated last_scanned_ret,
1743                 * our range may already be invalid (e.g. hole punched).
1744                 * Skip if our range is before last_scanned_ret, as there is
1745                 * no need to defrag the range anymore.
1746                 */
1747                if (entry->start + range_len <= *last_scanned_ret)
1748                        continue;
1749
1750                if (ra)
1751                        page_cache_sync_readahead(inode->vfs_inode.i_mapping,
1752                                ra, NULL, entry->start >> PAGE_SHIFT,
1753                                ((entry->start + range_len - 1) >> PAGE_SHIFT) -
1754                                (entry->start >> PAGE_SHIFT) + 1);
1755                /*
1756                 * Here we may not defrag any range if holes are punched before
1757                 * we locked the pages.
1758                 * But that's fine, it only affects the @sectors_defragged
1759                 * accounting.
1760                 */
1761                ret = defrag_one_range(inode, entry->start, range_len,
1762                                       extent_thresh, newer_than, do_compress,
1763                                       last_scanned_ret);
1764                if (ret < 0)
1765                        break;
1766                *sectors_defragged += range_len >>
1767                                      inode->root->fs_info->sectorsize_bits;
1768        }
1769out:
1770        list_for_each_entry_safe(entry, tmp, &target_list, list) {
1771                list_del_init(&entry->list);
1772                kfree(entry);
1773        }
1774        if (ret >= 0)
1775                *last_scanned_ret = max(*last_scanned_ret, start + len);
1776        return ret;
1777}
1778
1779/*
1780 * Entry point to file defragmentation.
1781 *
1782 * @inode:         inode to be defragged
1783 * @ra:            readahead state (can be NUL)
1784 * @range:         defrag options including range and flags
1785 * @newer_than:    minimum transid to defrag
1786 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1787 *                 will be defragged.
1788 *
1789 * Return <0 for error.
1790 * Return >=0 for the number of sectors defragged, and range->start will be updated
1791 * to indicate the file offset where next defrag should be started at.
1792 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1793 *  defragging all the range).
1794 */
1795int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1796                      struct btrfs_ioctl_defrag_range_args *range,
1797                      u64 newer_than, unsigned long max_to_defrag)
1798{
1799        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1800        unsigned long sectors_defragged = 0;
1801        u64 isize = i_size_read(inode);
1802        u64 cur;
1803        u64 last_byte;
1804        bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
1805        bool ra_allocated = false;
1806        int compress_type = BTRFS_COMPRESS_ZLIB;
1807        int ret = 0;
1808        u32 extent_thresh = range->extent_thresh;
1809        pgoff_t start_index;
1810
1811        if (isize == 0)
1812                return 0;
1813
1814        if (range->start >= isize)
1815                return -EINVAL;
1816
1817        if (do_compress) {
1818                if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1819                        return -EINVAL;
1820                if (range->compress_type)
1821                        compress_type = range->compress_type;
1822        }
1823
1824        if (extent_thresh == 0)
1825                extent_thresh = SZ_256K;
1826
1827        if (range->start + range->len > range->start) {
1828                /* Got a specific range */
1829                last_byte = min(isize, range->start + range->len);
1830        } else {
1831                /* Defrag until file end */
1832                last_byte = isize;
1833        }
1834
1835        /* Align the range */
1836        cur = round_down(range->start, fs_info->sectorsize);
1837        last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1838
1839        /*
1840         * If we were not given a ra, allocate a readahead context. As
1841         * readahead is just an optimization, defrag will work without it so
1842         * we don't error out.
1843         */
1844        if (!ra) {
1845                ra_allocated = true;
1846                ra = kzalloc(sizeof(*ra), GFP_KERNEL);
1847                if (ra)
1848                        file_ra_state_init(ra, inode->i_mapping);
1849        }
1850
1851        /*
1852         * Make writeback start from the beginning of the range, so that the
1853         * defrag range can be written sequentially.
1854         */
1855        start_index = cur >> PAGE_SHIFT;
1856        if (start_index < inode->i_mapping->writeback_index)
1857                inode->i_mapping->writeback_index = start_index;
1858
1859        while (cur < last_byte) {
1860                const unsigned long prev_sectors_defragged = sectors_defragged;
1861                u64 last_scanned = cur;
1862                u64 cluster_end;
1863
1864                if (btrfs_defrag_cancelled(fs_info)) {
1865                        ret = -EAGAIN;
1866                        break;
1867                }
1868
1869                /* We want the cluster end at page boundary when possible */
1870                cluster_end = (((cur >> PAGE_SHIFT) +
1871                               (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
1872                cluster_end = min(cluster_end, last_byte);
1873
1874                btrfs_inode_lock(inode, 0);
1875                if (IS_SWAPFILE(inode)) {
1876                        ret = -ETXTBSY;
1877                        btrfs_inode_unlock(inode, 0);
1878                        break;
1879                }
1880                if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
1881                        btrfs_inode_unlock(inode, 0);
1882                        break;
1883                }
1884                if (do_compress)
1885                        BTRFS_I(inode)->defrag_compress = compress_type;
1886                ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
1887                                cluster_end + 1 - cur, extent_thresh,
1888                                newer_than, do_compress, &sectors_defragged,
1889                                max_to_defrag, &last_scanned);
1890
1891                if (sectors_defragged > prev_sectors_defragged)
1892                        balance_dirty_pages_ratelimited(inode->i_mapping);
1893
1894                btrfs_inode_unlock(inode, 0);
1895                if (ret < 0)
1896                        break;
1897                cur = max(cluster_end + 1, last_scanned);
1898                if (ret > 0) {
1899                        ret = 0;
1900                        break;
1901                }
1902                cond_resched();
1903        }
1904
1905        if (ra_allocated)
1906                kfree(ra);
1907        /*
1908         * Update range.start for autodefrag, this will indicate where to start
1909         * in next run.
1910         */
1911        range->start = cur;
1912        if (sectors_defragged) {
1913                /*
1914                 * We have defragged some sectors, for compression case they
1915                 * need to be written back immediately.
1916                 */
1917                if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
1918                        filemap_flush(inode->i_mapping);
1919                        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1920                                     &BTRFS_I(inode)->runtime_flags))
1921                                filemap_flush(inode->i_mapping);
1922                }
1923                if (range->compress_type == BTRFS_COMPRESS_LZO)
1924                        btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1925                else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
1926                        btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
1927                ret = sectors_defragged;
1928        }
1929        if (do_compress) {
1930                btrfs_inode_lock(inode, 0);
1931                BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
1932                btrfs_inode_unlock(inode, 0);
1933        }
1934        return ret;
1935}
1936
1937/*
1938 * Try to start exclusive operation @type or cancel it if it's running.
1939 *
1940 * Return:
1941 *   0        - normal mode, newly claimed op started
1942 *  >0        - normal mode, something else is running,
1943 *              return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
1944 * ECANCELED  - cancel mode, successful cancel
1945 * ENOTCONN   - cancel mode, operation not running anymore
1946 */
1947static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
1948                        enum btrfs_exclusive_operation type, bool cancel)
1949{
1950        if (!cancel) {
1951                /* Start normal op */
1952                if (!btrfs_exclop_start(fs_info, type))
1953                        return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
1954                /* Exclusive operation is now claimed */
1955                return 0;
1956        }
1957
1958        /* Cancel running op */
1959        if (btrfs_exclop_start_try_lock(fs_info, type)) {
1960                /*
1961                 * This blocks any exclop finish from setting it to NONE, so we
1962                 * request cancellation. Either it runs and we will wait for it,
1963                 * or it has finished and no waiting will happen.
1964                 */
1965                atomic_inc(&fs_info->reloc_cancel_req);
1966                btrfs_exclop_start_unlock(fs_info);
1967
1968                if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
1969                        wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
1970                                    TASK_INTERRUPTIBLE);
1971
1972                return -ECANCELED;
1973        }
1974
1975        /* Something else is running or none */
1976        return -ENOTCONN;
1977}
1978
1979static noinline int btrfs_ioctl_resize(struct file *file,
1980                                        void __user *arg)
1981{
1982        BTRFS_DEV_LOOKUP_ARGS(args);
1983        struct inode *inode = file_inode(file);
1984        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1985        u64 new_size;
1986        u64 old_size;
1987        u64 devid = 1;
1988        struct btrfs_root *root = BTRFS_I(inode)->root;
1989        struct btrfs_ioctl_vol_args *vol_args;
1990        struct btrfs_trans_handle *trans;
1991        struct btrfs_device *device = NULL;
1992        char *sizestr;
1993        char *retptr;
1994        char *devstr = NULL;
1995        int ret = 0;
1996        int mod = 0;
1997        bool cancel;
1998
1999        if (!capable(CAP_SYS_ADMIN))
2000                return -EPERM;

2001
2002        ret = mnt_want_write_file(file);
2003        if (ret)
2004                return ret;
2005
2006        /*
2007         * Read the arguments before checking exclusivity to be able to
2008         * distinguish regular resize and cancel
2009         */
2010        vol_args = memdup_user(arg, sizeof(*vol_args));
2011        if (IS_ERR(vol_args)) {
2012                ret = PTR_ERR(vol_args);
2013                goto out_drop;
2014        }
2015        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2016        sizestr = vol_args->name;
2017        cancel = (strcmp("cancel", sizestr) == 0);
2018        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
2019        if (ret)
2020                goto out_free;
2021        /* Exclusive operation is now claimed */
2022
2023        devstr = strchr(sizestr, ':');
2024        if (devstr) {
2025                sizestr = devstr + 1;
2026                *devstr = '\0';
2027                devstr = vol_args->name;
2028                ret = kstrtoull(devstr, 10, &devid);
2029                if (ret)
2030                        goto out_finish;
2031                if (!devid) {
2032                        ret = -EINVAL;
2033                        goto out_finish;
2034                }
2035                btrfs_info(fs_info, "resizing devid %llu", devid);
2036        }
2037
2038        args.devid = devid;
2039        device = btrfs_find_device(fs_info->fs_devices, &args);
2040        if (!device) {
2041                btrfs_info(fs_info, "resizer unable to find device %llu",
2042                           devid);
2043                ret = -ENODEV;
2044                goto out_finish;
2045        }
2046
2047        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2048                btrfs_info(fs_info,
2049                           "resizer unable to apply on readonly device %llu",
2050                       devid);
2051                ret = -EPERM;
2052                goto out_finish;
2053        }
2054
2055        if (!strcmp(sizestr, "max"))
2056                new_size = bdev_nr_bytes(device->bdev);
2057        else {
2058                if (sizestr[0] == '-') {
2059                        mod = -1;
2060                        sizestr++;
2061                } else if (sizestr[0] == '+') {
2062                        mod = 1;
2063                        sizestr++;
2064                }
2065                new_size = memparse(sizestr, &retptr);
2066                if (*retptr != '\0' || new_size == 0) {
2067                        ret = -EINVAL;
2068                        goto out_finish;
2069                }
2070        }
2071
2072        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2073                ret = -EPERM;
2074                goto out_finish;
2075        }
2076
2077        old_size = btrfs_device_get_total_bytes(device);
2078
2079        if (mod < 0) {
2080                if (new_size > old_size) {
2081                        ret = -EINVAL;
2082                        goto out_finish;
2083                }
2084                new_size = old_size - new_size;
2085        } else if (mod > 0) {
2086                if (new_size > ULLONG_MAX - old_size) {
2087                        ret = -ERANGE;
2088                        goto out_finish;
2089                }
2090                new_size = old_size + new_size;
2091        }
2092
2093        if (new_size < SZ_256M) {
2094                ret = -EINVAL;
2095                goto out_finish;
2096        }
2097        if (new_size > bdev_nr_bytes(device->bdev)) {
2098                ret = -EFBIG;
2099                goto out_finish;
2100        }
2101
2102        new_size = round_down(new_size, fs_info->sectorsize);
2103
2104        if (new_size > old_size) {
2105                trans = btrfs_start_transaction(root, 0);
2106                if (IS_ERR(trans)) {
2107                        ret = PTR_ERR(trans);
2108                        goto out_finish;
2109                }
2110                ret = btrfs_grow_device(trans, device, new_size);
2111                btrfs_commit_transaction(trans);
2112        } else if (new_size < old_size) {
2113                ret = btrfs_shrink_device(device, new_size);
2114        } /* equal, nothing need to do */
2115
2116        if (ret == 0 && new_size != old_size)
2117                btrfs_info_in_rcu(fs_info,
2118                        "resize device %s (devid %llu) from %llu to %llu",
2119                        rcu_str_deref(device->name), device->devid,
2120                        old_size, new_size);
2121out_finish:
2122        btrfs_exclop_finish(fs_info);
2123out_free:
2124        kfree(vol_args);
2125out_drop:
2126        mnt_drop_write_file(file);
2127        return ret;
2128}
2129
2130static noinline int __btrfs_ioctl_snap_create(struct file *file,
2131                                struct user_namespace *mnt_userns,
2132                                const char *name, unsigned long fd, int subvol,
2133                                bool readonly,
2134                                struct btrfs_qgroup_inherit *inherit)
2135{
2136        int namelen;
2137        int ret = 0;
2138
2139        if (!S_ISDIR(file_inode(file)->i_mode))
2140                return -ENOTDIR;
2141
2142        ret = mnt_want_write_file(file);
2143        if (ret)
2144                goto out;
2145
2146        namelen = strlen(name);
2147        if (strchr(name, '/')) {
2148                ret = -EINVAL;
2149                goto out_drop_write;
2150        }
2151
2152        if (name[0] == '.' &&
2153           (namelen == 1 || (name[1] == '.' && namelen == 2))) {
2154                ret = -EEXIST;
2155                goto out_drop_write;
2156        }
2157
2158        if (subvol) {
2159                ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
2160                                     namelen, NULL, readonly, inherit);
2161        } else {
2162                struct fd src = fdget(fd);
2163                struct inode *src_inode;
2164                if (!src.file) {
2165                        ret = -EINVAL;
2166                        goto out_drop_write;
2167                }
2168
2169                src_inode = file_inode(src.file);
2170                if (src_inode->i_sb != file_inode(file)->i_sb) {
2171                        btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
2172                                   "Snapshot src from another FS");
2173                        ret = -EXDEV;
2174                } else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
2175                        /*
2176                         * Subvolume creation is not restricted, but snapshots
2177                         * are limited to own subvolumes only
2178                         */
2179                        ret = -EPERM;
2180                } else {
2181                        ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
2182                                               name, namelen,
2183                                               BTRFS_I(src_inode)->root,
2184                                               readonly, inherit);
2185                }
2186                fdput(src);
2187        }
2188out_drop_write:
2189        mnt_drop_write_file(file);
2190out:
2191        return ret;
2192}
2193
2194static noinline int btrfs_ioctl_snap_create(struct file *file,
2195                                            void __user *arg, int subvol)
2196{
2197        struct btrfs_ioctl_vol_args *vol_args;
2198        int ret;
2199
2200        if (!S_ISDIR(file_inode(file)->i_mode))
2201                return -ENOTDIR;
2202
2203        vol_args = memdup_user(arg, sizeof(*vol_args));
2204        if (IS_ERR(vol_args))
2205                return PTR_ERR(vol_args);
2206        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2207
2208        ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
2209                                        vol_args->name, vol_args->fd, subvol,
2210                                        false, NULL);
2211
2212        kfree(vol_args);
2213        return ret;
2214}
2215
2216static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
2217                                               void __user *arg, int subvol)
2218{
2219        struct btrfs_ioctl_vol_args_v2 *vol_args;
2220        int ret;
2221        bool readonly = false;
2222        struct btrfs_qgroup_inherit *inherit = NULL;
2223
2224        if (!S_ISDIR(file_inode(file)->i_mode))
2225                return -ENOTDIR;
2226
2227        vol_args = memdup_user(arg, sizeof(*vol_args));
2228        if (IS_ERR(vol_args))
2229                return PTR_ERR(vol_args);
2230        vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
2231
2232        if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
2233                ret = -EOPNOTSUPP;
2234                goto free_args;
2235        }
2236
2237        if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
2238                readonly = true;
2239        if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
2240                u64 nums;
2241
2242                if (vol_args->size < sizeof(*inherit) ||
2243                    vol_args->size > PAGE_SIZE) {
2244                        ret = -EINVAL;
2245                        goto free_args;
2246                }
2247                inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
2248                if (IS_ERR(inherit)) {
2249                        ret = PTR_ERR(inherit);
2250                        goto free_args;
2251                }
2252
2253                if (inherit->num_qgroups > PAGE_SIZE ||
2254                    inherit->num_ref_copies > PAGE_SIZE ||
2255                    inherit->num_excl_copies > PAGE_SIZE) {
2256                        ret = -EINVAL;
2257                        goto free_inherit;
2258                }
2259
2260                nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2261                       2 * inherit->num_excl_copies;
2262                if (vol_args->size != struct_size(inherit, qgroups, nums)) {
2263                        ret = -EINVAL;
2264                        goto free_inherit;
2265                }
2266        }
2267
2268        ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
2269                                        vol_args->name, vol_args->fd, subvol,
2270                                        readonly, inherit);
2271        if (ret)
2272                goto free_inherit;
2273free_inherit:
2274        kfree(inherit);
2275free_args:
2276        kfree(vol_args);
2277        return ret;
2278}
2279
2280static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
2281                                                void __user *arg)
2282{
2283        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2284        struct btrfs_root *root = BTRFS_I(inode)->root;
2285        int ret = 0;
2286        u64 flags = 0;
2287
2288        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
2289                return -EINVAL;
2290
2291        down_read(&fs_info->subvol_sem);
2292        if (btrfs_root_readonly(root))
2293                flags |= BTRFS_SUBVOL_RDONLY;
2294        up_read(&fs_info->subvol_sem);
2295
2296        if (copy_to_user(arg, &flags, sizeof(flags)))
2297                ret = -EFAULT;
2298
2299        return ret;
2300}
2301
2302static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
2303                                              void __user *arg)
2304{
2305        struct inode *inode = file_inode(file);
2306        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2307        struct btrfs_root *root = BTRFS_I(inode)->root;
2308        struct btrfs_trans_handle *trans;
2309        u64 root_flags;
2310        u64 flags;
2311        int ret = 0;
2312
2313        if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
2314                return -EPERM;
2315
2316        ret = mnt_want_write_file(file);
2317        if (ret)
2318                goto out;
2319
2320        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
2321                ret = -EINVAL;
2322                goto out_drop_write;
2323        }
2324
2325        if (copy_from_user(&flags, arg, sizeof(flags))) {
2326                ret = -EFAULT;
2327                goto out_drop_write;
2328        }
2329
2330        if (flags & ~BTRFS_SUBVOL_RDONLY) {
2331                ret = -EOPNOTSUPP;
2332                goto out_drop_write;
2333        }
2334
2335        down_write(&fs_info->subvol_sem);
2336
2337        /* nothing to do */
2338        if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
2339                goto out_drop_sem;
2340
2341        root_flags = btrfs_root_flags(&root->root_item);
2342        if (flags & BTRFS_SUBVOL_RDONLY) {
2343                btrfs_set_root_flags(&root->root_item,
2344                                     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
2345        } else {
2346                /*
2347                 * Block RO -> RW transition if this subvolume is involved in
2348                 * send
2349                 */
2350                spin_lock(&root->root_item_lock);
2351                if (root->send_in_progress == 0) {
2352                        btrfs_set_root_flags(&root->root_item,
2353                                     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
2354                        spin_unlock(&root->root_item_lock);
2355                } else {
2356                        spin_unlock(&root->root_item_lock);
2357                        btrfs_warn(fs_info,
2358                                   "Attempt to set subvolume %llu read-write during send",
2359                                   root->root_key.objectid);
2360                        ret = -EPERM;
2361                        goto out_drop_sem;
2362                }
2363        }
2364
2365        trans = btrfs_start_transaction(root, 1);
2366        if (IS_ERR(trans)) {
2367                ret = PTR_ERR(trans);
2368                goto out_reset;
2369        }
2370
2371        ret = btrfs_update_root(trans, fs_info->tree_root,
2372                                &root->root_key, &root->root_item);
2373        if (ret < 0) {
2374                btrfs_end_transaction(trans);
2375                goto out_reset;
2376        }
2377
2378        ret = btrfs_commit_transaction(trans);
2379
2380out_reset:
2381        if (ret)
2382                btrfs_set_root_flags(&root->root_item, root_flags);
2383out_drop_sem:
2384        up_write(&fs_info->subvol_sem);
2385out_drop_write:
2386        mnt_drop_write_file(file);
2387out:
2388        return ret;
2389}
2390
2391static noinline int key_in_sk(struct btrfs_key *key,
2392                              struct btrfs_ioctl_search_key *sk)
2393{
2394        struct btrfs_key test;
2395        int ret;
2396
2397        test.objectid = sk->min_objectid;
2398        test.type = sk->min_type;
2399        test.offset = sk->min_offset;
2400
2401        ret = btrfs_comp_cpu_keys(key, &test);
2402        if (ret < 0)
2403                return 0;
2404
2405        test.objectid = sk->max_objectid;
2406        test.type = sk->max_type;
2407        test.offset = sk->max_offset;
2408
2409        ret = btrfs_comp_cpu_keys(key, &test);
2410        if (ret > 0)
2411                return 0;
2412        return 1;
2413}
2414
2415static noinline int copy_to_sk(struct btrfs_path *path,
2416                               struct btrfs_key *key,
2417                               struct btrfs_ioctl_search_key *sk,
2418                               size_t *buf_size,
2419                               char __user *ubuf,
2420                               unsigned long *sk_offset,
2421                               int *num_found)
2422{
2423        u64 found_transid;
2424        struct extent_buffer *leaf;
2425        struct btrfs_ioctl_search_header sh;
2426        struct btrfs_key test;
2427        unsigned long item_off;
2428        unsigned long item_len;
2429        int nritems;
2430        int i;
2431        int slot;
2432        int ret = 0;
2433
2434        leaf = path->nodes[0];
2435        slot = path->slots[0];
2436        nritems = btrfs_header_nritems(leaf);
2437
2438        if (btrfs_header_generation(leaf) > sk->max_transid) {
2439                i = nritems;
2440                goto advance_key;
2441        }
2442        found_transid = btrfs_header_generation(leaf);
2443
2444        for (i = slot; i < nritems; i++) {
2445                item_off = btrfs_item_ptr_offset(leaf, i);
2446                item_len = btrfs_item_size(leaf, i);
2447
2448                btrfs_item_key_to_cpu(leaf, key, i);
2449                if (!key_in_sk(key, sk))
2450                        continue;
2451
2452                if (sizeof(sh) + item_len > *buf_size) {
2453                        if (*num_found) {
2454                                ret = 1;
2455                                goto out;
2456                        }
2457
2458                        /*
2459                         * return one empty item back for v1, which does not
2460                         * handle -EOVERFLOW
2461                         */
2462
2463                        *buf_size = sizeof(sh) + item_len;
2464                        item_len = 0;
2465                        ret = -EOVERFLOW;
2466                }
2467
2468                if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
2469                        ret = 1;
2470                        goto out;
2471                }
2472
2473                sh.objectid = key->objectid;
2474                sh.offset = key->offset;
2475                sh.type = key->type;
2476                sh.len = item_len;
2477                sh.transid = found_transid;
2478
2479                /*
2480                 * Copy search result header. If we fault then loop again so we
2481                 * can fault in the pages and -EFAULT there if there's a
2482                 * problem. Otherwise we'll fault and then copy the buffer in
2483                 * properly this next time through
2484                 */
2485                if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
2486                        ret = 0;
2487                        goto out;
2488                }
2489
2490                *sk_offset += sizeof(sh);
2491
2492                if (item_len) {
2493                        char __user *up = ubuf + *sk_offset;
2494                        /*
2495                         * Copy the item, same behavior as above, but reset the
2496                         * * sk_offset so we copy the full thing again.
2497                         */
2498                        if (read_extent_buffer_to_user_nofault(leaf, up,
2499                                                item_off, item_len)) {
2500                                ret = 0;
2501                                *sk_offset -= sizeof(sh);
2502                                goto out;
2503                        }
2504
2505                        *sk_offset += item_len;
2506                }
2507                (*num_found)++;
2508
2509                if (ret) /* -EOVERFLOW from above */
2510                        goto out;
2511
2512                if (*num_found >= sk->nr_items) {
2513                        ret = 1;
2514                        goto out;
2515                }
2516        }
2517advance_key:
2518        ret = 0;
2519        test.objectid = sk->max_objectid;
2520        test.type = sk->max_type;
2521        test.offset = sk->max_offset;
2522        if (btrfs_comp_cpu_keys(key, &test) >= 0)
2523                ret = 1;
2524        else if (key->offset < (u64)-1)
2525                key->offset++;
2526        else if (key->type < (u8)-1) {
2527                key->offset = 0;
2528                key->type++;
2529        } else if (key->objectid < (u64)-1) {
2530                key->offset = 0;
2531                key->type = 0;
2532                key->objectid++;
2533        } else
2534                ret = 1;
2535out:
2536        /*
2537         *  0: all items from this leaf copied, continue with next
2538         *  1: * more items can be copied, but unused buffer is too small
2539         *     * all items were found
2540         *     Either way, it will stops the loop which iterates to the next
2541         *     leaf
2542         *  -EOVERFLOW: item was to large for buffer
2543         *  -EFAULT: could not copy extent buffer back to userspace
2544         */
2545        return ret;
2546}
2547
2548static noinline int search_ioctl(struct inode *inode,
2549                                 struct btrfs_ioctl_search_key *sk,
2550                                 size_t *buf_size,
2551                                 char __user *ubuf)
2552{
2553        struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
2554        struct btrfs_root *root;
2555        struct btrfs_key key;
2556        struct btrfs_path *path;
2557        int ret;
2558        int num_found = 0;
2559        unsigned long sk_offset = 0;
2560
2561        if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
2562                *buf_size = sizeof(struct btrfs_ioctl_search_header);
2563                return -EOVERFLOW;
2564        }
2565
2566        path = btrfs_alloc_path();
2567        if (!path)
2568                return -ENOMEM;
2569
2570        if (sk->tree_id == 0) {
2571                /* search the root of the inode that was passed */
2572                root = btrfs_grab_root(BTRFS_I(inode)->root);
2573        } else {
2574                root = btrfs_get_fs_root(info, sk->tree_id, true);
2575                if (IS_ERR(root)) {
2576                        btrfs_free_path(path);
2577                        return PTR_ERR(root);
2578                }
2579        }
2580
2581        key.objectid = sk->min_objectid;
2582        key.type = sk->min_type;
2583        key.offset = sk->min_offset;
2584
2585        while (1) {
2586                ret = -EFAULT;
2587                /*
2588                 * Ensure that the whole user buffer is faulted in at sub-page
2589                 * granularity, otherwise the loop may live-lock.
2590                 */
2591                if (fault_in_subpage_writeable(ubuf + sk_offset,
2592                                               *buf_size - sk_offset))
2593                        break;
2594
2595                ret = btrfs_search_forward(root, &key, path, sk->min_transid);
2596                if (ret != 0) {
2597                        if (ret > 0)
2598                                ret = 0;
2599                        goto err;
2600                }
2601                ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
2602                                 &sk_offset, &num_found);
2603                btrfs_release_path(path);
2604                if (ret)
2605                        break;
2606
2607        }
2608        if (ret > 0)
2609                ret = 0;
2610err:
2611        sk->nr_items = num_found;
2612        btrfs_put_root(root);
2613        btrfs_free_path(path);
2614        return ret;
2615}
2616
2617static noinline int btrfs_ioctl_tree_search(struct inode *inode,
2618                                            void __user *argp)
2619{
2620        struct btrfs_ioctl_search_args __user *uargs = argp;
2621        struct btrfs_ioctl_search_key sk;
2622        int ret;
2623        size_t buf_size;
2624
2625        if (!capable(CAP_SYS_ADMIN))
2626                return -EPERM;
2627
2628        if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
2629                return -EFAULT;
2630
2631        buf_size = sizeof(uargs->buf);
2632
2633        ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
2634
2635        /*
2636         * In the origin implementation an overflow is handled by returning a
2637         * search header with a len of zero, so reset ret.
2638         */
2639        if (ret == -EOVERFLOW)
2640                ret = 0;
2641
2642        if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
2643                ret = -EFAULT;
2644        return ret;
2645}
2646
2647static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
2648                                               void __user *argp)
2649{
2650        struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
2651        struct btrfs_ioctl_search_args_v2 args;
2652        int ret;
2653        size_t buf_size;
2654        const size_t buf_limit = SZ_16M;
2655
2656        if (!capable(CAP_SYS_ADMIN))
2657                return -EPERM;
2658
2659        /* copy search header and buffer size */
2660        if (copy_from_user(&args, uarg, sizeof(args)))
2661                return -EFAULT;
2662
2663        buf_size = args.buf_size;
2664
2665        /* limit result size to 16MB */
2666        if (buf_size > buf_limit)
2667                buf_size = buf_limit;
2668
2669        ret = search_ioctl(inode, &args.key, &buf_size,
2670                           (char __user *)(&uarg->buf[0]));
2671        if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
2672                ret = -EFAULT;
2673        else if (ret == -EOVERFLOW &&
2674                copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
2675                ret = -EFAULT;
2676
2677        return ret;
2678}
2679
2680/*
2681 * Search INODE_REFs to identify path name of 'dirid' directory
2682 * in a 'tree_id' tree. and sets path name to 'name'.
2683 */
2684static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
2685                                u64 tree_id, u64 dirid, char *name)
2686{
2687        struct btrfs_root *root;
2688        struct btrfs_key key;
2689        char *ptr;
2690        int ret = -1;
2691        int slot;
2692        int len;
2693        int total_len = 0;
2694        struct btrfs_inode_ref *iref;
2695        struct extent_buffer *l;
2696        struct btrfs_path *path;
2697
2698        if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
2699                name[0]='\0';
2700                return 0;
2701        }
2702
2703        path = btrfs_alloc_path();
2704        if (!path)
2705                return -ENOMEM;
2706
2707        ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
2708
2709        root = btrfs_get_fs_root(info, tree_id, true);
2710        if (IS_ERR(root)) {
2711                ret = PTR_ERR(root);
2712                root = NULL;
2713                goto out;
2714        }
2715
2716        key.objectid = dirid;
2717        key.type = BTRFS_INODE_REF_KEY;
2718        key.offset = (u64)-1;
2719
2720        while (1) {
2721                ret = btrfs_search_backwards(root, &key, path);
2722                if (ret < 0)
2723                        goto out;
2724                else if (ret > 0) {
2725                        ret = -ENOENT;
2726                        goto out;
2727                }
2728
2729                l = path->nodes[0];
2730                slot = path->slots[0];
2731
2732                iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
2733                len = btrfs_inode_ref_name_len(l, iref);
2734                ptr -= len + 1;
2735                total_len += len + 1;
2736                if (ptr < name) {
2737                        ret = -ENAMETOOLONG;
2738                        goto out;
2739                }
2740
2741                *(ptr + len) = '/';
2742                read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
2743
2744                if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
2745                        break;
2746
2747                btrfs_release_path(path);
2748                key.objectid = key.offset;
2749                key.offset = (u64)-1;
2750                dirid = key.objectid;
2751        }
2752        memmove(name, ptr, total_len);
2753        name[total_len] = '\0';
2754        ret = 0;
2755out:
2756        btrfs_put_root(root);
2757        btrfs_free_path(path);
2758        return ret;
2759}
2760
2761static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
2762                                struct inode *inode,
2763                                struct btrfs_ioctl_ino_lookup_user_args *args)
2764{
2765        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2766        struct super_block *sb = inode->i_sb;
2767        struct btrfs_key upper_limit = BTRFS_I(inode)->location;
2768        u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
2769        u64 dirid = args->dirid;
2770        unsigned long item_off;
2771        unsigned long item_len;
2772        struct btrfs_inode_ref *iref;
2773        struct btrfs_root_ref *rref;
2774        struct btrfs_root *root = NULL;
2775        struct btrfs_path *path;
2776        struct btrfs_key key, key2;
2777        struct extent_buffer *leaf;
2778        struct inode *temp_inode;
2779        char *ptr;
2780        int slot;
2781        int len;
2782        int total_len = 0;
2783        int ret;
2784
2785        path = btrfs_alloc_path();
2786        if (!path)
2787                return -ENOMEM;
2788
2789        /*
2790         * If the bottom subvolume does not exist directly under upper_limit,
2791         * construct the path in from the bottom up.
2792         */
2793        if (dirid != upper_limit.objectid) {
2794                ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
2795
2796                root = btrfs_get_fs_root(fs_info, treeid, true);
2797                if (IS_ERR(root)) {
2798                        ret = PTR_ERR(root);
2799                        goto out;
2800                }
2801
2802                key.objectid = dirid;
2803                key.type = BTRFS_INODE_REF_KEY;
2804                key.offset = (u64)-1;
2805                while (1) {
2806                        ret = btrfs_search_backwards(root, &key, path);
2807                        if (ret < 0)
2808                                goto out_put;
2809                        else if (ret > 0) {
2810                                ret = -ENOENT;
2811                                goto out_put;
2812                        }
2813
2814                        leaf = path->nodes[0];
2815                        slot = path->slots[0];
2816
2817                        iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
2818                        len = btrfs_inode_ref_name_len(leaf, iref);
2819                        ptr -= len + 1;
2820                        total_len += len + 1;
2821                        if (ptr < args->path) {
2822                                ret = -ENAMETOOLONG;
2823                                goto out_put;
2824                        }
2825
2826                        *(ptr + len) = '/';
2827                        read_extent_buffer(leaf, ptr,
2828                                        (unsigned long)(iref + 1), len);
2829
2830                        /* Check the read+exec permission of this directory */
2831                        ret = btrfs_previous_item(root, path, dirid,
2832                                                  BTRFS_INODE_ITEM_KEY);
2833                        if (ret < 0) {
2834                                goto out_put;
2835                        } else if (ret > 0) {
2836                                ret = -ENOENT;
2837                                goto out_put;
2838                        }
2839
2840                        leaf = path->nodes[0];
2841                        slot = path->slots[0];
2842                        btrfs_item_key_to_cpu(leaf, &key2, slot);
2843                        if (key2.objectid != dirid) {
2844                                ret = -ENOENT;
2845                                goto out_put;
2846                        }
2847
2848                        temp_inode = btrfs_iget(sb, key2.objectid, root);
2849                        if (IS_ERR(temp_inode)) {
2850                                ret = PTR_ERR(temp_inode);
2851                                goto out_put;
2852                        }
2853                        ret = inode_permission(mnt_userns, temp_inode,
2854                                               MAY_READ | MAY_EXEC);
2855                        iput(temp_inode);
2856                        if (ret) {
2857                                ret = -EACCES;
2858                                goto out_put;
2859                        }
2860
2861                        if (key.offset == upper_limit.objectid)
2862                                break;
2863                        if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
2864                                ret = -EACCES;
2865                                goto out_put;
2866                        }
2867
2868                        btrfs_release_path(path);
2869                        key.objectid = key.offset;
2870                        key.offset = (u64)-1;
2871                        dirid = key.objectid;
2872                }
2873
2874                memmove(args->path, ptr, total_len);
2875                args->path[total_len] = '\0';
2876                btrfs_put_root(root);
2877                root = NULL;
2878                btrfs_release_path(path);
2879        }
2880
2881        /* Get the bottom subvolume's name from ROOT_REF */
2882        key.objectid = treeid;
2883        key.type = BTRFS_ROOT_REF_KEY;
2884        key.offset = args->treeid;
2885        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2886        if (ret < 0) {
2887                goto out;
2888        } else if (ret > 0) {
2889                ret = -ENOENT;
2890                goto out;
2891        }
2892
2893        leaf = path->nodes[0];
2894        slot = path->slots[0];
2895        btrfs_item_key_to_cpu(leaf, &key, slot);
2896
2897        item_off = btrfs_item_ptr_offset(leaf, slot);
2898        item_len = btrfs_item_size(leaf, slot);
2899        /* Check if dirid in ROOT_REF corresponds to passed dirid */
2900        rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
2901        if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
2902                ret = -EINVAL;
2903                goto out;
2904        }
2905
2906        /* Copy subvolume's name */
2907        item_off += sizeof(struct btrfs_root_ref);
2908        item_len -= sizeof(struct btrfs_root_ref);
2909        read_extent_buffer(leaf, args->name, item_off, item_len);
2910        args->name[item_len] = 0;
2911
2912out_put:
2913        btrfs_put_root(root);
2914out:
2915        btrfs_free_path(path);
2916        return ret;
2917}
2918
2919static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
2920                                           void __user *argp)
2921{
2922        struct btrfs_ioctl_ino_lookup_args *args;
2923        int ret = 0;
2924
2925        args = memdup_user(argp, sizeof(*args));
2926        if (IS_ERR(args))
2927                return PTR_ERR(args);
2928
2929        /*
2930         * Unprivileged query to obtain the containing subvolume root id. The
2931         * path is reset so it's consistent with btrfs_search_path_in_tree.
2932         */
2933        if (args->treeid == 0)
2934                args->treeid = root->root_key.objectid;
2935
2936        if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
2937                args->name[0] = 0;
2938                goto out;
2939        }
2940
2941        if (!capable(CAP_SYS_ADMIN)) {
2942                ret = -EPERM;
2943                goto out;
2944        }
2945
2946        ret = btrfs_search_path_in_tree(root->fs_info,
2947                                        args->treeid, args->objectid,
2948                                        args->name);
2949
2950out:
2951        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
2952                ret = -EFAULT;
2953
2954        kfree(args);
2955        return ret;
2956}
2957
2958/*
2959 * Version of ino_lookup ioctl (unprivileged)
2960 *
2961 * The main differences from ino_lookup ioctl are:
2962 *
2963 *   1. Read + Exec permission will be checked using inode_permission() during
2964 *      path construction. -EACCES will be returned in case of failure.
2965 *   2. Path construction will be stopped at the inode number which corresponds
2966 *      to the fd with which this ioctl is called. If constructed path does not
2967 *      exist under fd's inode, -EACCES will be returned.
2968 *   3. The name of bottom subvolume is also searched and filled.
2969 */
2970static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
2971{
2972        struct btrfs_ioctl_ino_lookup_user_args *args;
2973        struct inode *inode;
2974        int ret;
2975
2976        args = memdup_user(argp, sizeof(*args));
2977        if (IS_ERR(args))
2978                return PTR_ERR(args);
2979
2980        inode = file_inode(file);
2981
2982        if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
2983            BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
2984                /*
2985                 * The subvolume does not exist under fd with which this is
2986                 * called
2987                 */
2988                kfree(args);
2989                return -EACCES;
2990        }
2991
2992        ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
2993
2994        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
2995                ret = -EFAULT;
2996
2997        kfree(args);
2998        return ret;
2999}
3000

3001/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
3002static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
3003{
3004        struct btrfs_ioctl_get_subvol_info_args *subvol_info;
3005        struct btrfs_fs_info *fs_info;
3006        struct btrfs_root *root;
3007        struct btrfs_path *path;
3008        struct btrfs_key key;
3009        struct btrfs_root_item *root_item;
3010        struct btrfs_root_ref *rref;
3011        struct extent_buffer *leaf;
3012        unsigned long item_off;
3013        unsigned long item_len;
3014        int slot;
3015        int ret = 0;
3016
3017        path = btrfs_alloc_path();
3018        if (!path)
3019                return -ENOMEM;
3020
3021        subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
3022        if (!subvol_info) {
3023                btrfs_free_path(path);
3024                return -ENOMEM;
3025        }
3026
3027        fs_info = BTRFS_I(inode)->root->fs_info;
3028
3029        /* Get root_item of inode's subvolume */
3030        key.objectid = BTRFS_I(inode)->root->root_key.objectid;
3031        root = btrfs_get_fs_root(fs_info, key.objectid, true);
3032        if (IS_ERR(root)) {
3033                ret = PTR_ERR(root);
3034                goto out_free;
3035        }
3036        root_item = &root->root_item;
3037
3038        subvol_info->treeid = key.objectid;
3039
3040        subvol_info->generation = btrfs_root_generation(root_item);
3041        subvol_info->flags = btrfs_root_flags(root_item);
3042
3043        memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
3044        memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
3045                                                    BTRFS_UUID_SIZE);
3046        memcpy(subvol_info->received_uuid, root_item->received_uuid,
3047                                                    BTRFS_UUID_SIZE);
3048
3049        subvol_info->ctransid = btrfs_root_ctransid(root_item);
3050        subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
3051        subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
3052
3053        subvol_info->otransid = btrfs_root_otransid(root_item);
3054        subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
3055        subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
3056
3057        subvol_info->stransid = btrfs_root_stransid(root_item);
3058        subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
3059        subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
3060
3061        subvol_info->rtransid = btrfs_root_rtransid(root_item);
3062        subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
3063        subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
3064
3065        if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
3066                /* Search root tree for ROOT_BACKREF of this subvolume */
3067                key.type = BTRFS_ROOT_BACKREF_KEY;
3068                key.offset = 0;
3069                ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3070                if (ret < 0) {
3071                        goto out;
3072                } else if (path->slots[0] >=
3073                           btrfs_header_nritems(path->nodes[0])) {
3074                        ret = btrfs_next_leaf(fs_info->tree_root, path);
3075                        if (ret < 0) {
3076                                goto out;
3077                        } else if (ret > 0) {
3078                                ret = -EUCLEAN;
3079                                goto out;
3080                        }
3081                }
3082
3083                leaf = path->nodes[0];
3084                slot = path->slots[0];
3085                btrfs_item_key_to_cpu(leaf, &key, slot);
3086                if (key.objectid == subvol_info->treeid &&
3087                    key.type == BTRFS_ROOT_BACKREF_KEY) {
3088                        subvol_info->parent_id = key.offset;
3089
3090                        rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
3091                        subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
3092
3093                        item_off = btrfs_item_ptr_offset(leaf, slot)
3094                                        + sizeof(struct btrfs_root_ref);
3095                        item_len = btrfs_item_size(leaf, slot)
3096                                        - sizeof(struct btrfs_root_ref);
3097                        read_extent_buffer(leaf, subvol_info->name,
3098                                           item_off, item_len);
3099                } else {
3100                        ret = -ENOENT;
3101                        goto out;
3102                }
3103        }
3104
3105        if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
3106                ret = -EFAULT;
3107
3108out:
3109        btrfs_put_root(root);
3110out_free:
3111        btrfs_free_path(path);
3112        kfree(subvol_info);
3113        return ret;
3114}
3115
3116/*
3117 * Return ROOT_REF information of the subvolume containing this inode
3118 * except the subvolume name.
3119 */
3120static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
3121                                          void __user *argp)
3122{
3123        struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
3124        struct btrfs_root_ref *rref;
3125        struct btrfs_path *path;
3126        struct btrfs_key key;
3127        struct extent_buffer *leaf;
3128        u64 objectid;
3129        int slot;
3130        int ret;
3131        u8 found;
3132
3133        path = btrfs_alloc_path();
3134        if (!path)
3135                return -ENOMEM;
3136
3137        rootrefs = memdup_user(argp, sizeof(*rootrefs));
3138        if (IS_ERR(rootrefs)) {
3139                btrfs_free_path(path);
3140                return PTR_ERR(rootrefs);
3141        }
3142
3143        objectid = root->root_key.objectid;
3144        key.objectid = objectid;
3145        key.type = BTRFS_ROOT_REF_KEY;
3146        key.offset = rootrefs->min_treeid;
3147        found = 0;
3148
3149        root = root->fs_info->tree_root;
3150        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3151        if (ret < 0) {
3152                goto out;
3153        } else if (path->slots[0] >=
3154                   btrfs_header_nritems(path->nodes[0])) {
3155                ret = btrfs_next_leaf(root, path);
3156                if (ret < 0) {
3157                        goto out;
3158                } else if (ret > 0) {
3159                        ret = -EUCLEAN;
3160                        goto out;
3161                }
3162        }
3163        while (1) {
3164                leaf = path->nodes[0];
3165                slot = path->slots[0];
3166
3167                btrfs_item_key_to_cpu(leaf, &key, slot);
3168                if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
3169                        ret = 0;
3170                        goto out;
3171                }
3172
3173                if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
3174                        ret = -EOVERFLOW;
3175                        goto out;
3176                }
3177
3178                rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
3179                rootrefs->rootref[found].treeid = key.offset;
3180                rootrefs->rootref[found].dirid =
3181                                  btrfs_root_ref_dirid(leaf, rref);
3182                found++;
3183
3184                ret = btrfs_next_item(root, path);
3185                if (ret < 0) {
3186                        goto out;
3187                } else if (ret > 0) {
3188                        ret = -EUCLEAN;
3189                        goto out;
3190                }
3191        }
3192
3193out:
3194        if (!ret || ret == -EOVERFLOW) {
3195                rootrefs->num_items = found;
3196                /* update min_treeid for next search */
3197                if (found)
3198                        rootrefs->min_treeid =
3199                                rootrefs->rootref[found - 1].treeid + 1;
3200                if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
3201                        ret = -EFAULT;
3202        }
3203
3204        kfree(rootrefs);
3205        btrfs_free_path(path);
3206
3207        return ret;
3208}
3209
3210static noinline int btrfs_ioctl_snap_destroy(struct file *file,
3211                                             void __user *arg,
3212                                             bool destroy_v2)
3213{
3214        struct dentry *parent = file->f_path.dentry;
3215        struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
3216        struct dentry *dentry;
3217        struct inode *dir = d_inode(parent);
3218        struct inode *inode;
3219        struct btrfs_root *root = BTRFS_I(dir)->root;
3220        struct btrfs_root *dest = NULL;
3221        struct btrfs_ioctl_vol_args *vol_args = NULL;
3222        struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
3223        struct user_namespace *mnt_userns = file_mnt_user_ns(file);
3224        char *subvol_name, *subvol_name_ptr = NULL;
3225        int subvol_namelen;
3226        int err = 0;
3227        bool destroy_parent = false;
3228
3229        /* We don't support snapshots with extent tree v2 yet. */
3230        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3231                btrfs_err(fs_info,
3232                          "extent tree v2 doesn't support snapshot deletion yet");
3233                return -EOPNOTSUPP;
3234        }
3235
3236        if (destroy_v2) {
3237                vol_args2 = memdup_user(arg, sizeof(*vol_args2));
3238                if (IS_ERR(vol_args2))
3239                        return PTR_ERR(vol_args2);
3240
3241                if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
3242                        err = -EOPNOTSUPP;
3243                        goto out;
3244                }
3245
3246                /*
3247                 * If SPEC_BY_ID is not set, we are looking for the subvolume by
3248                 * name, same as v1 currently does.
3249                 */
3250                if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
3251                        vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
3252                        subvol_name = vol_args2->name;
3253
3254                        err = mnt_want_write_file(file);
3255                        if (err)
3256                                goto out;
3257                } else {
3258                        struct inode *old_dir;
3259
3260                        if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
3261                                err = -EINVAL;
3262                                goto out;
3263                        }
3264
3265                        err = mnt_want_write_file(file);
3266                        if (err)
3267                                goto out;
3268
3269                        dentry = btrfs_get_dentry(fs_info->sb,
3270                                        BTRFS_FIRST_FREE_OBJECTID,
3271                                        vol_args2->subvolid, 0, 0);
3272                        if (IS_ERR(dentry)) {
3273                                err = PTR_ERR(dentry);
3274                                goto out_drop_write;
3275                        }
3276
3277                        /*
3278                         * Change the default parent since the subvolume being
3279                         * deleted can be outside of the current mount point.
3280                         */
3281                        parent = btrfs_get_parent(dentry);
3282
3283                        /*
3284                         * At this point dentry->d_name can point to '/' if the
3285                         * subvolume we want to destroy is outsite of the
3286                         * current mount point, so we need to release the
3287                         * current dentry and execute the lookup to return a new
3288                         * one with ->d_name pointing to the
3289                         * <mount point>/subvol_name.
3290                         */
3291                        dput(dentry);
3292                        if (IS_ERR(parent)) {
3293                                err = PTR_ERR(parent);
3294                                goto out_drop_write;
3295                        }
3296                        old_dir = dir;
3297                        dir = d_inode(parent);
3298
3299                        /*
3300                         * If v2 was used with SPEC_BY_ID, a new parent was
3301                         * allocated since the subvolume can be outside of the
3302                         * current mount point. Later on we need to release this
3303                         * new parent dentry.
3304                         */
3305                        destroy_parent = true;
3306
3307                        /*
3308                         * On idmapped mounts, deletion via subvolid is
3309                         * restricted to subvolumes that are immediate
3310                         * ancestors of the inode referenced by the file
3311                         * descriptor in the ioctl. Otherwise the idmapping
3312                         * could potentially be abused to delete subvolumes
3313                         * anywhere in the filesystem the user wouldn't be able
3314                         * to delete without an idmapped mount.
3315                         */
3316                        if (old_dir != dir && mnt_userns != &init_user_ns) {
3317                                err = -EOPNOTSUPP;
3318                                goto free_parent;
3319                        }
3320
3321                        subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
3322                                                fs_info, vol_args2->subvolid);
3323                        if (IS_ERR(subvol_name_ptr)) {
3324                                err = PTR_ERR(subvol_name_ptr);
3325                                goto free_parent;
3326                        }
3327                        /* subvol_name_ptr is already nul terminated */
3328                        subvol_name = (char *)kbasename(subvol_name_ptr);
3329                }
3330        } else {
3331                vol_args = memdup_user(arg, sizeof(*vol_args));
3332                if (IS_ERR(vol_args))
3333                        return PTR_ERR(vol_args);
3334
3335                vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
3336                subvol_name = vol_args->name;
3337
3338                err = mnt_want_write_file(file);
3339                if (err)
3340                        goto out;
3341        }
3342
3343        subvol_namelen = strlen(subvol_name);
3344
3345        if (strchr(subvol_name, '/') ||
3346            strncmp(subvol_name, "..", subvol_namelen) == 0) {
3347                err = -EINVAL;
3348                goto free_subvol_name;
3349        }
3350
3351        if (!S_ISDIR(dir->i_mode)) {
3352                err = -ENOTDIR;
3353                goto free_subvol_name;
3354        }
3355
3356        err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
3357        if (err == -EINTR)
3358                goto free_subvol_name;
3359        dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
3360        if (IS_ERR(dentry)) {
3361                err = PTR_ERR(dentry);
3362                goto out_unlock_dir;
3363        }
3364
3365        if (d_really_is_negative(dentry)) {
3366                err = -ENOENT;
3367                goto out_dput;
3368        }
3369
3370        inode = d_inode(dentry);
3371        dest = BTRFS_I(inode)->root;
3372        if (!capable(CAP_SYS_ADMIN)) {
3373                /*
3374                 * Regular user.  Only allow this with a special mount
3375                 * option, when the user has write+exec access to the
3376                 * subvol root, and when rmdir(2) would have been
3377                 * allowed.
3378                 *
3379                 * Note that this is _not_ check that the subvol is
3380                 * empty or doesn't contain data that we wouldn't
3381                 * otherwise be able to delete.
3382                 *
3383                 * Users who want to delete empty subvols should try
3384                 * rmdir(2).
3385                 */
3386                err = -EPERM;
3387                if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
3388                        goto out_dput;
3389
3390                /*
3391                 * Do not allow deletion if the parent dir is the same
3392                 * as the dir to be deleted.  That means the ioctl
3393                 * must be called on the dentry referencing the root
3394                 * of the subvol, not a random directory contained
3395                 * within it.
3396                 */
3397                err = -EINVAL;
3398                if (root == dest)
3399                        goto out_dput;
3400
3401                err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
3402                if (err)
3403                        goto out_dput;
3404        }
3405
3406        /* check if subvolume may be deleted by a user */
3407        err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
3408        if (err)
3409                goto out_dput;
3410
3411        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
3412                err = -EINVAL;
3413                goto out_dput;
3414        }
3415
3416        btrfs_inode_lock(inode, 0);
3417        err = btrfs_delete_subvolume(dir, dentry);
3418        btrfs_inode_unlock(inode, 0);
3419        if (!err)
3420                d_delete_notify(dir, dentry);
3421
3422out_dput:
3423        dput(dentry);
3424out_unlock_dir:
3425        btrfs_inode_unlock(dir, 0);
3426free_subvol_name:
3427        kfree(subvol_name_ptr);
3428free_parent:
3429        if (destroy_parent)
3430                dput(parent);
3431out_drop_write:
3432        mnt_drop_write_file(file);
3433out:
3434        kfree(vol_args2);
3435        kfree(vol_args);
3436        return err;
3437}
3438
3439static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
3440{
3441        struct inode *inode = file_inode(file);
3442        struct btrfs_root *root = BTRFS_I(inode)->root;
3443        struct btrfs_ioctl_defrag_range_args range = {0};
3444        int ret;
3445
3446        ret = mnt_want_write_file(file);
3447        if (ret)
3448                return ret;
3449
3450        if (btrfs_root_readonly(root)) {
3451                ret = -EROFS;
3452                goto out;
3453        }
3454
3455        switch (inode->i_mode & S_IFMT) {
3456        case S_IFDIR:
3457                if (!capable(CAP_SYS_ADMIN)) {
3458                        ret = -EPERM;
3459                        goto out;
3460                }
3461                ret = btrfs_defrag_root(root);
3462                break;
3463        case S_IFREG:
3464                /*
3465                 * Note that this does not check the file descriptor for write
3466                 * access. This prevents defragmenting executables that are
3467                 * running and allows defrag on files open in read-only mode.
3468                 */
3469                if (!capable(CAP_SYS_ADMIN) &&
3470                    inode_permission(&init_user_ns, inode, MAY_WRITE)) {
3471                        ret = -EPERM;
3472                        goto out;
3473                }
3474
3475                if (argp) {
3476                        if (copy_from_user(&range, argp, sizeof(range))) {
3477                                ret = -EFAULT;
3478                                goto out;
3479                        }
3480                        /* compression requires us to start the IO */
3481                        if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
3482                                range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
3483                                range.extent_thresh = (u32)-1;
3484                        }
3485                } else {
3486                        /* the rest are all set to zero by kzalloc */
3487                        range.len = (u64)-1;
3488                }
3489                ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
3490                                        &range, BTRFS_OLDEST_GENERATION, 0);
3491                if (ret > 0)
3492                        ret = 0;
3493                break;
3494        default:
3495                ret = -EINVAL;
3496        }
3497out:
3498        mnt_drop_write_file(file);
3499        return ret;
3500}
3501
3502static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
3503{
3504        struct btrfs_ioctl_vol_args *vol_args;
3505        bool restore_op = false;
3506        int ret;
3507
3508        if (!capable(CAP_SYS_ADMIN))
3509                return -EPERM;
3510
3511        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3512                btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
3513                return -EINVAL;
3514        }
3515
3516        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
3517                if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
3518                        return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3519
3520                /*
3521                 * We can do the device add because we have a paused balanced,
3522                 * change the exclusive op type and remember we should bring
3523                 * back the paused balance
3524                 */
3525                fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
3526                btrfs_exclop_start_unlock(fs_info);
3527                restore_op = true;
3528        }
3529
3530        vol_args = memdup_user(arg, sizeof(*vol_args));
3531        if (IS_ERR(vol_args)) {
3532                ret = PTR_ERR(vol_args);
3533                goto out;
3534        }
3535
3536        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
3537        ret = btrfs_init_new_device(fs_info, vol_args->name);
3538
3539        if (!ret)
3540                btrfs_info(fs_info, "disk added %s", vol_args->name);
3541
3542        kfree(vol_args);
3543out:
3544        if (restore_op)
3545                btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
3546        else
3547                btrfs_exclop_finish(fs_info);
3548        return ret;
3549}
3550
3551static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
3552{
3553        BTRFS_DEV_LOOKUP_ARGS(args);
3554        struct inode *inode = file_inode(file);
3555        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3556        struct btrfs_ioctl_vol_args_v2 *vol_args;
3557        struct block_device *bdev = NULL;
3558        fmode_t mode;
3559        int ret;
3560        bool cancel = false;
3561
3562        if (!capable(CAP_SYS_ADMIN))
3563                return -EPERM;
3564
3565        vol_args = memdup_user(arg, sizeof(*vol_args));
3566        if (IS_ERR(vol_args))
3567                return PTR_ERR(vol_args);
3568
3569        if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
3570                ret = -EOPNOTSUPP;
3571                goto out;
3572        }
3573
3574        vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
3575        if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
3576                args.devid = vol_args->devid;
3577        } else if (!strcmp("cancel", vol_args->name)) {
3578                cancel = true;
3579        } else {
3580                ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
3581                if (ret)
3582                        goto out;
3583        }
3584
3585        ret = mnt_want_write_file(file);
3586        if (ret)
3587                goto out;
3588
3589        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
3590                                           cancel);
3591        if (ret)
3592                goto err_drop;
3593
3594        /* Exclusive operation is now claimed */
3595        ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
3596
3597        btrfs_exclop_finish(fs_info);
3598
3599        if (!ret) {
3600                if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
3601                        btrfs_info(fs_info, "device deleted: id %llu",
3602                                        vol_args->devid);
3603                else
3604                        btrfs_info(fs_info, "device deleted: %s",
3605                                        vol_args->name);
3606        }
3607err_drop:
3608        mnt_drop_write_file(file);
3609        if (bdev)
3610                blkdev_put(bdev, mode);
3611out:
3612        btrfs_put_dev_args_from_path(&args);
3613        kfree(vol_args);
3614        return ret;
3615}
3616
3617static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
3618{
3619        BTRFS_DEV_LOOKUP_ARGS(args);
3620        struct inode *inode = file_inode(file);
3621        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3622        struct btrfs_ioctl_vol_args *vol_args;
3623        struct block_device *bdev = NULL;
3624        fmode_t mode;
3625        int ret;
3626        bool cancel = false;
3627
3628        if (!capable(CAP_SYS_ADMIN))
3629                return -EPERM;
3630
3631        vol_args = memdup_user(arg, sizeof(*vol_args));
3632        if (IS_ERR(vol_args))
3633                return PTR_ERR(vol_args);
3634
3635        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
3636        if (!strcmp("cancel", vol_args->name)) {
3637                cancel = true;
3638        } else {
3639                ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
3640                if (ret)
3641                        goto out;
3642        }
3643
3644        ret = mnt_want_write_file(file);
3645        if (ret)
3646                goto out;
3647
3648        ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
3649                                           cancel);
3650        if (ret == 0) {
3651                ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
3652                if (!ret)
3653                        btrfs_info(fs_info, "disk deleted %s", vol_args->name);
3654                btrfs_exclop_finish(fs_info);
3655        }
3656
3657        mnt_drop_write_file(file);
3658        if (bdev)
3659                blkdev_put(bdev, mode);
3660out:
3661        btrfs_put_dev_args_from_path(&args);
3662        kfree(vol_args);
3663        return ret;
3664}
3665
3666static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
3667                                void __user *arg)
3668{
3669        struct btrfs_ioctl_fs_info_args *fi_args;
3670        struct btrfs_device *device;
3671        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3672        u64 flags_in;
3673        int ret = 0;
3674
3675        fi_args = memdup_user(arg, sizeof(*fi_args));
3676        if (IS_ERR(fi_args))
3677                return PTR_ERR(fi_args);
3678
3679        flags_in = fi_args->flags;
3680        memset(fi_args, 0, sizeof(*fi_args));
3681
3682        rcu_read_lock();
3683        fi_args->num_devices = fs_devices->num_devices;
3684
3685        list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
3686                if (device->devid > fi_args->max_id)
3687                        fi_args->max_id = device->devid;
3688        }
3689        rcu_read_unlock();
3690
3691        memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
3692        fi_args->nodesize = fs_info->nodesize;
3693        fi_args->sectorsize = fs_info->sectorsize;
3694        fi_args->clone_alignment = fs_info->sectorsize;
3695
3696        if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
3697                fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
3698                fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
3699                fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
3700        }
3701
3702        if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
3703                fi_args->generation = fs_info->generation;
3704                fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
3705        }
3706
3707        if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
3708                memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
3709                       sizeof(fi_args->metadata_uuid));
3710                fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
3711        }
3712
3713        if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
3714                ret = -EFAULT;
3715
3716        kfree(fi_args);
3717        return ret;
3718}
3719
3720static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
3721                                 void __user *arg)
3722{
3723        BTRFS_DEV_LOOKUP_ARGS(args);
3724        struct btrfs_ioctl_dev_info_args *di_args;
3725        struct btrfs_device *dev;
3726        int ret = 0;
3727
3728        di_args = memdup_user(arg, sizeof(*di_args));
3729        if (IS_ERR(di_args))
3730                return PTR_ERR(di_args);
3731
3732        args.devid = di_args->devid;
3733        if (!btrfs_is_empty_uuid(di_args->uuid))
3734                args.uuid = di_args->uuid;
3735
3736        rcu_read_lock();
3737        dev = btrfs_find_device(fs_info->fs_devices, &args);
3738        if (!dev) {
3739                ret = -ENODEV;
3740                goto out;
3741        }
3742
3743        di_args->devid = dev->devid;
3744        di_args->bytes_used = btrfs_device_get_bytes_used(dev);
3745        di_args->total_bytes = btrfs_device_get_total_bytes(dev);
3746        memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
3747        if (dev->name) {
3748                strncpy(di_args->path, rcu_str_deref(dev->name),
3749                                sizeof(di_args->path) - 1);
3750                di_args->path[sizeof(di_args->path) - 1] = 0;
3751        } else {
3752                di_args->path[0] = '\0';
3753        }
3754
3755out:
3756        rcu_read_unlock();
3757        if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
3758                ret = -EFAULT;
3759
3760        kfree(di_args);
3761        return ret;
3762}
3763
3764static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
3765{
3766        struct inode *inode = file_inode(file);
3767        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3768        struct btrfs_root *root = BTRFS_I(inode)->root;
3769        struct btrfs_root *new_root;
3770        struct btrfs_dir_item *di;
3771        struct btrfs_trans_handle *trans;
3772        struct btrfs_path *path = NULL;
3773        struct btrfs_disk_key disk_key;
3774        u64 objectid = 0;
3775        u64 dir_id;
3776        int ret;
3777
3778        if (!capable(CAP_SYS_ADMIN))
3779                return -EPERM;
3780
3781        ret = mnt_want_write_file(file);
3782        if (ret)
3783                return ret;
3784
3785        if (copy_from_user(&objectid, argp, sizeof(objectid))) {
3786                ret = -EFAULT;
3787                goto out;
3788        }
3789
3790        if (!objectid)
3791                objectid = BTRFS_FS_TREE_OBJECTID;
3792
3793        new_root = btrfs_get_fs_root(fs_info, objectid, true);
3794        if (IS_ERR(new_root)) {
3795                ret = PTR_ERR(new_root);
3796                goto out;
3797        }
3798        if (!is_fstree(new_root->root_key.objectid)) {
3799                ret = -ENOENT;
3800                goto out_free;
3801        }
3802
3803        path = btrfs_alloc_path();
3804        if (!path) {
3805                ret = -ENOMEM;
3806                goto out_free;
3807        }
3808
3809        trans = btrfs_start_transaction(root, 1);
3810        if (IS_ERR(trans)) {
3811                ret = PTR_ERR(trans);
3812                goto out_free;
3813        }
3814
3815        dir_id = btrfs_super_root_dir(fs_info->super_copy);
3816        di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
3817                                   dir_id, "default", 7, 1);
3818        if (IS_ERR_OR_NULL(di)) {
3819                btrfs_release_path(path);
3820                btrfs_end_transaction(trans);
3821                btrfs_err(fs_info,
3822                          "Umm, you don't have the default diritem, this isn't going to work");
3823                ret = -ENOENT;
3824                goto out_free;
3825        }
3826
3827        btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
3828        btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
3829        btrfs_mark_buffer_dirty(path->nodes[0]);
3830        btrfs_release_path(path);
3831
3832        btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
3833        btrfs_end_transaction(trans);
3834out_free:
3835        btrfs_put_root(new_root);
3836        btrfs_free_path(path);
3837out:
3838        mnt_drop_write_file(file);
3839        return ret;
3840}
3841
3842static void get_block_group_info(struct list_head *groups_list,
3843                                 struct btrfs_ioctl_space_info *space)
3844{
3845        struct btrfs_block_group *block_group;
3846
3847        space->total_bytes = 0;
3848        space->used_bytes = 0;
3849        space->flags = 0;
3850        list_for_each_entry(block_group, groups_list, list) {
3851                space->flags = block_group->flags;
3852                space->total_bytes += block_group->length;
3853                space->used_bytes += block_group->used;
3854        }
3855}
3856
3857static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
3858                                   void __user *arg)
3859{
3860        struct btrfs_ioctl_space_args space_args;
3861        struct btrfs_ioctl_space_info space;
3862        struct btrfs_ioctl_space_info *dest;
3863        struct btrfs_ioctl_space_info *dest_orig;
3864        struct btrfs_ioctl_space_info __user *user_dest;
3865        struct btrfs_space_info *info;
3866        static const u64 types[] = {
3867                BTRFS_BLOCK_GROUP_DATA,
3868                BTRFS_BLOCK_GROUP_SYSTEM,
3869                BTRFS_BLOCK_GROUP_METADATA,
3870                BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
3871        };
3872        int num_types = 4;
3873        int alloc_size;
3874        int ret = 0;
3875        u64 slot_count = 0;
3876        int i, c;
3877
3878        if (copy_from_user(&space_args,
3879                           (struct btrfs_ioctl_space_args __user *)arg,
3880                           sizeof(space_args)))
3881                return -EFAULT;
3882
3883        for (i = 0; i < num_types; i++) {
3884                struct btrfs_space_info *tmp;
3885
3886                info = NULL;
3887                list_for_each_entry(tmp, &fs_info->space_info, list) {
3888                        if (tmp->flags == types[i]) {
3889                                info = tmp;
3890                                break;
3891                        }
3892                }
3893
3894                if (!info)
3895                        continue;
3896
3897                down_read(&info->groups_sem);
3898                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3899                        if (!list_empty(&info->block_groups[c]))
3900                                slot_count++;
3901                }
3902                up_read(&info->groups_sem);
3903        }
3904
3905        /*
3906         * Global block reserve, exported as a space_info
3907         */
3908        slot_count++;
3909
3910        /* space_slots == 0 means they are asking for a count */
3911        if (space_args.space_slots == 0) {
3912                space_args.total_spaces = slot_count;
3913                goto out;
3914        }
3915
3916        slot_count = min_t(u64, space_args.space_slots, slot_count);
3917
3918        alloc_size = sizeof(*dest) * slot_count;
3919
3920        /* we generally have at most 6 or so space infos, one for each raid
3921         * level.  So, a whole page should be more than enough for everyone
3922         */
3923        if (alloc_size > PAGE_SIZE)
3924                return -ENOMEM;
3925
3926        space_args.total_spaces = 0;
3927        dest = kmalloc(alloc_size, GFP_KERNEL);
3928        if (!dest)
3929                return -ENOMEM;
3930        dest_orig = dest;
3931
3932        /* now we have a buffer to copy into */
3933        for (i = 0; i < num_types; i++) {
3934                struct btrfs_space_info *tmp;
3935
3936                if (!slot_count)
3937                        break;
3938
3939                info = NULL;
3940                list_for_each_entry(tmp, &fs_info->space_info, list) {
3941                        if (tmp->flags == types[i]) {
3942                                info = tmp;
3943                                break;
3944                        }
3945                }
3946
3947                if (!info)
3948                        continue;
3949                down_read(&info->groups_sem);
3950                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3951                        if (!list_empty(&info->block_groups[c])) {
3952                                get_block_group_info(&info->block_groups[c],
3953                                                     &space);
3954                                memcpy(dest, &space, sizeof(space));
3955                                dest++;
3956                                space_args.total_spaces++;
3957                                slot_count--;
3958                        }
3959                        if (!slot_count)
3960                                break;
3961                }
3962                up_read(&info->groups_sem);
3963        }
3964
3965        /*
3966         * Add global block reserve
3967         */
3968        if (slot_count) {
3969                struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3970
3971                spin_lock(&block_rsv->lock);
3972                space.total_bytes = block_rsv->size;
3973                space.used_bytes = block_rsv->size - block_rsv->reserved;
3974                spin_unlock(&block_rsv->lock);
3975                space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
3976                memcpy(dest, &space, sizeof(space));
3977                space_args.total_spaces++;
3978        }
3979
3980        user_dest = (struct btrfs_ioctl_space_info __user *)
3981                (arg + sizeof(struct btrfs_ioctl_space_args));
3982
3983        if (copy_to_user(user_dest, dest_orig, alloc_size))
3984                ret = -EFAULT;
3985
3986        kfree(dest_orig);
3987out:
3988        if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
3989                ret = -EFAULT;
3990
3991        return ret;
3992}
3993
3994static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3995                                            void __user *argp)
3996{
3997        struct btrfs_trans_handle *trans;
3998        u64 transid;
3999
4000        trans = btrfs_attach_transaction_barrier(root);

4001        if (IS_ERR(trans)) {
4002                if (PTR_ERR(trans) != -ENOENT)
4003                        return PTR_ERR(trans);
4004
4005                /* No running transaction, don't bother */
4006                transid = root->fs_info->last_trans_committed;
4007                goto out;
4008        }
4009        transid = trans->transid;
4010        btrfs_commit_transaction_async(trans);
4011out:
4012        if (argp)
4013                if (copy_to_user(argp, &transid, sizeof(transid)))
4014                        return -EFAULT;
4015        return 0;
4016}
4017
4018static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
4019                                           void __user *argp)
4020{
4021        u64 transid;
4022
4023        if (argp) {
4024                if (copy_from_user(&transid, argp, sizeof(transid)))
4025                        return -EFAULT;
4026        } else {
4027                transid = 0;  /* current trans */
4028        }
4029        return btrfs_wait_for_commit(fs_info, transid);
4030}
4031
4032static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
4033{
4034        struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
4035        struct btrfs_ioctl_scrub_args *sa;
4036        int ret;
4037
4038        if (!capable(CAP_SYS_ADMIN))
4039                return -EPERM;
4040
4041        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
4042                btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
4043                return -EINVAL;
4044        }
4045
4046        sa = memdup_user(arg, sizeof(*sa));
4047        if (IS_ERR(sa))
4048                return PTR_ERR(sa);
4049
4050        if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
4051                ret = mnt_want_write_file(file);
4052                if (ret)
4053                        goto out;
4054        }
4055
4056        ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
4057                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
4058                              0);
4059
4060        /*
4061         * Copy scrub args to user space even if btrfs_scrub_dev() returned an
4062         * error. This is important as it allows user space to know how much
4063         * progress scrub has done. For example, if scrub is canceled we get
4064         * -ECANCELED from btrfs_scrub_dev() and return that error back to user
4065         * space. Later user space can inspect the progress from the structure
4066         * btrfs_ioctl_scrub_args and resume scrub from where it left off
4067         * previously (btrfs-progs does this).
4068         * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
4069         * then return -EFAULT to signal the structure was not copied or it may
4070         * be corrupt and unreliable due to a partial copy.
4071         */
4072        if (copy_to_user(arg, sa, sizeof(*sa)))
4073                ret = -EFAULT;
4074
4075        if (!(sa->flags & BTRFS_SCRUB_READONLY))
4076                mnt_drop_write_file(file);
4077out:
4078        kfree(sa);
4079        return ret;
4080}
4081
4082static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
4083{
4084        if (!capable(CAP_SYS_ADMIN))
4085                return -EPERM;
4086
4087        return btrfs_scrub_cancel(fs_info);
4088}
4089
4090static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
4091                                       void __user *arg)
4092{
4093        struct btrfs_ioctl_scrub_args *sa;
4094        int ret;
4095
4096        if (!capable(CAP_SYS_ADMIN))
4097                return -EPERM;
4098
4099        sa = memdup_user(arg, sizeof(*sa));
4100        if (IS_ERR(sa))
4101                return PTR_ERR(sa);
4102
4103        ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
4104
4105        if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
4106                ret = -EFAULT;
4107
4108        kfree(sa);
4109        return ret;
4110}
4111
4112static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
4113                                      void __user *arg)
4114{
4115        struct btrfs_ioctl_get_dev_stats *sa;
4116        int ret;
4117
4118        sa = memdup_user(arg, sizeof(*sa));
4119        if (IS_ERR(sa))
4120                return PTR_ERR(sa);
4121
4122        if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
4123                kfree(sa);
4124                return -EPERM;
4125        }
4126
4127        ret = btrfs_get_dev_stats(fs_info, sa);
4128
4129        if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
4130                ret = -EFAULT;
4131
4132        kfree(sa);
4133        return ret;
4134}
4135
4136static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
4137                                    void __user *arg)
4138{
4139        struct btrfs_ioctl_dev_replace_args *p;
4140        int ret;
4141
4142        if (!capable(CAP_SYS_ADMIN))
4143                return -EPERM;
4144
4145        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
4146                btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
4147                return -EINVAL;
4148        }
4149
4150        p = memdup_user(arg, sizeof(*p));
4151        if (IS_ERR(p))
4152                return PTR_ERR(p);
4153
4154        switch (p->cmd) {
4155        case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
4156                if (sb_rdonly(fs_info->sb)) {
4157                        ret = -EROFS;
4158                        goto out;
4159                }
4160                if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
4161                        ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
4162                } else {
4163                        ret = btrfs_dev_replace_by_ioctl(fs_info, p);
4164                        btrfs_exclop_finish(fs_info);
4165                }
4166                break;
4167        case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
4168                btrfs_dev_replace_status(fs_info, p);
4169                ret = 0;
4170                break;
4171        case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
4172                p->result = btrfs_dev_replace_cancel(fs_info);
4173                ret = 0;
4174                break;
4175        default:
4176                ret = -EINVAL;
4177                break;
4178        }
4179
4180        if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
4181                ret = -EFAULT;
4182out:
4183        kfree(p);
4184        return ret;
4185}
4186
4187static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
4188{
4189        int ret = 0;
4190        int i;
4191        u64 rel_ptr;
4192        int size;
4193        struct btrfs_ioctl_ino_path_args *ipa = NULL;
4194        struct inode_fs_paths *ipath = NULL;
4195        struct btrfs_path *path;
4196
4197        if (!capable(CAP_DAC_READ_SEARCH))
4198                return -EPERM;
4199
4200        path = btrfs_alloc_path();
4201        if (!path) {
4202                ret = -ENOMEM;
4203                goto out;
4204        }
4205
4206        ipa = memdup_user(arg, sizeof(*ipa));
4207        if (IS_ERR(ipa)) {
4208                ret = PTR_ERR(ipa);
4209                ipa = NULL;
4210                goto out;
4211        }
4212
4213        size = min_t(u32, ipa->size, 4096);
4214        ipath = init_ipath(size, root, path);
4215        if (IS_ERR(ipath)) {
4216                ret = PTR_ERR(ipath);
4217                ipath = NULL;
4218                goto out;
4219        }
4220
4221        ret = paths_from_inode(ipa->inum, ipath);
4222        if (ret < 0)
4223                goto out;
4224
4225        for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
4226                rel_ptr = ipath->fspath->val[i] -
4227                          (u64)(unsigned long)ipath->fspath->val;
4228                ipath->fspath->val[i] = rel_ptr;
4229        }
4230
4231        ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
4232                           ipath->fspath, size);
4233        if (ret) {
4234                ret = -EFAULT;
4235                goto out;
4236        }
4237
4238out:
4239        btrfs_free_path(path);
4240        free_ipath(ipath);
4241        kfree(ipa);
4242
4243        return ret;
4244}
4245
4246static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
4247{
4248        struct btrfs_data_container *inodes = ctx;
4249        const size_t c = 3 * sizeof(u64);
4250
4251        if (inodes->bytes_left >= c) {
4252                inodes->bytes_left -= c;
4253                inodes->val[inodes->elem_cnt] = inum;
4254                inodes->val[inodes->elem_cnt + 1] = offset;
4255                inodes->val[inodes->elem_cnt + 2] = root;
4256                inodes->elem_cnt += 3;
4257        } else {
4258                inodes->bytes_missing += c - inodes->bytes_left;
4259                inodes->bytes_left = 0;
4260                inodes->elem_missed += 3;
4261        }
4262
4263        return 0;
4264}
4265
4266static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
4267                                        void __user *arg, int version)
4268{
4269        int ret = 0;
4270        int size;
4271        struct btrfs_ioctl_logical_ino_args *loi;
4272        struct btrfs_data_container *inodes = NULL;
4273        struct btrfs_path *path = NULL;
4274        bool ignore_offset;
4275
4276        if (!capable(CAP_SYS_ADMIN))
4277                return -EPERM;
4278
4279        loi = memdup_user(arg, sizeof(*loi));
4280        if (IS_ERR(loi))
4281                return PTR_ERR(loi);
4282
4283        if (version == 1) {
4284                ignore_offset = false;
4285                size = min_t(u32, loi->size, SZ_64K);
4286        } else {
4287                /* All reserved bits must be 0 for now */
4288                if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
4289                        ret = -EINVAL;
4290                        goto out_loi;
4291                }
4292                /* Only accept flags we have defined so far */
4293                if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
4294                        ret = -EINVAL;
4295                        goto out_loi;
4296                }
4297                ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
4298                size = min_t(u32, loi->size, SZ_16M);
4299        }
4300
4301        path = btrfs_alloc_path();
4302        if (!path) {
4303                ret = -ENOMEM;
4304                goto out;
4305        }
4306
4307        inodes = init_data_container(size);
4308        if (IS_ERR(inodes)) {
4309                ret = PTR_ERR(inodes);
4310                inodes = NULL;
4311                goto out;
4312        }
4313
4314        ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
4315                                          build_ino_list, inodes, ignore_offset);
4316        if (ret == -EINVAL)
4317                ret = -ENOENT;
4318        if (ret < 0)
4319                goto out;
4320
4321        ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
4322                           size);
4323        if (ret)
4324                ret = -EFAULT;
4325
4326out:
4327        btrfs_free_path(path);
4328        kvfree(inodes);
4329out_loi:
4330        kfree(loi);
4331
4332        return ret;
4333}
4334
4335void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
4336                               struct btrfs_ioctl_balance_args *bargs)
4337{
4338        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4339
4340        bargs->flags = bctl->flags;
4341
4342        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
4343                bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
4344        if (atomic_read(&fs_info->balance_pause_req))
4345                bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
4346        if (atomic_read(&fs_info->balance_cancel_req))
4347                bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
4348
4349        memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
4350        memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
4351        memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
4352
4353        spin_lock(&fs_info->balance_lock);
4354        memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
4355        spin_unlock(&fs_info->balance_lock);
4356}
4357
4358static long btrfs_ioctl_balance(struct file *file, void __user *arg)
4359{
4360        struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4361        struct btrfs_fs_info *fs_info = root->fs_info;
4362        struct btrfs_ioctl_balance_args *bargs;
4363        struct btrfs_balance_control *bctl;
4364        bool need_unlock; /* for mut. excl. ops lock */
4365        int ret;
4366
4367        if (!capable(CAP_SYS_ADMIN))
4368                return -EPERM;
4369
4370        ret = mnt_want_write_file(file);
4371        if (ret)
4372                return ret;
4373
4374        bargs = memdup_user(arg, sizeof(*bargs));
4375        if (IS_ERR(bargs)) {
4376                ret = PTR_ERR(bargs);
4377                bargs = NULL;
4378                goto out;
4379        }
4380
4381again:
4382        if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
4383                mutex_lock(&fs_info->balance_mutex);
4384                need_unlock = true;
4385                goto locked;
4386        }
4387
4388        /*
4389         * mut. excl. ops lock is locked.  Three possibilities:
4390         *   (1) some other op is running
4391         *   (2) balance is running
4392         *   (3) balance is paused -- special case (think resume)
4393         */
4394        mutex_lock(&fs_info->balance_mutex);
4395        if (fs_info->balance_ctl) {
4396                /* this is either (2) or (3) */
4397                if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4398                        mutex_unlock(&fs_info->balance_mutex);
4399                        /*
4400                         * Lock released to allow other waiters to continue,
4401                         * we'll reexamine the status again.
4402                         */
4403                        mutex_lock(&fs_info->balance_mutex);
4404
4405                        if (fs_info->balance_ctl &&
4406                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4407                                /* this is (3) */
4408                                need_unlock = false;
4409                                goto locked;
4410                        }
4411
4412                        mutex_unlock(&fs_info->balance_mutex);
4413                        goto again;
4414                } else {
4415                        /* this is (2) */
4416                        mutex_unlock(&fs_info->balance_mutex);
4417                        ret = -EINPROGRESS;
4418                        goto out;
4419                }
4420        } else {
4421                /* this is (1) */
4422                mutex_unlock(&fs_info->balance_mutex);
4423                ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
4424                goto out;
4425        }
4426
4427locked:
4428        if (bargs->flags & BTRFS_BALANCE_RESUME) {
4429                if (!fs_info->balance_ctl) {
4430                        ret = -ENOTCONN;
4431                        goto out_unlock;
4432                }
4433
4434                bctl = fs_info->balance_ctl;
4435                spin_lock(&fs_info->balance_lock);
4436                bctl->flags |= BTRFS_BALANCE_RESUME;
4437                spin_unlock(&fs_info->balance_lock);
4438                btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
4439
4440                goto do_balance;
4441        }
4442
4443        if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
4444                ret = -EINVAL;
4445                goto out_unlock;
4446        }
4447
4448        if (fs_info->balance_ctl) {
4449                ret = -EINPROGRESS;
4450                goto out_unlock;
4451        }
4452
4453        bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
4454        if (!bctl) {
4455                ret = -ENOMEM;
4456                goto out_unlock;
4457        }
4458
4459        memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
4460        memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
4461        memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
4462
4463        bctl->flags = bargs->flags;
4464do_balance:
4465        /*
4466         * Ownership of bctl and exclusive operation goes to btrfs_balance.
4467         * bctl is freed in reset_balance_state, or, if restriper was paused
4468         * all the way until unmount, in free_fs_info.  The flag should be
4469         * cleared after reset_balance_state.
4470         */
4471        need_unlock = false;
4472
4473        ret = btrfs_balance(fs_info, bctl, bargs);
4474        bctl = NULL;
4475
4476        if (ret == 0 || ret == -ECANCELED) {
4477                if (copy_to_user(arg, bargs, sizeof(*bargs)))
4478                        ret = -EFAULT;
4479        }
4480
4481        kfree(bctl);
4482out_unlock:
4483        mutex_unlock(&fs_info->balance_mutex);
4484        if (need_unlock)
4485                btrfs_exclop_finish(fs_info);
4486out:
4487        mnt_drop_write_file(file);
4488        kfree(bargs);
4489        return ret;
4490}
4491
4492static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
4493{
4494        if (!capable(CAP_SYS_ADMIN))
4495                return -EPERM;
4496
4497        switch (cmd) {
4498        case BTRFS_BALANCE_CTL_PAUSE:
4499                return btrfs_pause_balance(fs_info);
4500        case BTRFS_BALANCE_CTL_CANCEL:
4501                return btrfs_cancel_balance(fs_info);
4502        }
4503
4504        return -EINVAL;
4505}
4506
4507static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
4508                                         void __user *arg)
4509{
4510        struct btrfs_ioctl_balance_args *bargs;
4511        int ret = 0;
4512
4513        if (!capable(CAP_SYS_ADMIN))
4514                return -EPERM;
4515
4516        mutex_lock(&fs_info->balance_mutex);
4517        if (!fs_info->balance_ctl) {
4518                ret = -ENOTCONN;
4519                goto out;
4520        }
4521
4522        bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
4523        if (!bargs) {
4524                ret = -ENOMEM;
4525                goto out;
4526        }
4527
4528        btrfs_update_ioctl_balance_args(fs_info, bargs);
4529
4530        if (copy_to_user(arg, bargs, sizeof(*bargs)))
4531                ret = -EFAULT;
4532
4533        kfree(bargs);
4534out:
4535        mutex_unlock(&fs_info->balance_mutex);
4536        return ret;
4537}
4538
4539static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
4540{
4541        struct inode *inode = file_inode(file);
4542        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4543        struct btrfs_ioctl_quota_ctl_args *sa;
4544        int ret;
4545
4546        if (!capable(CAP_SYS_ADMIN))
4547                return -EPERM;
4548
4549        ret = mnt_want_write_file(file);
4550        if (ret)
4551                return ret;
4552
4553        sa = memdup_user(arg, sizeof(*sa));
4554        if (IS_ERR(sa)) {
4555                ret = PTR_ERR(sa);
4556                goto drop_write;
4557        }
4558
4559        down_write(&fs_info->subvol_sem);
4560
4561        switch (sa->cmd) {
4562        case BTRFS_QUOTA_CTL_ENABLE:
4563                ret = btrfs_quota_enable(fs_info);
4564                break;
4565        case BTRFS_QUOTA_CTL_DISABLE:
4566                ret = btrfs_quota_disable(fs_info);
4567                break;
4568        default:
4569                ret = -EINVAL;
4570                break;
4571        }
4572
4573        kfree(sa);
4574        up_write(&fs_info->subvol_sem);
4575drop_write:
4576        mnt_drop_write_file(file);
4577        return ret;
4578}
4579
4580static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
4581{
4582        struct inode *inode = file_inode(file);
4583        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4584        struct btrfs_root *root = BTRFS_I(inode)->root;
4585        struct btrfs_ioctl_qgroup_assign_args *sa;
4586        struct btrfs_trans_handle *trans;
4587        int ret;
4588        int err;
4589
4590        if (!capable(CAP_SYS_ADMIN))
4591                return -EPERM;
4592
4593        ret = mnt_want_write_file(file);
4594        if (ret)
4595                return ret;
4596
4597        sa = memdup_user(arg, sizeof(*sa));
4598        if (IS_ERR(sa)) {
4599                ret = PTR_ERR(sa);
4600                goto drop_write;
4601        }
4602
4603        trans = btrfs_join_transaction(root);
4604        if (IS_ERR(trans)) {
4605                ret = PTR_ERR(trans);
4606                goto out;
4607        }
4608
4609        if (sa->assign) {
4610                ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
4611        } else {
4612                ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
4613        }
4614
4615        /* update qgroup status and info */
4616        err = btrfs_run_qgroups(trans);
4617        if (err < 0)
4618                btrfs_handle_fs_error(fs_info, err,
4619                                      "failed to update qgroup status and info");
4620        err = btrfs_end_transaction(trans);
4621        if (err && !ret)
4622                ret = err;
4623
4624out:
4625        kfree(sa);
4626drop_write:
4627        mnt_drop_write_file(file);
4628        return ret;
4629}
4630
4631static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
4632{
4633        struct inode *inode = file_inode(file);
4634        struct btrfs_root *root = BTRFS_I(inode)->root;
4635        struct btrfs_ioctl_qgroup_create_args *sa;
4636        struct btrfs_trans_handle *trans;
4637        int ret;
4638        int err;
4639
4640        if (!capable(CAP_SYS_ADMIN))
4641                return -EPERM;
4642
4643        ret = mnt_want_write_file(file);
4644        if (ret)
4645                return ret;
4646
4647        sa = memdup_user(arg, sizeof(*sa));
4648        if (IS_ERR(sa)) {
4649                ret = PTR_ERR(sa);
4650                goto drop_write;
4651        }
4652
4653        if (!sa->qgroupid) {
4654                ret = -EINVAL;
4655                goto out;
4656        }
4657
4658        trans = btrfs_join_transaction(root);
4659        if (IS_ERR(trans)) {
4660                ret = PTR_ERR(trans);
4661                goto out;
4662        }
4663
4664        if (sa->create) {
4665                ret = btrfs_create_qgroup(trans, sa->qgroupid);
4666        } else {
4667                ret = btrfs_remove_qgroup(trans, sa->qgroupid);
4668        }
4669
4670        err = btrfs_end_transaction(trans);
4671        if (err && !ret)
4672                ret = err;
4673
4674out:
4675        kfree(sa);
4676drop_write:
4677        mnt_drop_write_file(file);
4678        return ret;
4679}
4680
4681static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
4682{
4683        struct inode *inode = file_inode(file);
4684        struct btrfs_root *root = BTRFS_I(inode)->root;
4685        struct btrfs_ioctl_qgroup_limit_args *sa;
4686        struct btrfs_trans_handle *trans;
4687        int ret;
4688        int err;
4689        u64 qgroupid;
4690
4691        if (!capable(CAP_SYS_ADMIN))
4692                return -EPERM;
4693
4694        ret = mnt_want_write_file(file);
4695        if (ret)
4696                return ret;
4697
4698        sa = memdup_user(arg, sizeof(*sa));
4699        if (IS_ERR(sa)) {
4700                ret = PTR_ERR(sa);
4701                goto drop_write;
4702        }
4703
4704        trans = btrfs_join_transaction(root);
4705        if (IS_ERR(trans)) {
4706                ret = PTR_ERR(trans);
4707                goto out;
4708        }
4709
4710        qgroupid = sa->qgroupid;
4711        if (!qgroupid) {
4712                /* take the current subvol as qgroup */
4713                qgroupid = root->root_key.objectid;
4714        }
4715
4716        ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
4717
4718        err = btrfs_end_transaction(trans);
4719        if (err && !ret)
4720                ret = err;
4721
4722out:
4723        kfree(sa);
4724drop_write:
4725        mnt_drop_write_file(file);
4726        return ret;
4727}
4728
4729static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
4730{
4731        struct inode *inode = file_inode(file);
4732        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4733        struct btrfs_ioctl_quota_rescan_args *qsa;
4734        int ret;
4735
4736        if (!capable(CAP_SYS_ADMIN))
4737                return -EPERM;
4738
4739        ret = mnt_want_write_file(file);
4740        if (ret)
4741                return ret;
4742
4743        qsa = memdup_user(arg, sizeof(*qsa));
4744        if (IS_ERR(qsa)) {
4745                ret = PTR_ERR(qsa);
4746                goto drop_write;
4747        }
4748
4749        if (qsa->flags) {
4750                ret = -EINVAL;
4751                goto out;
4752        }
4753
4754        ret = btrfs_qgroup_rescan(fs_info);
4755
4756out:
4757        kfree(qsa);
4758drop_write:
4759        mnt_drop_write_file(file);
4760        return ret;
4761}
4762
4763static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
4764                                                void __user *arg)
4765{
4766        struct btrfs_ioctl_quota_rescan_args qsa = {0};
4767
4768        if (!capable(CAP_SYS_ADMIN))
4769                return -EPERM;
4770
4771        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
4772                qsa.flags = 1;
4773                qsa.progress = fs_info->qgroup_rescan_progress.objectid;
4774        }
4775
4776        if (copy_to_user(arg, &qsa, sizeof(qsa)))
4777                return -EFAULT;
4778
4779        return 0;
4780}
4781
4782static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
4783                                                void __user *arg)
4784{
4785        if (!capable(CAP_SYS_ADMIN))
4786                return -EPERM;
4787
4788        return btrfs_qgroup_wait_for_completion(fs_info, true);
4789}
4790
4791static long _btrfs_ioctl_set_received_subvol(struct file *file,
4792                                            struct user_namespace *mnt_userns,
4793                                            struct btrfs_ioctl_received_subvol_args *sa)
4794{
4795        struct inode *inode = file_inode(file);
4796        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4797        struct btrfs_root *root = BTRFS_I(inode)->root;
4798        struct btrfs_root_item *root_item = &root->root_item;
4799        struct btrfs_trans_handle *trans;
4800        struct timespec64 ct = current_time(inode);
4801        int ret = 0;
4802        int received_uuid_changed;
4803
4804        if (!inode_owner_or_capable(mnt_userns, inode))
4805                return -EPERM;
4806
4807        ret = mnt_want_write_file(file);
4808        if (ret < 0)
4809                return ret;
4810
4811        down_write(&fs_info->subvol_sem);
4812
4813        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
4814                ret = -EINVAL;
4815                goto out;
4816        }
4817
4818        if (btrfs_root_readonly(root)) {
4819                ret = -EROFS;
4820                goto out;
4821        }
4822
4823        /*
4824         * 1 - root item
4825         * 2 - uuid items (received uuid + subvol uuid)
4826         */
4827        trans = btrfs_start_transaction(root, 3);
4828        if (IS_ERR(trans)) {
4829                ret = PTR_ERR(trans);
4830                trans = NULL;
4831                goto out;
4832        }
4833
4834        sa->rtransid = trans->transid;
4835        sa->rtime.sec = ct.tv_sec;
4836        sa->rtime.nsec = ct.tv_nsec;
4837
4838        received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
4839                                       BTRFS_UUID_SIZE);
4840        if (received_uuid_changed &&
4841            !btrfs_is_empty_uuid(root_item->received_uuid)) {
4842                ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
4843                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4844                                          root->root_key.objectid);
4845                if (ret && ret != -ENOENT) {
4846                        btrfs_abort_transaction(trans, ret);
4847                        btrfs_end_transaction(trans);
4848                        goto out;
4849                }
4850        }
4851        memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
4852        btrfs_set_root_stransid(root_item, sa->stransid);
4853        btrfs_set_root_rtransid(root_item, sa->rtransid);
4854        btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
4855        btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
4856        btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
4857        btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
4858
4859        ret = btrfs_update_root(trans, fs_info->tree_root,
4860                                &root->root_key, &root->root_item);
4861        if (ret < 0) {
4862                btrfs_end_transaction(trans);
4863                goto out;
4864        }
4865        if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
4866                ret = btrfs_uuid_tree_add(trans, sa->uuid,
4867                                          BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4868                                          root->root_key.objectid);
4869                if (ret < 0 && ret != -EEXIST) {
4870                        btrfs_abort_transaction(trans, ret);
4871                        btrfs_end_transaction(trans);
4872                        goto out;
4873                }
4874        }
4875        ret = btrfs_commit_transaction(trans);
4876out:
4877        up_write(&fs_info->subvol_sem);
4878        mnt_drop_write_file(file);
4879        return ret;
4880}
4881
4882#ifdef CONFIG_64BIT
4883static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4884                                                void __user *arg)
4885{
4886        struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4887        struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4888        int ret = 0;
4889
4890        args32 = memdup_user(arg, sizeof(*args32));
4891        if (IS_ERR(args32))
4892                return PTR_ERR(args32);
4893
4894        args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
4895        if (!args64) {
4896                ret = -ENOMEM;
4897                goto out;
4898        }
4899
4900        memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4901        args64->stransid = args32->stransid;
4902        args64->rtransid = args32->rtransid;
4903        args64->stime.sec = args32->stime.sec;
4904        args64->stime.nsec = args32->stime.nsec;
4905        args64->rtime.sec = args32->rtime.sec;
4906        args64->rtime.nsec = args32->rtime.nsec;
4907        args64->flags = args32->flags;
4908
4909        ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
4910        if (ret)
4911                goto out;
4912
4913        memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4914        args32->stransid = args64->stransid;
4915        args32->rtransid = args64->rtransid;
4916        args32->stime.sec = args64->stime.sec;
4917        args32->stime.nsec = args64->stime.nsec;
4918        args32->rtime.sec = args64->rtime.sec;
4919        args32->rtime.nsec = args64->rtime.nsec;
4920        args32->flags = args64->flags;
4921
4922        ret = copy_to_user(arg, args32, sizeof(*args32));
4923        if (ret)
4924                ret = -EFAULT;
4925
4926out:
4927        kfree(args32);
4928        kfree(args64);
4929        return ret;
4930}
4931#endif
4932
4933static long btrfs_ioctl_set_received_subvol(struct file *file,
4934                                            void __user *arg)
4935{
4936        struct btrfs_ioctl_received_subvol_args *sa = NULL;
4937        int ret = 0;
4938
4939        sa = memdup_user(arg, sizeof(*sa));
4940        if (IS_ERR(sa))
4941                return PTR_ERR(sa);
4942
4943        ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
4944
4945        if (ret)
4946                goto out;
4947
4948        ret = copy_to_user(arg, sa, sizeof(*sa));
4949        if (ret)
4950                ret = -EFAULT;
4951
4952out:
4953        kfree(sa);
4954        return ret;
4955}
4956
4957static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
4958                                        void __user *arg)
4959{
4960        size_t len;
4961        int ret;
4962        char label[BTRFS_LABEL_SIZE];
4963
4964        spin_lock(&fs_info->super_lock);
4965        memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
4966        spin_unlock(&fs_info->super_lock);
4967
4968        len = strnlen(label, BTRFS_LABEL_SIZE);
4969
4970        if (len == BTRFS_LABEL_SIZE) {
4971                btrfs_warn(fs_info,
4972                           "label is too long, return the first %zu bytes",
4973                           --len);
4974        }
4975
4976        ret = copy_to_user(arg, label, len);
4977
4978        return ret ? -EFAULT : 0;
4979}
4980
4981static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4982{
4983        struct inode *inode = file_inode(file);
4984        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4985        struct btrfs_root *root = BTRFS_I(inode)->root;
4986        struct btrfs_super_block *super_block = fs_info->super_copy;
4987        struct btrfs_trans_handle *trans;
4988        char label[BTRFS_LABEL_SIZE];
4989        int ret;
4990
4991        if (!capable(CAP_SYS_ADMIN))
4992                return -EPERM;
4993
4994        if (copy_from_user(label, arg, sizeof(label)))
4995                return -EFAULT;
4996
4997        if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
4998                btrfs_err(fs_info,
4999                          "unable to set label with more than %d bytes",
5000                          BTRFS_LABEL_SIZE - 1);

5001                return -EINVAL;
5002        }
5003
5004        ret = mnt_want_write_file(file);
5005        if (ret)
5006                return ret;
5007
5008        trans = btrfs_start_transaction(root, 0);
5009        if (IS_ERR(trans)) {
5010                ret = PTR_ERR(trans);
5011                goto out_unlock;
5012        }
5013
5014        spin_lock(&fs_info->super_lock);
5015        strcpy(super_block->label, label);
5016        spin_unlock(&fs_info->super_lock);
5017        ret = btrfs_commit_transaction(trans);
5018
5019out_unlock:
5020        mnt_drop_write_file(file);
5021        return ret;
5022}
5023
5024#define INIT_FEATURE_FLAGS(suffix) \
5025        { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
5026          .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
5027          .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
5028
5029int btrfs_ioctl_get_supported_features(void __user *arg)
5030{
5031        static const struct btrfs_ioctl_feature_flags features[3] = {
5032                INIT_FEATURE_FLAGS(SUPP),
5033                INIT_FEATURE_FLAGS(SAFE_SET),
5034                INIT_FEATURE_FLAGS(SAFE_CLEAR)
5035        };
5036
5037        if (copy_to_user(arg, &features, sizeof(features)))
5038                return -EFAULT;
5039
5040        return 0;
5041}
5042
5043static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
5044                                        void __user *arg)
5045{
5046        struct btrfs_super_block *super_block = fs_info->super_copy;
5047        struct btrfs_ioctl_feature_flags features;
5048
5049        features.compat_flags = btrfs_super_compat_flags(super_block);
5050        features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
5051        features.incompat_flags = btrfs_super_incompat_flags(super_block);
5052
5053        if (copy_to_user(arg, &features, sizeof(features)))
5054                return -EFAULT;
5055
5056        return 0;
5057}
5058
5059static int check_feature_bits(struct btrfs_fs_info *fs_info,
5060                              enum btrfs_feature_set set,
5061                              u64 change_mask, u64 flags, u64 supported_flags,
5062                              u64 safe_set, u64 safe_clear)
5063{
5064        const char *type = btrfs_feature_set_name(set);
5065        char *names;
5066        u64 disallowed, unsupported;
5067        u64 set_mask = flags & change_mask;
5068        u64 clear_mask = ~flags & change_mask;
5069
5070        unsupported = set_mask & ~supported_flags;
5071        if (unsupported) {
5072                names = btrfs_printable_features(set, unsupported);
5073                if (names) {
5074                        btrfs_warn(fs_info,
5075                                   "this kernel does not support the %s feature bit%s",
5076                                   names, strchr(names, ',') ? "s" : "");
5077                        kfree(names);
5078                } else
5079                        btrfs_warn(fs_info,
5080                                   "this kernel does not support %s bits 0x%llx",
5081                                   type, unsupported);
5082                return -EOPNOTSUPP;
5083        }
5084
5085        disallowed = set_mask & ~safe_set;
5086        if (disallowed) {
5087                names = btrfs_printable_features(set, disallowed);
5088                if (names) {
5089                        btrfs_warn(fs_info,
5090                                   "can't set the %s feature bit%s while mounted",
5091                                   names, strchr(names, ',') ? "s" : "");
5092                        kfree(names);
5093                } else
5094                        btrfs_warn(fs_info,
5095                                   "can't set %s bits 0x%llx while mounted",
5096                                   type, disallowed);
5097                return -EPERM;
5098        }
5099
5100        disallowed = clear_mask & ~safe_clear;
5101        if (disallowed) {
5102                names = btrfs_printable_features(set, disallowed);
5103                if (names) {
5104                        btrfs_warn(fs_info,
5105                                   "can't clear the %s feature bit%s while mounted",
5106                                   names, strchr(names, ',') ? "s" : "");
5107                        kfree(names);
5108                } else
5109                        btrfs_warn(fs_info,
5110                                   "can't clear %s bits 0x%llx while mounted",
5111                                   type, disallowed);
5112                return -EPERM;
5113        }
5114
5115        return 0;
5116}
5117
5118#define check_feature(fs_info, change_mask, flags, mask_base)   \
5119check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags,       \
5120                   BTRFS_FEATURE_ ## mask_base ## _SUPP,        \
5121                   BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,    \
5122                   BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
5123
5124static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
5125{
5126        struct inode *inode = file_inode(file);
5127        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5128        struct btrfs_root *root = BTRFS_I(inode)->root;
5129        struct btrfs_super_block *super_block = fs_info->super_copy;
5130        struct btrfs_ioctl_feature_flags flags[2];
5131        struct btrfs_trans_handle *trans;
5132        u64 newflags;
5133        int ret;
5134
5135        if (!capable(CAP_SYS_ADMIN))
5136                return -EPERM;
5137
5138        if (copy_from_user(flags, arg, sizeof(flags)))
5139                return -EFAULT;
5140
5141        /* Nothing to do */
5142        if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
5143            !flags[0].incompat_flags)
5144                return 0;
5145
5146        ret = check_feature(fs_info, flags[0].compat_flags,
5147                            flags[1].compat_flags, COMPAT);
5148        if (ret)
5149                return ret;
5150
5151        ret = check_feature(fs_info, flags[0].compat_ro_flags,
5152                            flags[1].compat_ro_flags, COMPAT_RO);
5153        if (ret)
5154                return ret;
5155
5156        ret = check_feature(fs_info, flags[0].incompat_flags,
5157                            flags[1].incompat_flags, INCOMPAT);
5158        if (ret)
5159                return ret;
5160
5161        ret = mnt_want_write_file(file);
5162        if (ret)
5163                return ret;
5164
5165        trans = btrfs_start_transaction(root, 0);
5166        if (IS_ERR(trans)) {
5167                ret = PTR_ERR(trans);
5168                goto out_drop_write;
5169        }
5170
5171        spin_lock(&fs_info->super_lock);
5172        newflags = btrfs_super_compat_flags(super_block);
5173        newflags |= flags[0].compat_flags & flags[1].compat_flags;
5174        newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
5175        btrfs_set_super_compat_flags(super_block, newflags);
5176
5177        newflags = btrfs_super_compat_ro_flags(super_block);
5178        newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
5179        newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
5180        btrfs_set_super_compat_ro_flags(super_block, newflags);
5181
5182        newflags = btrfs_super_incompat_flags(super_block);
5183        newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
5184        newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
5185        btrfs_set_super_incompat_flags(super_block, newflags);
5186        spin_unlock(&fs_info->super_lock);
5187
5188        ret = btrfs_commit_transaction(trans);
5189out_drop_write:
5190        mnt_drop_write_file(file);
5191
5192        return ret;
5193}
5194
5195static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
5196{
5197        struct btrfs_ioctl_send_args *arg;
5198        int ret;
5199
5200        if (compat) {
5201#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5202                struct btrfs_ioctl_send_args_32 args32;
5203
5204                ret = copy_from_user(&args32, argp, sizeof(args32));
5205                if (ret)
5206                        return -EFAULT;
5207                arg = kzalloc(sizeof(*arg), GFP_KERNEL);
5208                if (!arg)
5209                        return -ENOMEM;
5210                arg->send_fd = args32.send_fd;
5211                arg->clone_sources_count = args32.clone_sources_count;
5212                arg->clone_sources = compat_ptr(args32.clone_sources);
5213                arg->parent_root = args32.parent_root;
5214                arg->flags = args32.flags;
5215                memcpy(arg->reserved, args32.reserved,
5216                       sizeof(args32.reserved));
5217#else
5218                return -ENOTTY;
5219#endif
5220        } else {
5221                arg = memdup_user(argp, sizeof(*arg));
5222                if (IS_ERR(arg))
5223                        return PTR_ERR(arg);
5224        }
5225        ret = btrfs_ioctl_send(inode, arg);
5226        kfree(arg);
5227        return ret;
5228}
5229
5230static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
5231                                    bool compat)
5232{
5233        struct btrfs_ioctl_encoded_io_args args = { 0 };
5234        size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
5235                                             flags);
5236        size_t copy_end;
5237        struct iovec iovstack[UIO_FASTIOV];
5238        struct iovec *iov = iovstack;
5239        struct iov_iter iter;
5240        loff_t pos;
5241        struct kiocb kiocb;
5242        ssize_t ret;
5243
5244        if (!capable(CAP_SYS_ADMIN)) {
5245                ret = -EPERM;
5246                goto out_acct;
5247        }
5248
5249        if (compat) {
5250#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5251                struct btrfs_ioctl_encoded_io_args_32 args32;
5252
5253                copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
5254                                       flags);
5255                if (copy_from_user(&args32, argp, copy_end)) {
5256                        ret = -EFAULT;
5257                        goto out_acct;
5258                }
5259                args.iov = compat_ptr(args32.iov);
5260                args.iovcnt = args32.iovcnt;
5261                args.offset = args32.offset;
5262                args.flags = args32.flags;
5263#else
5264                return -ENOTTY;
5265#endif
5266        } else {
5267                copy_end = copy_end_kernel;
5268                if (copy_from_user(&args, argp, copy_end)) {
5269                        ret = -EFAULT;
5270                        goto out_acct;
5271                }
5272        }
5273        if (args.flags != 0) {
5274                ret = -EINVAL;
5275                goto out_acct;
5276        }
5277
5278        ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
5279                           &iov, &iter);
5280        if (ret < 0)
5281                goto out_acct;
5282
5283        if (iov_iter_count(&iter) == 0) {
5284                ret = 0;
5285                goto out_iov;
5286        }
5287        pos = args.offset;
5288        ret = rw_verify_area(READ, file, &pos, args.len);
5289        if (ret < 0)
5290                goto out_iov;
5291
5292        init_sync_kiocb(&kiocb, file);
5293        kiocb.ki_pos = pos;
5294
5295        ret = btrfs_encoded_read(&kiocb, &iter, &args);
5296        if (ret >= 0) {
5297                fsnotify_access(file);
5298                if (copy_to_user(argp + copy_end,
5299                                 (char *)&args + copy_end_kernel,
5300                                 sizeof(args) - copy_end_kernel))
5301                        ret = -EFAULT;
5302        }
5303
5304out_iov:
5305        kfree(iov);
5306out_acct:
5307        if (ret > 0)
5308                add_rchar(current, ret);
5309        inc_syscr(current);
5310        return ret;
5311}
5312
5313static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
5314{
5315        struct btrfs_ioctl_encoded_io_args args;
5316        struct iovec iovstack[UIO_FASTIOV];
5317        struct iovec *iov = iovstack;
5318        struct iov_iter iter;
5319        loff_t pos;
5320        struct kiocb kiocb;
5321        ssize_t ret;
5322
5323        if (!capable(CAP_SYS_ADMIN)) {
5324                ret = -EPERM;
5325                goto out_acct;
5326        }
5327
5328        if (!(file->f_mode & FMODE_WRITE)) {
5329                ret = -EBADF;
5330                goto out_acct;
5331        }
5332
5333        if (compat) {
5334#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5335                struct btrfs_ioctl_encoded_io_args_32 args32;
5336
5337                if (copy_from_user(&args32, argp, sizeof(args32))) {
5338                        ret = -EFAULT;
5339                        goto out_acct;
5340                }
5341                args.iov = compat_ptr(args32.iov);
5342                args.iovcnt = args32.iovcnt;
5343                args.offset = args32.offset;
5344                args.flags = args32.flags;
5345                args.len = args32.len;
5346                args.unencoded_len = args32.unencoded_len;
5347                args.unencoded_offset = args32.unencoded_offset;
5348                args.compression = args32.compression;
5349                args.encryption = args32.encryption;
5350                memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
5351#else
5352                return -ENOTTY;
5353#endif
5354        } else {
5355                if (copy_from_user(&args, argp, sizeof(args))) {
5356                        ret = -EFAULT;
5357                        goto out_acct;
5358                }
5359        }
5360
5361        ret = -EINVAL;
5362        if (args.flags != 0)
5363                goto out_acct;
5364        if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
5365                goto out_acct;
5366        if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
5367            args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
5368                goto out_acct;
5369        if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
5370            args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
5371                goto out_acct;
5372        if (args.unencoded_offset > args.unencoded_len)
5373                goto out_acct;
5374        if (args.len > args.unencoded_len - args.unencoded_offset)
5375                goto out_acct;
5376
5377        ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
5378                           &iov, &iter);
5379        if (ret < 0)
5380                goto out_acct;
5381
5382        file_start_write(file);
5383
5384        if (iov_iter_count(&iter) == 0) {
5385                ret = 0;
5386                goto out_end_write;
5387        }
5388        pos = args.offset;
5389        ret = rw_verify_area(WRITE, file, &pos, args.len);
5390        if (ret < 0)
5391                goto out_end_write;
5392
5393        init_sync_kiocb(&kiocb, file);
5394        ret = kiocb_set_rw_flags(&kiocb, 0);
5395        if (ret)
5396                goto out_end_write;
5397        kiocb.ki_pos = pos;
5398
5399        ret = btrfs_do_write_iter(&kiocb, &iter, &args);
5400        if (ret > 0)
5401                fsnotify_modify(file);
5402
5403out_end_write:
5404        file_end_write(file);
5405        kfree(iov);
5406out_acct:
5407        if (ret > 0)
5408                add_wchar(current, ret);
5409        inc_syscw(current);
5410        return ret;
5411}
5412
5413long btrfs_ioctl(struct file *file, unsigned int
5414                cmd, unsigned long arg)
5415{
5416        struct inode *inode = file_inode(file);
5417        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5418        struct btrfs_root *root = BTRFS_I(inode)->root;
5419        void __user *argp = (void __user *)arg;
5420
5421        switch (cmd) {
5422        case FS_IOC_GETVERSION:
5423                return btrfs_ioctl_getversion(inode, argp);
5424        case FS_IOC_GETFSLABEL:
5425                return btrfs_ioctl_get_fslabel(fs_info, argp);
5426        case FS_IOC_SETFSLABEL:
5427                return btrfs_ioctl_set_fslabel(file, argp);
5428        case FITRIM:
5429                return btrfs_ioctl_fitrim(fs_info, argp);
5430        case BTRFS_IOC_SNAP_CREATE:
5431                return btrfs_ioctl_snap_create(file, argp, 0);
5432        case BTRFS_IOC_SNAP_CREATE_V2:
5433                return btrfs_ioctl_snap_create_v2(file, argp, 0);
5434        case BTRFS_IOC_SUBVOL_CREATE:
5435                return btrfs_ioctl_snap_create(file, argp, 1);
5436        case BTRFS_IOC_SUBVOL_CREATE_V2:
5437                return btrfs_ioctl_snap_create_v2(file, argp, 1);
5438        case BTRFS_IOC_SNAP_DESTROY:
5439                return btrfs_ioctl_snap_destroy(file, argp, false);
5440        case BTRFS_IOC_SNAP_DESTROY_V2:
5441                return btrfs_ioctl_snap_destroy(file, argp, true);
5442        case BTRFS_IOC_SUBVOL_GETFLAGS:
5443                return btrfs_ioctl_subvol_getflags(inode, argp);
5444        case BTRFS_IOC_SUBVOL_SETFLAGS:
5445                return btrfs_ioctl_subvol_setflags(file, argp);
5446        case BTRFS_IOC_DEFAULT_SUBVOL:
5447                return btrfs_ioctl_default_subvol(file, argp);
5448        case BTRFS_IOC_DEFRAG:
5449                return btrfs_ioctl_defrag(file, NULL);
5450        case BTRFS_IOC_DEFRAG_RANGE:
5451                return btrfs_ioctl_defrag(file, argp);
5452        case BTRFS_IOC_RESIZE:
5453                return btrfs_ioctl_resize(file, argp);
5454        case BTRFS_IOC_ADD_DEV:
5455                return btrfs_ioctl_add_dev(fs_info, argp);
5456        case BTRFS_IOC_RM_DEV:
5457                return btrfs_ioctl_rm_dev(file, argp);
5458        case BTRFS_IOC_RM_DEV_V2:
5459                return btrfs_ioctl_rm_dev_v2(file, argp);
5460        case BTRFS_IOC_FS_INFO:
5461                return btrfs_ioctl_fs_info(fs_info, argp);
5462        case BTRFS_IOC_DEV_INFO:
5463                return btrfs_ioctl_dev_info(fs_info, argp);
5464        case BTRFS_IOC_TREE_SEARCH:
5465                return btrfs_ioctl_tree_search(inode, argp);
5466        case BTRFS_IOC_TREE_SEARCH_V2:
5467                return btrfs_ioctl_tree_search_v2(inode, argp);
5468        case BTRFS_IOC_INO_LOOKUP:
5469                return btrfs_ioctl_ino_lookup(root, argp);
5470        case BTRFS_IOC_INO_PATHS:
5471                return btrfs_ioctl_ino_to_path(root, argp);
5472        case BTRFS_IOC_LOGICAL_INO:
5473                return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
5474        case BTRFS_IOC_LOGICAL_INO_V2:
5475                return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
5476        case BTRFS_IOC_SPACE_INFO:
5477                return btrfs_ioctl_space_info(fs_info, argp);
5478        case BTRFS_IOC_SYNC: {
5479                int ret;
5480
5481                ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
5482                if (ret)
5483                        return ret;
5484                ret = btrfs_sync_fs(inode->i_sb, 1);
5485                /*
5486                 * The transaction thread may want to do more work,
5487                 * namely it pokes the cleaner kthread that will start
5488                 * processing uncleaned subvols.
5489                 */
5490                wake_up_process(fs_info->transaction_kthread);
5491                return ret;
5492        }
5493        case BTRFS_IOC_START_SYNC:
5494                return btrfs_ioctl_start_sync(root, argp);
5495        case BTRFS_IOC_WAIT_SYNC:
5496                return btrfs_ioctl_wait_sync(fs_info, argp);
5497        case BTRFS_IOC_SCRUB:
5498                return btrfs_ioctl_scrub(file, argp);
5499        case BTRFS_IOC_SCRUB_CANCEL:
5500                return btrfs_ioctl_scrub_cancel(fs_info);
5501        case BTRFS_IOC_SCRUB_PROGRESS:
5502                return btrfs_ioctl_scrub_progress(fs_info, argp);
5503        case BTRFS_IOC_BALANCE_V2:
5504                return btrfs_ioctl_balance(file, argp);
5505        case BTRFS_IOC_BALANCE_CTL:
5506                return btrfs_ioctl_balance_ctl(fs_info, arg);
5507        case BTRFS_IOC_BALANCE_PROGRESS:
5508                return btrfs_ioctl_balance_progress(fs_info, argp);
5509        case BTRFS_IOC_SET_RECEIVED_SUBVOL:
5510                return btrfs_ioctl_set_received_subvol(file, argp);
5511#ifdef CONFIG_64BIT
5512        case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
5513                return btrfs_ioctl_set_received_subvol_32(file, argp);
5514#endif
5515        case BTRFS_IOC_SEND:
5516                return _btrfs_ioctl_send(inode, argp, false);
5517#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5518        case BTRFS_IOC_SEND_32:
5519                return _btrfs_ioctl_send(inode, argp, true);
5520#endif
5521        case BTRFS_IOC_GET_DEV_STATS:
5522                return btrfs_ioctl_get_dev_stats(fs_info, argp);
5523        case BTRFS_IOC_QUOTA_CTL:
5524                return btrfs_ioctl_quota_ctl(file, argp);
5525        case BTRFS_IOC_QGROUP_ASSIGN:
5526                return btrfs_ioctl_qgroup_assign(file, argp);
5527        case BTRFS_IOC_QGROUP_CREATE:
5528                return btrfs_ioctl_qgroup_create(file, argp);
5529        case BTRFS_IOC_QGROUP_LIMIT:
5530                return btrfs_ioctl_qgroup_limit(file, argp);
5531        case BTRFS_IOC_QUOTA_RESCAN:
5532                return btrfs_ioctl_quota_rescan(file, argp);
5533        case BTRFS_IOC_QUOTA_RESCAN_STATUS:
5534                return btrfs_ioctl_quota_rescan_status(fs_info, argp);
5535        case BTRFS_IOC_QUOTA_RESCAN_WAIT:
5536                return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
5537        case BTRFS_IOC_DEV_REPLACE:
5538                return btrfs_ioctl_dev_replace(fs_info, argp);
5539        case BTRFS_IOC_GET_SUPPORTED_FEATURES:
5540                return btrfs_ioctl_get_supported_features(argp);
5541        case BTRFS_IOC_GET_FEATURES:
5542                return btrfs_ioctl_get_features(fs_info, argp);
5543        case BTRFS_IOC_SET_FEATURES:
5544                return btrfs_ioctl_set_features(file, argp);
5545        case BTRFS_IOC_GET_SUBVOL_INFO:
5546                return btrfs_ioctl_get_subvol_info(inode, argp);
5547        case BTRFS_IOC_GET_SUBVOL_ROOTREF:
5548                return btrfs_ioctl_get_subvol_rootref(root, argp);
5549        case BTRFS_IOC_INO_LOOKUP_USER:
5550                return btrfs_ioctl_ino_lookup_user(file, argp);
5551        case FS_IOC_ENABLE_VERITY:
5552                return fsverity_ioctl_enable(file, (const void __user *)argp);
5553        case FS_IOC_MEASURE_VERITY:
5554                return fsverity_ioctl_measure(file, argp);
5555        case BTRFS_IOC_ENCODED_READ:
5556                return btrfs_ioctl_encoded_read(file, argp, false);
5557        case BTRFS_IOC_ENCODED_WRITE:
5558                return btrfs_ioctl_encoded_write(file, argp, false);
5559#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5560        case BTRFS_IOC_ENCODED_READ_32:
5561                return btrfs_ioctl_encoded_read(file, argp, true);
5562        case BTRFS_IOC_ENCODED_WRITE_32:
5563                return btrfs_ioctl_encoded_write(file, argp, true);
5564#endif
5565        }
5566
5567        return -ENOTTY;
5568}
5569
5570#ifdef CONFIG_COMPAT
5571long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5572{
5573        /*
5574         * These all access 32-bit values anyway so no further
5575         * handling is necessary.
5576         */
5577        switch (cmd) {
5578        case FS_IOC32_GETVERSION:
5579                cmd = FS_IOC_GETVERSION;
5580                break;
5581        }
5582
5583        return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
5584}
5585#endif
5586