linux/fs/btrfs/btrfs_inode.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#ifndef BTRFS_INODE_H
   7#define BTRFS_INODE_H
   8
   9#include <linux/hash.h>
  10#include <linux/refcount.h>
  11#include "extent_map.h"
  12#include "extent_io.h"
  13#include "ordered-data.h"
  14#include "delayed-inode.h"
  15
  16/*
  17 * ordered_data_close is set by truncate when a file that used
  18 * to have good data has been truncated to zero.  When it is set
  19 * the btrfs file release call will add this inode to the
  20 * ordered operations list so that we make sure to flush out any
  21 * new data the application may have written before commit.
  22 */
  23enum {
  24        BTRFS_INODE_FLUSH_ON_CLOSE,
  25        BTRFS_INODE_DUMMY,
  26        BTRFS_INODE_IN_DEFRAG,
  27        BTRFS_INODE_HAS_ASYNC_EXTENT,
  28         /*
  29          * Always set under the VFS' inode lock, otherwise it can cause races
  30          * during fsync (we start as a fast fsync and then end up in a full
  31          * fsync racing with ordered extent completion).
  32          */
  33        BTRFS_INODE_NEEDS_FULL_SYNC,
  34        BTRFS_INODE_COPY_EVERYTHING,
  35        BTRFS_INODE_IN_DELALLOC_LIST,
  36        BTRFS_INODE_HAS_PROPS,
  37        BTRFS_INODE_SNAPSHOT_FLUSH,
  38        /*
  39         * Set and used when logging an inode and it serves to signal that an
  40         * inode does not have xattrs, so subsequent fsyncs can avoid searching
  41         * for xattrs to log. This bit must be cleared whenever a xattr is added
  42         * to an inode.
  43         */
  44        BTRFS_INODE_NO_XATTRS,
  45        /*
  46         * Set when we are in a context where we need to start a transaction and
  47         * have dirty pages with the respective file range locked. This is to
  48         * ensure that when reserving space for the transaction, if we are low
  49         * on available space and need to flush delalloc, we will not flush
  50         * delalloc for this inode, because that could result in a deadlock (on
  51         * the file range, inode's io_tree).
  52         */
  53        BTRFS_INODE_NO_DELALLOC_FLUSH,
  54        /*
  55         * Set when we are working on enabling verity for a file. Computing and
  56         * writing the whole Merkle tree can take a while so we want to prevent
  57         * races where two separate tasks attempt to simultaneously start verity
  58         * on the same file.
  59         */
  60        BTRFS_INODE_VERITY_IN_PROGRESS,
  61};
  62
  63/* in memory btrfs inode */
  64struct btrfs_inode {
  65        /* which subvolume this inode belongs to */
  66        struct btrfs_root *root;
  67
  68        /* key used to find this inode on disk.  This is used by the code
  69         * to read in roots of subvolumes
  70         */
  71        struct btrfs_key location;
  72
  73        /*
  74         * Lock for counters and all fields used to determine if the inode is in
  75         * the log or not (last_trans, last_sub_trans, last_log_commit,
  76         * logged_trans), to access/update new_delalloc_bytes and to update the
  77         * VFS' inode number of bytes used.
  78         */
  79        spinlock_t lock;
  80
  81        /* the extent_tree has caches of all the extent mappings to disk */
  82        struct extent_map_tree extent_tree;
  83
  84        /* the io_tree does range state (DIRTY, LOCKED etc) */
  85        struct extent_io_tree io_tree;
  86
  87        /* special utility tree used to record which mirrors have already been
  88         * tried when checksums fail for a given block
  89         */
  90        struct extent_io_tree io_failure_tree;
  91
  92        /*
  93         * Keep track of where the inode has extent items mapped in order to
  94         * make sure the i_size adjustments are accurate
  95         */
  96        struct extent_io_tree file_extent_tree;
  97
  98        /* held while logging the inode in tree-log.c */
  99        struct mutex log_mutex;
 100
 101        /* used to order data wrt metadata */
 102        struct btrfs_ordered_inode_tree ordered_tree;
 103
 104        /* list of all the delalloc inodes in the FS.  There are times we need
 105         * to write all the delalloc pages to disk, and this list is used
 106         * to walk them all.
 107         */
 108        struct list_head delalloc_inodes;
 109
 110        /* node for the red-black tree that links inodes in subvolume root */
 111        struct rb_node rb_node;
 112
 113        unsigned long runtime_flags;
 114
 115        /* Keep track of who's O_SYNC/fsyncing currently */
 116        atomic_t sync_writers;
 117
 118        /* full 64 bit generation number, struct vfs_inode doesn't have a big
 119         * enough field for this.
 120         */
 121        u64 generation;
 122
 123        /*
 124         * transid of the trans_handle that last modified this inode
 125         */
 126        u64 last_trans;
 127
 128        /*
 129         * transid that last logged this inode
 130         */
 131        u64 logged_trans;
 132
 133        /*
 134         * log transid when this inode was last modified
 135         */
 136        int last_sub_trans;
 137
 138        /* a local copy of root's last_log_commit */
 139        int last_log_commit;
 140
 141        /* total number of bytes pending delalloc, used by stat to calc the
 142         * real block usage of the file
 143         */
 144        u64 delalloc_bytes;
 145
 146        /*
 147         * Total number of bytes pending delalloc that fall within a file
 148         * range that is either a hole or beyond EOF (and no prealloc extent
 149         * exists in the range). This is always <= delalloc_bytes.
 150         */
 151        u64 new_delalloc_bytes;
 152
 153        /*
 154         * total number of bytes pending defrag, used by stat to check whether
 155         * it needs COW.
 156         */
 157        u64 defrag_bytes;
 158
 159        /*
 160         * the size of the file stored in the metadata on disk.  data=ordered
 161         * means the in-memory i_size might be larger than the size on disk
 162         * because not all the blocks are written yet.
 163         */
 164        u64 disk_i_size;
 165
 166        /*
 167         * if this is a directory then index_cnt is the counter for the index
 168         * number for new files that are created
 169         */
 170        u64 index_cnt;
 171
 172        /* Cache the directory index number to speed the dir/file remove */
 173        u64 dir_index;
 174
 175        /* the fsync log has some corner cases that mean we have to check
 176         * directories to see if any unlinks have been done before
 177         * the directory was logged.  See tree-log.c for all the
 178         * details
 179         */
 180        u64 last_unlink_trans;
 181
 182        /*
 183         * The id/generation of the last transaction where this inode was
 184         * either the source or the destination of a clone/dedupe operation.
 185         * Used when logging an inode to know if there are shared extents that
 186         * need special care when logging checksum items, to avoid duplicate
 187         * checksum items in a log (which can lead to a corruption where we end
 188         * up with missing checksum ranges after log replay).
 189         * Protected by the vfs inode lock.
 190         */
 191        u64 last_reflink_trans;
 192
 193        /*
 194         * Number of bytes outstanding that are going to need csums.  This is
 195         * used in ENOSPC accounting.
 196         */
 197        u64 csum_bytes;
 198
 199        /* Backwards incompatible flags, lower half of inode_item::flags  */
 200        u32 flags;
 201        /* Read-only compatibility flags, upper half of inode_item::flags */
 202        u32 ro_flags;
 203
 204        /*
 205         * Counters to keep track of the number of extent item's we may use due
 206         * to delalloc and such.  outstanding_extents is the number of extent
 207         * items we think we'll end up using, and reserved_extents is the number
 208         * of extent items we've reserved metadata for.
 209         */
 210        unsigned outstanding_extents;
 211
 212        struct btrfs_block_rsv block_rsv;
 213
 214        /*
 215         * Cached values of inode properties
 216         */
 217        unsigned prop_compress;         /* per-file compression algorithm */
 218        /*
 219         * Force compression on the file using the defrag ioctl, could be
 220         * different from prop_compress and takes precedence if set
 221         */
 222        unsigned defrag_compress;
 223
 224        struct btrfs_delayed_node *delayed_node;
 225
 226        /* File creation time. */
 227        struct timespec64 i_otime;
 228
 229        /* Hook into fs_info->delayed_iputs */
 230        struct list_head delayed_iput;
 231
 232        struct rw_semaphore i_mmap_lock;
 233        struct inode vfs_inode;
 234};
 235
 236static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
 237{
 238        return inode->root->fs_info->sectorsize;
 239}
 240
 241static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 242{
 243        return container_of(inode, struct btrfs_inode, vfs_inode);
 244}
 245
 246static inline unsigned long btrfs_inode_hash(u64 objectid,
 247                                             const struct btrfs_root *root)
 248{
 249        u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
 250
 251#if BITS_PER_LONG == 32
 252        h = (h >> 32) ^ (h & 0xffffffff);
 253#endif
 254
 255        return (unsigned long)h;
 256}
 257
 258static inline void btrfs_insert_inode_hash(struct inode *inode)
 259{
 260        unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
 261
 262        __insert_inode_hash(inode, h);
 263}
 264
 265static inline u64 btrfs_ino(const struct btrfs_inode *inode)
 266{
 267        u64 ino = inode->location.objectid;
 268
 269        /*
 270         * !ino: btree_inode
 271         * type == BTRFS_ROOT_ITEM_KEY: subvol dir
 272         */
 273        if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
 274                ino = inode->vfs_inode.i_ino;
 275        return ino;
 276}
 277
 278static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
 279{
 280        i_size_write(&inode->vfs_inode, size);
 281        inode->disk_i_size = size;
 282}
 283
 284static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
 285{
 286        struct btrfs_root *root = inode->root;
 287
 288        if (root == root->fs_info->tree_root &&
 289            btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
 290                return true;
 291        if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
 292                return true;
 293        return false;
 294}
 295
 296static inline bool is_data_inode(struct inode *inode)
 297{
 298        return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
 299}
 300
 301static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
 302                                                 int mod)
 303{
 304        lockdep_assert_held(&inode->lock);
 305        inode->outstanding_extents += mod;
 306        if (btrfs_is_free_space_inode(inode))
 307                return;
 308        trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
 309                                                  mod);
 310}
 311
 312/*
 313 * Called every time after doing a buffered, direct IO or memory mapped write.
 314 *
 315 * This is to ensure that if we write to a file that was previously fsynced in
 316 * the current transaction, then try to fsync it again in the same transaction,
 317 * we will know that there were changes in the file and that it needs to be
 318 * logged.
 319 */
 320static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
 321{
 322        spin_lock(&inode->lock);
 323        inode->last_sub_trans = inode->root->log_transid;
 324        spin_unlock(&inode->lock);
 325}
 326
 327static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 328{
 329        bool ret = false;
 330
 331        spin_lock(&inode->lock);
 332        if (inode->logged_trans == generation &&
 333            inode->last_sub_trans <= inode->last_log_commit &&
 334            inode->last_sub_trans <= inode->root->last_log_commit)
 335                ret = true;
 336        spin_unlock(&inode->lock);
 337        return ret;
 338}
 339
 340struct btrfs_dio_private {
 341        struct inode *inode;
 342        u64 logical_offset;
 343        u64 disk_bytenr;
 344        /* Used for bio::bi_size */
 345        u32 bytes;
 346
 347        /*
 348         * References to this structure. There is one reference per in-flight
 349         * bio plus one while we're still setting up.
 350         */
 351        refcount_t refs;
 352
 353        /* dio_bio came from fs/direct-io.c */
 354        struct bio *dio_bio;
 355
 356        /* Array of checksums */
 357        u8 csums[];
 358};
 359
 360/*
 361 * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
 362 * separate u32s. These two functions convert between the two representations.
 363 */
 364static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
 365{
 366        return (flags | ((u64)ro_flags << 32));
 367}
 368
 369static inline void btrfs_inode_split_flags(u64 inode_item_flags,
 370                                           u32 *flags, u32 *ro_flags)
 371{
 372        *flags = (u32)inode_item_flags;
 373        *ro_flags = (u32)(inode_item_flags >> 32);
 374}
 375
 376/* Array of bytes with variable length, hexadecimal format 0x1234 */
 377#define CSUM_FMT                                "0x%*phN"
 378#define CSUM_FMT_VALUE(size, bytes)             size, bytes
 379
 380static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
 381                u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
 382{
 383        struct btrfs_root *root = inode->root;
 384        const u32 csum_size = root->fs_info->csum_size;
 385
 386        /* Output minus objectid, which is more meaningful */
 387        if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
 388                btrfs_warn_rl(root->fs_info,
 389"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 390                        root->root_key.objectid, btrfs_ino(inode),
 391                        logical_start,
 392                        CSUM_FMT_VALUE(csum_size, csum),
 393                        CSUM_FMT_VALUE(csum_size, csum_expected),
 394                        mirror_num);
 395        else
 396                btrfs_warn_rl(root->fs_info,
 397"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 398                        root->root_key.objectid, btrfs_ino(inode),
 399                        logical_start,
 400                        CSUM_FMT_VALUE(csum_size, csum),
 401                        CSUM_FMT_VALUE(csum_size, csum_expected),
 402                        mirror_num);
 403}
 404
 405#endif
 406