linux/fs/btrfs/btrfs_inode.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#ifndef BTRFS_INODE_H
   7#define BTRFS_INODE_H
   8
   9#include <linux/hash.h>
  10#include <linux/refcount.h>
  11#include "extent_map.h"
  12#include "extent_io.h"
  13#include "ordered-data.h"
  14#include "delayed-inode.h"
  15
  16/*
  17 * ordered_data_close is set by truncate when a file that used
  18 * to have good data has been truncated to zero.  When it is set
  19 * the btrfs file release call will add this inode to the
  20 * ordered operations list so that we make sure to flush out any
  21 * new data the application may have written before commit.
  22 */
  23enum {
  24        BTRFS_INODE_FLUSH_ON_CLOSE,
  25        BTRFS_INODE_DUMMY,
  26        BTRFS_INODE_IN_DEFRAG,
  27        BTRFS_INODE_HAS_ASYNC_EXTENT,
  28         /*
  29          * Always set under the VFS' inode lock, otherwise it can cause races
  30          * during fsync (we start as a fast fsync and then end up in a full
  31          * fsync racing with ordered extent completion).
  32          */
  33        BTRFS_INODE_NEEDS_FULL_SYNC,
  34        BTRFS_INODE_COPY_EVERYTHING,
  35        BTRFS_INODE_IN_DELALLOC_LIST,
  36        BTRFS_INODE_HAS_PROPS,
  37        BTRFS_INODE_SNAPSHOT_FLUSH,
  38        /*
  39         * Set and used when logging an inode and it serves to signal that an
  40         * inode does not have xattrs, so subsequent fsyncs can avoid searching
  41         * for xattrs to log. This bit must be cleared whenever a xattr is added
  42         * to an inode.
  43         */
  44        BTRFS_INODE_NO_XATTRS,
  45        /*
  46         * Set when we are in a context where we need to start a transaction and
  47         * have dirty pages with the respective file range locked. This is to
  48         * ensure that when reserving space for the transaction, if we are low
  49         * on available space and need to flush delalloc, we will not flush
  50         * delalloc for this inode, because that could result in a deadlock (on
  51         * the file range, inode's io_tree).
  52         */
  53        BTRFS_INODE_NO_DELALLOC_FLUSH,
  54};
  55
  56/* in memory btrfs inode */
  57struct btrfs_inode {
  58        /* which subvolume this inode belongs to */
  59        struct btrfs_root *root;
  60
  61        /* key used to find this inode on disk.  This is used by the code
  62         * to read in roots of subvolumes
  63         */
  64        struct btrfs_key location;
  65
  66        /*
  67         * Lock for counters and all fields used to determine if the inode is in
  68         * the log or not (last_trans, last_sub_trans, last_log_commit,
  69         * logged_trans), to access/update new_delalloc_bytes and to update the
  70         * VFS' inode number of bytes used.
  71         */
  72        spinlock_t lock;
  73
  74        /* the extent_tree has caches of all the extent mappings to disk */
  75        struct extent_map_tree extent_tree;
  76
  77        /* the io_tree does range state (DIRTY, LOCKED etc) */
  78        struct extent_io_tree io_tree;
  79
  80        /* special utility tree used to record which mirrors have already been
  81         * tried when checksums fail for a given block
  82         */
  83        struct extent_io_tree io_failure_tree;
  84
  85        /*
  86         * Keep track of where the inode has extent items mapped in order to
  87         * make sure the i_size adjustments are accurate
  88         */
  89        struct extent_io_tree file_extent_tree;
  90
  91        /* held while logging the inode in tree-log.c */
  92        struct mutex log_mutex;
  93
  94        /* used to order data wrt metadata */
  95        struct btrfs_ordered_inode_tree ordered_tree;
  96
  97        /* list of all the delalloc inodes in the FS.  There are times we need
  98         * to write all the delalloc pages to disk, and this list is used
  99         * to walk them all.
 100         */
 101        struct list_head delalloc_inodes;
 102
 103        /* node for the red-black tree that links inodes in subvolume root */
 104        struct rb_node rb_node;
 105
 106        unsigned long runtime_flags;
 107
 108        /* Keep track of who's O_SYNC/fsyncing currently */
 109        atomic_t sync_writers;
 110
 111        /* full 64 bit generation number, struct vfs_inode doesn't have a big
 112         * enough field for this.
 113         */
 114        u64 generation;
 115
 116        /*
 117         * transid of the trans_handle that last modified this inode
 118         */
 119        u64 last_trans;
 120
 121        /*
 122         * transid that last logged this inode
 123         */
 124        u64 logged_trans;
 125
 126        /*
 127         * log transid when this inode was last modified
 128         */
 129        int last_sub_trans;
 130
 131        /* a local copy of root's last_log_commit */
 132        int last_log_commit;
 133
 134        /* total number of bytes pending delalloc, used by stat to calc the
 135         * real block usage of the file
 136         */
 137        u64 delalloc_bytes;
 138
 139        /*
 140         * Total number of bytes pending delalloc that fall within a file
 141         * range that is either a hole or beyond EOF (and no prealloc extent
 142         * exists in the range). This is always <= delalloc_bytes.
 143         */
 144        u64 new_delalloc_bytes;
 145
 146        /*
 147         * total number of bytes pending defrag, used by stat to check whether
 148         * it needs COW.
 149         */
 150        u64 defrag_bytes;
 151
 152        /*
 153         * the size of the file stored in the metadata on disk.  data=ordered
 154         * means the in-memory i_size might be larger than the size on disk
 155         * because not all the blocks are written yet.
 156         */
 157        u64 disk_i_size;
 158
 159        /*
 160         * if this is a directory then index_cnt is the counter for the index
 161         * number for new files that are created
 162         */
 163        u64 index_cnt;
 164
 165        /* Cache the directory index number to speed the dir/file remove */
 166        u64 dir_index;
 167
 168        /* the fsync log has some corner cases that mean we have to check
 169         * directories to see if any unlinks have been done before
 170         * the directory was logged.  See tree-log.c for all the
 171         * details
 172         */
 173        u64 last_unlink_trans;
 174
 175        /*
 176         * The id/generation of the last transaction where this inode was
 177         * either the source or the destination of a clone/dedupe operation.
 178         * Used when logging an inode to know if there are shared extents that
 179         * need special care when logging checksum items, to avoid duplicate
 180         * checksum items in a log (which can lead to a corruption where we end
 181         * up with missing checksum ranges after log replay).
 182         * Protected by the vfs inode lock.
 183         */
 184        u64 last_reflink_trans;
 185
 186        /*
 187         * Number of bytes outstanding that are going to need csums.  This is
 188         * used in ENOSPC accounting.
 189         */
 190        u64 csum_bytes;
 191
 192        /* flags field from the on disk inode */
 193        u32 flags;
 194
 195        /*
 196         * Counters to keep track of the number of extent item's we may use due
 197         * to delalloc and such.  outstanding_extents is the number of extent
 198         * items we think we'll end up using, and reserved_extents is the number
 199         * of extent items we've reserved metadata for.
 200         */
 201        unsigned outstanding_extents;
 202
 203        struct btrfs_block_rsv block_rsv;
 204
 205        /*
 206         * Cached values of inode properties
 207         */
 208        unsigned prop_compress;         /* per-file compression algorithm */
 209        /*
 210         * Force compression on the file using the defrag ioctl, could be
 211         * different from prop_compress and takes precedence if set
 212         */
 213        unsigned defrag_compress;
 214
 215        struct btrfs_delayed_node *delayed_node;
 216
 217        /* File creation time. */
 218        struct timespec64 i_otime;
 219
 220        /* Hook into fs_info->delayed_iputs */
 221        struct list_head delayed_iput;
 222
 223        struct rw_semaphore i_mmap_lock;
 224        struct inode vfs_inode;
 225};
 226
 227static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
 228{
 229        return inode->root->fs_info->sectorsize;
 230}
 231
 232static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
 233{
 234        return container_of(inode, struct btrfs_inode, vfs_inode);
 235}
 236
 237static inline unsigned long btrfs_inode_hash(u64 objectid,
 238                                             const struct btrfs_root *root)
 239{
 240        u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
 241
 242#if BITS_PER_LONG == 32
 243        h = (h >> 32) ^ (h & 0xffffffff);
 244#endif
 245
 246        return (unsigned long)h;
 247}
 248
 249static inline void btrfs_insert_inode_hash(struct inode *inode)
 250{
 251        unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
 252
 253        __insert_inode_hash(inode, h);
 254}
 255
 256static inline u64 btrfs_ino(const struct btrfs_inode *inode)
 257{
 258        u64 ino = inode->location.objectid;
 259
 260        /*
 261         * !ino: btree_inode
 262         * type == BTRFS_ROOT_ITEM_KEY: subvol dir
 263         */
 264        if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY)
 265                ino = inode->vfs_inode.i_ino;
 266        return ino;
 267}
 268
 269static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
 270{
 271        i_size_write(&inode->vfs_inode, size);
 272        inode->disk_i_size = size;
 273}
 274
 275static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
 276{
 277        struct btrfs_root *root = inode->root;
 278
 279        if (root == root->fs_info->tree_root &&
 280            btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
 281                return true;
 282        if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID)
 283                return true;
 284        return false;
 285}
 286
 287static inline bool is_data_inode(struct inode *inode)
 288{
 289        return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
 290}
 291
 292static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
 293                                                 int mod)
 294{
 295        lockdep_assert_held(&inode->lock);
 296        inode->outstanding_extents += mod;
 297        if (btrfs_is_free_space_inode(inode))
 298                return;
 299        trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
 300                                                  mod);
 301}
 302
 303/*
 304 * Called every time after doing a buffered, direct IO or memory mapped write.
 305 *
 306 * This is to ensure that if we write to a file that was previously fsynced in
 307 * the current transaction, then try to fsync it again in the same transaction,
 308 * we will know that there were changes in the file and that it needs to be
 309 * logged.
 310 */
 311static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
 312{
 313        spin_lock(&inode->lock);
 314        inode->last_sub_trans = inode->root->log_transid;
 315        spin_unlock(&inode->lock);
 316}
 317
 318static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
 319{
 320        bool ret = false;
 321
 322        spin_lock(&inode->lock);
 323        if (inode->logged_trans == generation &&
 324            inode->last_sub_trans <= inode->last_log_commit &&
 325            inode->last_sub_trans <= inode->root->last_log_commit)
 326                ret = true;
 327        spin_unlock(&inode->lock);
 328        return ret;
 329}
 330
 331struct btrfs_dio_private {
 332        struct inode *inode;
 333        u64 logical_offset;
 334        u64 disk_bytenr;
 335        /* Used for bio::bi_size */
 336        u32 bytes;
 337
 338        /*
 339         * References to this structure. There is one reference per in-flight
 340         * bio plus one while we're still setting up.
 341         */
 342        refcount_t refs;
 343
 344        /* dio_bio came from fs/direct-io.c */
 345        struct bio *dio_bio;
 346
 347        /* Array of checksums */
 348        u8 csums[];
 349};
 350
 351/* Array of bytes with variable length, hexadecimal format 0x1234 */
 352#define CSUM_FMT                                "0x%*phN"
 353#define CSUM_FMT_VALUE(size, bytes)             size, bytes
 354
 355static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
 356                u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
 357{
 358        struct btrfs_root *root = inode->root;
 359        const u32 csum_size = root->fs_info->csum_size;
 360
 361        /* Output minus objectid, which is more meaningful */
 362        if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
 363                btrfs_warn_rl(root->fs_info,
 364"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 365                        root->root_key.objectid, btrfs_ino(inode),
 366                        logical_start,
 367                        CSUM_FMT_VALUE(csum_size, csum),
 368                        CSUM_FMT_VALUE(csum_size, csum_expected),
 369                        mirror_num);
 370        else
 371                btrfs_warn_rl(root->fs_info,
 372"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
 373                        root->root_key.objectid, btrfs_ino(inode),
 374                        logical_start,
 375                        CSUM_FMT_VALUE(csum_size, csum),
 376                        CSUM_FMT_VALUE(csum_size, csum_expected),
 377                        mirror_num);
 378}
 379
 380#endif
 381