linux/fs/btrfs/qgroup.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2/*
   3 * Copyright (C) 2014 Facebook.  All rights reserved.
   4 */
   5
   6#ifndef BTRFS_QGROUP_H
   7#define BTRFS_QGROUP_H
   8
   9#include <linux/spinlock.h>
  10#include <linux/rbtree.h>
  11#include "ulist.h"
  12#include "delayed-ref.h"
  13
  14/*
  15 * Btrfs qgroup overview
  16 *
  17 * Btrfs qgroup splits into 3 main part:
  18 * 1) Reserve
  19 *    Reserve metadata/data space for incoming operations
  20 *    Affect how qgroup limit works
  21 *
  22 * 2) Trace
  23 *    Tell btrfs qgroup to trace dirty extents.
  24 *
  25 *    Dirty extents including:
  26 *    - Newly allocated extents
  27 *    - Extents going to be deleted (in this trans)
  28 *    - Extents whose owner is going to be modified
  29 *
  30 *    This is the main part affects whether qgroup numbers will stay
  31 *    consistent.
  32 *    Btrfs qgroup can trace clean extents and won't cause any problem,
  33 *    but it will consume extra CPU time, it should be avoided if possible.
  34 *
  35 * 3) Account
  36 *    Btrfs qgroup will updates its numbers, based on dirty extents traced
  37 *    in previous step.
  38 *
  39 *    Normally at qgroup rescan and transaction commit time.
  40 */
  41
  42/*
  43 * Special performance optimization for balance.
  44 *
  45 * For balance, we need to swap subtree of subvolume and reloc trees.
  46 * In theory, we need to trace all subtree blocks of both subvolume and reloc
  47 * trees, since their owner has changed during such swap.
  48 *
  49 * However since balance has ensured that both subtrees are containing the
  50 * same contents and have the same tree structures, such swap won't cause
  51 * qgroup number change.
  52 *
  53 * But there is a race window between subtree swap and transaction commit,
  54 * during that window, if we increase/decrease tree level or merge/split tree
  55 * blocks, we still need to trace the original subtrees.
  56 *
  57 * So for balance, we use a delayed subtree tracing, whose workflow is:
  58 *
  59 * 1) Record the subtree root block get swapped.
  60 *
  61 *    During subtree swap:
  62 *    O = Old tree blocks
  63 *    N = New tree blocks
  64 *          reloc tree                     subvolume tree X
  65 *             Root                               Root
  66 *            /    \                             /    \
  67 *          NA     OB                          OA      OB
  68 *        /  |     |  \                      /  |      |  \
  69 *      NC  ND     OE  OF                   OC  OD     OE  OF
  70 *
  71 *   In this case, NA and OA are going to be swapped, record (NA, OA) into
  72 *   subvolume tree X.
  73 *
  74 * 2) After subtree swap.
  75 *          reloc tree                     subvolume tree X
  76 *             Root                               Root
  77 *            /    \                             /    \
  78 *          OA     OB                          NA      OB
  79 *        /  |     |  \                      /  |      |  \
  80 *      OC  OD     OE  OF                   NC  ND     OE  OF
  81 *
  82 * 3a) COW happens for OB
  83 *     If we are going to COW tree block OB, we check OB's bytenr against
  84 *     tree X's swapped_blocks structure.
  85 *     If it doesn't fit any, nothing will happen.
  86 *
  87 * 3b) COW happens for NA
  88 *     Check NA's bytenr against tree X's swapped_blocks, and get a hit.
  89 *     Then we do subtree scan on both subtrees OA and NA.
  90 *     Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
  91 *
  92 *     Then no matter what we do to subvolume tree X, qgroup numbers will
  93 *     still be correct.
  94 *     Then NA's record gets removed from X's swapped_blocks.
  95 *
  96 * 4)  Transaction commit
  97 *     Any record in X's swapped_blocks gets removed, since there is no
  98 *     modification to the swapped subtrees, no need to trigger heavy qgroup
  99 *     subtree rescan for them.
 100 */
 101
 102/*
 103 * Record a dirty extent, and info qgroup to update quota on it
 104 * TODO: Use kmem cache to alloc it.
 105 */
 106struct btrfs_qgroup_extent_record {
 107        struct rb_node node;
 108        u64 bytenr;
 109        u64 num_bytes;
 110
 111        /*
 112         * For qgroup reserved data space freeing.
 113         *
 114         * @data_rsv_refroot and @data_rsv will be recorded after
 115         * BTRFS_ADD_DELAYED_EXTENT is called.
 116         * And will be used to free reserved qgroup space at
 117         * transaction commit time.
 118         */
 119        u32 data_rsv;           /* reserved data space needs to be freed */
 120        u64 data_rsv_refroot;   /* which root the reserved data belongs to */
 121        struct ulist *old_roots;
 122};
 123
 124struct btrfs_qgroup_swapped_block {
 125        struct rb_node node;
 126
 127        int level;
 128        bool trace_leaf;
 129
 130        /* bytenr/generation of the tree block in subvolume tree after swap */
 131        u64 subvol_bytenr;
 132        u64 subvol_generation;
 133
 134        /* bytenr/generation of the tree block in reloc tree after swap */
 135        u64 reloc_bytenr;
 136        u64 reloc_generation;
 137
 138        u64 last_snapshot;
 139        struct btrfs_key first_key;
 140};
 141
 142/*
 143 * Qgroup reservation types:
 144 *
 145 * DATA:
 146 *      space reserved for data
 147 *
 148 * META_PERTRANS:
 149 *      Space reserved for metadata (per-transaction)
 150 *      Due to the fact that qgroup data is only updated at transaction commit
 151 *      time, reserved space for metadata must be kept until transaction
 152 *      commits.
 153 *      Any metadata reserved that are used in btrfs_start_transaction() should
 154 *      be of this type.
 155 *
 156 * META_PREALLOC:
 157 *      There are cases where metadata space is reserved before starting
 158 *      transaction, and then btrfs_join_transaction() to get a trans handle.
 159 *      Any metadata reserved for such usage should be of this type.
 160 *      And after join_transaction() part (or all) of such reservation should
 161 *      be converted into META_PERTRANS.
 162 */
 163enum btrfs_qgroup_rsv_type {
 164        BTRFS_QGROUP_RSV_DATA,
 165        BTRFS_QGROUP_RSV_META_PERTRANS,
 166        BTRFS_QGROUP_RSV_META_PREALLOC,
 167        BTRFS_QGROUP_RSV_LAST,
 168};
 169
 170/*
 171 * Represents how many bytes we have reserved for this qgroup.
 172 *
 173 * Each type should have different reservation behavior.
 174 * E.g, data follows its io_tree flag modification, while
 175 * *currently* meta is just reserve-and-clear during transaction.
 176 *
 177 * TODO: Add new type for reservation which can survive transaction commit.
 178 * Current metadata reservation behavior is not suitable for such case.
 179 */
 180struct btrfs_qgroup_rsv {
 181        u64 values[BTRFS_QGROUP_RSV_LAST];
 182};
 183
 184/*
 185 * one struct for each qgroup, organized in fs_info->qgroup_tree.
 186 */
 187struct btrfs_qgroup {
 188        u64 qgroupid;
 189
 190        /*
 191         * state
 192         */
 193        u64 rfer;       /* referenced */
 194        u64 rfer_cmpr;  /* referenced compressed */
 195        u64 excl;       /* exclusive */
 196        u64 excl_cmpr;  /* exclusive compressed */
 197
 198        /*
 199         * limits
 200         */
 201        u64 lim_flags;  /* which limits are set */
 202        u64 max_rfer;
 203        u64 max_excl;
 204        u64 rsv_rfer;
 205        u64 rsv_excl;
 206
 207        /*
 208         * reservation tracking
 209         */
 210        struct btrfs_qgroup_rsv rsv;
 211
 212        /*
 213         * lists
 214         */
 215        struct list_head groups;  /* groups this group is member of */
 216        struct list_head members; /* groups that are members of this group */
 217        struct list_head dirty;   /* dirty groups */
 218        struct rb_node node;      /* tree of qgroups */
 219
 220        /*
 221         * temp variables for accounting operations
 222         * Refer to qgroup_shared_accounting() for details.
 223         */
 224        u64 old_refcnt;
 225        u64 new_refcnt;
 226};
 227
 228/*
 229 * For qgroup event trace points only
 230 */
 231#define QGROUP_RESERVE          (1<<0)
 232#define QGROUP_RELEASE          (1<<1)
 233#define QGROUP_FREE             (1<<2)
 234
 235int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
 236int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
 237int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
 238void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
 239int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
 240                                     bool interruptible);
 241int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 242                              u64 dst);
 243int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
 244                              u64 dst);
 245int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
 246int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
 247int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
 248                       struct btrfs_qgroup_limit *limit);
 249int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
 250void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 251struct btrfs_delayed_extent_op;
 252
 253/*
 254 * Inform qgroup to trace one dirty extent, its info is recorded in @record.
 255 * So qgroup can account it at transaction committing time.
 256 *
 257 * No lock version, caller must acquire delayed ref lock and allocated memory,
 258 * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
 259 *
 260 * Return 0 for success insert
 261 * Return >0 for existing record, caller can free @record safely.
 262 * Error is not possible
 263 */
 264int btrfs_qgroup_trace_extent_nolock(
 265                struct btrfs_fs_info *fs_info,
 266                struct btrfs_delayed_ref_root *delayed_refs,
 267                struct btrfs_qgroup_extent_record *record);
 268
 269/*
 270 * Post handler after qgroup_trace_extent_nolock().
 271 *
 272 * NOTE: Current qgroup does the expensive backref walk at transaction
 273 * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
 274 * new transaction.
 275 * This is designed to allow btrfs_find_all_roots() to get correct new_roots
 276 * result.
 277 *
 278 * However for old_roots there is no need to do backref walk at that time,
 279 * since we search commit roots to walk backref and result will always be
 280 * correct.
 281 *
 282 * Due to the nature of no lock version, we can't do backref there.
 283 * So we must call btrfs_qgroup_trace_extent_post() after exiting
 284 * spinlock context.
 285 *
 286 * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
 287 * using current root, then we can move all expensive backref walk out of
 288 * transaction committing, but not now as qgroup accounting will be wrong again.
 289 */
 290int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
 291                                   struct btrfs_qgroup_extent_record *qrecord);
 292
 293/*
 294 * Inform qgroup to trace one dirty extent, specified by @bytenr and
 295 * @num_bytes.
 296 * So qgroup can account it at commit trans time.
 297 *
 298 * Better encapsulated version, with memory allocation and backref walk for
 299 * commit roots.
 300 * So this can sleep.
 301 *
 302 * Return 0 if the operation is done.
 303 * Return <0 for error, like memory allocation failure or invalid parameter
 304 * (NULL trans)
 305 */
 306int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 307                              u64 num_bytes, gfp_t gfp_flag);
 308
 309/*
 310 * Inform qgroup to trace all leaf items of data
 311 *
 312 * Return 0 for success
 313 * Return <0 for error(ENOMEM)
 314 */
 315int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
 316                                  struct extent_buffer *eb);
 317/*
 318 * Inform qgroup to trace a whole subtree, including all its child tree
 319 * blocks and data.
 320 * The root tree block is specified by @root_eb.
 321 *
 322 * Normally used by relocation(tree block swap) and subvolume deletion.
 323 *
 324 * Return 0 for success
 325 * Return <0 for error(ENOMEM or tree search error)
 326 */
 327int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
 328                               struct extent_buffer *root_eb,
 329                               u64 root_gen, int root_level);
 330int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
 331                                u64 num_bytes, struct ulist *old_roots,
 332                                struct ulist *new_roots);
 333int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
 334int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
 335int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
 336                         u64 objectid, struct btrfs_qgroup_inherit *inherit);
 337void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
 338                               u64 ref_root, u64 num_bytes,
 339                               enum btrfs_qgroup_rsv_type type);
 340
 341#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 342int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 343                               u64 rfer, u64 excl);
 344#endif
 345
 346/* New io_tree based accurate qgroup reserve API */
 347int btrfs_qgroup_reserve_data(struct inode *inode,
 348                        struct extent_changeset **reserved, u64 start, u64 len);
 349int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
 350int btrfs_qgroup_free_data(struct inode *inode,
 351                        struct extent_changeset *reserved, u64 start, u64 len);
 352
 353int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
 354                                enum btrfs_qgroup_rsv_type type, bool enforce);
 355/* Reserve metadata space for pertrans and prealloc type */
 356static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
 357                                int num_bytes, bool enforce)
 358{
 359        return __btrfs_qgroup_reserve_meta(root, num_bytes,
 360                        BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
 361}
 362static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
 363                                int num_bytes, bool enforce)
 364{
 365        return __btrfs_qgroup_reserve_meta(root, num_bytes,
 366                        BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
 367}
 368
 369void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
 370                             enum btrfs_qgroup_rsv_type type);
 371
 372/* Free per-transaction meta reservation for error handling */
 373static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
 374                                                   int num_bytes)
 375{
 376        __btrfs_qgroup_free_meta(root, num_bytes,
 377                        BTRFS_QGROUP_RSV_META_PERTRANS);
 378}
 379
 380/* Pre-allocated meta reservation can be freed at need */
 381static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
 382                                                   int num_bytes)
 383{
 384        __btrfs_qgroup_free_meta(root, num_bytes,
 385                        BTRFS_QGROUP_RSV_META_PREALLOC);
 386}
 387
 388/*
 389 * Per-transaction meta reservation should be all freed at transaction commit
 390 * time
 391 */
 392void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
 393
 394/*
 395 * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
 396 *
 397 * This is called when preallocated meta reservation needs to be used.
 398 * Normally after btrfs_join_transaction() call.
 399 */
 400void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
 401
 402void btrfs_qgroup_check_reserved_leak(struct inode *inode);
 403
 404/* btrfs_qgroup_swapped_blocks related functions */
 405void btrfs_qgroup_init_swapped_blocks(
 406        struct btrfs_qgroup_swapped_blocks *swapped_blocks);
 407
 408void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
 409int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
 410                struct btrfs_root *subvol_root,
 411                struct btrfs_block_group_cache *bg,
 412                struct extent_buffer *subvol_parent, int subvol_slot,
 413                struct extent_buffer *reloc_parent, int reloc_slot,
 414                u64 last_snapshot);
 415int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
 416                struct btrfs_root *root, struct extent_buffer *eb);
 417
 418#endif
 419