linux/fs/btrfs/qgroup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/rbtree.h>
  11#include <linux/slab.h>
  12#include <linux/workqueue.h>
  13#include <linux/btrfs.h>
  14#include <linux/sched/mm.h>
  15
  16#include "ctree.h"
  17#include "transaction.h"
  18#include "disk-io.h"
  19#include "locking.h"
  20#include "ulist.h"
  21#include "backref.h"
  22#include "extent_io.h"
  23#include "qgroup.h"
  24#include "block-group.h"
  25#include "sysfs.h"
  26#include "tree-mod-log.h"
  27
  28/*
  29 * Helpers to access qgroup reservation
  30 *
  31 * Callers should ensure the lock context and type are valid
  32 */
  33
  34static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  35{
  36        u64 ret = 0;
  37        int i;
  38
  39        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  40                ret += qgroup->rsv.values[i];
  41
  42        return ret;
  43}
  44
  45#ifdef CONFIG_BTRFS_DEBUG
  46static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  47{
  48        if (type == BTRFS_QGROUP_RSV_DATA)
  49                return "data";
  50        if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  51                return "meta_pertrans";
  52        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  53                return "meta_prealloc";
  54        return NULL;
  55}
  56#endif
  57
  58static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  59                           struct btrfs_qgroup *qgroup, u64 num_bytes,
  60                           enum btrfs_qgroup_rsv_type type)
  61{
  62        trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  63        qgroup->rsv.values[type] += num_bytes;
  64}
  65
  66static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  67                               struct btrfs_qgroup *qgroup, u64 num_bytes,
  68                               enum btrfs_qgroup_rsv_type type)
  69{
  70        trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  71        if (qgroup->rsv.values[type] >= num_bytes) {
  72                qgroup->rsv.values[type] -= num_bytes;
  73                return;
  74        }
  75#ifdef CONFIG_BTRFS_DEBUG
  76        WARN_RATELIMIT(1,
  77                "qgroup %llu %s reserved space underflow, have %llu to free %llu",
  78                qgroup->qgroupid, qgroup_rsv_type_str(type),
  79                qgroup->rsv.values[type], num_bytes);
  80#endif
  81        qgroup->rsv.values[type] = 0;
  82}
  83
  84static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
  85                                     struct btrfs_qgroup *dest,
  86                                     struct btrfs_qgroup *src)
  87{
  88        int i;
  89
  90        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  91                qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
  92}
  93
  94static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
  95                                         struct btrfs_qgroup *dest,
  96                                          struct btrfs_qgroup *src)
  97{
  98        int i;
  99
 100        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 101                qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 102}
 103
 104static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 105                                           int mod)
 106{
 107        if (qg->old_refcnt < seq)
 108                qg->old_refcnt = seq;
 109        qg->old_refcnt += mod;
 110}
 111
 112static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 113                                           int mod)
 114{
 115        if (qg->new_refcnt < seq)
 116                qg->new_refcnt = seq;
 117        qg->new_refcnt += mod;
 118}
 119
 120static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
 121{
 122        if (qg->old_refcnt < seq)
 123                return 0;
 124        return qg->old_refcnt - seq;
 125}
 126
 127static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
 128{
 129        if (qg->new_refcnt < seq)
 130                return 0;
 131        return qg->new_refcnt - seq;
 132}
 133
 134/*
 135 * glue structure to represent the relations between qgroups.
 136 */
 137struct btrfs_qgroup_list {
 138        struct list_head next_group;
 139        struct list_head next_member;
 140        struct btrfs_qgroup *group;
 141        struct btrfs_qgroup *member;
 142};
 143
 144static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
 145{
 146        return (u64)(uintptr_t)qg;
 147}
 148
 149static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
 150{
 151        return (struct btrfs_qgroup *)(uintptr_t)n->aux;
 152}
 153
 154static int
 155qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 156                   int init_flags);
 157static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 158
 159/* must be called with qgroup_ioctl_lock held */
 160static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 161                                           u64 qgroupid)
 162{
 163        struct rb_node *n = fs_info->qgroup_tree.rb_node;
 164        struct btrfs_qgroup *qgroup;
 165
 166        while (n) {
 167                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 168                if (qgroup->qgroupid < qgroupid)
 169                        n = n->rb_left;
 170                else if (qgroup->qgroupid > qgroupid)
 171                        n = n->rb_right;
 172                else
 173                        return qgroup;
 174        }
 175        return NULL;
 176}
 177
 178/* must be called with qgroup_lock held */
 179static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 180                                          u64 qgroupid)
 181{
 182        struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 183        struct rb_node *parent = NULL;
 184        struct btrfs_qgroup *qgroup;
 185
 186        while (*p) {
 187                parent = *p;
 188                qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 189
 190                if (qgroup->qgroupid < qgroupid)
 191                        p = &(*p)->rb_left;
 192                else if (qgroup->qgroupid > qgroupid)
 193                        p = &(*p)->rb_right;
 194                else
 195                        return qgroup;
 196        }
 197
 198        qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
 199        if (!qgroup)
 200                return ERR_PTR(-ENOMEM);
 201
 202        qgroup->qgroupid = qgroupid;
 203        INIT_LIST_HEAD(&qgroup->groups);
 204        INIT_LIST_HEAD(&qgroup->members);
 205        INIT_LIST_HEAD(&qgroup->dirty);
 206
 207        rb_link_node(&qgroup->node, parent, p);
 208        rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 209
 210        return qgroup;
 211}
 212
 213static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
 214                            struct btrfs_qgroup *qgroup)
 215{
 216        struct btrfs_qgroup_list *list;
 217
 218        list_del(&qgroup->dirty);
 219        while (!list_empty(&qgroup->groups)) {
 220                list = list_first_entry(&qgroup->groups,
 221                                        struct btrfs_qgroup_list, next_group);
 222                list_del(&list->next_group);
 223                list_del(&list->next_member);
 224                kfree(list);
 225        }
 226
 227        while (!list_empty(&qgroup->members)) {
 228                list = list_first_entry(&qgroup->members,
 229                                        struct btrfs_qgroup_list, next_member);
 230                list_del(&list->next_group);
 231                list_del(&list->next_member);
 232                kfree(list);
 233        }
 234}
 235
 236/* must be called with qgroup_lock held */
 237static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 238{
 239        struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 240
 241        if (!qgroup)
 242                return -ENOENT;
 243
 244        rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 245        __del_qgroup_rb(fs_info, qgroup);
 246        return 0;
 247}
 248
 249/*
 250 * Add relation specified by two qgroups.
 251 *
 252 * Must be called with qgroup_lock held.
 253 *
 254 * Return: 0        on success
 255 *         -ENOENT  if one of the qgroups is NULL
 256 *         <0       other errors
 257 */
 258static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
 259{
 260        struct btrfs_qgroup_list *list;
 261
 262        if (!member || !parent)
 263                return -ENOENT;
 264
 265        list = kzalloc(sizeof(*list), GFP_ATOMIC);
 266        if (!list)
 267                return -ENOMEM;
 268
 269        list->group = parent;
 270        list->member = member;
 271        list_add_tail(&list->next_group, &member->groups);
 272        list_add_tail(&list->next_member, &parent->members);
 273
 274        return 0;
 275}
 276
 277/*
 278 * Add relation specified by two qgoup ids.
 279 *
 280 * Must be called with qgroup_lock held.
 281 *
 282 * Return: 0        on success
 283 *         -ENOENT  if one of the ids does not exist
 284 *         <0       other errors
 285 */
 286static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
 287{
 288        struct btrfs_qgroup *member;
 289        struct btrfs_qgroup *parent;
 290
 291        member = find_qgroup_rb(fs_info, memberid);
 292        parent = find_qgroup_rb(fs_info, parentid);
 293
 294        return __add_relation_rb(member, parent);
 295}
 296
 297/* Must be called with qgroup_lock held */
 298static int del_relation_rb(struct btrfs_fs_info *fs_info,
 299                           u64 memberid, u64 parentid)
 300{
 301        struct btrfs_qgroup *member;
 302        struct btrfs_qgroup *parent;
 303        struct btrfs_qgroup_list *list;
 304
 305        member = find_qgroup_rb(fs_info, memberid);
 306        parent = find_qgroup_rb(fs_info, parentid);
 307        if (!member || !parent)
 308                return -ENOENT;
 309
 310        list_for_each_entry(list, &member->groups, next_group) {
 311                if (list->group == parent) {
 312                        list_del(&list->next_group);
 313                        list_del(&list->next_member);
 314                        kfree(list);
 315                        return 0;
 316                }
 317        }
 318        return -ENOENT;
 319}
 320
 321#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 322int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 323                               u64 rfer, u64 excl)
 324{
 325        struct btrfs_qgroup *qgroup;
 326
 327        qgroup = find_qgroup_rb(fs_info, qgroupid);
 328        if (!qgroup)
 329                return -EINVAL;
 330        if (qgroup->rfer != rfer || qgroup->excl != excl)
 331                return -EINVAL;
 332        return 0;
 333}
 334#endif
 335
 336/*
 337 * The full config is read in one go, only called from open_ctree()
 338 * It doesn't use any locking, as at this point we're still single-threaded
 339 */
 340int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 341{
 342        struct btrfs_key key;
 343        struct btrfs_key found_key;
 344        struct btrfs_root *quota_root = fs_info->quota_root;
 345        struct btrfs_path *path = NULL;
 346        struct extent_buffer *l;
 347        int slot;
 348        int ret = 0;
 349        u64 flags = 0;
 350        u64 rescan_progress = 0;
 351
 352        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 353                return 0;
 354
 355        fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 356        if (!fs_info->qgroup_ulist) {
 357                ret = -ENOMEM;
 358                goto out;
 359        }
 360
 361        path = btrfs_alloc_path();
 362        if (!path) {
 363                ret = -ENOMEM;
 364                goto out;
 365        }
 366
 367        ret = btrfs_sysfs_add_qgroups(fs_info);
 368        if (ret < 0)
 369                goto out;
 370        /* default this to quota off, in case no status key is found */
 371        fs_info->qgroup_flags = 0;
 372
 373        /*
 374         * pass 1: read status, all qgroup infos and limits
 375         */
 376        key.objectid = 0;
 377        key.type = 0;
 378        key.offset = 0;
 379        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 380        if (ret)
 381                goto out;
 382
 383        while (1) {
 384                struct btrfs_qgroup *qgroup;
 385
 386                slot = path->slots[0];
 387                l = path->nodes[0];
 388                btrfs_item_key_to_cpu(l, &found_key, slot);
 389
 390                if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 391                        struct btrfs_qgroup_status_item *ptr;
 392
 393                        ptr = btrfs_item_ptr(l, slot,
 394                                             struct btrfs_qgroup_status_item);
 395
 396                        if (btrfs_qgroup_status_version(l, ptr) !=
 397                            BTRFS_QGROUP_STATUS_VERSION) {
 398                                btrfs_err(fs_info,
 399                                 "old qgroup version, quota disabled");
 400                                goto out;
 401                        }
 402                        if (btrfs_qgroup_status_generation(l, ptr) !=
 403                            fs_info->generation) {
 404                                flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 405                                btrfs_err(fs_info,
 406                                        "qgroup generation mismatch, marked as inconsistent");
 407                        }
 408                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 409                                                                          ptr);
 410                        rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 411                        goto next1;
 412                }
 413
 414                if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 415                    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 416                        goto next1;
 417
 418                qgroup = find_qgroup_rb(fs_info, found_key.offset);
 419                if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 420                    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
 421                        btrfs_err(fs_info, "inconsistent qgroup config");
 422                        flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 423                }
 424                if (!qgroup) {
 425                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
 426                        if (IS_ERR(qgroup)) {
 427                                ret = PTR_ERR(qgroup);
 428                                goto out;
 429                        }
 430                }
 431                ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 432                if (ret < 0)
 433                        goto out;
 434
 435                switch (found_key.type) {
 436                case BTRFS_QGROUP_INFO_KEY: {
 437                        struct btrfs_qgroup_info_item *ptr;
 438
 439                        ptr = btrfs_item_ptr(l, slot,
 440                                             struct btrfs_qgroup_info_item);
 441                        qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 442                        qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 443                        qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 444                        qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 445                        /* generation currently unused */
 446                        break;
 447                }
 448                case BTRFS_QGROUP_LIMIT_KEY: {
 449                        struct btrfs_qgroup_limit_item *ptr;
 450
 451                        ptr = btrfs_item_ptr(l, slot,
 452                                             struct btrfs_qgroup_limit_item);
 453                        qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 454                        qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 455                        qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 456                        qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 457                        qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 458                        break;
 459                }
 460                }
 461next1:
 462                ret = btrfs_next_item(quota_root, path);
 463                if (ret < 0)
 464                        goto out;
 465                if (ret)
 466                        break;
 467        }
 468        btrfs_release_path(path);
 469
 470        /*
 471         * pass 2: read all qgroup relations
 472         */
 473        key.objectid = 0;
 474        key.type = BTRFS_QGROUP_RELATION_KEY;
 475        key.offset = 0;
 476        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 477        if (ret)
 478                goto out;
 479        while (1) {
 480                slot = path->slots[0];
 481                l = path->nodes[0];
 482                btrfs_item_key_to_cpu(l, &found_key, slot);
 483
 484                if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 485                        goto next2;
 486
 487                if (found_key.objectid > found_key.offset) {
 488                        /* parent <- member, not needed to build config */
 489                        /* FIXME should we omit the key completely? */
 490                        goto next2;
 491                }
 492
 493                ret = add_relation_rb(fs_info, found_key.objectid,
 494                                      found_key.offset);
 495                if (ret == -ENOENT) {
 496                        btrfs_warn(fs_info,
 497                                "orphan qgroup relation 0x%llx->0x%llx",
 498                                found_key.objectid, found_key.offset);
 499                        ret = 0;        /* ignore the error */
 500                }
 501                if (ret)
 502                        goto out;
 503next2:
 504                ret = btrfs_next_item(quota_root, path);
 505                if (ret < 0)
 506                        goto out;
 507                if (ret)
 508                        break;
 509        }
 510out:
 511        btrfs_free_path(path);
 512        fs_info->qgroup_flags |= flags;
 513        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
 514                clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 515        else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
 516                 ret >= 0)
 517                ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 518
 519        if (ret < 0) {
 520                ulist_free(fs_info->qgroup_ulist);
 521                fs_info->qgroup_ulist = NULL;
 522                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 523                btrfs_sysfs_del_qgroups(fs_info);
 524        }
 525
 526        return ret < 0 ? ret : 0;
 527}
 528
 529/*
 530 * Called in close_ctree() when quota is still enabled.  This verifies we don't
 531 * leak some reserved space.
 532 *
 533 * Return false if no reserved space is left.
 534 * Return true if some reserved space is leaked.
 535 */
 536bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
 537{
 538        struct rb_node *node;
 539        bool ret = false;
 540
 541        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 542                return ret;
 543        /*
 544         * Since we're unmounting, there is no race and no need to grab qgroup
 545         * lock.  And here we don't go post-order to provide a more user
 546         * friendly sorted result.
 547         */
 548        for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
 549                struct btrfs_qgroup *qgroup;
 550                int i;
 551
 552                qgroup = rb_entry(node, struct btrfs_qgroup, node);
 553                for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
 554                        if (qgroup->rsv.values[i]) {
 555                                ret = true;
 556                                btrfs_warn(fs_info,
 557                "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
 558                                   btrfs_qgroup_level(qgroup->qgroupid),
 559                                   btrfs_qgroup_subvolid(qgroup->qgroupid),
 560                                   i, qgroup->rsv.values[i]);
 561                        }
 562                }
 563        }
 564        return ret;
 565}
 566
 567/*
 568 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 569 * first two are in single-threaded paths.And for the third one, we have set
 570 * quota_root to be null with qgroup_lock held before, so it is safe to clean
 571 * up the in-memory structures without qgroup_lock held.
 572 */
 573void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 574{
 575        struct rb_node *n;
 576        struct btrfs_qgroup *qgroup;
 577
 578        while ((n = rb_first(&fs_info->qgroup_tree))) {
 579                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 580                rb_erase(n, &fs_info->qgroup_tree);
 581                __del_qgroup_rb(fs_info, qgroup);
 582                btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 583                kfree(qgroup);
 584        }
 585        /*
 586         * We call btrfs_free_qgroup_config() when unmounting
 587         * filesystem and disabling quota, so we set qgroup_ulist
 588         * to be null here to avoid double free.
 589         */
 590        ulist_free(fs_info->qgroup_ulist);
 591        fs_info->qgroup_ulist = NULL;
 592        btrfs_sysfs_del_qgroups(fs_info);
 593}
 594
 595static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 596                                    u64 dst)
 597{
 598        int ret;
 599        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 600        struct btrfs_path *path;
 601        struct btrfs_key key;
 602
 603        path = btrfs_alloc_path();
 604        if (!path)
 605                return -ENOMEM;
 606
 607        key.objectid = src;
 608        key.type = BTRFS_QGROUP_RELATION_KEY;
 609        key.offset = dst;
 610
 611        ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 612
 613        btrfs_mark_buffer_dirty(path->nodes[0]);
 614
 615        btrfs_free_path(path);
 616        return ret;
 617}
 618
 619static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 620                                    u64 dst)
 621{
 622        int ret;
 623        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 624        struct btrfs_path *path;
 625        struct btrfs_key key;
 626
 627        path = btrfs_alloc_path();
 628        if (!path)
 629                return -ENOMEM;
 630
 631        key.objectid = src;
 632        key.type = BTRFS_QGROUP_RELATION_KEY;
 633        key.offset = dst;
 634
 635        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 636        if (ret < 0)
 637                goto out;
 638
 639        if (ret > 0) {
 640                ret = -ENOENT;
 641                goto out;
 642        }
 643
 644        ret = btrfs_del_item(trans, quota_root, path);
 645out:
 646        btrfs_free_path(path);
 647        return ret;
 648}
 649
 650static int add_qgroup_item(struct btrfs_trans_handle *trans,
 651                           struct btrfs_root *quota_root, u64 qgroupid)
 652{
 653        int ret;
 654        struct btrfs_path *path;
 655        struct btrfs_qgroup_info_item *qgroup_info;
 656        struct btrfs_qgroup_limit_item *qgroup_limit;
 657        struct extent_buffer *leaf;
 658        struct btrfs_key key;
 659
 660        if (btrfs_is_testing(quota_root->fs_info))
 661                return 0;
 662
 663        path = btrfs_alloc_path();
 664        if (!path)
 665                return -ENOMEM;
 666
 667        key.objectid = 0;
 668        key.type = BTRFS_QGROUP_INFO_KEY;
 669        key.offset = qgroupid;
 670
 671        /*
 672         * Avoid a transaction abort by catching -EEXIST here. In that
 673         * case, we proceed by re-initializing the existing structure
 674         * on disk.
 675         */
 676
 677        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 678                                      sizeof(*qgroup_info));
 679        if (ret && ret != -EEXIST)
 680                goto out;
 681
 682        leaf = path->nodes[0];
 683        qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 684                                 struct btrfs_qgroup_info_item);
 685        btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 686        btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 687        btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 688        btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 689        btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 690
 691        btrfs_mark_buffer_dirty(leaf);
 692
 693        btrfs_release_path(path);
 694
 695        key.type = BTRFS_QGROUP_LIMIT_KEY;
 696        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 697                                      sizeof(*qgroup_limit));
 698        if (ret && ret != -EEXIST)
 699                goto out;
 700
 701        leaf = path->nodes[0];
 702        qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 703                                  struct btrfs_qgroup_limit_item);
 704        btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 705        btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 706        btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 707        btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 708        btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 709
 710        btrfs_mark_buffer_dirty(leaf);
 711
 712        ret = 0;
 713out:
 714        btrfs_free_path(path);
 715        return ret;
 716}
 717
 718static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 719{
 720        int ret;
 721        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 722        struct btrfs_path *path;
 723        struct btrfs_key key;
 724
 725        path = btrfs_alloc_path();
 726        if (!path)
 727                return -ENOMEM;
 728
 729        key.objectid = 0;
 730        key.type = BTRFS_QGROUP_INFO_KEY;
 731        key.offset = qgroupid;
 732        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 733        if (ret < 0)
 734                goto out;
 735
 736        if (ret > 0) {
 737                ret = -ENOENT;
 738                goto out;
 739        }
 740
 741        ret = btrfs_del_item(trans, quota_root, path);
 742        if (ret)
 743                goto out;
 744
 745        btrfs_release_path(path);
 746
 747        key.type = BTRFS_QGROUP_LIMIT_KEY;
 748        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 749        if (ret < 0)
 750                goto out;
 751
 752        if (ret > 0) {
 753                ret = -ENOENT;
 754                goto out;
 755        }
 756
 757        ret = btrfs_del_item(trans, quota_root, path);
 758
 759out:
 760        btrfs_free_path(path);
 761        return ret;
 762}
 763
 764static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 765                                    struct btrfs_qgroup *qgroup)
 766{
 767        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 768        struct btrfs_path *path;
 769        struct btrfs_key key;
 770        struct extent_buffer *l;
 771        struct btrfs_qgroup_limit_item *qgroup_limit;
 772        int ret;
 773        int slot;
 774
 775        key.objectid = 0;
 776        key.type = BTRFS_QGROUP_LIMIT_KEY;
 777        key.offset = qgroup->qgroupid;
 778
 779        path = btrfs_alloc_path();
 780        if (!path)
 781                return -ENOMEM;
 782
 783        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 784        if (ret > 0)
 785                ret = -ENOENT;
 786
 787        if (ret)
 788                goto out;
 789
 790        l = path->nodes[0];
 791        slot = path->slots[0];
 792        qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 793        btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 794        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 795        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 796        btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 797        btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 798
 799        btrfs_mark_buffer_dirty(l);
 800
 801out:
 802        btrfs_free_path(path);
 803        return ret;
 804}
 805
 806static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 807                                   struct btrfs_qgroup *qgroup)
 808{
 809        struct btrfs_fs_info *fs_info = trans->fs_info;
 810        struct btrfs_root *quota_root = fs_info->quota_root;
 811        struct btrfs_path *path;
 812        struct btrfs_key key;
 813        struct extent_buffer *l;
 814        struct btrfs_qgroup_info_item *qgroup_info;
 815        int ret;
 816        int slot;
 817
 818        if (btrfs_is_testing(fs_info))
 819                return 0;
 820
 821        key.objectid = 0;
 822        key.type = BTRFS_QGROUP_INFO_KEY;
 823        key.offset = qgroup->qgroupid;
 824
 825        path = btrfs_alloc_path();
 826        if (!path)
 827                return -ENOMEM;
 828
 829        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 830        if (ret > 0)
 831                ret = -ENOENT;
 832
 833        if (ret)
 834                goto out;
 835
 836        l = path->nodes[0];
 837        slot = path->slots[0];
 838        qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 839        btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 840        btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 841        btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 842        btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 843        btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 844
 845        btrfs_mark_buffer_dirty(l);
 846
 847out:
 848        btrfs_free_path(path);
 849        return ret;
 850}
 851
 852static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 853{
 854        struct btrfs_fs_info *fs_info = trans->fs_info;
 855        struct btrfs_root *quota_root = fs_info->quota_root;
 856        struct btrfs_path *path;
 857        struct btrfs_key key;
 858        struct extent_buffer *l;
 859        struct btrfs_qgroup_status_item *ptr;
 860        int ret;
 861        int slot;
 862
 863        key.objectid = 0;
 864        key.type = BTRFS_QGROUP_STATUS_KEY;
 865        key.offset = 0;
 866
 867        path = btrfs_alloc_path();
 868        if (!path)
 869                return -ENOMEM;
 870
 871        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 872        if (ret > 0)
 873                ret = -ENOENT;
 874
 875        if (ret)
 876                goto out;
 877
 878        l = path->nodes[0];
 879        slot = path->slots[0];
 880        ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 881        btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
 882        btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 883        btrfs_set_qgroup_status_rescan(l, ptr,
 884                                fs_info->qgroup_rescan_progress.objectid);
 885
 886        btrfs_mark_buffer_dirty(l);
 887
 888out:
 889        btrfs_free_path(path);
 890        return ret;
 891}
 892
 893/*
 894 * called with qgroup_lock held
 895 */
 896static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 897                                  struct btrfs_root *root)
 898{
 899        struct btrfs_path *path;
 900        struct btrfs_key key;
 901        struct extent_buffer *leaf = NULL;
 902        int ret;
 903        int nr = 0;
 904
 905        path = btrfs_alloc_path();
 906        if (!path)
 907                return -ENOMEM;
 908
 909        key.objectid = 0;
 910        key.offset = 0;
 911        key.type = 0;
 912
 913        while (1) {
 914                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 915                if (ret < 0)
 916                        goto out;
 917                leaf = path->nodes[0];
 918                nr = btrfs_header_nritems(leaf);
 919                if (!nr)
 920                        break;
 921                /*
 922                 * delete the leaf one by one
 923                 * since the whole tree is going
 924                 * to be deleted.
 925                 */
 926                path->slots[0] = 0;
 927                ret = btrfs_del_items(trans, root, path, 0, nr);
 928                if (ret)
 929                        goto out;
 930
 931                btrfs_release_path(path);
 932        }
 933        ret = 0;
 934out:
 935        btrfs_free_path(path);
 936        return ret;
 937}
 938
 939int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 940{
 941        struct btrfs_root *quota_root;
 942        struct btrfs_root *tree_root = fs_info->tree_root;
 943        struct btrfs_path *path = NULL;
 944        struct btrfs_qgroup_status_item *ptr;
 945        struct extent_buffer *leaf;
 946        struct btrfs_key key;
 947        struct btrfs_key found_key;
 948        struct btrfs_qgroup *qgroup = NULL;
 949        struct btrfs_trans_handle *trans = NULL;
 950        struct ulist *ulist = NULL;
 951        int ret = 0;
 952        int slot;
 953
 954        /*
 955         * We need to have subvol_sem write locked, to prevent races between
 956         * concurrent tasks trying to enable quotas, because we will unlock
 957         * and relock qgroup_ioctl_lock before setting fs_info->quota_root
 958         * and before setting BTRFS_FS_QUOTA_ENABLED.
 959         */
 960        lockdep_assert_held_write(&fs_info->subvol_sem);
 961
 962        if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
 963                btrfs_err(fs_info,
 964                          "qgroups are currently unsupported in extent tree v2");
 965                return -EINVAL;
 966        }
 967
 968        mutex_lock(&fs_info->qgroup_ioctl_lock);
 969        if (fs_info->quota_root)
 970                goto out;
 971
 972        ulist = ulist_alloc(GFP_KERNEL);
 973        if (!ulist) {
 974                ret = -ENOMEM;
 975                goto out;
 976        }
 977
 978        ret = btrfs_sysfs_add_qgroups(fs_info);
 979        if (ret < 0)
 980                goto out;
 981
 982        /*
 983         * Unlock qgroup_ioctl_lock before starting the transaction. This is to
 984         * avoid lock acquisition inversion problems (reported by lockdep) between
 985         * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
 986         * start a transaction.
 987         * After we started the transaction lock qgroup_ioctl_lock again and
 988         * check if someone else created the quota root in the meanwhile. If so,
 989         * just return success and release the transaction handle.
 990         *
 991         * Also we don't need to worry about someone else calling
 992         * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
 993         * that function returns 0 (success) when the sysfs entries already exist.
 994         */
 995        mutex_unlock(&fs_info->qgroup_ioctl_lock);
 996
 997        /*
 998         * 1 for quota root item
 999         * 1 for BTRFS_QGROUP_STATUS item
1000         *
1001         * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
1002         * per subvolume. However those are not currently reserved since it
1003         * would be a lot of overkill.
1004         */
1005        trans = btrfs_start_transaction(tree_root, 2);
1006
1007        mutex_lock(&fs_info->qgroup_ioctl_lock);
1008        if (IS_ERR(trans)) {
1009                ret = PTR_ERR(trans);
1010                trans = NULL;
1011                goto out;
1012        }
1013
1014        if (fs_info->quota_root)
1015                goto out;
1016
1017        fs_info->qgroup_ulist = ulist;
1018        ulist = NULL;
1019
1020        /*
1021         * initially create the quota tree
1022         */
1023        quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1024        if (IS_ERR(quota_root)) {
1025                ret =  PTR_ERR(quota_root);
1026                btrfs_abort_transaction(trans, ret);
1027                goto out;
1028        }
1029
1030        path = btrfs_alloc_path();
1031        if (!path) {
1032                ret = -ENOMEM;
1033                btrfs_abort_transaction(trans, ret);
1034                goto out_free_root;
1035        }
1036
1037        key.objectid = 0;
1038        key.type = BTRFS_QGROUP_STATUS_KEY;
1039        key.offset = 0;
1040
1041        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1042                                      sizeof(*ptr));
1043        if (ret) {
1044                btrfs_abort_transaction(trans, ret);
1045                goto out_free_path;
1046        }
1047
1048        leaf = path->nodes[0];
1049        ptr = btrfs_item_ptr(leaf, path->slots[0],
1050                                 struct btrfs_qgroup_status_item);
1051        btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1052        btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1053        fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
1054                                BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1055        btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
1056        btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1057
1058        btrfs_mark_buffer_dirty(leaf);
1059
1060        key.objectid = 0;
1061        key.type = BTRFS_ROOT_REF_KEY;
1062        key.offset = 0;
1063
1064        btrfs_release_path(path);
1065        ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1066        if (ret > 0)
1067                goto out_add_root;
1068        if (ret < 0) {
1069                btrfs_abort_transaction(trans, ret);
1070                goto out_free_path;
1071        }
1072
1073        while (1) {
1074                slot = path->slots[0];
1075                leaf = path->nodes[0];
1076                btrfs_item_key_to_cpu(leaf, &found_key, slot);
1077
1078                if (found_key.type == BTRFS_ROOT_REF_KEY) {
1079
1080                        /* Release locks on tree_root before we access quota_root */
1081                        btrfs_release_path(path);
1082
1083                        ret = add_qgroup_item(trans, quota_root,
1084                                              found_key.offset);
1085                        if (ret) {
1086                                btrfs_abort_transaction(trans, ret);
1087                                goto out_free_path;
1088                        }
1089
1090                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
1091                        if (IS_ERR(qgroup)) {
1092                                ret = PTR_ERR(qgroup);
1093                                btrfs_abort_transaction(trans, ret);
1094                                goto out_free_path;
1095                        }
1096                        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1097                        if (ret < 0) {
1098                                btrfs_abort_transaction(trans, ret);
1099                                goto out_free_path;
1100                        }
1101                        ret = btrfs_search_slot_for_read(tree_root, &found_key,
1102                                                         path, 1, 0);
1103                        if (ret < 0) {
1104                                btrfs_abort_transaction(trans, ret);
1105                                goto out_free_path;
1106                        }
1107                        if (ret > 0) {
1108                                /*
1109                                 * Shouldn't happen, but in case it does we
1110                                 * don't need to do the btrfs_next_item, just
1111                                 * continue.
1112                                 */
1113                                continue;
1114                        }
1115                }
1116                ret = btrfs_next_item(tree_root, path);
1117                if (ret < 0) {
1118                        btrfs_abort_transaction(trans, ret);
1119                        goto out_free_path;
1120                }
1121                if (ret)
1122                        break;
1123        }
1124
1125out_add_root:
1126        btrfs_release_path(path);
1127        ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1128        if (ret) {
1129                btrfs_abort_transaction(trans, ret);
1130                goto out_free_path;
1131        }
1132
1133        qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
1134        if (IS_ERR(qgroup)) {
1135                ret = PTR_ERR(qgroup);
1136                btrfs_abort_transaction(trans, ret);
1137                goto out_free_path;
1138        }
1139        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1140        if (ret < 0) {
1141                btrfs_abort_transaction(trans, ret);
1142                goto out_free_path;
1143        }
1144
1145        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1146        /*
1147         * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
1148         * a deadlock with tasks concurrently doing other qgroup operations, such
1149         * adding/removing qgroups or adding/deleting qgroup relations for example,
1150         * because all qgroup operations first start or join a transaction and then
1151         * lock the qgroup_ioctl_lock mutex.
1152         * We are safe from a concurrent task trying to enable quotas, by calling
1153         * this function, since we are serialized by fs_info->subvol_sem.
1154         */
1155        ret = btrfs_commit_transaction(trans);
1156        trans = NULL;
1157        mutex_lock(&fs_info->qgroup_ioctl_lock);
1158        if (ret)
1159                goto out_free_path;
1160
1161        /*
1162         * Set quota enabled flag after committing the transaction, to avoid
1163         * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1164         * creation.
1165         */
1166        spin_lock(&fs_info->qgroup_lock);
1167        fs_info->quota_root = quota_root;
1168        set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1169        spin_unlock(&fs_info->qgroup_lock);
1170
1171        ret = qgroup_rescan_init(fs_info, 0, 1);
1172        if (!ret) {
1173                qgroup_rescan_zero_tracking(fs_info);
1174                fs_info->qgroup_rescan_running = true;
1175                btrfs_queue_work(fs_info->qgroup_rescan_workers,
1176                                 &fs_info->qgroup_rescan_work);
1177        }
1178
1179out_free_path:
1180        btrfs_free_path(path);
1181out_free_root:
1182        if (ret)
1183                btrfs_put_root(quota_root);
1184out:
1185        if (ret) {
1186                ulist_free(fs_info->qgroup_ulist);
1187                fs_info->qgroup_ulist = NULL;
1188                btrfs_sysfs_del_qgroups(fs_info);
1189        }
1190        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1191        if (ret && trans)
1192                btrfs_end_transaction(trans);
1193        else if (trans)
1194                ret = btrfs_end_transaction(trans);
1195        ulist_free(ulist);
1196        return ret;
1197}
1198
1199int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1200{
1201        struct btrfs_root *quota_root;
1202        struct btrfs_trans_handle *trans = NULL;
1203        int ret = 0;
1204
1205        /*
1206         * We need to have subvol_sem write locked, to prevent races between
1207         * concurrent tasks trying to disable quotas, because we will unlock
1208         * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
1209         */
1210        lockdep_assert_held_write(&fs_info->subvol_sem);
1211
1212        mutex_lock(&fs_info->qgroup_ioctl_lock);
1213        if (!fs_info->quota_root)
1214                goto out;
1215
1216        /*
1217         * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
1218         * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
1219         * to lock that mutex while holding a transaction handle and the rescan
1220         * worker needs to commit a transaction.
1221         */
1222        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1223
1224        /*
1225         * Request qgroup rescan worker to complete and wait for it. This wait
1226         * must be done before transaction start for quota disable since it may
1227         * deadlock with transaction by the qgroup rescan worker.
1228         */
1229        clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1230        btrfs_qgroup_wait_for_completion(fs_info, false);
1231
1232        /*
1233         * 1 For the root item
1234         *
1235         * We should also reserve enough items for the quota tree deletion in
1236         * btrfs_clean_quota_tree but this is not done.
1237         *
1238         * Also, we must always start a transaction without holding the mutex
1239         * qgroup_ioctl_lock, see btrfs_quota_enable().
1240         */
1241        trans = btrfs_start_transaction(fs_info->tree_root, 1);
1242
1243        mutex_lock(&fs_info->qgroup_ioctl_lock);
1244        if (IS_ERR(trans)) {
1245                ret = PTR_ERR(trans);
1246                trans = NULL;
1247                set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1248                goto out;
1249        }
1250
1251        if (!fs_info->quota_root)
1252                goto out;
1253
1254        spin_lock(&fs_info->qgroup_lock);
1255        quota_root = fs_info->quota_root;
1256        fs_info->quota_root = NULL;
1257        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1258        spin_unlock(&fs_info->qgroup_lock);
1259
1260        btrfs_free_qgroup_config(fs_info);
1261
1262        ret = btrfs_clean_quota_tree(trans, quota_root);
1263        if (ret) {
1264                btrfs_abort_transaction(trans, ret);
1265                goto out;
1266        }
1267
1268        ret = btrfs_del_root(trans, &quota_root->root_key);
1269        if (ret) {
1270                btrfs_abort_transaction(trans, ret);
1271                goto out;
1272        }
1273
1274        list_del(&quota_root->dirty_list);
1275
1276        btrfs_tree_lock(quota_root->node);
1277        btrfs_clean_tree_block(quota_root->node);
1278        btrfs_tree_unlock(quota_root->node);
1279        btrfs_free_tree_block(trans, btrfs_root_id(quota_root),
1280                              quota_root->node, 0, 1);
1281
1282        btrfs_put_root(quota_root);
1283
1284out:
1285        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1286        if (ret && trans)
1287                btrfs_end_transaction(trans);
1288        else if (trans)
1289                ret = btrfs_end_transaction(trans);
1290
1291        return ret;
1292}
1293
1294static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1295                         struct btrfs_qgroup *qgroup)
1296{
1297        if (list_empty(&qgroup->dirty))
1298                list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1299}
1300
1301/*
1302 * The easy accounting, we're updating qgroup relationship whose child qgroup
1303 * only has exclusive extents.
1304 *
1305 * In this case, all exclusive extents will also be exclusive for parent, so
1306 * excl/rfer just get added/removed.
1307 *
1308 * So is qgroup reservation space, which should also be added/removed to
1309 * parent.
1310 * Or when child tries to release reservation space, parent will underflow its
1311 * reservation (for relationship adding case).
1312 *
1313 * Caller should hold fs_info->qgroup_lock.
1314 */
1315static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1316                                    struct ulist *tmp, u64 ref_root,
1317                                    struct btrfs_qgroup *src, int sign)
1318{
1319        struct btrfs_qgroup *qgroup;
1320        struct btrfs_qgroup_list *glist;
1321        struct ulist_node *unode;
1322        struct ulist_iterator uiter;
1323        u64 num_bytes = src->excl;
1324        int ret = 0;
1325
1326        qgroup = find_qgroup_rb(fs_info, ref_root);
1327        if (!qgroup)
1328                goto out;
1329
1330        qgroup->rfer += sign * num_bytes;
1331        qgroup->rfer_cmpr += sign * num_bytes;
1332
1333        WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1334        qgroup->excl += sign * num_bytes;
1335        qgroup->excl_cmpr += sign * num_bytes;
1336
1337        if (sign > 0)
1338                qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1339        else
1340                qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1341
1342        qgroup_dirty(fs_info, qgroup);
1343
1344        /* Get all of the parent groups that contain this qgroup */
1345        list_for_each_entry(glist, &qgroup->groups, next_group) {
1346                ret = ulist_add(tmp, glist->group->qgroupid,
1347                                qgroup_to_aux(glist->group), GFP_ATOMIC);
1348                if (ret < 0)
1349                        goto out;
1350        }
1351
1352        /* Iterate all of the parents and adjust their reference counts */
1353        ULIST_ITER_INIT(&uiter);
1354        while ((unode = ulist_next(tmp, &uiter))) {
1355                qgroup = unode_aux_to_qgroup(unode);
1356                qgroup->rfer += sign * num_bytes;
1357                qgroup->rfer_cmpr += sign * num_bytes;
1358                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1359                qgroup->excl += sign * num_bytes;
1360                if (sign > 0)
1361                        qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1362                else
1363                        qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1364                qgroup->excl_cmpr += sign * num_bytes;
1365                qgroup_dirty(fs_info, qgroup);
1366
1367                /* Add any parents of the parents */
1368                list_for_each_entry(glist, &qgroup->groups, next_group) {
1369                        ret = ulist_add(tmp, glist->group->qgroupid,
1370                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
1371                        if (ret < 0)
1372                                goto out;
1373                }
1374        }
1375        ret = 0;
1376out:
1377        return ret;
1378}
1379
1380
1381/*
1382 * Quick path for updating qgroup with only excl refs.
1383 *
1384 * In that case, just update all parent will be enough.
1385 * Or we needs to do a full rescan.
1386 * Caller should also hold fs_info->qgroup_lock.
1387 *
1388 * Return 0 for quick update, return >0 for need to full rescan
1389 * and mark INCONSISTENT flag.
1390 * Return < 0 for other error.
1391 */
1392static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1393                                   struct ulist *tmp, u64 src, u64 dst,
1394                                   int sign)
1395{
1396        struct btrfs_qgroup *qgroup;
1397        int ret = 1;
1398        int err = 0;
1399
1400        qgroup = find_qgroup_rb(fs_info, src);
1401        if (!qgroup)
1402                goto out;
1403        if (qgroup->excl == qgroup->rfer) {
1404                ret = 0;
1405                err = __qgroup_excl_accounting(fs_info, tmp, dst,
1406                                               qgroup, sign);
1407                if (err < 0) {
1408                        ret = err;
1409                        goto out;
1410                }
1411        }
1412out:
1413        if (ret)
1414                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1415        return ret;
1416}
1417
1418int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1419                              u64 dst)
1420{
1421        struct btrfs_fs_info *fs_info = trans->fs_info;
1422        struct btrfs_qgroup *parent;
1423        struct btrfs_qgroup *member;
1424        struct btrfs_qgroup_list *list;
1425        struct ulist *tmp;
1426        unsigned int nofs_flag;
1427        int ret = 0;
1428
1429        /* Check the level of src and dst first */
1430        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1431                return -EINVAL;
1432
1433        /* We hold a transaction handle open, must do a NOFS allocation. */
1434        nofs_flag = memalloc_nofs_save();
1435        tmp = ulist_alloc(GFP_KERNEL);
1436        memalloc_nofs_restore(nofs_flag);
1437        if (!tmp)
1438                return -ENOMEM;
1439
1440        mutex_lock(&fs_info->qgroup_ioctl_lock);
1441        if (!fs_info->quota_root) {
1442                ret = -ENOTCONN;
1443                goto out;
1444        }
1445        member = find_qgroup_rb(fs_info, src);
1446        parent = find_qgroup_rb(fs_info, dst);
1447        if (!member || !parent) {
1448                ret = -EINVAL;
1449                goto out;
1450        }
1451
1452        /* check if such qgroup relation exist firstly */
1453        list_for_each_entry(list, &member->groups, next_group) {
1454                if (list->group == parent) {
1455                        ret = -EEXIST;
1456                        goto out;
1457                }
1458        }
1459
1460        ret = add_qgroup_relation_item(trans, src, dst);
1461        if (ret)
1462                goto out;
1463
1464        ret = add_qgroup_relation_item(trans, dst, src);
1465        if (ret) {
1466                del_qgroup_relation_item(trans, src, dst);
1467                goto out;
1468        }
1469
1470        spin_lock(&fs_info->qgroup_lock);
1471        ret = __add_relation_rb(member, parent);
1472        if (ret < 0) {
1473                spin_unlock(&fs_info->qgroup_lock);
1474                goto out;
1475        }
1476        ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1477        spin_unlock(&fs_info->qgroup_lock);
1478out:
1479        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1480        ulist_free(tmp);
1481        return ret;
1482}
1483
1484static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1485                                 u64 dst)
1486{
1487        struct btrfs_fs_info *fs_info = trans->fs_info;
1488        struct btrfs_qgroup *parent;
1489        struct btrfs_qgroup *member;
1490        struct btrfs_qgroup_list *list;
1491        struct ulist *tmp;
1492        bool found = false;
1493        unsigned int nofs_flag;
1494        int ret = 0;
1495        int ret2;
1496
1497        /* We hold a transaction handle open, must do a NOFS allocation. */
1498        nofs_flag = memalloc_nofs_save();
1499        tmp = ulist_alloc(GFP_KERNEL);
1500        memalloc_nofs_restore(nofs_flag);
1501        if (!tmp)
1502                return -ENOMEM;
1503
1504        if (!fs_info->quota_root) {
1505                ret = -ENOTCONN;
1506                goto out;
1507        }
1508
1509        member = find_qgroup_rb(fs_info, src);
1510        parent = find_qgroup_rb(fs_info, dst);
1511        /*
1512         * The parent/member pair doesn't exist, then try to delete the dead
1513         * relation items only.
1514         */
1515        if (!member || !parent)
1516                goto delete_item;
1517
1518        /* check if such qgroup relation exist firstly */
1519        list_for_each_entry(list, &member->groups, next_group) {
1520                if (list->group == parent) {
1521                        found = true;
1522                        break;
1523                }
1524        }
1525
1526delete_item:
1527        ret = del_qgroup_relation_item(trans, src, dst);
1528        if (ret < 0 && ret != -ENOENT)
1529                goto out;
1530        ret2 = del_qgroup_relation_item(trans, dst, src);
1531        if (ret2 < 0 && ret2 != -ENOENT)
1532                goto out;
1533
1534        /* At least one deletion succeeded, return 0 */
1535        if (!ret || !ret2)
1536                ret = 0;
1537
1538        if (found) {
1539                spin_lock(&fs_info->qgroup_lock);
1540                del_relation_rb(fs_info, src, dst);
1541                ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1542                spin_unlock(&fs_info->qgroup_lock);
1543        }
1544out:
1545        ulist_free(tmp);
1546        return ret;
1547}
1548
1549int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1550                              u64 dst)
1551{
1552        struct btrfs_fs_info *fs_info = trans->fs_info;
1553        int ret = 0;
1554
1555        mutex_lock(&fs_info->qgroup_ioctl_lock);
1556        ret = __del_qgroup_relation(trans, src, dst);
1557        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1558
1559        return ret;
1560}
1561
1562int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1563{
1564        struct btrfs_fs_info *fs_info = trans->fs_info;
1565        struct btrfs_root *quota_root;
1566        struct btrfs_qgroup *qgroup;
1567        int ret = 0;
1568
1569        mutex_lock(&fs_info->qgroup_ioctl_lock);
1570        if (!fs_info->quota_root) {
1571                ret = -ENOTCONN;
1572                goto out;
1573        }
1574        quota_root = fs_info->quota_root;
1575        qgroup = find_qgroup_rb(fs_info, qgroupid);
1576        if (qgroup) {
1577                ret = -EEXIST;
1578                goto out;
1579        }
1580
1581        ret = add_qgroup_item(trans, quota_root, qgroupid);
1582        if (ret)
1583                goto out;
1584
1585        spin_lock(&fs_info->qgroup_lock);
1586        qgroup = add_qgroup_rb(fs_info, qgroupid);
1587        spin_unlock(&fs_info->qgroup_lock);
1588
1589        if (IS_ERR(qgroup)) {
1590                ret = PTR_ERR(qgroup);
1591                goto out;
1592        }
1593        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1594out:
1595        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1596        return ret;
1597}
1598
1599int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1600{
1601        struct btrfs_fs_info *fs_info = trans->fs_info;
1602        struct btrfs_qgroup *qgroup;
1603        struct btrfs_qgroup_list *list;
1604        int ret = 0;
1605
1606        mutex_lock(&fs_info->qgroup_ioctl_lock);
1607        if (!fs_info->quota_root) {
1608                ret = -ENOTCONN;
1609                goto out;
1610        }
1611
1612        qgroup = find_qgroup_rb(fs_info, qgroupid);
1613        if (!qgroup) {
1614                ret = -ENOENT;
1615                goto out;
1616        }
1617
1618        /* Check if there are no children of this qgroup */
1619        if (!list_empty(&qgroup->members)) {
1620                ret = -EBUSY;
1621                goto out;
1622        }
1623
1624        ret = del_qgroup_item(trans, qgroupid);
1625        if (ret && ret != -ENOENT)
1626                goto out;
1627
1628        while (!list_empty(&qgroup->groups)) {
1629                list = list_first_entry(&qgroup->groups,
1630                                        struct btrfs_qgroup_list, next_group);
1631                ret = __del_qgroup_relation(trans, qgroupid,
1632                                            list->group->qgroupid);
1633                if (ret)
1634                        goto out;
1635        }
1636
1637        spin_lock(&fs_info->qgroup_lock);
1638        del_qgroup_rb(fs_info, qgroupid);
1639        spin_unlock(&fs_info->qgroup_lock);
1640
1641        /*
1642         * Remove the qgroup from sysfs now without holding the qgroup_lock
1643         * spinlock, since the sysfs_remove_group() function needs to take
1644         * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1645         */
1646        btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1647        kfree(qgroup);
1648out:
1649        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1650        return ret;
1651}
1652
1653int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1654                       struct btrfs_qgroup_limit *limit)
1655{
1656        struct btrfs_fs_info *fs_info = trans->fs_info;
1657        struct btrfs_qgroup *qgroup;
1658        int ret = 0;
1659        /* Sometimes we would want to clear the limit on this qgroup.
1660         * To meet this requirement, we treat the -1 as a special value
1661         * which tell kernel to clear the limit on this qgroup.
1662         */
1663        const u64 CLEAR_VALUE = -1;
1664
1665        mutex_lock(&fs_info->qgroup_ioctl_lock);
1666        if (!fs_info->quota_root) {
1667                ret = -ENOTCONN;
1668                goto out;
1669        }
1670
1671        qgroup = find_qgroup_rb(fs_info, qgroupid);
1672        if (!qgroup) {
1673                ret = -ENOENT;
1674                goto out;
1675        }
1676
1677        spin_lock(&fs_info->qgroup_lock);
1678        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1679                if (limit->max_rfer == CLEAR_VALUE) {
1680                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1681                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1682                        qgroup->max_rfer = 0;
1683                } else {
1684                        qgroup->max_rfer = limit->max_rfer;
1685                }
1686        }
1687        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1688                if (limit->max_excl == CLEAR_VALUE) {
1689                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1690                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1691                        qgroup->max_excl = 0;
1692                } else {
1693                        qgroup->max_excl = limit->max_excl;
1694                }
1695        }
1696        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1697                if (limit->rsv_rfer == CLEAR_VALUE) {
1698                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1699                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1700                        qgroup->rsv_rfer = 0;
1701                } else {
1702                        qgroup->rsv_rfer = limit->rsv_rfer;
1703                }
1704        }
1705        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1706                if (limit->rsv_excl == CLEAR_VALUE) {
1707                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1708                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1709                        qgroup->rsv_excl = 0;
1710                } else {
1711                        qgroup->rsv_excl = limit->rsv_excl;
1712                }
1713        }
1714        qgroup->lim_flags |= limit->flags;
1715
1716        spin_unlock(&fs_info->qgroup_lock);
1717
1718        ret = update_qgroup_limit_item(trans, qgroup);
1719        if (ret) {
1720                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1721                btrfs_info(fs_info, "unable to update quota limit for %llu",
1722                       qgroupid);
1723        }
1724
1725out:
1726        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1727        return ret;
1728}
1729
1730int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1731                                struct btrfs_delayed_ref_root *delayed_refs,
1732                                struct btrfs_qgroup_extent_record *record)
1733{
1734        struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1735        struct rb_node *parent_node = NULL;
1736        struct btrfs_qgroup_extent_record *entry;
1737        u64 bytenr = record->bytenr;
1738
1739        lockdep_assert_held(&delayed_refs->lock);
1740        trace_btrfs_qgroup_trace_extent(fs_info, record);
1741
1742        while (*p) {
1743                parent_node = *p;
1744                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1745                                 node);
1746                if (bytenr < entry->bytenr) {
1747                        p = &(*p)->rb_left;
1748                } else if (bytenr > entry->bytenr) {
1749                        p = &(*p)->rb_right;
1750                } else {
1751                        if (record->data_rsv && !entry->data_rsv) {
1752                                entry->data_rsv = record->data_rsv;
1753                                entry->data_rsv_refroot =
1754                                        record->data_rsv_refroot;
1755                        }
1756                        return 1;
1757                }
1758        }
1759
1760        rb_link_node(&record->node, parent_node, p);
1761        rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1762        return 0;
1763}
1764
1765int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
1766                                   struct btrfs_qgroup_extent_record *qrecord)
1767{
1768        struct ulist *old_root;
1769        u64 bytenr = qrecord->bytenr;
1770        int ret;
1771
1772        /*
1773         * We are always called in a context where we are already holding a
1774         * transaction handle. Often we are called when adding a data delayed
1775         * reference from btrfs_truncate_inode_items() (truncating or unlinking),
1776         * in which case we will be holding a write lock on extent buffer from a
1777         * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
1778         * acquire fs_info->commit_root_sem, because that is a higher level lock
1779         * that must be acquired before locking any extent buffers.
1780         *
1781         * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
1782         * but we can't pass it a non-NULL transaction handle, because otherwise
1783         * it would not use commit roots and would lock extent buffers, causing
1784         * a deadlock if it ends up trying to read lock the same extent buffer
1785         * that was previously write locked at btrfs_truncate_inode_items().
1786         *
1787         * So pass a NULL transaction handle to btrfs_find_all_roots() and
1788         * explicitly tell it to not acquire the commit_root_sem - if we are
1789         * holding a transaction handle we don't need its protection.
1790         */
1791        ASSERT(trans != NULL);
1792
1793        ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
1794                                   true);
1795        if (ret < 0) {
1796                trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1797                btrfs_warn(trans->fs_info,
1798"error accounting new delayed refs extent (err code: %d), quota inconsistent",
1799                        ret);
1800                return 0;
1801        }
1802
1803        /*
1804         * Here we don't need to get the lock of
1805         * trans->transaction->delayed_refs, since inserted qrecord won't
1806         * be deleted, only qrecord->node may be modified (new qrecord insert)
1807         *
1808         * So modifying qrecord->old_roots is safe here
1809         */
1810        qrecord->old_roots = old_root;
1811        return 0;
1812}
1813
1814int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1815                              u64 num_bytes, gfp_t gfp_flag)
1816{
1817        struct btrfs_fs_info *fs_info = trans->fs_info;
1818        struct btrfs_qgroup_extent_record *record;
1819        struct btrfs_delayed_ref_root *delayed_refs;
1820        int ret;
1821
1822        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1823            || bytenr == 0 || num_bytes == 0)
1824                return 0;
1825        record = kzalloc(sizeof(*record), gfp_flag);
1826        if (!record)
1827                return -ENOMEM;
1828
1829        delayed_refs = &trans->transaction->delayed_refs;
1830        record->bytenr = bytenr;
1831        record->num_bytes = num_bytes;
1832        record->old_roots = NULL;
1833
1834        spin_lock(&delayed_refs->lock);
1835        ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1836        spin_unlock(&delayed_refs->lock);
1837        if (ret > 0) {
1838                kfree(record);
1839                return 0;
1840        }
1841        return btrfs_qgroup_trace_extent_post(trans, record);
1842}
1843
1844int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1845                                  struct extent_buffer *eb)
1846{
1847        struct btrfs_fs_info *fs_info = trans->fs_info;
1848        int nr = btrfs_header_nritems(eb);
1849        int i, extent_type, ret;
1850        struct btrfs_key key;
1851        struct btrfs_file_extent_item *fi;
1852        u64 bytenr, num_bytes;
1853
1854        /* We can be called directly from walk_up_proc() */
1855        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1856                return 0;
1857
1858        for (i = 0; i < nr; i++) {
1859                btrfs_item_key_to_cpu(eb, &key, i);
1860
1861                if (key.type != BTRFS_EXTENT_DATA_KEY)
1862                        continue;
1863
1864                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1865                /* filter out non qgroup-accountable extents  */
1866                extent_type = btrfs_file_extent_type(eb, fi);
1867
1868                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1869                        continue;
1870
1871                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1872                if (!bytenr)
1873                        continue;
1874
1875                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1876
1877                ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
1878                                                GFP_NOFS);
1879                if (ret)
1880                        return ret;
1881        }
1882        cond_resched();
1883        return 0;
1884}
1885
1886/*
1887 * Walk up the tree from the bottom, freeing leaves and any interior
1888 * nodes which have had all slots visited. If a node (leaf or
1889 * interior) is freed, the node above it will have it's slot
1890 * incremented. The root node will never be freed.
1891 *
1892 * At the end of this function, we should have a path which has all
1893 * slots incremented to the next position for a search. If we need to
1894 * read a new node it will be NULL and the node above it will have the
1895 * correct slot selected for a later read.
1896 *
1897 * If we increment the root nodes slot counter past the number of
1898 * elements, 1 is returned to signal completion of the search.
1899 */
1900static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
1901{
1902        int level = 0;
1903        int nr, slot;
1904        struct extent_buffer *eb;
1905
1906        if (root_level == 0)
1907                return 1;
1908
1909        while (level <= root_level) {
1910                eb = path->nodes[level];
1911                nr = btrfs_header_nritems(eb);
1912                path->slots[level]++;
1913                slot = path->slots[level];
1914                if (slot >= nr || level == 0) {
1915                        /*
1916                         * Don't free the root -  we will detect this
1917                         * condition after our loop and return a
1918                         * positive value for caller to stop walking the tree.
1919                         */
1920                        if (level != root_level) {
1921                                btrfs_tree_unlock_rw(eb, path->locks[level]);
1922                                path->locks[level] = 0;
1923
1924                                free_extent_buffer(eb);
1925                                path->nodes[level] = NULL;
1926                                path->slots[level] = 0;
1927                        }
1928                } else {
1929                        /*
1930                         * We have a valid slot to walk back down
1931                         * from. Stop here so caller can process these
1932                         * new nodes.
1933                         */
1934                        break;
1935                }
1936
1937                level++;
1938        }
1939
1940        eb = path->nodes[root_level];
1941        if (path->slots[root_level] >= btrfs_header_nritems(eb))
1942                return 1;
1943
1944        return 0;
1945}
1946
1947/*
1948 * Helper function to trace a subtree tree block swap.
1949 *
1950 * The swap will happen in highest tree block, but there may be a lot of
1951 * tree blocks involved.
1952 *
1953 * For example:
1954 *  OO = Old tree blocks
1955 *  NN = New tree blocks allocated during balance
1956 *
1957 *           File tree (257)                  Reloc tree for 257
1958 * L2              OO                                NN
1959 *               /    \                            /    \
1960 * L1          OO      OO (a)                    OO      NN (a)
1961 *            / \     / \                       / \     / \
1962 * L0       OO   OO OO   OO                   OO   OO NN   NN
1963 *                  (b)  (c)                          (b)  (c)
1964 *
1965 * When calling qgroup_trace_extent_swap(), we will pass:
1966 * @src_eb = OO(a)
1967 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1968 * @dst_level = 0
1969 * @root_level = 1
1970 *
1971 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1972 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1973 *
1974 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1975 *
1976 * 1) Tree search from @src_eb
1977 *    It should acts as a simplified btrfs_search_slot().
1978 *    The key for search can be extracted from @dst_path->nodes[dst_level]
1979 *    (first key).
1980 *
1981 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1982 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1983 *    They should be marked during previous (@dst_level = 1) iteration.
1984 *
1985 * 3) Mark file extents in leaves dirty
1986 *    We don't have good way to pick out new file extents only.
1987 *    So we still follow the old method by scanning all file extents in
1988 *    the leave.
1989 *
1990 * This function can free us from keeping two paths, thus later we only need
1991 * to care about how to iterate all new tree blocks in reloc tree.
1992 */
1993static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1994                                    struct extent_buffer *src_eb,
1995                                    struct btrfs_path *dst_path,
1996                                    int dst_level, int root_level,
1997                                    bool trace_leaf)
1998{
1999        struct btrfs_key key;
2000        struct btrfs_path *src_path;
2001        struct btrfs_fs_info *fs_info = trans->fs_info;
2002        u32 nodesize = fs_info->nodesize;
2003        int cur_level = root_level;
2004        int ret;
2005
2006        BUG_ON(dst_level > root_level);
2007        /* Level mismatch */
2008        if (btrfs_header_level(src_eb) != root_level)
2009                return -EINVAL;
2010
2011        src_path = btrfs_alloc_path();
2012        if (!src_path) {
2013                ret = -ENOMEM;
2014                goto out;
2015        }
2016
2017        if (dst_level)
2018                btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2019        else
2020                btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
2021
2022        /* For src_path */
2023        atomic_inc(&src_eb->refs);
2024        src_path->nodes[root_level] = src_eb;
2025        src_path->slots[root_level] = dst_path->slots[root_level];
2026        src_path->locks[root_level] = 0;
2027
2028        /* A simplified version of btrfs_search_slot() */
2029        while (cur_level >= dst_level) {
2030                struct btrfs_key src_key;
2031                struct btrfs_key dst_key;
2032
2033                if (src_path->nodes[cur_level] == NULL) {
2034                        struct extent_buffer *eb;
2035                        int parent_slot;
2036
2037                        eb = src_path->nodes[cur_level + 1];
2038                        parent_slot = src_path->slots[cur_level + 1];
2039
2040                        eb = btrfs_read_node_slot(eb, parent_slot);
2041                        if (IS_ERR(eb)) {
2042                                ret = PTR_ERR(eb);
2043                                goto out;
2044                        }
2045
2046                        src_path->nodes[cur_level] = eb;
2047
2048                        btrfs_tree_read_lock(eb);
2049                        src_path->locks[cur_level] = BTRFS_READ_LOCK;
2050                }
2051
2052                src_path->slots[cur_level] = dst_path->slots[cur_level];
2053                if (cur_level) {
2054                        btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
2055                                        &dst_key, dst_path->slots[cur_level]);
2056                        btrfs_node_key_to_cpu(src_path->nodes[cur_level],
2057                                        &src_key, src_path->slots[cur_level]);
2058                } else {
2059                        btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2060                                        &dst_key, dst_path->slots[cur_level]);
2061                        btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2062                                        &src_key, src_path->slots[cur_level]);
2063                }
2064                /* Content mismatch, something went wrong */
2065                if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2066                        ret = -ENOENT;
2067                        goto out;
2068                }
2069                cur_level--;
2070        }
2071
2072        /*
2073         * Now both @dst_path and @src_path have been populated, record the tree
2074         * blocks for qgroup accounting.
2075         */
2076        ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2077                        nodesize, GFP_NOFS);
2078        if (ret < 0)
2079                goto out;
2080        ret = btrfs_qgroup_trace_extent(trans,
2081                        dst_path->nodes[dst_level]->start,
2082                        nodesize, GFP_NOFS);
2083        if (ret < 0)
2084                goto out;
2085
2086        /* Record leaf file extents */
2087        if (dst_level == 0 && trace_leaf) {
2088                ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2089                if (ret < 0)
2090                        goto out;
2091                ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2092        }
2093out:
2094        btrfs_free_path(src_path);
2095        return ret;
2096}
2097
2098/*
2099 * Helper function to do recursive generation-aware depth-first search, to
2100 * locate all new tree blocks in a subtree of reloc tree.
2101 *
2102 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2103 *         reloc tree
2104 * L2         NN (a)
2105 *          /    \
2106 * L1    OO        NN (b)
2107 *      /  \      /  \
2108 * L0  OO  OO    OO  NN
2109 *               (c) (d)
2110 * If we pass:
2111 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2112 * @cur_level = 1
2113 * @root_level = 1
2114 *
2115 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2116 * above tree blocks along with their counter parts in file tree.
2117 * While during search, old tree blocks OO(c) will be skipped as tree block swap
2118 * won't affect OO(c).
2119 */
2120static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2121                                           struct extent_buffer *src_eb,
2122                                           struct btrfs_path *dst_path,
2123                                           int cur_level, int root_level,
2124                                           u64 last_snapshot, bool trace_leaf)
2125{
2126        struct btrfs_fs_info *fs_info = trans->fs_info;
2127        struct extent_buffer *eb;
2128        bool need_cleanup = false;
2129        int ret = 0;
2130        int i;
2131
2132        /* Level sanity check */
2133        if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2134            root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2135            root_level < cur_level) {
2136                btrfs_err_rl(fs_info,
2137                        "%s: bad levels, cur_level=%d root_level=%d",
2138                        __func__, cur_level, root_level);
2139                return -EUCLEAN;
2140        }
2141
2142        /* Read the tree block if needed */
2143        if (dst_path->nodes[cur_level] == NULL) {
2144                int parent_slot;
2145                u64 child_gen;
2146
2147                /*
2148                 * dst_path->nodes[root_level] must be initialized before
2149                 * calling this function.
2150                 */
2151                if (cur_level == root_level) {
2152                        btrfs_err_rl(fs_info,
2153        "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2154                                __func__, root_level, root_level, cur_level);
2155                        return -EUCLEAN;
2156                }
2157
2158                /*
2159                 * We need to get child blockptr/gen from parent before we can
2160                 * read it.
2161                  */
2162                eb = dst_path->nodes[cur_level + 1];
2163                parent_slot = dst_path->slots[cur_level + 1];
2164                child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2165
2166                /* This node is old, no need to trace */
2167                if (child_gen < last_snapshot)
2168                        goto out;
2169
2170                eb = btrfs_read_node_slot(eb, parent_slot);
2171                if (IS_ERR(eb)) {
2172                        ret = PTR_ERR(eb);
2173                        goto out;
2174                }
2175
2176                dst_path->nodes[cur_level] = eb;
2177                dst_path->slots[cur_level] = 0;
2178
2179                btrfs_tree_read_lock(eb);
2180                dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2181                need_cleanup = true;
2182        }
2183
2184        /* Now record this tree block and its counter part for qgroups */
2185        ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2186                                       root_level, trace_leaf);
2187        if (ret < 0)
2188                goto cleanup;
2189
2190        eb = dst_path->nodes[cur_level];
2191
2192        if (cur_level > 0) {
2193                /* Iterate all child tree blocks */
2194                for (i = 0; i < btrfs_header_nritems(eb); i++) {
2195                        /* Skip old tree blocks as they won't be swapped */
2196                        if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2197                                continue;
2198                        dst_path->slots[cur_level] = i;
2199
2200                        /* Recursive call (at most 7 times) */
2201                        ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2202                                        dst_path, cur_level - 1, root_level,
2203                                        last_snapshot, trace_leaf);
2204                        if (ret < 0)
2205                                goto cleanup;
2206                }
2207        }
2208
2209cleanup:
2210        if (need_cleanup) {
2211                /* Clean up */
2212                btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2213                                     dst_path->locks[cur_level]);
2214                free_extent_buffer(dst_path->nodes[cur_level]);
2215                dst_path->nodes[cur_level] = NULL;
2216                dst_path->slots[cur_level] = 0;
2217                dst_path->locks[cur_level] = 0;
2218        }
2219out:
2220        return ret;
2221}
2222
2223static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2224                                struct extent_buffer *src_eb,
2225                                struct extent_buffer *dst_eb,
2226                                u64 last_snapshot, bool trace_leaf)
2227{
2228        struct btrfs_fs_info *fs_info = trans->fs_info;
2229        struct btrfs_path *dst_path = NULL;
2230        int level;
2231        int ret;
2232
2233        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2234                return 0;
2235
2236        /* Wrong parameter order */
2237        if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2238                btrfs_err_rl(fs_info,
2239                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2240                             btrfs_header_generation(src_eb),
2241                             btrfs_header_generation(dst_eb));
2242                return -EUCLEAN;
2243        }
2244
2245        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2246                ret = -EIO;
2247                goto out;
2248        }
2249
2250        level = btrfs_header_level(dst_eb);
2251        dst_path = btrfs_alloc_path();
2252        if (!dst_path) {
2253                ret = -ENOMEM;
2254                goto out;
2255        }
2256        /* For dst_path */
2257        atomic_inc(&dst_eb->refs);
2258        dst_path->nodes[level] = dst_eb;
2259        dst_path->slots[level] = 0;
2260        dst_path->locks[level] = 0;
2261
2262        /* Do the generation aware breadth-first search */
2263        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2264                                              level, last_snapshot, trace_leaf);
2265        if (ret < 0)
2266                goto out;
2267        ret = 0;
2268
2269out:
2270        btrfs_free_path(dst_path);
2271        if (ret < 0)
2272                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2273        return ret;
2274}
2275
2276int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2277                               struct extent_buffer *root_eb,
2278                               u64 root_gen, int root_level)
2279{
2280        struct btrfs_fs_info *fs_info = trans->fs_info;
2281        int ret = 0;
2282        int level;
2283        struct extent_buffer *eb = root_eb;
2284        struct btrfs_path *path = NULL;
2285
2286        BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2287        BUG_ON(root_eb == NULL);
2288
2289        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2290                return 0;
2291
2292        if (!extent_buffer_uptodate(root_eb)) {
2293                ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
2294                if (ret)
2295                        goto out;
2296        }
2297
2298        if (root_level == 0) {
2299                ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2300                goto out;
2301        }
2302
2303        path = btrfs_alloc_path();
2304        if (!path)
2305                return -ENOMEM;
2306
2307        /*
2308         * Walk down the tree.  Missing extent blocks are filled in as
2309         * we go. Metadata is accounted every time we read a new
2310         * extent block.
2311         *
2312         * When we reach a leaf, we account for file extent items in it,
2313         * walk back up the tree (adjusting slot pointers as we go)
2314         * and restart the search process.
2315         */
2316        atomic_inc(&root_eb->refs);     /* For path */
2317        path->nodes[root_level] = root_eb;
2318        path->slots[root_level] = 0;
2319        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2320walk_down:
2321        level = root_level;
2322        while (level >= 0) {
2323                if (path->nodes[level] == NULL) {
2324                        int parent_slot;
2325                        u64 child_bytenr;
2326
2327                        /*
2328                         * We need to get child blockptr from parent before we
2329                         * can read it.
2330                          */
2331                        eb = path->nodes[level + 1];
2332                        parent_slot = path->slots[level + 1];
2333                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2334
2335                        eb = btrfs_read_node_slot(eb, parent_slot);
2336                        if (IS_ERR(eb)) {
2337                                ret = PTR_ERR(eb);
2338                                goto out;
2339                        }
2340
2341                        path->nodes[level] = eb;
2342                        path->slots[level] = 0;
2343
2344                        btrfs_tree_read_lock(eb);
2345                        path->locks[level] = BTRFS_READ_LOCK;
2346
2347                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2348                                                        fs_info->nodesize,
2349                                                        GFP_NOFS);
2350                        if (ret)
2351                                goto out;
2352                }
2353
2354                if (level == 0) {
2355                        ret = btrfs_qgroup_trace_leaf_items(trans,
2356                                                            path->nodes[level]);
2357                        if (ret)
2358                                goto out;
2359
2360                        /* Nonzero return here means we completed our search */
2361                        ret = adjust_slots_upwards(path, root_level);
2362                        if (ret)
2363                                break;
2364
2365                        /* Restart search with new slots */
2366                        goto walk_down;
2367                }
2368
2369                level--;
2370        }
2371
2372        ret = 0;
2373out:
2374        btrfs_free_path(path);
2375
2376        return ret;
2377}
2378
2379#define UPDATE_NEW      0
2380#define UPDATE_OLD      1
2381/*
2382 * Walk all of the roots that points to the bytenr and adjust their refcnts.
2383 */
2384static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2385                                struct ulist *roots, struct ulist *tmp,
2386                                struct ulist *qgroups, u64 seq, int update_old)
2387{
2388        struct ulist_node *unode;
2389        struct ulist_iterator uiter;
2390        struct ulist_node *tmp_unode;
2391        struct ulist_iterator tmp_uiter;
2392        struct btrfs_qgroup *qg;
2393        int ret = 0;
2394
2395        if (!roots)
2396                return 0;
2397        ULIST_ITER_INIT(&uiter);
2398        while ((unode = ulist_next(roots, &uiter))) {
2399                qg = find_qgroup_rb(fs_info, unode->val);
2400                if (!qg)
2401                        continue;
2402
2403                ulist_reinit(tmp);
2404                ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
2405                                GFP_ATOMIC);
2406                if (ret < 0)
2407                        return ret;
2408                ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
2409                if (ret < 0)
2410                        return ret;
2411                ULIST_ITER_INIT(&tmp_uiter);
2412                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
2413                        struct btrfs_qgroup_list *glist;
2414
2415                        qg = unode_aux_to_qgroup(tmp_unode);
2416                        if (update_old)
2417                                btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2418                        else
2419                                btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2420                        list_for_each_entry(glist, &qg->groups, next_group) {
2421                                ret = ulist_add(qgroups, glist->group->qgroupid,
2422                                                qgroup_to_aux(glist->group),
2423                                                GFP_ATOMIC);
2424                                if (ret < 0)
2425                                        return ret;
2426                                ret = ulist_add(tmp, glist->group->qgroupid,
2427                                                qgroup_to_aux(glist->group),
2428                                                GFP_ATOMIC);
2429                                if (ret < 0)
2430                                        return ret;
2431                        }
2432                }
2433        }
2434        return 0;
2435}
2436
2437/*
2438 * Update qgroup rfer/excl counters.
2439 * Rfer update is easy, codes can explain themselves.
2440 *
2441 * Excl update is tricky, the update is split into 2 parts.
2442 * Part 1: Possible exclusive <-> sharing detect:
2443 *      |       A       |       !A      |
2444 *  -------------------------------------
2445 *  B   |       *       |       -       |
2446 *  -------------------------------------
2447 *  !B  |       +       |       **      |
2448 *  -------------------------------------
2449 *
2450 * Conditions:
2451 * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2452 * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2453 * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2454 * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2455 *
2456 * Results:
2457 * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
2458 * *: Definitely not changed.           **: Possible unchanged.
2459 *
2460 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2461 *
2462 * To make the logic clear, we first use condition A and B to split
2463 * combination into 4 results.
2464 *
2465 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2466 * only on variant maybe 0.
2467 *
2468 * Lastly, check result **, since there are 2 variants maybe 0, split them
2469 * again(2x2).
2470 * But this time we don't need to consider other things, the codes and logic
2471 * is easy to understand now.
2472 */
2473static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
2474                                  struct ulist *qgroups,
2475                                  u64 nr_old_roots,
2476                                  u64 nr_new_roots,
2477                                  u64 num_bytes, u64 seq)
2478{
2479        struct ulist_node *unode;
2480        struct ulist_iterator uiter;
2481        struct btrfs_qgroup *qg;
2482        u64 cur_new_count, cur_old_count;
2483
2484        ULIST_ITER_INIT(&uiter);
2485        while ((unode = ulist_next(qgroups, &uiter))) {
2486                bool dirty = false;
2487
2488                qg = unode_aux_to_qgroup(unode);
2489                cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2490                cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2491
2492                trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2493                                             cur_new_count);
2494
2495                /* Rfer update part */
2496                if (cur_old_count == 0 && cur_new_count > 0) {
2497                        qg->rfer += num_bytes;
2498                        qg->rfer_cmpr += num_bytes;
2499                        dirty = true;
2500                }
2501                if (cur_old_count > 0 && cur_new_count == 0) {
2502                        qg->rfer -= num_bytes;
2503                        qg->rfer_cmpr -= num_bytes;
2504                        dirty = true;
2505                }
2506
2507                /* Excl update part */
2508                /* Exclusive/none -> shared case */
2509                if (cur_old_count == nr_old_roots &&
2510                    cur_new_count < nr_new_roots) {
2511                        /* Exclusive -> shared */
2512                        if (cur_old_count != 0) {
2513                                qg->excl -= num_bytes;
2514                                qg->excl_cmpr -= num_bytes;
2515                                dirty = true;
2516                        }
2517                }
2518
2519                /* Shared -> exclusive/none case */
2520                if (cur_old_count < nr_old_roots &&
2521                    cur_new_count == nr_new_roots) {
2522                        /* Shared->exclusive */
2523                        if (cur_new_count != 0) {
2524                                qg->excl += num_bytes;
2525                                qg->excl_cmpr += num_bytes;
2526                                dirty = true;
2527                        }
2528                }
2529
2530                /* Exclusive/none -> exclusive/none case */
2531                if (cur_old_count == nr_old_roots &&
2532                    cur_new_count == nr_new_roots) {
2533                        if (cur_old_count == 0) {
2534                                /* None -> exclusive/none */
2535
2536                                if (cur_new_count != 0) {
2537                                        /* None -> exclusive */
2538                                        qg->excl += num_bytes;
2539                                        qg->excl_cmpr += num_bytes;
2540                                        dirty = true;
2541                                }
2542                                /* None -> none, nothing changed */
2543                        } else {
2544                                /* Exclusive -> exclusive/none */
2545
2546                                if (cur_new_count == 0) {
2547                                        /* Exclusive -> none */
2548                                        qg->excl -= num_bytes;
2549                                        qg->excl_cmpr -= num_bytes;
2550                                        dirty = true;
2551                                }
2552                                /* Exclusive -> exclusive, nothing changed */
2553                        }
2554                }
2555
2556                if (dirty)
2557                        qgroup_dirty(fs_info, qg);
2558        }
2559        return 0;
2560}
2561
2562/*
2563 * Check if the @roots potentially is a list of fs tree roots
2564 *
2565 * Return 0 for definitely not a fs/subvol tree roots ulist
2566 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2567 *          one as well)
2568 */
2569static int maybe_fs_roots(struct ulist *roots)
2570{
2571        struct ulist_node *unode;
2572        struct ulist_iterator uiter;
2573
2574        /* Empty one, still possible for fs roots */
2575        if (!roots || roots->nnodes == 0)
2576                return 1;
2577
2578        ULIST_ITER_INIT(&uiter);
2579        unode = ulist_next(roots, &uiter);
2580        if (!unode)
2581                return 1;
2582
2583        /*
2584         * If it contains fs tree roots, then it must belong to fs/subvol
2585         * trees.
2586         * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2587         */
2588        return is_fstree(unode->val);
2589}
2590
2591int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2592                                u64 num_bytes, struct ulist *old_roots,
2593                                struct ulist *new_roots)
2594{
2595        struct btrfs_fs_info *fs_info = trans->fs_info;
2596        struct ulist *qgroups = NULL;
2597        struct ulist *tmp = NULL;
2598        u64 seq;
2599        u64 nr_new_roots = 0;
2600        u64 nr_old_roots = 0;
2601        int ret = 0;
2602
2603        /*
2604         * If quotas get disabled meanwhile, the resources need to be freed and
2605         * we can't just exit here.
2606         */
2607        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2608                goto out_free;
2609
2610        if (new_roots) {
2611                if (!maybe_fs_roots(new_roots))
2612                        goto out_free;
2613                nr_new_roots = new_roots->nnodes;
2614        }
2615        if (old_roots) {
2616                if (!maybe_fs_roots(old_roots))
2617                        goto out_free;
2618                nr_old_roots = old_roots->nnodes;
2619        }
2620
2621        /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2622        if (nr_old_roots == 0 && nr_new_roots == 0)
2623                goto out_free;
2624
2625        BUG_ON(!fs_info->quota_root);
2626
2627        trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2628                                        num_bytes, nr_old_roots, nr_new_roots);
2629
2630        qgroups = ulist_alloc(GFP_NOFS);
2631        if (!qgroups) {
2632                ret = -ENOMEM;
2633                goto out_free;
2634        }
2635        tmp = ulist_alloc(GFP_NOFS);
2636        if (!tmp) {
2637                ret = -ENOMEM;
2638                goto out_free;
2639        }
2640
2641        mutex_lock(&fs_info->qgroup_rescan_lock);
2642        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2643                if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2644                        mutex_unlock(&fs_info->qgroup_rescan_lock);
2645                        ret = 0;
2646                        goto out_free;
2647                }
2648        }
2649        mutex_unlock(&fs_info->qgroup_rescan_lock);
2650
2651        spin_lock(&fs_info->qgroup_lock);
2652        seq = fs_info->qgroup_seq;
2653
2654        /* Update old refcnts using old_roots */
2655        ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
2656                                   UPDATE_OLD);
2657        if (ret < 0)
2658                goto out;
2659
2660        /* Update new refcnts using new_roots */
2661        ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
2662                                   UPDATE_NEW);
2663        if (ret < 0)
2664                goto out;
2665
2666        qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
2667                               num_bytes, seq);
2668
2669        /*
2670         * Bump qgroup_seq to avoid seq overlap
2671         */
2672        fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2673out:
2674        spin_unlock(&fs_info->qgroup_lock);
2675out_free:
2676        ulist_free(tmp);
2677        ulist_free(qgroups);
2678        ulist_free(old_roots);
2679        ulist_free(new_roots);
2680        return ret;
2681}
2682
2683int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2684{
2685        struct btrfs_fs_info *fs_info = trans->fs_info;
2686        struct btrfs_qgroup_extent_record *record;
2687        struct btrfs_delayed_ref_root *delayed_refs;
2688        struct ulist *new_roots = NULL;
2689        struct rb_node *node;
2690        u64 num_dirty_extents = 0;
2691        u64 qgroup_to_skip;
2692        int ret = 0;
2693
2694        delayed_refs = &trans->transaction->delayed_refs;
2695        qgroup_to_skip = delayed_refs->qgroup_to_skip;
2696        while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2697                record = rb_entry(node, struct btrfs_qgroup_extent_record,
2698                                  node);
2699
2700                num_dirty_extents++;
2701                trace_btrfs_qgroup_account_extents(fs_info, record);
2702
2703                if (!ret) {
2704                        /*
2705                         * Old roots should be searched when inserting qgroup
2706                         * extent record
2707                         */
2708                        if (WARN_ON(!record->old_roots)) {
2709                                /* Search commit root to find old_roots */
2710                                ret = btrfs_find_all_roots(NULL, fs_info,
2711                                                record->bytenr, 0,
2712                                                &record->old_roots, false);
2713                                if (ret < 0)
2714                                        goto cleanup;
2715                        }
2716
2717                        /* Free the reserved data space */
2718                        btrfs_qgroup_free_refroot(fs_info,
2719                                        record->data_rsv_refroot,
2720                                        record->data_rsv,
2721                                        BTRFS_QGROUP_RSV_DATA);
2722                        /*
2723                         * Use BTRFS_SEQ_LAST as time_seq to do special search,
2724                         * which doesn't lock tree or delayed_refs and search
2725                         * current root. It's safe inside commit_transaction().
2726                         */
2727                        ret = btrfs_find_all_roots(trans, fs_info,
2728                           record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
2729                        if (ret < 0)
2730                                goto cleanup;
2731                        if (qgroup_to_skip) {
2732                                ulist_del(new_roots, qgroup_to_skip, 0);
2733                                ulist_del(record->old_roots, qgroup_to_skip,
2734                                          0);
2735                        }
2736                        ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2737                                                          record->num_bytes,
2738                                                          record->old_roots,
2739                                                          new_roots);
2740                        record->old_roots = NULL;
2741                        new_roots = NULL;
2742                }
2743cleanup:
2744                ulist_free(record->old_roots);
2745                ulist_free(new_roots);
2746                new_roots = NULL;
2747                rb_erase(node, &delayed_refs->dirty_extent_root);
2748                kfree(record);
2749
2750        }
2751        trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2752                                       num_dirty_extents);
2753        return ret;
2754}
2755
2756/*
2757 * called from commit_transaction. Writes all changed qgroups to disk.
2758 */
2759int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2760{
2761        struct btrfs_fs_info *fs_info = trans->fs_info;
2762        int ret = 0;
2763
2764        if (!fs_info->quota_root)
2765                return ret;
2766
2767        spin_lock(&fs_info->qgroup_lock);
2768        while (!list_empty(&fs_info->dirty_qgroups)) {
2769                struct btrfs_qgroup *qgroup;
2770                qgroup = list_first_entry(&fs_info->dirty_qgroups,
2771                                          struct btrfs_qgroup, dirty);
2772                list_del_init(&qgroup->dirty);
2773                spin_unlock(&fs_info->qgroup_lock);
2774                ret = update_qgroup_info_item(trans, qgroup);
2775                if (ret)
2776                        fs_info->qgroup_flags |=
2777                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2778                ret = update_qgroup_limit_item(trans, qgroup);
2779                if (ret)
2780                        fs_info->qgroup_flags |=
2781                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2782                spin_lock(&fs_info->qgroup_lock);
2783        }
2784        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2785                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2786        else
2787                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2788        spin_unlock(&fs_info->qgroup_lock);
2789
2790        ret = update_qgroup_status_item(trans);
2791        if (ret)
2792                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2793
2794        return ret;
2795}
2796
2797/*
2798 * Copy the accounting information between qgroups. This is necessary
2799 * when a snapshot or a subvolume is created. Throwing an error will
2800 * cause a transaction abort so we take extra care here to only error
2801 * when a readonly fs is a reasonable outcome.
2802 */
2803int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2804                         u64 objectid, struct btrfs_qgroup_inherit *inherit)
2805{
2806        int ret = 0;
2807        int i;
2808        u64 *i_qgroups;
2809        bool committing = false;
2810        struct btrfs_fs_info *fs_info = trans->fs_info;
2811        struct btrfs_root *quota_root;
2812        struct btrfs_qgroup *srcgroup;
2813        struct btrfs_qgroup *dstgroup;
2814        bool need_rescan = false;
2815        u32 level_size = 0;
2816        u64 nums;
2817
2818        /*
2819         * There are only two callers of this function.
2820         *
2821         * One in create_subvol() in the ioctl context, which needs to hold
2822         * the qgroup_ioctl_lock.
2823         *
2824         * The other one in create_pending_snapshot() where no other qgroup
2825         * code can modify the fs as they all need to either start a new trans
2826         * or hold a trans handler, thus we don't need to hold
2827         * qgroup_ioctl_lock.
2828         * This would avoid long and complex lock chain and make lockdep happy.
2829         */
2830        spin_lock(&fs_info->trans_lock);
2831        if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2832                committing = true;
2833        spin_unlock(&fs_info->trans_lock);
2834
2835        if (!committing)
2836                mutex_lock(&fs_info->qgroup_ioctl_lock);
2837        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2838                goto out;
2839
2840        quota_root = fs_info->quota_root;
2841        if (!quota_root) {
2842                ret = -EINVAL;
2843                goto out;
2844        }
2845
2846        if (inherit) {
2847                i_qgroups = (u64 *)(inherit + 1);
2848                nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2849                       2 * inherit->num_excl_copies;
2850                for (i = 0; i < nums; ++i) {
2851                        srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2852
2853                        /*
2854                         * Zero out invalid groups so we can ignore
2855                         * them later.
2856                         */
2857                        if (!srcgroup ||
2858                            ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
2859                                *i_qgroups = 0ULL;
2860
2861                        ++i_qgroups;
2862                }
2863        }
2864
2865        /*
2866         * create a tracking group for the subvol itself
2867         */
2868        ret = add_qgroup_item(trans, quota_root, objectid);
2869        if (ret)
2870                goto out;
2871
2872        /*
2873         * add qgroup to all inherited groups
2874         */
2875        if (inherit) {
2876                i_qgroups = (u64 *)(inherit + 1);
2877                for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
2878                        if (*i_qgroups == 0)
2879                                continue;
2880                        ret = add_qgroup_relation_item(trans, objectid,
2881                                                       *i_qgroups);
2882                        if (ret && ret != -EEXIST)
2883                                goto out;
2884                        ret = add_qgroup_relation_item(trans, *i_qgroups,
2885                                                       objectid);
2886                        if (ret && ret != -EEXIST)
2887                                goto out;
2888                }
2889                ret = 0;
2890        }
2891
2892
2893        spin_lock(&fs_info->qgroup_lock);
2894
2895        dstgroup = add_qgroup_rb(fs_info, objectid);
2896        if (IS_ERR(dstgroup)) {
2897                ret = PTR_ERR(dstgroup);
2898                goto unlock;
2899        }
2900
2901        if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2902                dstgroup->lim_flags = inherit->lim.flags;
2903                dstgroup->max_rfer = inherit->lim.max_rfer;
2904                dstgroup->max_excl = inherit->lim.max_excl;
2905                dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2906                dstgroup->rsv_excl = inherit->lim.rsv_excl;
2907
2908                ret = update_qgroup_limit_item(trans, dstgroup);
2909                if (ret) {
2910                        fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2911                        btrfs_info(fs_info,
2912                                   "unable to update quota limit for %llu",
2913                                   dstgroup->qgroupid);
2914                        goto unlock;
2915                }
2916        }
2917
2918        if (srcid) {
2919                srcgroup = find_qgroup_rb(fs_info, srcid);
2920                if (!srcgroup)
2921                        goto unlock;
2922
2923                /*
2924                 * We call inherit after we clone the root in order to make sure
2925                 * our counts don't go crazy, so at this point the only
2926                 * difference between the two roots should be the root node.
2927                 */
2928                level_size = fs_info->nodesize;
2929                dstgroup->rfer = srcgroup->rfer;
2930                dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2931                dstgroup->excl = level_size;
2932                dstgroup->excl_cmpr = level_size;
2933                srcgroup->excl = level_size;
2934                srcgroup->excl_cmpr = level_size;
2935
2936                /* inherit the limit info */
2937                dstgroup->lim_flags = srcgroup->lim_flags;
2938                dstgroup->max_rfer = srcgroup->max_rfer;
2939                dstgroup->max_excl = srcgroup->max_excl;
2940                dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2941                dstgroup->rsv_excl = srcgroup->rsv_excl;
2942
2943                qgroup_dirty(fs_info, dstgroup);
2944                qgroup_dirty(fs_info, srcgroup);
2945        }
2946
2947        if (!inherit)
2948                goto unlock;
2949
2950        i_qgroups = (u64 *)(inherit + 1);
2951        for (i = 0; i < inherit->num_qgroups; ++i) {
2952                if (*i_qgroups) {
2953                        ret = add_relation_rb(fs_info, objectid, *i_qgroups);
2954                        if (ret)
2955                                goto unlock;
2956                }
2957                ++i_qgroups;
2958
2959                /*
2960                 * If we're doing a snapshot, and adding the snapshot to a new
2961                 * qgroup, the numbers are guaranteed to be incorrect.
2962                 */
2963                if (srcid)
2964                        need_rescan = true;
2965        }
2966
2967        for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
2968                struct btrfs_qgroup *src;
2969                struct btrfs_qgroup *dst;
2970
2971                if (!i_qgroups[0] || !i_qgroups[1])
2972                        continue;
2973
2974                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2975                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2976
2977                if (!src || !dst) {
2978                        ret = -EINVAL;
2979                        goto unlock;
2980                }
2981
2982                dst->rfer = src->rfer - level_size;
2983                dst->rfer_cmpr = src->rfer_cmpr - level_size;
2984
2985                /* Manually tweaking numbers certainly needs a rescan */
2986                need_rescan = true;
2987        }
2988        for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
2989                struct btrfs_qgroup *src;
2990                struct btrfs_qgroup *dst;
2991
2992                if (!i_qgroups[0] || !i_qgroups[1])
2993                        continue;
2994
2995                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2996                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2997
2998                if (!src || !dst) {
2999                        ret = -EINVAL;
3000                        goto unlock;
3001                }
3002
3003                dst->excl = src->excl + level_size;
3004                dst->excl_cmpr = src->excl_cmpr + level_size;
3005                need_rescan = true;
3006        }
3007
3008unlock:
3009        spin_unlock(&fs_info->qgroup_lock);
3010        if (!ret)
3011                ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
3012out:
3013        if (!committing)
3014                mutex_unlock(&fs_info->qgroup_ioctl_lock);
3015        if (need_rescan)
3016                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3017        return ret;
3018}
3019
3020static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
3021{
3022        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
3023            qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
3024                return false;
3025
3026        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
3027            qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
3028                return false;
3029
3030        return true;
3031}
3032
3033static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
3034                          enum btrfs_qgroup_rsv_type type)
3035{
3036        struct btrfs_qgroup *qgroup;
3037        struct btrfs_fs_info *fs_info = root->fs_info;
3038        u64 ref_root = root->root_key.objectid;
3039        int ret = 0;
3040        struct ulist_node *unode;
3041        struct ulist_iterator uiter;
3042
3043        if (!is_fstree(ref_root))
3044                return 0;
3045
3046        if (num_bytes == 0)
3047                return 0;
3048
3049        if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
3050            capable(CAP_SYS_RESOURCE))
3051                enforce = false;
3052
3053        spin_lock(&fs_info->qgroup_lock);
3054        if (!fs_info->quota_root)
3055                goto out;
3056
3057        qgroup = find_qgroup_rb(fs_info, ref_root);
3058        if (!qgroup)
3059                goto out;
3060
3061        /*
3062         * in a first step, we check all affected qgroups if any limits would
3063         * be exceeded
3064         */
3065        ulist_reinit(fs_info->qgroup_ulist);
3066        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3067                        qgroup_to_aux(qgroup), GFP_ATOMIC);
3068        if (ret < 0)
3069                goto out;
3070        ULIST_ITER_INIT(&uiter);
3071        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3072                struct btrfs_qgroup *qg;
3073                struct btrfs_qgroup_list *glist;
3074
3075                qg = unode_aux_to_qgroup(unode);
3076
3077                if (enforce && !qgroup_check_limits(qg, num_bytes)) {
3078                        ret = -EDQUOT;
3079                        goto out;
3080                }
3081
3082                list_for_each_entry(glist, &qg->groups, next_group) {
3083                        ret = ulist_add(fs_info->qgroup_ulist,
3084                                        glist->group->qgroupid,
3085                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3086                        if (ret < 0)
3087                                goto out;
3088                }
3089        }
3090        ret = 0;
3091        /*
3092         * no limits exceeded, now record the reservation into all qgroups
3093         */
3094        ULIST_ITER_INIT(&uiter);
3095        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3096                struct btrfs_qgroup *qg;
3097
3098                qg = unode_aux_to_qgroup(unode);
3099
3100                qgroup_rsv_add(fs_info, qg, num_bytes, type);
3101        }
3102
3103out:
3104        spin_unlock(&fs_info->qgroup_lock);
3105        return ret;
3106}
3107
3108/*
3109 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3110 * qgroup).
3111 *
3112 * Will handle all higher level qgroup too.
3113 *
3114 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3115 * This special case is only used for META_PERTRANS type.
3116 */
3117void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3118                               u64 ref_root, u64 num_bytes,
3119                               enum btrfs_qgroup_rsv_type type)
3120{
3121        struct btrfs_qgroup *qgroup;
3122        struct ulist_node *unode;
3123        struct ulist_iterator uiter;
3124        int ret = 0;
3125
3126        if (!is_fstree(ref_root))
3127                return;
3128
3129        if (num_bytes == 0)
3130                return;
3131
3132        if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3133                WARN(1, "%s: Invalid type to free", __func__);
3134                return;
3135        }
3136        spin_lock(&fs_info->qgroup_lock);
3137
3138        if (!fs_info->quota_root)
3139                goto out;
3140
3141        qgroup = find_qgroup_rb(fs_info, ref_root);
3142        if (!qgroup)
3143                goto out;
3144
3145        if (num_bytes == (u64)-1)
3146                /*
3147                 * We're freeing all pertrans rsv, get reserved value from
3148                 * level 0 qgroup as real num_bytes to free.
3149                 */
3150                num_bytes = qgroup->rsv.values[type];
3151
3152        ulist_reinit(fs_info->qgroup_ulist);
3153        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3154                        qgroup_to_aux(qgroup), GFP_ATOMIC);
3155        if (ret < 0)
3156                goto out;
3157        ULIST_ITER_INIT(&uiter);
3158        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3159                struct btrfs_qgroup *qg;
3160                struct btrfs_qgroup_list *glist;
3161
3162                qg = unode_aux_to_qgroup(unode);
3163
3164                qgroup_rsv_release(fs_info, qg, num_bytes, type);
3165
3166                list_for_each_entry(glist, &qg->groups, next_group) {
3167                        ret = ulist_add(fs_info->qgroup_ulist,
3168                                        glist->group->qgroupid,
3169                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3170                        if (ret < 0)
3171                                goto out;
3172                }
3173        }
3174
3175out:
3176        spin_unlock(&fs_info->qgroup_lock);
3177}
3178
3179/*
3180 * Check if the leaf is the last leaf. Which means all node pointers
3181 * are at their last position.
3182 */
3183static bool is_last_leaf(struct btrfs_path *path)
3184{
3185        int i;
3186
3187        for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3188                if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3189                        return false;
3190        }
3191        return true;
3192}
3193
3194/*
3195 * returns < 0 on error, 0 when more leafs are to be scanned.
3196 * returns 1 when done.
3197 */
3198static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3199                              struct btrfs_path *path)
3200{
3201        struct btrfs_fs_info *fs_info = trans->fs_info;
3202        struct btrfs_root *extent_root;
3203        struct btrfs_key found;
3204        struct extent_buffer *scratch_leaf = NULL;
3205        struct ulist *roots = NULL;
3206        u64 num_bytes;
3207        bool done;
3208        int slot;
3209        int ret;
3210
3211        mutex_lock(&fs_info->qgroup_rescan_lock);
3212        extent_root = btrfs_extent_root(fs_info,
3213                                fs_info->qgroup_rescan_progress.objectid);
3214        ret = btrfs_search_slot_for_read(extent_root,
3215                                         &fs_info->qgroup_rescan_progress,
3216                                         path, 1, 0);
3217
3218        btrfs_debug(fs_info,
3219                "current progress key (%llu %u %llu), search_slot ret %d",
3220                fs_info->qgroup_rescan_progress.objectid,
3221                fs_info->qgroup_rescan_progress.type,
3222                fs_info->qgroup_rescan_progress.offset, ret);
3223
3224        if (ret) {
3225                /*
3226                 * The rescan is about to end, we will not be scanning any
3227                 * further blocks. We cannot unset the RESCAN flag here, because
3228                 * we want to commit the transaction if everything went well.
3229                 * To make the live accounting work in this phase, we set our
3230                 * scan progress pointer such that every real extent objectid
3231                 * will be smaller.
3232                 */
3233                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3234                btrfs_release_path(path);
3235                mutex_unlock(&fs_info->qgroup_rescan_lock);
3236                return ret;
3237        }
3238        done = is_last_leaf(path);
3239
3240        btrfs_item_key_to_cpu(path->nodes[0], &found,
3241                              btrfs_header_nritems(path->nodes[0]) - 1);
3242        fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3243
3244        scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3245        if (!scratch_leaf) {
3246                ret = -ENOMEM;
3247                mutex_unlock(&fs_info->qgroup_rescan_lock);
3248                goto out;
3249        }
3250        slot = path->slots[0];
3251        btrfs_release_path(path);
3252        mutex_unlock(&fs_info->qgroup_rescan_lock);
3253
3254        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3255                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3256                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3257                    found.type != BTRFS_METADATA_ITEM_KEY)
3258                        continue;
3259                if (found.type == BTRFS_METADATA_ITEM_KEY)
3260                        num_bytes = fs_info->nodesize;
3261                else
3262                        num_bytes = found.offset;
3263
3264                ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
3265                                           &roots, false);
3266                if (ret < 0)
3267                        goto out;
3268                /* For rescan, just pass old_roots as NULL */
3269                ret = btrfs_qgroup_account_extent(trans, found.objectid,
3270                                                  num_bytes, NULL, roots);
3271                if (ret < 0)
3272                        goto out;
3273        }
3274out:
3275        if (scratch_leaf)
3276                free_extent_buffer(scratch_leaf);
3277
3278        if (done && !ret) {
3279                ret = 1;
3280                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3281        }
3282        return ret;
3283}
3284
3285static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3286{
3287        return btrfs_fs_closing(fs_info) ||
3288                test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
3289                !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
3290}
3291
3292static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3293{
3294        struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3295                                                     qgroup_rescan_work);
3296        struct btrfs_path *path;
3297        struct btrfs_trans_handle *trans = NULL;
3298        int err = -ENOMEM;
3299        int ret = 0;
3300        bool stopped = false;
3301
3302        path = btrfs_alloc_path();
3303        if (!path)
3304                goto out;
3305        /*
3306         * Rescan should only search for commit root, and any later difference
3307         * should be recorded by qgroup
3308         */
3309        path->search_commit_root = 1;
3310        path->skip_locking = 1;
3311
3312        err = 0;
3313        while (!err && !(stopped = rescan_should_stop(fs_info))) {
3314                trans = btrfs_start_transaction(fs_info->fs_root, 0);
3315                if (IS_ERR(trans)) {
3316                        err = PTR_ERR(trans);
3317                        break;
3318                }
3319
3320                err = qgroup_rescan_leaf(trans, path);
3321
3322                if (err > 0)
3323                        btrfs_commit_transaction(trans);
3324                else
3325                        btrfs_end_transaction(trans);
3326        }
3327
3328out:
3329        btrfs_free_path(path);
3330
3331        mutex_lock(&fs_info->qgroup_rescan_lock);
3332        if (err > 0 &&
3333            fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3334                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3335        } else if (err < 0 || stopped) {
3336                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3337        }
3338        mutex_unlock(&fs_info->qgroup_rescan_lock);
3339
3340        /*
3341         * only update status, since the previous part has already updated the
3342         * qgroup info.
3343         */
3344        trans = btrfs_start_transaction(fs_info->quota_root, 1);
3345        if (IS_ERR(trans)) {
3346                err = PTR_ERR(trans);
3347                trans = NULL;
3348                btrfs_err(fs_info,
3349                          "fail to start transaction for status update: %d",
3350                          err);
3351        }
3352
3353        mutex_lock(&fs_info->qgroup_rescan_lock);
3354        if (!stopped)
3355                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3356        if (trans) {
3357                ret = update_qgroup_status_item(trans);
3358                if (ret < 0) {
3359                        err = ret;
3360                        btrfs_err(fs_info, "fail to update qgroup status: %d",
3361                                  err);
3362                }
3363        }
3364        fs_info->qgroup_rescan_running = false;
3365        complete_all(&fs_info->qgroup_rescan_completion);
3366        mutex_unlock(&fs_info->qgroup_rescan_lock);
3367
3368        if (!trans)
3369                return;
3370
3371        btrfs_end_transaction(trans);
3372
3373        if (stopped) {
3374                btrfs_info(fs_info, "qgroup scan paused");
3375        } else if (err >= 0) {
3376                btrfs_info(fs_info, "qgroup scan completed%s",
3377                        err > 0 ? " (inconsistency flag cleared)" : "");
3378        } else {
3379                btrfs_err(fs_info, "qgroup scan failed with %d", err);
3380        }
3381}
3382
3383/*
3384 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3385 * memory required for the rescan context.
3386 */
3387static int
3388qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3389                   int init_flags)
3390{
3391        int ret = 0;
3392
3393        if (!init_flags) {
3394                /* we're resuming qgroup rescan at mount time */
3395                if (!(fs_info->qgroup_flags &
3396                      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3397                        btrfs_warn(fs_info,
3398                        "qgroup rescan init failed, qgroup rescan is not queued");
3399                        ret = -EINVAL;
3400                } else if (!(fs_info->qgroup_flags &
3401                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3402                        btrfs_warn(fs_info,
3403                        "qgroup rescan init failed, qgroup is not enabled");
3404                        ret = -EINVAL;
3405                }
3406
3407                if (ret)
3408                        return ret;
3409        }
3410
3411        mutex_lock(&fs_info->qgroup_rescan_lock);
3412
3413        if (init_flags) {
3414                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3415                        btrfs_warn(fs_info,
3416                                   "qgroup rescan is already in progress");
3417                        ret = -EINPROGRESS;
3418                } else if (!(fs_info->qgroup_flags &
3419                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3420                        btrfs_warn(fs_info,
3421                        "qgroup rescan init failed, qgroup is not enabled");
3422                        ret = -EINVAL;
3423                } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3424                        /* Quota disable is in progress */
3425                        ret = -EBUSY;
3426                }
3427
3428                if (ret) {
3429                        mutex_unlock(&fs_info->qgroup_rescan_lock);
3430                        return ret;
3431                }
3432                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3433        }
3434
3435        memset(&fs_info->qgroup_rescan_progress, 0,
3436                sizeof(fs_info->qgroup_rescan_progress));
3437        fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3438        init_completion(&fs_info->qgroup_rescan_completion);
3439        mutex_unlock(&fs_info->qgroup_rescan_lock);
3440
3441        btrfs_init_work(&fs_info->qgroup_rescan_work,
3442                        btrfs_qgroup_rescan_worker, NULL, NULL);
3443        return 0;
3444}
3445
3446static void
3447qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3448{
3449        struct rb_node *n;
3450        struct btrfs_qgroup *qgroup;
3451
3452        spin_lock(&fs_info->qgroup_lock);
3453        /* clear all current qgroup tracking information */
3454        for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3455                qgroup = rb_entry(n, struct btrfs_qgroup, node);
3456                qgroup->rfer = 0;
3457                qgroup->rfer_cmpr = 0;
3458                qgroup->excl = 0;
3459                qgroup->excl_cmpr = 0;
3460                qgroup_dirty(fs_info, qgroup);
3461        }
3462        spin_unlock(&fs_info->qgroup_lock);
3463}
3464
3465int
3466btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3467{
3468        int ret = 0;
3469        struct btrfs_trans_handle *trans;
3470
3471        ret = qgroup_rescan_init(fs_info, 0, 1);
3472        if (ret)
3473                return ret;
3474
3475        /*
3476         * We have set the rescan_progress to 0, which means no more
3477         * delayed refs will be accounted by btrfs_qgroup_account_ref.
3478         * However, btrfs_qgroup_account_ref may be right after its call
3479         * to btrfs_find_all_roots, in which case it would still do the
3480         * accounting.
3481         * To solve this, we're committing the transaction, which will
3482         * ensure we run all delayed refs and only after that, we are
3483         * going to clear all tracking information for a clean start.
3484         */
3485
3486        trans = btrfs_join_transaction(fs_info->fs_root);
3487        if (IS_ERR(trans)) {
3488                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3489                return PTR_ERR(trans);
3490        }
3491        ret = btrfs_commit_transaction(trans);
3492        if (ret) {
3493                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3494                return ret;
3495        }
3496
3497        qgroup_rescan_zero_tracking(fs_info);
3498
3499        mutex_lock(&fs_info->qgroup_rescan_lock);
3500        fs_info->qgroup_rescan_running = true;
3501        btrfs_queue_work(fs_info->qgroup_rescan_workers,
3502                         &fs_info->qgroup_rescan_work);
3503        mutex_unlock(&fs_info->qgroup_rescan_lock);
3504
3505        return 0;
3506}
3507
3508int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3509                                     bool interruptible)
3510{
3511        int running;
3512        int ret = 0;
3513
3514        mutex_lock(&fs_info->qgroup_rescan_lock);
3515        running = fs_info->qgroup_rescan_running;
3516        mutex_unlock(&fs_info->qgroup_rescan_lock);
3517
3518        if (!running)
3519                return 0;
3520
3521        if (interruptible)
3522                ret = wait_for_completion_interruptible(
3523                                        &fs_info->qgroup_rescan_completion);
3524        else
3525                wait_for_completion(&fs_info->qgroup_rescan_completion);
3526
3527        return ret;
3528}
3529
3530/*
3531 * this is only called from open_ctree where we're still single threaded, thus
3532 * locking is omitted here.
3533 */
3534void
3535btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3536{
3537        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3538                mutex_lock(&fs_info->qgroup_rescan_lock);
3539                fs_info->qgroup_rescan_running = true;
3540                btrfs_queue_work(fs_info->qgroup_rescan_workers,
3541                                 &fs_info->qgroup_rescan_work);
3542                mutex_unlock(&fs_info->qgroup_rescan_lock);
3543        }
3544}
3545
3546#define rbtree_iterate_from_safe(node, next, start)                             \
3547       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3548
3549static int qgroup_unreserve_range(struct btrfs_inode *inode,
3550                                  struct extent_changeset *reserved, u64 start,
3551                                  u64 len)
3552{
3553        struct rb_node *node;
3554        struct rb_node *next;
3555        struct ulist_node *entry;
3556        int ret = 0;
3557
3558        node = reserved->range_changed.root.rb_node;
3559        if (!node)
3560                return 0;
3561        while (node) {
3562                entry = rb_entry(node, struct ulist_node, rb_node);
3563                if (entry->val < start)
3564                        node = node->rb_right;
3565                else
3566                        node = node->rb_left;
3567        }
3568
3569        if (entry->val > start && rb_prev(&entry->rb_node))
3570                entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3571                                 rb_node);
3572
3573        rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3574                u64 entry_start;
3575                u64 entry_end;
3576                u64 entry_len;
3577                int clear_ret;
3578
3579                entry = rb_entry(node, struct ulist_node, rb_node);
3580                entry_start = entry->val;
3581                entry_end = entry->aux;
3582                entry_len = entry_end - entry_start + 1;
3583
3584                if (entry_start >= start + len)
3585                        break;
3586                if (entry_start + entry_len <= start)
3587                        continue;
3588                /*
3589                 * Now the entry is in [start, start + len), revert the
3590                 * EXTENT_QGROUP_RESERVED bit.
3591                 */
3592                clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3593                                              entry_end, EXTENT_QGROUP_RESERVED);
3594                if (!ret && clear_ret < 0)
3595                        ret = clear_ret;
3596
3597                ulist_del(&reserved->range_changed, entry->val, entry->aux);
3598                if (likely(reserved->bytes_changed >= entry_len)) {
3599                        reserved->bytes_changed -= entry_len;
3600                } else {
3601                        WARN_ON(1);
3602                        reserved->bytes_changed = 0;
3603                }
3604        }
3605
3606        return ret;
3607}
3608
3609/*
3610 * Try to free some space for qgroup.
3611 *
3612 * For qgroup, there are only 3 ways to free qgroup space:
3613 * - Flush nodatacow write
3614 *   Any nodatacow write will free its reserved data space at run_delalloc_range().
3615 *   In theory, we should only flush nodatacow inodes, but it's not yet
3616 *   possible, so we need to flush the whole root.
3617 *
3618 * - Wait for ordered extents
3619 *   When ordered extents are finished, their reserved metadata is finally
3620 *   converted to per_trans status, which can be freed by later commit
3621 *   transaction.
3622 *
3623 * - Commit transaction
3624 *   This would free the meta_per_trans space.
3625 *   In theory this shouldn't provide much space, but any more qgroup space
3626 *   is needed.
3627 */
3628static int try_flush_qgroup(struct btrfs_root *root)
3629{
3630        struct btrfs_trans_handle *trans;
3631        int ret;
3632
3633        /* Can't hold an open transaction or we run the risk of deadlocking. */
3634        ASSERT(current->journal_info == NULL);
3635        if (WARN_ON(current->journal_info))
3636                return 0;
3637
3638        /*
3639         * We don't want to run flush again and again, so if there is a running
3640         * one, we won't try to start a new flush, but exit directly.
3641         */
3642        if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3643                wait_event(root->qgroup_flush_wait,
3644                        !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3645                return 0;
3646        }
3647
3648        ret = btrfs_start_delalloc_snapshot(root, true);
3649        if (ret < 0)
3650                goto out;
3651        btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3652
3653        trans = btrfs_join_transaction(root);
3654        if (IS_ERR(trans)) {
3655                ret = PTR_ERR(trans);
3656                goto out;
3657        }
3658
3659        ret = btrfs_commit_transaction(trans);
3660out:
3661        clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3662        wake_up(&root->qgroup_flush_wait);
3663        return ret;
3664}
3665
3666static int qgroup_reserve_data(struct btrfs_inode *inode,
3667                        struct extent_changeset **reserved_ret, u64 start,
3668                        u64 len)
3669{
3670        struct btrfs_root *root = inode->root;
3671        struct extent_changeset *reserved;
3672        bool new_reserved = false;
3673        u64 orig_reserved;
3674        u64 to_reserve;
3675        int ret;
3676
3677        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3678            !is_fstree(root->root_key.objectid) || len == 0)
3679                return 0;
3680
3681        /* @reserved parameter is mandatory for qgroup */
3682        if (WARN_ON(!reserved_ret))
3683                return -EINVAL;
3684        if (!*reserved_ret) {
3685                new_reserved = true;
3686                *reserved_ret = extent_changeset_alloc();
3687                if (!*reserved_ret)
3688                        return -ENOMEM;
3689        }
3690        reserved = *reserved_ret;
3691        /* Record already reserved space */
3692        orig_reserved = reserved->bytes_changed;
3693        ret = set_record_extent_bits(&inode->io_tree, start,
3694                        start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3695
3696        /* Newly reserved space */
3697        to_reserve = reserved->bytes_changed - orig_reserved;
3698        trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
3699                                        to_reserve, QGROUP_RESERVE);
3700        if (ret < 0)
3701                goto out;
3702        ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3703        if (ret < 0)
3704                goto cleanup;
3705
3706        return ret;
3707
3708cleanup:
3709        qgroup_unreserve_range(inode, reserved, start, len);
3710out:
3711        if (new_reserved) {
3712                extent_changeset_free(reserved);
3713                *reserved_ret = NULL;
3714        }
3715        return ret;
3716}
3717
3718/*
3719 * Reserve qgroup space for range [start, start + len).
3720 *
3721 * This function will either reserve space from related qgroups or do nothing
3722 * if the range is already reserved.
3723 *
3724 * Return 0 for successful reservation
3725 * Return <0 for error (including -EQUOT)
3726 *
3727 * NOTE: This function may sleep for memory allocation, dirty page flushing and
3728 *       commit transaction. So caller should not hold any dirty page locked.
3729 */
3730int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3731                        struct extent_changeset **reserved_ret, u64 start,
3732                        u64 len)
3733{
3734        int ret;
3735
3736        ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3737        if (ret <= 0 && ret != -EDQUOT)
3738                return ret;
3739
3740        ret = try_flush_qgroup(inode->root);
3741        if (ret < 0)
3742                return ret;
3743        return qgroup_reserve_data(inode, reserved_ret, start, len);
3744}
3745
3746/* Free ranges specified by @reserved, normally in error path */
3747static int qgroup_free_reserved_data(struct btrfs_inode *inode,
3748                        struct extent_changeset *reserved, u64 start, u64 len)
3749{
3750        struct btrfs_root *root = inode->root;
3751        struct ulist_node *unode;
3752        struct ulist_iterator uiter;
3753        struct extent_changeset changeset;
3754        int freed = 0;
3755        int ret;
3756
3757        extent_changeset_init(&changeset);
3758        len = round_up(start + len, root->fs_info->sectorsize);
3759        start = round_down(start, root->fs_info->sectorsize);
3760
3761        ULIST_ITER_INIT(&uiter);
3762        while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3763                u64 range_start = unode->val;
3764                /* unode->aux is the inclusive end */
3765                u64 range_len = unode->aux - range_start + 1;
3766                u64 free_start;
3767                u64 free_len;
3768
3769                extent_changeset_release(&changeset);
3770
3771                /* Only free range in range [start, start + len) */
3772                if (range_start >= start + len ||
3773                    range_start + range_len <= start)
3774                        continue;
3775                free_start = max(range_start, start);
3776                free_len = min(start + len, range_start + range_len) -
3777                           free_start;
3778                /*
3779                 * TODO: To also modify reserved->ranges_reserved to reflect
3780                 * the modification.
3781                 *
3782                 * However as long as we free qgroup reserved according to
3783                 * EXTENT_QGROUP_RESERVED, we won't double free.
3784                 * So not need to rush.
3785                 */
3786                ret = clear_record_extent_bits(&inode->io_tree, free_start,
3787                                free_start + free_len - 1,
3788                                EXTENT_QGROUP_RESERVED, &changeset);
3789                if (ret < 0)
3790                        goto out;
3791                freed += changeset.bytes_changed;
3792        }
3793        btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3794                                  BTRFS_QGROUP_RSV_DATA);
3795        ret = freed;
3796out:
3797        extent_changeset_release(&changeset);
3798        return ret;
3799}
3800
3801static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
3802                        struct extent_changeset *reserved, u64 start, u64 len,
3803                        int free)
3804{
3805        struct extent_changeset changeset;
3806        int trace_op = QGROUP_RELEASE;
3807        int ret;
3808
3809        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
3810                return 0;
3811
3812        /* In release case, we shouldn't have @reserved */
3813        WARN_ON(!free && reserved);
3814        if (free && reserved)
3815                return qgroup_free_reserved_data(inode, reserved, start, len);
3816        extent_changeset_init(&changeset);
3817        ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3818                                       EXTENT_QGROUP_RESERVED, &changeset);
3819        if (ret < 0)
3820                goto out;
3821
3822        if (free)
3823                trace_op = QGROUP_FREE;
3824        trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
3825                                        changeset.bytes_changed, trace_op);
3826        if (free)
3827                btrfs_qgroup_free_refroot(inode->root->fs_info,
3828                                inode->root->root_key.objectid,
3829                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3830        ret = changeset.bytes_changed;
3831out:
3832        extent_changeset_release(&changeset);
3833        return ret;
3834}
3835
3836/*
3837 * Free a reserved space range from io_tree and related qgroups
3838 *
3839 * Should be called when a range of pages get invalidated before reaching disk.
3840 * Or for error cleanup case.
3841 * if @reserved is given, only reserved range in [@start, @start + @len) will
3842 * be freed.
3843 *
3844 * For data written to disk, use btrfs_qgroup_release_data().
3845 *
3846 * NOTE: This function may sleep for memory allocation.
3847 */
3848int btrfs_qgroup_free_data(struct btrfs_inode *inode,
3849                        struct extent_changeset *reserved, u64 start, u64 len)
3850{
3851        return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
3852}
3853
3854/*
3855 * Release a reserved space range from io_tree only.
3856 *
3857 * Should be called when a range of pages get written to disk and corresponding
3858 * FILE_EXTENT is inserted into corresponding root.
3859 *
3860 * Since new qgroup accounting framework will only update qgroup numbers at
3861 * commit_transaction() time, its reserved space shouldn't be freed from
3862 * related qgroups.
3863 *
3864 * But we should release the range from io_tree, to allow further write to be
3865 * COWed.
3866 *
3867 * NOTE: This function may sleep for memory allocation.
3868 */
3869int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
3870{
3871        return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
3872}
3873
3874static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3875                              enum btrfs_qgroup_rsv_type type)
3876{
3877        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3878            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3879                return;
3880        if (num_bytes == 0)
3881                return;
3882
3883        spin_lock(&root->qgroup_meta_rsv_lock);
3884        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
3885                root->qgroup_meta_rsv_prealloc += num_bytes;
3886        else
3887                root->qgroup_meta_rsv_pertrans += num_bytes;
3888        spin_unlock(&root->qgroup_meta_rsv_lock);
3889}
3890
3891static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3892                             enum btrfs_qgroup_rsv_type type)
3893{
3894        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3895            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3896                return 0;
3897        if (num_bytes == 0)
3898                return 0;
3899
3900        spin_lock(&root->qgroup_meta_rsv_lock);
3901        if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
3902                num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
3903                                  num_bytes);
3904                root->qgroup_meta_rsv_prealloc -= num_bytes;
3905        } else {
3906                num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
3907                                  num_bytes);
3908                root->qgroup_meta_rsv_pertrans -= num_bytes;
3909        }
3910        spin_unlock(&root->qgroup_meta_rsv_lock);
3911        return num_bytes;
3912}
3913
3914int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3915                              enum btrfs_qgroup_rsv_type type, bool enforce)
3916{
3917        struct btrfs_fs_info *fs_info = root->fs_info;
3918        int ret;
3919
3920        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3921            !is_fstree(root->root_key.objectid) || num_bytes == 0)
3922                return 0;
3923
3924        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3925        trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
3926        ret = qgroup_reserve(root, num_bytes, enforce, type);
3927        if (ret < 0)
3928                return ret;
3929        /*
3930         * Record what we have reserved into root.
3931         *
3932         * To avoid quota disabled->enabled underflow.
3933         * In that case, we may try to free space we haven't reserved
3934         * (since quota was disabled), so record what we reserved into root.
3935         * And ensure later release won't underflow this number.
3936         */
3937        add_root_meta_rsv(root, num_bytes, type);
3938        return ret;
3939}
3940
3941int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3942                                enum btrfs_qgroup_rsv_type type, bool enforce)
3943{
3944        int ret;
3945
3946        ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3947        if (ret <= 0 && ret != -EDQUOT)
3948                return ret;
3949
3950        ret = try_flush_qgroup(root);
3951        if (ret < 0)
3952                return ret;
3953        return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3954}
3955
3956void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
3957{
3958        struct btrfs_fs_info *fs_info = root->fs_info;
3959
3960        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3961            !is_fstree(root->root_key.objectid))
3962                return;
3963
3964        /* TODO: Update trace point to handle such free */
3965        trace_qgroup_meta_free_all_pertrans(root);
3966        /* Special value -1 means to free all reserved space */
3967        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
3968                                  BTRFS_QGROUP_RSV_META_PERTRANS);
3969}
3970
3971void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
3972                              enum btrfs_qgroup_rsv_type type)
3973{
3974        struct btrfs_fs_info *fs_info = root->fs_info;
3975
3976        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3977            !is_fstree(root->root_key.objectid))
3978                return;
3979
3980        /*
3981         * reservation for META_PREALLOC can happen before quota is enabled,
3982         * which can lead to underflow.
3983         * Here ensure we will only free what we really have reserved.
3984         */
3985        num_bytes = sub_root_meta_rsv(root, num_bytes, type);
3986        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3987        trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
3988        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
3989                                  num_bytes, type);
3990}
3991
3992static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
3993                                int num_bytes)
3994{
3995        struct btrfs_qgroup *qgroup;
3996        struct ulist_node *unode;
3997        struct ulist_iterator uiter;
3998        int ret = 0;
3999
4000        if (num_bytes == 0)
4001                return;
4002        if (!fs_info->quota_root)
4003                return;
4004
4005        spin_lock(&fs_info->qgroup_lock);
4006        qgroup = find_qgroup_rb(fs_info, ref_root);
4007        if (!qgroup)
4008                goto out;
4009        ulist_reinit(fs_info->qgroup_ulist);
4010        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
4011                       qgroup_to_aux(qgroup), GFP_ATOMIC);
4012        if (ret < 0)
4013                goto out;
4014        ULIST_ITER_INIT(&uiter);
4015        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
4016                struct btrfs_qgroup *qg;
4017                struct btrfs_qgroup_list *glist;
4018
4019                qg = unode_aux_to_qgroup(unode);
4020
4021                qgroup_rsv_release(fs_info, qg, num_bytes,
4022                                BTRFS_QGROUP_RSV_META_PREALLOC);
4023                qgroup_rsv_add(fs_info, qg, num_bytes,
4024                                BTRFS_QGROUP_RSV_META_PERTRANS);
4025                list_for_each_entry(glist, &qg->groups, next_group) {
4026                        ret = ulist_add(fs_info->qgroup_ulist,
4027                                        glist->group->qgroupid,
4028                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
4029                        if (ret < 0)
4030                                goto out;
4031                }
4032        }
4033out:
4034        spin_unlock(&fs_info->qgroup_lock);
4035}
4036
4037void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
4038{
4039        struct btrfs_fs_info *fs_info = root->fs_info;
4040
4041        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
4042            !is_fstree(root->root_key.objectid))
4043                return;
4044        /* Same as btrfs_qgroup_free_meta_prealloc() */
4045        num_bytes = sub_root_meta_rsv(root, num_bytes,
4046                                      BTRFS_QGROUP_RSV_META_PREALLOC);
4047        trace_qgroup_meta_convert(root, num_bytes);
4048        qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
4049}
4050
4051/*
4052 * Check qgroup reserved space leaking, normally at destroy inode
4053 * time
4054 */
4055void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4056{
4057        struct extent_changeset changeset;
4058        struct ulist_node *unode;
4059        struct ulist_iterator iter;
4060        int ret;
4061
4062        extent_changeset_init(&changeset);
4063        ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4064                        EXTENT_QGROUP_RESERVED, &changeset);
4065
4066        WARN_ON(ret < 0);
4067        if (WARN_ON(changeset.bytes_changed)) {
4068                ULIST_ITER_INIT(&iter);
4069                while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4070                        btrfs_warn(inode->root->fs_info,
4071                "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4072                                btrfs_ino(inode), unode->val, unode->aux);
4073                }
4074                btrfs_qgroup_free_refroot(inode->root->fs_info,
4075                                inode->root->root_key.objectid,
4076                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4077
4078        }
4079        extent_changeset_release(&changeset);
4080}
4081
4082void btrfs_qgroup_init_swapped_blocks(
4083        struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4084{
4085        int i;
4086
4087        spin_lock_init(&swapped_blocks->lock);
4088        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4089                swapped_blocks->blocks[i] = RB_ROOT;
4090        swapped_blocks->swapped = false;
4091}
4092
4093/*
4094 * Delete all swapped blocks record of @root.
4095 * Every record here means we skipped a full subtree scan for qgroup.
4096 *
4097 * Gets called when committing one transaction.
4098 */
4099void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4100{
4101        struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4102        int i;
4103
4104        swapped_blocks = &root->swapped_blocks;
4105
4106        spin_lock(&swapped_blocks->lock);
4107        if (!swapped_blocks->swapped)
4108                goto out;
4109        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4110                struct rb_root *cur_root = &swapped_blocks->blocks[i];
4111                struct btrfs_qgroup_swapped_block *entry;
4112                struct btrfs_qgroup_swapped_block *next;
4113
4114                rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4115                                                     node)
4116                        kfree(entry);
4117                swapped_blocks->blocks[i] = RB_ROOT;
4118        }
4119        swapped_blocks->swapped = false;
4120out:
4121        spin_unlock(&swapped_blocks->lock);
4122}
4123
4124/*
4125 * Add subtree roots record into @subvol_root.
4126 *
4127 * @subvol_root:        tree root of the subvolume tree get swapped
4128 * @bg:                 block group under balance
4129 * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4130 * @reloc_parent/slot:  pointer to the subtree root in reloc tree
4131 *                      BOTH POINTERS ARE BEFORE TREE SWAP
4132 * @last_snapshot:      last snapshot generation of the subvolume tree
4133 */
4134int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4135                struct btrfs_root *subvol_root,
4136                struct btrfs_block_group *bg,
4137                struct extent_buffer *subvol_parent, int subvol_slot,
4138                struct extent_buffer *reloc_parent, int reloc_slot,
4139                u64 last_snapshot)
4140{
4141        struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4142        struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4143        struct btrfs_qgroup_swapped_block *block;
4144        struct rb_node **cur;
4145        struct rb_node *parent = NULL;
4146        int level = btrfs_header_level(subvol_parent) - 1;
4147        int ret = 0;
4148
4149        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4150                return 0;
4151
4152        if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4153            btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4154                btrfs_err_rl(fs_info,
4155                "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4156                        __func__,
4157                        btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4158                        btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4159                return -EUCLEAN;
4160        }
4161
4162        block = kmalloc(sizeof(*block), GFP_NOFS);
4163        if (!block) {
4164                ret = -ENOMEM;
4165                goto out;
4166        }
4167
4168        /*
4169         * @reloc_parent/slot is still before swap, while @block is going to
4170         * record the bytenr after swap, so we do the swap here.
4171         */
4172        block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4173        block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4174                                                             reloc_slot);
4175        block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4176        block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4177                                                            subvol_slot);
4178        block->last_snapshot = last_snapshot;
4179        block->level = level;
4180
4181        /*
4182         * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4183         * no one else can modify tree blocks thus we qgroup will not change
4184         * no matter the value of trace_leaf.
4185         */
4186        if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4187                block->trace_leaf = true;
4188        else
4189                block->trace_leaf = false;
4190        btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4191
4192        /* Insert @block into @blocks */
4193        spin_lock(&blocks->lock);
4194        cur = &blocks->blocks[level].rb_node;
4195        while (*cur) {
4196                struct btrfs_qgroup_swapped_block *entry;
4197
4198                parent = *cur;
4199                entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4200                                 node);
4201
4202                if (entry->subvol_bytenr < block->subvol_bytenr) {
4203                        cur = &(*cur)->rb_left;
4204                } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4205                        cur = &(*cur)->rb_right;
4206                } else {
4207                        if (entry->subvol_generation !=
4208                                        block->subvol_generation ||
4209                            entry->reloc_bytenr != block->reloc_bytenr ||
4210                            entry->reloc_generation !=
4211                                        block->reloc_generation) {
4212                                /*
4213                                 * Duplicated but mismatch entry found.
4214                                 * Shouldn't happen.
4215                                 *
4216                                 * Marking qgroup inconsistent should be enough
4217                                 * for end users.
4218                                 */
4219                                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4220                                ret = -EEXIST;
4221                        }
4222                        kfree(block);
4223                        goto out_unlock;
4224                }
4225        }
4226        rb_link_node(&block->node, parent, cur);
4227        rb_insert_color(&block->node, &blocks->blocks[level]);
4228        blocks->swapped = true;
4229out_unlock:
4230        spin_unlock(&blocks->lock);
4231out:
4232        if (ret < 0)
4233                fs_info->qgroup_flags |=
4234                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4235        return ret;
4236}
4237
4238/*
4239 * Check if the tree block is a subtree root, and if so do the needed
4240 * delayed subtree trace for qgroup.
4241 *
4242 * This is called during btrfs_cow_block().
4243 */
4244int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4245                                         struct btrfs_root *root,
4246                                         struct extent_buffer *subvol_eb)
4247{
4248        struct btrfs_fs_info *fs_info = root->fs_info;
4249        struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4250        struct btrfs_qgroup_swapped_block *block;
4251        struct extent_buffer *reloc_eb = NULL;
4252        struct rb_node *node;
4253        bool found = false;
4254        bool swapped = false;
4255        int level = btrfs_header_level(subvol_eb);
4256        int ret = 0;
4257        int i;
4258
4259        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4260                return 0;
4261        if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4262                return 0;
4263
4264        spin_lock(&blocks->lock);
4265        if (!blocks->swapped) {
4266                spin_unlock(&blocks->lock);
4267                return 0;
4268        }
4269        node = blocks->blocks[level].rb_node;
4270
4271        while (node) {
4272                block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4273                if (block->subvol_bytenr < subvol_eb->start) {
4274                        node = node->rb_left;
4275                } else if (block->subvol_bytenr > subvol_eb->start) {
4276                        node = node->rb_right;
4277                } else {
4278                        found = true;
4279                        break;
4280                }
4281        }
4282        if (!found) {
4283                spin_unlock(&blocks->lock);
4284                goto out;
4285        }
4286        /* Found one, remove it from @blocks first and update blocks->swapped */
4287        rb_erase(&block->node, &blocks->blocks[level]);
4288        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4289                if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4290                        swapped = true;
4291                        break;
4292                }
4293        }
4294        blocks->swapped = swapped;
4295        spin_unlock(&blocks->lock);
4296
4297        /* Read out reloc subtree root */
4298        reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
4299                                   block->reloc_generation, block->level,
4300                                   &block->first_key);
4301        if (IS_ERR(reloc_eb)) {
4302                ret = PTR_ERR(reloc_eb);
4303                reloc_eb = NULL;
4304                goto free_out;
4305        }
4306        if (!extent_buffer_uptodate(reloc_eb)) {
4307                ret = -EIO;
4308                goto free_out;
4309        }
4310
4311        ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4312                        block->last_snapshot, block->trace_leaf);
4313free_out:
4314        kfree(block);
4315        free_extent_buffer(reloc_eb);
4316out:
4317        if (ret < 0) {
4318                btrfs_err_rl(fs_info,
4319                             "failed to account subtree at bytenr %llu: %d",
4320                             subvol_eb->start, ret);
4321                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4322        }
4323        return ret;
4324}
4325
4326void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4327{
4328        struct btrfs_qgroup_extent_record *entry;
4329        struct btrfs_qgroup_extent_record *next;
4330        struct rb_root *root;
4331
4332        root = &trans->delayed_refs.dirty_extent_root;
4333        rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4334                ulist_free(entry->old_roots);
4335                kfree(entry);
4336        }
4337}
4338