linux/fs/btrfs/qgroup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/rbtree.h>
  11#include <linux/slab.h>
  12#include <linux/workqueue.h>
  13#include <linux/btrfs.h>
  14#include <linux/sched/mm.h>
  15
  16#include "ctree.h"
  17#include "transaction.h"
  18#include "disk-io.h"
  19#include "locking.h"
  20#include "ulist.h"
  21#include "backref.h"
  22#include "extent_io.h"
  23#include "qgroup.h"
  24#include "block-group.h"
  25#include "sysfs.h"
  26
  27/* TODO XXX FIXME
  28 *  - subvol delete -> delete when ref goes to 0? delete limits also?
  29 *  - reorganize keys
  30 *  - compressed
  31 *  - sync
  32 *  - copy also limits on subvol creation
  33 *  - limit
  34 *  - caches for ulists
  35 *  - performance benchmarks
  36 *  - check all ioctl parameters
  37 */
  38
  39/*
  40 * Helpers to access qgroup reservation
  41 *
  42 * Callers should ensure the lock context and type are valid
  43 */
  44
  45static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  46{
  47        u64 ret = 0;
  48        int i;
  49
  50        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  51                ret += qgroup->rsv.values[i];
  52
  53        return ret;
  54}
  55
  56#ifdef CONFIG_BTRFS_DEBUG
  57static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  58{
  59        if (type == BTRFS_QGROUP_RSV_DATA)
  60                return "data";
  61        if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  62                return "meta_pertrans";
  63        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  64                return "meta_prealloc";
  65        return NULL;
  66}
  67#endif
  68
  69static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  70                           struct btrfs_qgroup *qgroup, u64 num_bytes,
  71                           enum btrfs_qgroup_rsv_type type)
  72{
  73        trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  74        qgroup->rsv.values[type] += num_bytes;
  75}
  76
  77static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  78                               struct btrfs_qgroup *qgroup, u64 num_bytes,
  79                               enum btrfs_qgroup_rsv_type type)
  80{
  81        trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  82        if (qgroup->rsv.values[type] >= num_bytes) {
  83                qgroup->rsv.values[type] -= num_bytes;
  84                return;
  85        }
  86#ifdef CONFIG_BTRFS_DEBUG
  87        WARN_RATELIMIT(1,
  88                "qgroup %llu %s reserved space underflow, have %llu to free %llu",
  89                qgroup->qgroupid, qgroup_rsv_type_str(type),
  90                qgroup->rsv.values[type], num_bytes);
  91#endif
  92        qgroup->rsv.values[type] = 0;
  93}
  94
  95static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
  96                                     struct btrfs_qgroup *dest,
  97                                     struct btrfs_qgroup *src)
  98{
  99        int i;
 100
 101        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 102                qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
 103}
 104
 105static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
 106                                         struct btrfs_qgroup *dest,
 107                                          struct btrfs_qgroup *src)
 108{
 109        int i;
 110
 111        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 112                qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 113}
 114
 115static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 116                                           int mod)
 117{
 118        if (qg->old_refcnt < seq)
 119                qg->old_refcnt = seq;
 120        qg->old_refcnt += mod;
 121}
 122
 123static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 124                                           int mod)
 125{
 126        if (qg->new_refcnt < seq)
 127                qg->new_refcnt = seq;
 128        qg->new_refcnt += mod;
 129}
 130
 131static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
 132{
 133        if (qg->old_refcnt < seq)
 134                return 0;
 135        return qg->old_refcnt - seq;
 136}
 137
 138static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
 139{
 140        if (qg->new_refcnt < seq)
 141                return 0;
 142        return qg->new_refcnt - seq;
 143}
 144
 145/*
 146 * glue structure to represent the relations between qgroups.
 147 */
 148struct btrfs_qgroup_list {
 149        struct list_head next_group;
 150        struct list_head next_member;
 151        struct btrfs_qgroup *group;
 152        struct btrfs_qgroup *member;
 153};
 154
 155static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
 156{
 157        return (u64)(uintptr_t)qg;
 158}
 159
 160static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
 161{
 162        return (struct btrfs_qgroup *)(uintptr_t)n->aux;
 163}
 164
 165static int
 166qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 167                   int init_flags);
 168static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 169
 170/* must be called with qgroup_ioctl_lock held */
 171static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 172                                           u64 qgroupid)
 173{
 174        struct rb_node *n = fs_info->qgroup_tree.rb_node;
 175        struct btrfs_qgroup *qgroup;
 176
 177        while (n) {
 178                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 179                if (qgroup->qgroupid < qgroupid)
 180                        n = n->rb_left;
 181                else if (qgroup->qgroupid > qgroupid)
 182                        n = n->rb_right;
 183                else
 184                        return qgroup;
 185        }
 186        return NULL;
 187}
 188
 189/* must be called with qgroup_lock held */
 190static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 191                                          u64 qgroupid)
 192{
 193        struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 194        struct rb_node *parent = NULL;
 195        struct btrfs_qgroup *qgroup;
 196
 197        while (*p) {
 198                parent = *p;
 199                qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 200
 201                if (qgroup->qgroupid < qgroupid)
 202                        p = &(*p)->rb_left;
 203                else if (qgroup->qgroupid > qgroupid)
 204                        p = &(*p)->rb_right;
 205                else
 206                        return qgroup;
 207        }
 208
 209        qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
 210        if (!qgroup)
 211                return ERR_PTR(-ENOMEM);
 212
 213        qgroup->qgroupid = qgroupid;
 214        INIT_LIST_HEAD(&qgroup->groups);
 215        INIT_LIST_HEAD(&qgroup->members);
 216        INIT_LIST_HEAD(&qgroup->dirty);
 217
 218        rb_link_node(&qgroup->node, parent, p);
 219        rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 220
 221        return qgroup;
 222}
 223
 224static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
 225                            struct btrfs_qgroup *qgroup)
 226{
 227        struct btrfs_qgroup_list *list;
 228
 229        btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 230        list_del(&qgroup->dirty);
 231        while (!list_empty(&qgroup->groups)) {
 232                list = list_first_entry(&qgroup->groups,
 233                                        struct btrfs_qgroup_list, next_group);
 234                list_del(&list->next_group);
 235                list_del(&list->next_member);
 236                kfree(list);
 237        }
 238
 239        while (!list_empty(&qgroup->members)) {
 240                list = list_first_entry(&qgroup->members,
 241                                        struct btrfs_qgroup_list, next_member);
 242                list_del(&list->next_group);
 243                list_del(&list->next_member);
 244                kfree(list);
 245        }
 246        kfree(qgroup);
 247}
 248
 249/* must be called with qgroup_lock held */
 250static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 251{
 252        struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 253
 254        if (!qgroup)
 255                return -ENOENT;
 256
 257        rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 258        __del_qgroup_rb(fs_info, qgroup);
 259        return 0;
 260}
 261
 262/* must be called with qgroup_lock held */
 263static int add_relation_rb(struct btrfs_fs_info *fs_info,
 264                           u64 memberid, u64 parentid)
 265{
 266        struct btrfs_qgroup *member;
 267        struct btrfs_qgroup *parent;
 268        struct btrfs_qgroup_list *list;
 269
 270        member = find_qgroup_rb(fs_info, memberid);
 271        parent = find_qgroup_rb(fs_info, parentid);
 272        if (!member || !parent)
 273                return -ENOENT;
 274
 275        list = kzalloc(sizeof(*list), GFP_ATOMIC);
 276        if (!list)
 277                return -ENOMEM;
 278
 279        list->group = parent;
 280        list->member = member;
 281        list_add_tail(&list->next_group, &member->groups);
 282        list_add_tail(&list->next_member, &parent->members);
 283
 284        return 0;
 285}
 286
 287/* must be called with qgroup_lock held */
 288static int del_relation_rb(struct btrfs_fs_info *fs_info,
 289                           u64 memberid, u64 parentid)
 290{
 291        struct btrfs_qgroup *member;
 292        struct btrfs_qgroup *parent;
 293        struct btrfs_qgroup_list *list;
 294
 295        member = find_qgroup_rb(fs_info, memberid);
 296        parent = find_qgroup_rb(fs_info, parentid);
 297        if (!member || !parent)
 298                return -ENOENT;
 299
 300        list_for_each_entry(list, &member->groups, next_group) {
 301                if (list->group == parent) {
 302                        list_del(&list->next_group);
 303                        list_del(&list->next_member);
 304                        kfree(list);
 305                        return 0;
 306                }
 307        }
 308        return -ENOENT;
 309}
 310
 311#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 312int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 313                               u64 rfer, u64 excl)
 314{
 315        struct btrfs_qgroup *qgroup;
 316
 317        qgroup = find_qgroup_rb(fs_info, qgroupid);
 318        if (!qgroup)
 319                return -EINVAL;
 320        if (qgroup->rfer != rfer || qgroup->excl != excl)
 321                return -EINVAL;
 322        return 0;
 323}
 324#endif
 325
 326/*
 327 * The full config is read in one go, only called from open_ctree()
 328 * It doesn't use any locking, as at this point we're still single-threaded
 329 */
 330int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 331{
 332        struct btrfs_key key;
 333        struct btrfs_key found_key;
 334        struct btrfs_root *quota_root = fs_info->quota_root;
 335        struct btrfs_path *path = NULL;
 336        struct extent_buffer *l;
 337        int slot;
 338        int ret = 0;
 339        u64 flags = 0;
 340        u64 rescan_progress = 0;
 341
 342        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 343                return 0;
 344
 345        fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 346        if (!fs_info->qgroup_ulist) {
 347                ret = -ENOMEM;
 348                goto out;
 349        }
 350
 351        path = btrfs_alloc_path();
 352        if (!path) {
 353                ret = -ENOMEM;
 354                goto out;
 355        }
 356
 357        ret = btrfs_sysfs_add_qgroups(fs_info);
 358        if (ret < 0)
 359                goto out;
 360        /* default this to quota off, in case no status key is found */
 361        fs_info->qgroup_flags = 0;
 362
 363        /*
 364         * pass 1: read status, all qgroup infos and limits
 365         */
 366        key.objectid = 0;
 367        key.type = 0;
 368        key.offset = 0;
 369        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 370        if (ret)
 371                goto out;
 372
 373        while (1) {
 374                struct btrfs_qgroup *qgroup;
 375
 376                slot = path->slots[0];
 377                l = path->nodes[0];
 378                btrfs_item_key_to_cpu(l, &found_key, slot);
 379
 380                if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 381                        struct btrfs_qgroup_status_item *ptr;
 382
 383                        ptr = btrfs_item_ptr(l, slot,
 384                                             struct btrfs_qgroup_status_item);
 385
 386                        if (btrfs_qgroup_status_version(l, ptr) !=
 387                            BTRFS_QGROUP_STATUS_VERSION) {
 388                                btrfs_err(fs_info,
 389                                 "old qgroup version, quota disabled");
 390                                goto out;
 391                        }
 392                        if (btrfs_qgroup_status_generation(l, ptr) !=
 393                            fs_info->generation) {
 394                                flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 395                                btrfs_err(fs_info,
 396                                        "qgroup generation mismatch, marked as inconsistent");
 397                        }
 398                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 399                                                                          ptr);
 400                        rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 401                        goto next1;
 402                }
 403
 404                if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 405                    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 406                        goto next1;
 407
 408                qgroup = find_qgroup_rb(fs_info, found_key.offset);
 409                if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 410                    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
 411                        btrfs_err(fs_info, "inconsistent qgroup config");
 412                        flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 413                }
 414                if (!qgroup) {
 415                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
 416                        if (IS_ERR(qgroup)) {
 417                                ret = PTR_ERR(qgroup);
 418                                goto out;
 419                        }
 420                }
 421                ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 422                if (ret < 0)
 423                        goto out;
 424
 425                switch (found_key.type) {
 426                case BTRFS_QGROUP_INFO_KEY: {
 427                        struct btrfs_qgroup_info_item *ptr;
 428
 429                        ptr = btrfs_item_ptr(l, slot,
 430                                             struct btrfs_qgroup_info_item);
 431                        qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 432                        qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 433                        qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 434                        qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 435                        /* generation currently unused */
 436                        break;
 437                }
 438                case BTRFS_QGROUP_LIMIT_KEY: {
 439                        struct btrfs_qgroup_limit_item *ptr;
 440
 441                        ptr = btrfs_item_ptr(l, slot,
 442                                             struct btrfs_qgroup_limit_item);
 443                        qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 444                        qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 445                        qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 446                        qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 447                        qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 448                        break;
 449                }
 450                }
 451next1:
 452                ret = btrfs_next_item(quota_root, path);
 453                if (ret < 0)
 454                        goto out;
 455                if (ret)
 456                        break;
 457        }
 458        btrfs_release_path(path);
 459
 460        /*
 461         * pass 2: read all qgroup relations
 462         */
 463        key.objectid = 0;
 464        key.type = BTRFS_QGROUP_RELATION_KEY;
 465        key.offset = 0;
 466        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 467        if (ret)
 468                goto out;
 469        while (1) {
 470                slot = path->slots[0];
 471                l = path->nodes[0];
 472                btrfs_item_key_to_cpu(l, &found_key, slot);
 473
 474                if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 475                        goto next2;
 476
 477                if (found_key.objectid > found_key.offset) {
 478                        /* parent <- member, not needed to build config */
 479                        /* FIXME should we omit the key completely? */
 480                        goto next2;
 481                }
 482
 483                ret = add_relation_rb(fs_info, found_key.objectid,
 484                                      found_key.offset);
 485                if (ret == -ENOENT) {
 486                        btrfs_warn(fs_info,
 487                                "orphan qgroup relation 0x%llx->0x%llx",
 488                                found_key.objectid, found_key.offset);
 489                        ret = 0;        /* ignore the error */
 490                }
 491                if (ret)
 492                        goto out;
 493next2:
 494                ret = btrfs_next_item(quota_root, path);
 495                if (ret < 0)
 496                        goto out;
 497                if (ret)
 498                        break;
 499        }
 500out:
 501        btrfs_free_path(path);
 502        fs_info->qgroup_flags |= flags;
 503        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
 504                clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 505        else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
 506                 ret >= 0)
 507                ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 508
 509        if (ret < 0) {
 510                ulist_free(fs_info->qgroup_ulist);
 511                fs_info->qgroup_ulist = NULL;
 512                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 513                btrfs_sysfs_del_qgroups(fs_info);
 514        }
 515
 516        return ret < 0 ? ret : 0;
 517}
 518
 519/*
 520 * Called in close_ctree() when quota is still enabled.  This verifies we don't
 521 * leak some reserved space.
 522 *
 523 * Return false if no reserved space is left.
 524 * Return true if some reserved space is leaked.
 525 */
 526bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
 527{
 528        struct rb_node *node;
 529        bool ret = false;
 530
 531        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 532                return ret;
 533        /*
 534         * Since we're unmounting, there is no race and no need to grab qgroup
 535         * lock.  And here we don't go post-order to provide a more user
 536         * friendly sorted result.
 537         */
 538        for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
 539                struct btrfs_qgroup *qgroup;
 540                int i;
 541
 542                qgroup = rb_entry(node, struct btrfs_qgroup, node);
 543                for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
 544                        if (qgroup->rsv.values[i]) {
 545                                ret = true;
 546                                btrfs_warn(fs_info,
 547                "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
 548                                   btrfs_qgroup_level(qgroup->qgroupid),
 549                                   btrfs_qgroup_subvolid(qgroup->qgroupid),
 550                                   i, qgroup->rsv.values[i]);
 551                        }
 552                }
 553        }
 554        return ret;
 555}
 556
 557/*
 558 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 559 * first two are in single-threaded paths.And for the third one, we have set
 560 * quota_root to be null with qgroup_lock held before, so it is safe to clean
 561 * up the in-memory structures without qgroup_lock held.
 562 */
 563void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 564{
 565        struct rb_node *n;
 566        struct btrfs_qgroup *qgroup;
 567
 568        while ((n = rb_first(&fs_info->qgroup_tree))) {
 569                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 570                rb_erase(n, &fs_info->qgroup_tree);
 571                __del_qgroup_rb(fs_info, qgroup);
 572        }
 573        /*
 574         * We call btrfs_free_qgroup_config() when unmounting
 575         * filesystem and disabling quota, so we set qgroup_ulist
 576         * to be null here to avoid double free.
 577         */
 578        ulist_free(fs_info->qgroup_ulist);
 579        fs_info->qgroup_ulist = NULL;
 580        btrfs_sysfs_del_qgroups(fs_info);
 581}
 582
 583static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 584                                    u64 dst)
 585{
 586        int ret;
 587        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 588        struct btrfs_path *path;
 589        struct btrfs_key key;
 590
 591        path = btrfs_alloc_path();
 592        if (!path)
 593                return -ENOMEM;
 594
 595        key.objectid = src;
 596        key.type = BTRFS_QGROUP_RELATION_KEY;
 597        key.offset = dst;
 598
 599        ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 600
 601        btrfs_mark_buffer_dirty(path->nodes[0]);
 602
 603        btrfs_free_path(path);
 604        return ret;
 605}
 606
 607static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 608                                    u64 dst)
 609{
 610        int ret;
 611        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 612        struct btrfs_path *path;
 613        struct btrfs_key key;
 614
 615        path = btrfs_alloc_path();
 616        if (!path)
 617                return -ENOMEM;
 618
 619        key.objectid = src;
 620        key.type = BTRFS_QGROUP_RELATION_KEY;
 621        key.offset = dst;
 622
 623        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 624        if (ret < 0)
 625                goto out;
 626
 627        if (ret > 0) {
 628                ret = -ENOENT;
 629                goto out;
 630        }
 631
 632        ret = btrfs_del_item(trans, quota_root, path);
 633out:
 634        btrfs_free_path(path);
 635        return ret;
 636}
 637
 638static int add_qgroup_item(struct btrfs_trans_handle *trans,
 639                           struct btrfs_root *quota_root, u64 qgroupid)
 640{
 641        int ret;
 642        struct btrfs_path *path;
 643        struct btrfs_qgroup_info_item *qgroup_info;
 644        struct btrfs_qgroup_limit_item *qgroup_limit;
 645        struct extent_buffer *leaf;
 646        struct btrfs_key key;
 647
 648        if (btrfs_is_testing(quota_root->fs_info))
 649                return 0;
 650
 651        path = btrfs_alloc_path();
 652        if (!path)
 653                return -ENOMEM;
 654
 655        key.objectid = 0;
 656        key.type = BTRFS_QGROUP_INFO_KEY;
 657        key.offset = qgroupid;
 658
 659        /*
 660         * Avoid a transaction abort by catching -EEXIST here. In that
 661         * case, we proceed by re-initializing the existing structure
 662         * on disk.
 663         */
 664
 665        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 666                                      sizeof(*qgroup_info));
 667        if (ret && ret != -EEXIST)
 668                goto out;
 669
 670        leaf = path->nodes[0];
 671        qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 672                                 struct btrfs_qgroup_info_item);
 673        btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 674        btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 675        btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 676        btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 677        btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 678
 679        btrfs_mark_buffer_dirty(leaf);
 680
 681        btrfs_release_path(path);
 682
 683        key.type = BTRFS_QGROUP_LIMIT_KEY;
 684        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 685                                      sizeof(*qgroup_limit));
 686        if (ret && ret != -EEXIST)
 687                goto out;
 688
 689        leaf = path->nodes[0];
 690        qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 691                                  struct btrfs_qgroup_limit_item);
 692        btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 693        btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 694        btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 695        btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 696        btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 697
 698        btrfs_mark_buffer_dirty(leaf);
 699
 700        ret = 0;
 701out:
 702        btrfs_free_path(path);
 703        return ret;
 704}
 705
 706static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 707{
 708        int ret;
 709        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 710        struct btrfs_path *path;
 711        struct btrfs_key key;
 712
 713        path = btrfs_alloc_path();
 714        if (!path)
 715                return -ENOMEM;
 716
 717        key.objectid = 0;
 718        key.type = BTRFS_QGROUP_INFO_KEY;
 719        key.offset = qgroupid;
 720        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 721        if (ret < 0)
 722                goto out;
 723
 724        if (ret > 0) {
 725                ret = -ENOENT;
 726                goto out;
 727        }
 728
 729        ret = btrfs_del_item(trans, quota_root, path);
 730        if (ret)
 731                goto out;
 732
 733        btrfs_release_path(path);
 734
 735        key.type = BTRFS_QGROUP_LIMIT_KEY;
 736        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 737        if (ret < 0)
 738                goto out;
 739
 740        if (ret > 0) {
 741                ret = -ENOENT;
 742                goto out;
 743        }
 744
 745        ret = btrfs_del_item(trans, quota_root, path);
 746
 747out:
 748        btrfs_free_path(path);
 749        return ret;
 750}
 751
 752static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 753                                    struct btrfs_qgroup *qgroup)
 754{
 755        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 756        struct btrfs_path *path;
 757        struct btrfs_key key;
 758        struct extent_buffer *l;
 759        struct btrfs_qgroup_limit_item *qgroup_limit;
 760        int ret;
 761        int slot;
 762
 763        key.objectid = 0;
 764        key.type = BTRFS_QGROUP_LIMIT_KEY;
 765        key.offset = qgroup->qgroupid;
 766
 767        path = btrfs_alloc_path();
 768        if (!path)
 769                return -ENOMEM;
 770
 771        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 772        if (ret > 0)
 773                ret = -ENOENT;
 774
 775        if (ret)
 776                goto out;
 777
 778        l = path->nodes[0];
 779        slot = path->slots[0];
 780        qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 781        btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 782        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 783        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 784        btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 785        btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 786
 787        btrfs_mark_buffer_dirty(l);
 788
 789out:
 790        btrfs_free_path(path);
 791        return ret;
 792}
 793
 794static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 795                                   struct btrfs_qgroup *qgroup)
 796{
 797        struct btrfs_fs_info *fs_info = trans->fs_info;
 798        struct btrfs_root *quota_root = fs_info->quota_root;
 799        struct btrfs_path *path;
 800        struct btrfs_key key;
 801        struct extent_buffer *l;
 802        struct btrfs_qgroup_info_item *qgroup_info;
 803        int ret;
 804        int slot;
 805
 806        if (btrfs_is_testing(fs_info))
 807                return 0;
 808
 809        key.objectid = 0;
 810        key.type = BTRFS_QGROUP_INFO_KEY;
 811        key.offset = qgroup->qgroupid;
 812
 813        path = btrfs_alloc_path();
 814        if (!path)
 815                return -ENOMEM;
 816
 817        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 818        if (ret > 0)
 819                ret = -ENOENT;
 820
 821        if (ret)
 822                goto out;
 823
 824        l = path->nodes[0];
 825        slot = path->slots[0];
 826        qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 827        btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 828        btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 829        btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 830        btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 831        btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 832
 833        btrfs_mark_buffer_dirty(l);
 834
 835out:
 836        btrfs_free_path(path);
 837        return ret;
 838}
 839
 840static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 841{
 842        struct btrfs_fs_info *fs_info = trans->fs_info;
 843        struct btrfs_root *quota_root = fs_info->quota_root;
 844        struct btrfs_path *path;
 845        struct btrfs_key key;
 846        struct extent_buffer *l;
 847        struct btrfs_qgroup_status_item *ptr;
 848        int ret;
 849        int slot;
 850
 851        key.objectid = 0;
 852        key.type = BTRFS_QGROUP_STATUS_KEY;
 853        key.offset = 0;
 854
 855        path = btrfs_alloc_path();
 856        if (!path)
 857                return -ENOMEM;
 858
 859        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 860        if (ret > 0)
 861                ret = -ENOENT;
 862
 863        if (ret)
 864                goto out;
 865
 866        l = path->nodes[0];
 867        slot = path->slots[0];
 868        ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 869        btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
 870        btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 871        btrfs_set_qgroup_status_rescan(l, ptr,
 872                                fs_info->qgroup_rescan_progress.objectid);
 873
 874        btrfs_mark_buffer_dirty(l);
 875
 876out:
 877        btrfs_free_path(path);
 878        return ret;
 879}
 880
 881/*
 882 * called with qgroup_lock held
 883 */
 884static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 885                                  struct btrfs_root *root)
 886{
 887        struct btrfs_path *path;
 888        struct btrfs_key key;
 889        struct extent_buffer *leaf = NULL;
 890        int ret;
 891        int nr = 0;
 892
 893        path = btrfs_alloc_path();
 894        if (!path)
 895                return -ENOMEM;
 896
 897        path->leave_spinning = 1;
 898
 899        key.objectid = 0;
 900        key.offset = 0;
 901        key.type = 0;
 902
 903        while (1) {
 904                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 905                if (ret < 0)
 906                        goto out;
 907                leaf = path->nodes[0];
 908                nr = btrfs_header_nritems(leaf);
 909                if (!nr)
 910                        break;
 911                /*
 912                 * delete the leaf one by one
 913                 * since the whole tree is going
 914                 * to be deleted.
 915                 */
 916                path->slots[0] = 0;
 917                ret = btrfs_del_items(trans, root, path, 0, nr);
 918                if (ret)
 919                        goto out;
 920
 921                btrfs_release_path(path);
 922        }
 923        ret = 0;
 924out:
 925        btrfs_free_path(path);
 926        return ret;
 927}
 928
 929int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 930{
 931        struct btrfs_root *quota_root;
 932        struct btrfs_root *tree_root = fs_info->tree_root;
 933        struct btrfs_path *path = NULL;
 934        struct btrfs_qgroup_status_item *ptr;
 935        struct extent_buffer *leaf;
 936        struct btrfs_key key;
 937        struct btrfs_key found_key;
 938        struct btrfs_qgroup *qgroup = NULL;
 939        struct btrfs_trans_handle *trans = NULL;
 940        struct ulist *ulist = NULL;
 941        int ret = 0;
 942        int slot;
 943
 944        mutex_lock(&fs_info->qgroup_ioctl_lock);
 945        if (fs_info->quota_root)
 946                goto out;
 947
 948        ulist = ulist_alloc(GFP_KERNEL);
 949        if (!ulist) {
 950                ret = -ENOMEM;
 951                goto out;
 952        }
 953
 954        ret = btrfs_sysfs_add_qgroups(fs_info);
 955        if (ret < 0)
 956                goto out;
 957
 958        /*
 959         * Unlock qgroup_ioctl_lock before starting the transaction. This is to
 960         * avoid lock acquisition inversion problems (reported by lockdep) between
 961         * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
 962         * start a transaction.
 963         * After we started the transaction lock qgroup_ioctl_lock again and
 964         * check if someone else created the quota root in the meanwhile. If so,
 965         * just return success and release the transaction handle.
 966         *
 967         * Also we don't need to worry about someone else calling
 968         * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
 969         * that function returns 0 (success) when the sysfs entries already exist.
 970         */
 971        mutex_unlock(&fs_info->qgroup_ioctl_lock);
 972
 973        /*
 974         * 1 for quota root item
 975         * 1 for BTRFS_QGROUP_STATUS item
 976         *
 977         * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
 978         * per subvolume. However those are not currently reserved since it
 979         * would be a lot of overkill.
 980         */
 981        trans = btrfs_start_transaction(tree_root, 2);
 982
 983        mutex_lock(&fs_info->qgroup_ioctl_lock);
 984        if (IS_ERR(trans)) {
 985                ret = PTR_ERR(trans);
 986                trans = NULL;
 987                goto out;
 988        }
 989
 990        if (fs_info->quota_root)
 991                goto out;
 992
 993        fs_info->qgroup_ulist = ulist;
 994        ulist = NULL;
 995
 996        /*
 997         * initially create the quota tree
 998         */
 999        quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
1000        if (IS_ERR(quota_root)) {
1001                ret =  PTR_ERR(quota_root);
1002                btrfs_abort_transaction(trans, ret);
1003                goto out;
1004        }
1005
1006        path = btrfs_alloc_path();
1007        if (!path) {
1008                ret = -ENOMEM;
1009                btrfs_abort_transaction(trans, ret);
1010                goto out_free_root;
1011        }
1012
1013        key.objectid = 0;
1014        key.type = BTRFS_QGROUP_STATUS_KEY;
1015        key.offset = 0;
1016
1017        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1018                                      sizeof(*ptr));
1019        if (ret) {
1020                btrfs_abort_transaction(trans, ret);
1021                goto out_free_path;
1022        }
1023
1024        leaf = path->nodes[0];
1025        ptr = btrfs_item_ptr(leaf, path->slots[0],
1026                                 struct btrfs_qgroup_status_item);
1027        btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1028        btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1029        fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
1030                                BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1031        btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
1032        btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1033
1034        btrfs_mark_buffer_dirty(leaf);
1035
1036        key.objectid = 0;
1037        key.type = BTRFS_ROOT_REF_KEY;
1038        key.offset = 0;
1039
1040        btrfs_release_path(path);
1041        ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1042        if (ret > 0)
1043                goto out_add_root;
1044        if (ret < 0) {
1045                btrfs_abort_transaction(trans, ret);
1046                goto out_free_path;
1047        }
1048
1049        while (1) {
1050                slot = path->slots[0];
1051                leaf = path->nodes[0];
1052                btrfs_item_key_to_cpu(leaf, &found_key, slot);
1053
1054                if (found_key.type == BTRFS_ROOT_REF_KEY) {
1055
1056                        /* Release locks on tree_root before we access quota_root */
1057                        btrfs_release_path(path);
1058
1059                        ret = add_qgroup_item(trans, quota_root,
1060                                              found_key.offset);
1061                        if (ret) {
1062                                btrfs_abort_transaction(trans, ret);
1063                                goto out_free_path;
1064                        }
1065
1066                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
1067                        if (IS_ERR(qgroup)) {
1068                                ret = PTR_ERR(qgroup);
1069                                btrfs_abort_transaction(trans, ret);
1070                                goto out_free_path;
1071                        }
1072                        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1073                        if (ret < 0) {
1074                                btrfs_abort_transaction(trans, ret);
1075                                goto out_free_path;
1076                        }
1077                        ret = btrfs_search_slot_for_read(tree_root, &found_key,
1078                                                         path, 1, 0);
1079                        if (ret < 0) {
1080                                btrfs_abort_transaction(trans, ret);
1081                                goto out_free_path;
1082                        }
1083                        if (ret > 0) {
1084                                /*
1085                                 * Shouldn't happen, but in case it does we
1086                                 * don't need to do the btrfs_next_item, just
1087                                 * continue.
1088                                 */
1089                                continue;
1090                        }
1091                }
1092                ret = btrfs_next_item(tree_root, path);
1093                if (ret < 0) {
1094                        btrfs_abort_transaction(trans, ret);
1095                        goto out_free_path;
1096                }
1097                if (ret)
1098                        break;
1099        }
1100
1101out_add_root:
1102        btrfs_release_path(path);
1103        ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1104        if (ret) {
1105                btrfs_abort_transaction(trans, ret);
1106                goto out_free_path;
1107        }
1108
1109        qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
1110        if (IS_ERR(qgroup)) {
1111                ret = PTR_ERR(qgroup);
1112                btrfs_abort_transaction(trans, ret);
1113                goto out_free_path;
1114        }
1115        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1116        if (ret < 0) {
1117                btrfs_abort_transaction(trans, ret);
1118                goto out_free_path;
1119        }
1120
1121        ret = btrfs_commit_transaction(trans);
1122        trans = NULL;
1123        if (ret)
1124                goto out_free_path;
1125
1126        /*
1127         * Set quota enabled flag after committing the transaction, to avoid
1128         * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1129         * creation.
1130         */
1131        spin_lock(&fs_info->qgroup_lock);
1132        fs_info->quota_root = quota_root;
1133        set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1134        spin_unlock(&fs_info->qgroup_lock);
1135
1136        ret = qgroup_rescan_init(fs_info, 0, 1);
1137        if (!ret) {
1138                qgroup_rescan_zero_tracking(fs_info);
1139                fs_info->qgroup_rescan_running = true;
1140                btrfs_queue_work(fs_info->qgroup_rescan_workers,
1141                                 &fs_info->qgroup_rescan_work);
1142        }
1143
1144out_free_path:
1145        btrfs_free_path(path);
1146out_free_root:
1147        if (ret)
1148                btrfs_put_root(quota_root);
1149out:
1150        if (ret) {
1151                ulist_free(fs_info->qgroup_ulist);
1152                fs_info->qgroup_ulist = NULL;
1153                btrfs_sysfs_del_qgroups(fs_info);
1154        }
1155        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1156        if (ret && trans)
1157                btrfs_end_transaction(trans);
1158        else if (trans)
1159                ret = btrfs_end_transaction(trans);
1160        ulist_free(ulist);
1161        return ret;
1162}
1163
1164int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1165{
1166        struct btrfs_root *quota_root;
1167        struct btrfs_trans_handle *trans = NULL;
1168        int ret = 0;
1169
1170        mutex_lock(&fs_info->qgroup_ioctl_lock);
1171        if (!fs_info->quota_root)
1172                goto out;
1173        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1174
1175        /*
1176         * 1 For the root item
1177         *
1178         * We should also reserve enough items for the quota tree deletion in
1179         * btrfs_clean_quota_tree but this is not done.
1180         *
1181         * Also, we must always start a transaction without holding the mutex
1182         * qgroup_ioctl_lock, see btrfs_quota_enable().
1183         */
1184        trans = btrfs_start_transaction(fs_info->tree_root, 1);
1185
1186        mutex_lock(&fs_info->qgroup_ioctl_lock);
1187        if (IS_ERR(trans)) {
1188                ret = PTR_ERR(trans);
1189                trans = NULL;
1190                goto out;
1191        }
1192
1193        if (!fs_info->quota_root)
1194                goto out;
1195
1196        clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1197        btrfs_qgroup_wait_for_completion(fs_info, false);
1198        spin_lock(&fs_info->qgroup_lock);
1199        quota_root = fs_info->quota_root;
1200        fs_info->quota_root = NULL;
1201        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1202        spin_unlock(&fs_info->qgroup_lock);
1203
1204        btrfs_free_qgroup_config(fs_info);
1205
1206        ret = btrfs_clean_quota_tree(trans, quota_root);
1207        if (ret) {
1208                btrfs_abort_transaction(trans, ret);
1209                goto out;
1210        }
1211
1212        ret = btrfs_del_root(trans, &quota_root->root_key);
1213        if (ret) {
1214                btrfs_abort_transaction(trans, ret);
1215                goto out;
1216        }
1217
1218        list_del(&quota_root->dirty_list);
1219
1220        btrfs_tree_lock(quota_root->node);
1221        btrfs_clean_tree_block(quota_root->node);
1222        btrfs_tree_unlock(quota_root->node);
1223        btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
1224
1225        btrfs_put_root(quota_root);
1226
1227out:
1228        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1229        if (ret && trans)
1230                btrfs_end_transaction(trans);
1231        else if (trans)
1232                ret = btrfs_end_transaction(trans);
1233
1234        return ret;
1235}
1236
1237static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1238                         struct btrfs_qgroup *qgroup)
1239{
1240        if (list_empty(&qgroup->dirty))
1241                list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1242}
1243
1244/*
1245 * The easy accounting, we're updating qgroup relationship whose child qgroup
1246 * only has exclusive extents.
1247 *
1248 * In this case, all exclusive extents will also be exclusive for parent, so
1249 * excl/rfer just get added/removed.
1250 *
1251 * So is qgroup reservation space, which should also be added/removed to
1252 * parent.
1253 * Or when child tries to release reservation space, parent will underflow its
1254 * reservation (for relationship adding case).
1255 *
1256 * Caller should hold fs_info->qgroup_lock.
1257 */
1258static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1259                                    struct ulist *tmp, u64 ref_root,
1260                                    struct btrfs_qgroup *src, int sign)
1261{
1262        struct btrfs_qgroup *qgroup;
1263        struct btrfs_qgroup_list *glist;
1264        struct ulist_node *unode;
1265        struct ulist_iterator uiter;
1266        u64 num_bytes = src->excl;
1267        int ret = 0;
1268
1269        qgroup = find_qgroup_rb(fs_info, ref_root);
1270        if (!qgroup)
1271                goto out;
1272
1273        qgroup->rfer += sign * num_bytes;
1274        qgroup->rfer_cmpr += sign * num_bytes;
1275
1276        WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1277        qgroup->excl += sign * num_bytes;
1278        qgroup->excl_cmpr += sign * num_bytes;
1279
1280        if (sign > 0)
1281                qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1282        else
1283                qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1284
1285        qgroup_dirty(fs_info, qgroup);
1286
1287        /* Get all of the parent groups that contain this qgroup */
1288        list_for_each_entry(glist, &qgroup->groups, next_group) {
1289                ret = ulist_add(tmp, glist->group->qgroupid,
1290                                qgroup_to_aux(glist->group), GFP_ATOMIC);
1291                if (ret < 0)
1292                        goto out;
1293        }
1294
1295        /* Iterate all of the parents and adjust their reference counts */
1296        ULIST_ITER_INIT(&uiter);
1297        while ((unode = ulist_next(tmp, &uiter))) {
1298                qgroup = unode_aux_to_qgroup(unode);
1299                qgroup->rfer += sign * num_bytes;
1300                qgroup->rfer_cmpr += sign * num_bytes;
1301                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1302                qgroup->excl += sign * num_bytes;
1303                if (sign > 0)
1304                        qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1305                else
1306                        qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1307                qgroup->excl_cmpr += sign * num_bytes;
1308                qgroup_dirty(fs_info, qgroup);
1309
1310                /* Add any parents of the parents */
1311                list_for_each_entry(glist, &qgroup->groups, next_group) {
1312                        ret = ulist_add(tmp, glist->group->qgroupid,
1313                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
1314                        if (ret < 0)
1315                                goto out;
1316                }
1317        }
1318        ret = 0;
1319out:
1320        return ret;
1321}
1322
1323
1324/*
1325 * Quick path for updating qgroup with only excl refs.
1326 *
1327 * In that case, just update all parent will be enough.
1328 * Or we needs to do a full rescan.
1329 * Caller should also hold fs_info->qgroup_lock.
1330 *
1331 * Return 0 for quick update, return >0 for need to full rescan
1332 * and mark INCONSISTENT flag.
1333 * Return < 0 for other error.
1334 */
1335static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1336                                   struct ulist *tmp, u64 src, u64 dst,
1337                                   int sign)
1338{
1339        struct btrfs_qgroup *qgroup;
1340        int ret = 1;
1341        int err = 0;
1342
1343        qgroup = find_qgroup_rb(fs_info, src);
1344        if (!qgroup)
1345                goto out;
1346        if (qgroup->excl == qgroup->rfer) {
1347                ret = 0;
1348                err = __qgroup_excl_accounting(fs_info, tmp, dst,
1349                                               qgroup, sign);
1350                if (err < 0) {
1351                        ret = err;
1352                        goto out;
1353                }
1354        }
1355out:
1356        if (ret)
1357                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1358        return ret;
1359}
1360
1361int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1362                              u64 dst)
1363{
1364        struct btrfs_fs_info *fs_info = trans->fs_info;
1365        struct btrfs_qgroup *parent;
1366        struct btrfs_qgroup *member;
1367        struct btrfs_qgroup_list *list;
1368        struct ulist *tmp;
1369        unsigned int nofs_flag;
1370        int ret = 0;
1371
1372        /* Check the level of src and dst first */
1373        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1374                return -EINVAL;
1375
1376        /* We hold a transaction handle open, must do a NOFS allocation. */
1377        nofs_flag = memalloc_nofs_save();
1378        tmp = ulist_alloc(GFP_KERNEL);
1379        memalloc_nofs_restore(nofs_flag);
1380        if (!tmp)
1381                return -ENOMEM;
1382
1383        mutex_lock(&fs_info->qgroup_ioctl_lock);
1384        if (!fs_info->quota_root) {
1385                ret = -ENOTCONN;
1386                goto out;
1387        }
1388        member = find_qgroup_rb(fs_info, src);
1389        parent = find_qgroup_rb(fs_info, dst);
1390        if (!member || !parent) {
1391                ret = -EINVAL;
1392                goto out;
1393        }
1394
1395        /* check if such qgroup relation exist firstly */
1396        list_for_each_entry(list, &member->groups, next_group) {
1397                if (list->group == parent) {
1398                        ret = -EEXIST;
1399                        goto out;
1400                }
1401        }
1402
1403        ret = add_qgroup_relation_item(trans, src, dst);
1404        if (ret)
1405                goto out;
1406
1407        ret = add_qgroup_relation_item(trans, dst, src);
1408        if (ret) {
1409                del_qgroup_relation_item(trans, src, dst);
1410                goto out;
1411        }
1412
1413        spin_lock(&fs_info->qgroup_lock);
1414        ret = add_relation_rb(fs_info, src, dst);
1415        if (ret < 0) {
1416                spin_unlock(&fs_info->qgroup_lock);
1417                goto out;
1418        }
1419        ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1420        spin_unlock(&fs_info->qgroup_lock);
1421out:
1422        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1423        ulist_free(tmp);
1424        return ret;
1425}
1426
1427static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1428                                 u64 dst)
1429{
1430        struct btrfs_fs_info *fs_info = trans->fs_info;
1431        struct btrfs_qgroup *parent;
1432        struct btrfs_qgroup *member;
1433        struct btrfs_qgroup_list *list;
1434        struct ulist *tmp;
1435        bool found = false;
1436        unsigned int nofs_flag;
1437        int ret = 0;
1438        int ret2;
1439
1440        /* We hold a transaction handle open, must do a NOFS allocation. */
1441        nofs_flag = memalloc_nofs_save();
1442        tmp = ulist_alloc(GFP_KERNEL);
1443        memalloc_nofs_restore(nofs_flag);
1444        if (!tmp)
1445                return -ENOMEM;
1446
1447        if (!fs_info->quota_root) {
1448                ret = -ENOTCONN;
1449                goto out;
1450        }
1451
1452        member = find_qgroup_rb(fs_info, src);
1453        parent = find_qgroup_rb(fs_info, dst);
1454        /*
1455         * The parent/member pair doesn't exist, then try to delete the dead
1456         * relation items only.
1457         */
1458        if (!member || !parent)
1459                goto delete_item;
1460
1461        /* check if such qgroup relation exist firstly */
1462        list_for_each_entry(list, &member->groups, next_group) {
1463                if (list->group == parent) {
1464                        found = true;
1465                        break;
1466                }
1467        }
1468
1469delete_item:
1470        ret = del_qgroup_relation_item(trans, src, dst);
1471        if (ret < 0 && ret != -ENOENT)
1472                goto out;
1473        ret2 = del_qgroup_relation_item(trans, dst, src);
1474        if (ret2 < 0 && ret2 != -ENOENT)
1475                goto out;
1476
1477        /* At least one deletion succeeded, return 0 */
1478        if (!ret || !ret2)
1479                ret = 0;
1480
1481        if (found) {
1482                spin_lock(&fs_info->qgroup_lock);
1483                del_relation_rb(fs_info, src, dst);
1484                ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1485                spin_unlock(&fs_info->qgroup_lock);
1486        }
1487out:
1488        ulist_free(tmp);
1489        return ret;
1490}
1491
1492int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1493                              u64 dst)
1494{
1495        struct btrfs_fs_info *fs_info = trans->fs_info;
1496        int ret = 0;
1497
1498        mutex_lock(&fs_info->qgroup_ioctl_lock);
1499        ret = __del_qgroup_relation(trans, src, dst);
1500        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1501
1502        return ret;
1503}
1504
1505int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1506{
1507        struct btrfs_fs_info *fs_info = trans->fs_info;
1508        struct btrfs_root *quota_root;
1509        struct btrfs_qgroup *qgroup;
1510        int ret = 0;
1511
1512        mutex_lock(&fs_info->qgroup_ioctl_lock);
1513        if (!fs_info->quota_root) {
1514                ret = -ENOTCONN;
1515                goto out;
1516        }
1517        quota_root = fs_info->quota_root;
1518        qgroup = find_qgroup_rb(fs_info, qgroupid);
1519        if (qgroup) {
1520                ret = -EEXIST;
1521                goto out;
1522        }
1523
1524        ret = add_qgroup_item(trans, quota_root, qgroupid);
1525        if (ret)
1526                goto out;
1527
1528        spin_lock(&fs_info->qgroup_lock);
1529        qgroup = add_qgroup_rb(fs_info, qgroupid);
1530        spin_unlock(&fs_info->qgroup_lock);
1531
1532        if (IS_ERR(qgroup)) {
1533                ret = PTR_ERR(qgroup);
1534                goto out;
1535        }
1536        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1537out:
1538        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1539        return ret;
1540}
1541
1542int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1543{
1544        struct btrfs_fs_info *fs_info = trans->fs_info;
1545        struct btrfs_qgroup *qgroup;
1546        struct btrfs_qgroup_list *list;
1547        int ret = 0;
1548
1549        mutex_lock(&fs_info->qgroup_ioctl_lock);
1550        if (!fs_info->quota_root) {
1551                ret = -ENOTCONN;
1552                goto out;
1553        }
1554
1555        qgroup = find_qgroup_rb(fs_info, qgroupid);
1556        if (!qgroup) {
1557                ret = -ENOENT;
1558                goto out;
1559        }
1560
1561        /* Check if there are no children of this qgroup */
1562        if (!list_empty(&qgroup->members)) {
1563                ret = -EBUSY;
1564                goto out;
1565        }
1566
1567        ret = del_qgroup_item(trans, qgroupid);
1568        if (ret && ret != -ENOENT)
1569                goto out;
1570
1571        while (!list_empty(&qgroup->groups)) {
1572                list = list_first_entry(&qgroup->groups,
1573                                        struct btrfs_qgroup_list, next_group);
1574                ret = __del_qgroup_relation(trans, qgroupid,
1575                                            list->group->qgroupid);
1576                if (ret)
1577                        goto out;
1578        }
1579
1580        spin_lock(&fs_info->qgroup_lock);
1581        del_qgroup_rb(fs_info, qgroupid);
1582        spin_unlock(&fs_info->qgroup_lock);
1583out:
1584        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1585        return ret;
1586}
1587
1588int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1589                       struct btrfs_qgroup_limit *limit)
1590{
1591        struct btrfs_fs_info *fs_info = trans->fs_info;
1592        struct btrfs_qgroup *qgroup;
1593        int ret = 0;
1594        /* Sometimes we would want to clear the limit on this qgroup.
1595         * To meet this requirement, we treat the -1 as a special value
1596         * which tell kernel to clear the limit on this qgroup.
1597         */
1598        const u64 CLEAR_VALUE = -1;
1599
1600        mutex_lock(&fs_info->qgroup_ioctl_lock);
1601        if (!fs_info->quota_root) {
1602                ret = -ENOTCONN;
1603                goto out;
1604        }
1605
1606        qgroup = find_qgroup_rb(fs_info, qgroupid);
1607        if (!qgroup) {
1608                ret = -ENOENT;
1609                goto out;
1610        }
1611
1612        spin_lock(&fs_info->qgroup_lock);
1613        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1614                if (limit->max_rfer == CLEAR_VALUE) {
1615                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1616                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1617                        qgroup->max_rfer = 0;
1618                } else {
1619                        qgroup->max_rfer = limit->max_rfer;
1620                }
1621        }
1622        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1623                if (limit->max_excl == CLEAR_VALUE) {
1624                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1625                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1626                        qgroup->max_excl = 0;
1627                } else {
1628                        qgroup->max_excl = limit->max_excl;
1629                }
1630        }
1631        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1632                if (limit->rsv_rfer == CLEAR_VALUE) {
1633                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1634                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1635                        qgroup->rsv_rfer = 0;
1636                } else {
1637                        qgroup->rsv_rfer = limit->rsv_rfer;
1638                }
1639        }
1640        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1641                if (limit->rsv_excl == CLEAR_VALUE) {
1642                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1643                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1644                        qgroup->rsv_excl = 0;
1645                } else {
1646                        qgroup->rsv_excl = limit->rsv_excl;
1647                }
1648        }
1649        qgroup->lim_flags |= limit->flags;
1650
1651        spin_unlock(&fs_info->qgroup_lock);
1652
1653        ret = update_qgroup_limit_item(trans, qgroup);
1654        if (ret) {
1655                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1656                btrfs_info(fs_info, "unable to update quota limit for %llu",
1657                       qgroupid);
1658        }
1659
1660out:
1661        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1662        return ret;
1663}
1664
1665int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1666                                struct btrfs_delayed_ref_root *delayed_refs,
1667                                struct btrfs_qgroup_extent_record *record)
1668{
1669        struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1670        struct rb_node *parent_node = NULL;
1671        struct btrfs_qgroup_extent_record *entry;
1672        u64 bytenr = record->bytenr;
1673
1674        lockdep_assert_held(&delayed_refs->lock);
1675        trace_btrfs_qgroup_trace_extent(fs_info, record);
1676
1677        while (*p) {
1678                parent_node = *p;
1679                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1680                                 node);
1681                if (bytenr < entry->bytenr) {
1682                        p = &(*p)->rb_left;
1683                } else if (bytenr > entry->bytenr) {
1684                        p = &(*p)->rb_right;
1685                } else {
1686                        if (record->data_rsv && !entry->data_rsv) {
1687                                entry->data_rsv = record->data_rsv;
1688                                entry->data_rsv_refroot =
1689                                        record->data_rsv_refroot;
1690                        }
1691                        return 1;
1692                }
1693        }
1694
1695        rb_link_node(&record->node, parent_node, p);
1696        rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1697        return 0;
1698}
1699
1700int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
1701                                   struct btrfs_qgroup_extent_record *qrecord)
1702{
1703        struct ulist *old_root;
1704        u64 bytenr = qrecord->bytenr;
1705        int ret;
1706
1707        ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
1708        if (ret < 0) {
1709                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1710                btrfs_warn(fs_info,
1711"error accounting new delayed refs extent (err code: %d), quota inconsistent",
1712                        ret);
1713                return 0;
1714        }
1715
1716        /*
1717         * Here we don't need to get the lock of
1718         * trans->transaction->delayed_refs, since inserted qrecord won't
1719         * be deleted, only qrecord->node may be modified (new qrecord insert)
1720         *
1721         * So modifying qrecord->old_roots is safe here
1722         */
1723        qrecord->old_roots = old_root;
1724        return 0;
1725}
1726
1727int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1728                              u64 num_bytes, gfp_t gfp_flag)
1729{
1730        struct btrfs_fs_info *fs_info = trans->fs_info;
1731        struct btrfs_qgroup_extent_record *record;
1732        struct btrfs_delayed_ref_root *delayed_refs;
1733        int ret;
1734
1735        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1736            || bytenr == 0 || num_bytes == 0)
1737                return 0;
1738        record = kzalloc(sizeof(*record), gfp_flag);
1739        if (!record)
1740                return -ENOMEM;
1741
1742        delayed_refs = &trans->transaction->delayed_refs;
1743        record->bytenr = bytenr;
1744        record->num_bytes = num_bytes;
1745        record->old_roots = NULL;
1746
1747        spin_lock(&delayed_refs->lock);
1748        ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1749        spin_unlock(&delayed_refs->lock);
1750        if (ret > 0) {
1751                kfree(record);
1752                return 0;
1753        }
1754        return btrfs_qgroup_trace_extent_post(fs_info, record);
1755}
1756
1757int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1758                                  struct extent_buffer *eb)
1759{
1760        struct btrfs_fs_info *fs_info = trans->fs_info;
1761        int nr = btrfs_header_nritems(eb);
1762        int i, extent_type, ret;
1763        struct btrfs_key key;
1764        struct btrfs_file_extent_item *fi;
1765        u64 bytenr, num_bytes;
1766
1767        /* We can be called directly from walk_up_proc() */
1768        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1769                return 0;
1770
1771        for (i = 0; i < nr; i++) {
1772                btrfs_item_key_to_cpu(eb, &key, i);
1773
1774                if (key.type != BTRFS_EXTENT_DATA_KEY)
1775                        continue;
1776
1777                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1778                /* filter out non qgroup-accountable extents  */
1779                extent_type = btrfs_file_extent_type(eb, fi);
1780
1781                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1782                        continue;
1783
1784                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1785                if (!bytenr)
1786                        continue;
1787
1788                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1789
1790                ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
1791                                                GFP_NOFS);
1792                if (ret)
1793                        return ret;
1794        }
1795        cond_resched();
1796        return 0;
1797}
1798
1799/*
1800 * Walk up the tree from the bottom, freeing leaves and any interior
1801 * nodes which have had all slots visited. If a node (leaf or
1802 * interior) is freed, the node above it will have it's slot
1803 * incremented. The root node will never be freed.
1804 *
1805 * At the end of this function, we should have a path which has all
1806 * slots incremented to the next position for a search. If we need to
1807 * read a new node it will be NULL and the node above it will have the
1808 * correct slot selected for a later read.
1809 *
1810 * If we increment the root nodes slot counter past the number of
1811 * elements, 1 is returned to signal completion of the search.
1812 */
1813static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
1814{
1815        int level = 0;
1816        int nr, slot;
1817        struct extent_buffer *eb;
1818
1819        if (root_level == 0)
1820                return 1;
1821
1822        while (level <= root_level) {
1823                eb = path->nodes[level];
1824                nr = btrfs_header_nritems(eb);
1825                path->slots[level]++;
1826                slot = path->slots[level];
1827                if (slot >= nr || level == 0) {
1828                        /*
1829                         * Don't free the root -  we will detect this
1830                         * condition after our loop and return a
1831                         * positive value for caller to stop walking the tree.
1832                         */
1833                        if (level != root_level) {
1834                                btrfs_tree_unlock_rw(eb, path->locks[level]);
1835                                path->locks[level] = 0;
1836
1837                                free_extent_buffer(eb);
1838                                path->nodes[level] = NULL;
1839                                path->slots[level] = 0;
1840                        }
1841                } else {
1842                        /*
1843                         * We have a valid slot to walk back down
1844                         * from. Stop here so caller can process these
1845                         * new nodes.
1846                         */
1847                        break;
1848                }
1849
1850                level++;
1851        }
1852
1853        eb = path->nodes[root_level];
1854        if (path->slots[root_level] >= btrfs_header_nritems(eb))
1855                return 1;
1856
1857        return 0;
1858}
1859
1860/*
1861 * Helper function to trace a subtree tree block swap.
1862 *
1863 * The swap will happen in highest tree block, but there may be a lot of
1864 * tree blocks involved.
1865 *
1866 * For example:
1867 *  OO = Old tree blocks
1868 *  NN = New tree blocks allocated during balance
1869 *
1870 *           File tree (257)                  Reloc tree for 257
1871 * L2              OO                                NN
1872 *               /    \                            /    \
1873 * L1          OO      OO (a)                    OO      NN (a)
1874 *            / \     / \                       / \     / \
1875 * L0       OO   OO OO   OO                   OO   OO NN   NN
1876 *                  (b)  (c)                          (b)  (c)
1877 *
1878 * When calling qgroup_trace_extent_swap(), we will pass:
1879 * @src_eb = OO(a)
1880 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1881 * @dst_level = 0
1882 * @root_level = 1
1883 *
1884 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1885 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1886 *
1887 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1888 *
1889 * 1) Tree search from @src_eb
1890 *    It should acts as a simplified btrfs_search_slot().
1891 *    The key for search can be extracted from @dst_path->nodes[dst_level]
1892 *    (first key).
1893 *
1894 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1895 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1896 *    They should be marked during previous (@dst_level = 1) iteration.
1897 *
1898 * 3) Mark file extents in leaves dirty
1899 *    We don't have good way to pick out new file extents only.
1900 *    So we still follow the old method by scanning all file extents in
1901 *    the leave.
1902 *
1903 * This function can free us from keeping two paths, thus later we only need
1904 * to care about how to iterate all new tree blocks in reloc tree.
1905 */
1906static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1907                                    struct extent_buffer *src_eb,
1908                                    struct btrfs_path *dst_path,
1909                                    int dst_level, int root_level,
1910                                    bool trace_leaf)
1911{
1912        struct btrfs_key key;
1913        struct btrfs_path *src_path;
1914        struct btrfs_fs_info *fs_info = trans->fs_info;
1915        u32 nodesize = fs_info->nodesize;
1916        int cur_level = root_level;
1917        int ret;
1918
1919        BUG_ON(dst_level > root_level);
1920        /* Level mismatch */
1921        if (btrfs_header_level(src_eb) != root_level)
1922                return -EINVAL;
1923
1924        src_path = btrfs_alloc_path();
1925        if (!src_path) {
1926                ret = -ENOMEM;
1927                goto out;
1928        }
1929
1930        if (dst_level)
1931                btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1932        else
1933                btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1934
1935        /* For src_path */
1936        atomic_inc(&src_eb->refs);
1937        src_path->nodes[root_level] = src_eb;
1938        src_path->slots[root_level] = dst_path->slots[root_level];
1939        src_path->locks[root_level] = 0;
1940
1941        /* A simplified version of btrfs_search_slot() */
1942        while (cur_level >= dst_level) {
1943                struct btrfs_key src_key;
1944                struct btrfs_key dst_key;
1945
1946                if (src_path->nodes[cur_level] == NULL) {
1947                        struct btrfs_key first_key;
1948                        struct extent_buffer *eb;
1949                        int parent_slot;
1950                        u64 child_gen;
1951                        u64 child_bytenr;
1952
1953                        eb = src_path->nodes[cur_level + 1];
1954                        parent_slot = src_path->slots[cur_level + 1];
1955                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
1956                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
1957                        btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
1958
1959                        eb = read_tree_block(fs_info, child_bytenr, child_gen,
1960                                             cur_level, &first_key);
1961                        if (IS_ERR(eb)) {
1962                                ret = PTR_ERR(eb);
1963                                goto out;
1964                        } else if (!extent_buffer_uptodate(eb)) {
1965                                free_extent_buffer(eb);
1966                                ret = -EIO;
1967                                goto out;
1968                        }
1969
1970                        src_path->nodes[cur_level] = eb;
1971
1972                        btrfs_tree_read_lock(eb);
1973                        btrfs_set_lock_blocking_read(eb);
1974                        src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
1975                }
1976
1977                src_path->slots[cur_level] = dst_path->slots[cur_level];
1978                if (cur_level) {
1979                        btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
1980                                        &dst_key, dst_path->slots[cur_level]);
1981                        btrfs_node_key_to_cpu(src_path->nodes[cur_level],
1982                                        &src_key, src_path->slots[cur_level]);
1983                } else {
1984                        btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
1985                                        &dst_key, dst_path->slots[cur_level]);
1986                        btrfs_item_key_to_cpu(src_path->nodes[cur_level],
1987                                        &src_key, src_path->slots[cur_level]);
1988                }
1989                /* Content mismatch, something went wrong */
1990                if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
1991                        ret = -ENOENT;
1992                        goto out;
1993                }
1994                cur_level--;
1995        }
1996
1997        /*
1998         * Now both @dst_path and @src_path have been populated, record the tree
1999         * blocks for qgroup accounting.
2000         */
2001        ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2002                        nodesize, GFP_NOFS);
2003        if (ret < 0)
2004                goto out;
2005        ret = btrfs_qgroup_trace_extent(trans,
2006                        dst_path->nodes[dst_level]->start,
2007                        nodesize, GFP_NOFS);
2008        if (ret < 0)
2009                goto out;
2010
2011        /* Record leaf file extents */
2012        if (dst_level == 0 && trace_leaf) {
2013                ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2014                if (ret < 0)
2015                        goto out;
2016                ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2017        }
2018out:
2019        btrfs_free_path(src_path);
2020        return ret;
2021}
2022
2023/*
2024 * Helper function to do recursive generation-aware depth-first search, to
2025 * locate all new tree blocks in a subtree of reloc tree.
2026 *
2027 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2028 *         reloc tree
2029 * L2         NN (a)
2030 *          /    \
2031 * L1    OO        NN (b)
2032 *      /  \      /  \
2033 * L0  OO  OO    OO  NN
2034 *               (c) (d)
2035 * If we pass:
2036 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2037 * @cur_level = 1
2038 * @root_level = 1
2039 *
2040 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2041 * above tree blocks along with their counter parts in file tree.
2042 * While during search, old tree blocks OO(c) will be skipped as tree block swap
2043 * won't affect OO(c).
2044 */
2045static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2046                                           struct extent_buffer *src_eb,
2047                                           struct btrfs_path *dst_path,
2048                                           int cur_level, int root_level,
2049                                           u64 last_snapshot, bool trace_leaf)
2050{
2051        struct btrfs_fs_info *fs_info = trans->fs_info;
2052        struct extent_buffer *eb;
2053        bool need_cleanup = false;
2054        int ret = 0;
2055        int i;
2056
2057        /* Level sanity check */
2058        if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2059            root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2060            root_level < cur_level) {
2061                btrfs_err_rl(fs_info,
2062                        "%s: bad levels, cur_level=%d root_level=%d",
2063                        __func__, cur_level, root_level);
2064                return -EUCLEAN;
2065        }
2066
2067        /* Read the tree block if needed */
2068        if (dst_path->nodes[cur_level] == NULL) {
2069                struct btrfs_key first_key;
2070                int parent_slot;
2071                u64 child_gen;
2072                u64 child_bytenr;
2073
2074                /*
2075                 * dst_path->nodes[root_level] must be initialized before
2076                 * calling this function.
2077                 */
2078                if (cur_level == root_level) {
2079                        btrfs_err_rl(fs_info,
2080        "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2081                                __func__, root_level, root_level, cur_level);
2082                        return -EUCLEAN;
2083                }
2084
2085                /*
2086                 * We need to get child blockptr/gen from parent before we can
2087                 * read it.
2088                  */
2089                eb = dst_path->nodes[cur_level + 1];
2090                parent_slot = dst_path->slots[cur_level + 1];
2091                child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2092                child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2093                btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2094
2095                /* This node is old, no need to trace */
2096                if (child_gen < last_snapshot)
2097                        goto out;
2098
2099                eb = read_tree_block(fs_info, child_bytenr, child_gen,
2100                                     cur_level, &first_key);
2101                if (IS_ERR(eb)) {
2102                        ret = PTR_ERR(eb);
2103                        goto out;
2104                } else if (!extent_buffer_uptodate(eb)) {
2105                        free_extent_buffer(eb);
2106                        ret = -EIO;
2107                        goto out;
2108                }
2109
2110                dst_path->nodes[cur_level] = eb;
2111                dst_path->slots[cur_level] = 0;
2112
2113                btrfs_tree_read_lock(eb);
2114                btrfs_set_lock_blocking_read(eb);
2115                dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
2116                need_cleanup = true;
2117        }
2118
2119        /* Now record this tree block and its counter part for qgroups */
2120        ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2121                                       root_level, trace_leaf);
2122        if (ret < 0)
2123                goto cleanup;
2124
2125        eb = dst_path->nodes[cur_level];
2126
2127        if (cur_level > 0) {
2128                /* Iterate all child tree blocks */
2129                for (i = 0; i < btrfs_header_nritems(eb); i++) {
2130                        /* Skip old tree blocks as they won't be swapped */
2131                        if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2132                                continue;
2133                        dst_path->slots[cur_level] = i;
2134
2135                        /* Recursive call (at most 7 times) */
2136                        ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2137                                        dst_path, cur_level - 1, root_level,
2138                                        last_snapshot, trace_leaf);
2139                        if (ret < 0)
2140                                goto cleanup;
2141                }
2142        }
2143
2144cleanup:
2145        if (need_cleanup) {
2146                /* Clean up */
2147                btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2148                                     dst_path->locks[cur_level]);
2149                free_extent_buffer(dst_path->nodes[cur_level]);
2150                dst_path->nodes[cur_level] = NULL;
2151                dst_path->slots[cur_level] = 0;
2152                dst_path->locks[cur_level] = 0;
2153        }
2154out:
2155        return ret;
2156}
2157
2158static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2159                                struct extent_buffer *src_eb,
2160                                struct extent_buffer *dst_eb,
2161                                u64 last_snapshot, bool trace_leaf)
2162{
2163        struct btrfs_fs_info *fs_info = trans->fs_info;
2164        struct btrfs_path *dst_path = NULL;
2165        int level;
2166        int ret;
2167
2168        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2169                return 0;
2170
2171        /* Wrong parameter order */
2172        if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2173                btrfs_err_rl(fs_info,
2174                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2175                             btrfs_header_generation(src_eb),
2176                             btrfs_header_generation(dst_eb));
2177                return -EUCLEAN;
2178        }
2179
2180        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2181                ret = -EIO;
2182                goto out;
2183        }
2184
2185        level = btrfs_header_level(dst_eb);
2186        dst_path = btrfs_alloc_path();
2187        if (!dst_path) {
2188                ret = -ENOMEM;
2189                goto out;
2190        }
2191        /* For dst_path */
2192        atomic_inc(&dst_eb->refs);
2193        dst_path->nodes[level] = dst_eb;
2194        dst_path->slots[level] = 0;
2195        dst_path->locks[level] = 0;
2196
2197        /* Do the generation aware breadth-first search */
2198        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2199                                              level, last_snapshot, trace_leaf);
2200        if (ret < 0)
2201                goto out;
2202        ret = 0;
2203
2204out:
2205        btrfs_free_path(dst_path);
2206        if (ret < 0)
2207                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2208        return ret;
2209}
2210
2211int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2212                               struct extent_buffer *root_eb,
2213                               u64 root_gen, int root_level)
2214{
2215        struct btrfs_fs_info *fs_info = trans->fs_info;
2216        int ret = 0;
2217        int level;
2218        struct extent_buffer *eb = root_eb;
2219        struct btrfs_path *path = NULL;
2220
2221        BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2222        BUG_ON(root_eb == NULL);
2223
2224        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2225                return 0;
2226
2227        if (!extent_buffer_uptodate(root_eb)) {
2228                ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
2229                if (ret)
2230                        goto out;
2231        }
2232
2233        if (root_level == 0) {
2234                ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2235                goto out;
2236        }
2237
2238        path = btrfs_alloc_path();
2239        if (!path)
2240                return -ENOMEM;
2241
2242        /*
2243         * Walk down the tree.  Missing extent blocks are filled in as
2244         * we go. Metadata is accounted every time we read a new
2245         * extent block.
2246         *
2247         * When we reach a leaf, we account for file extent items in it,
2248         * walk back up the tree (adjusting slot pointers as we go)
2249         * and restart the search process.
2250         */
2251        atomic_inc(&root_eb->refs);     /* For path */
2252        path->nodes[root_level] = root_eb;
2253        path->slots[root_level] = 0;
2254        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2255walk_down:
2256        level = root_level;
2257        while (level >= 0) {
2258                if (path->nodes[level] == NULL) {
2259                        struct btrfs_key first_key;
2260                        int parent_slot;
2261                        u64 child_gen;
2262                        u64 child_bytenr;
2263
2264                        /*
2265                         * We need to get child blockptr/gen from parent before
2266                         * we can read it.
2267                          */
2268                        eb = path->nodes[level + 1];
2269                        parent_slot = path->slots[level + 1];
2270                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2271                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2272                        btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2273
2274                        eb = read_tree_block(fs_info, child_bytenr, child_gen,
2275                                             level, &first_key);
2276                        if (IS_ERR(eb)) {
2277                                ret = PTR_ERR(eb);
2278                                goto out;
2279                        } else if (!extent_buffer_uptodate(eb)) {
2280                                free_extent_buffer(eb);
2281                                ret = -EIO;
2282                                goto out;
2283                        }
2284
2285                        path->nodes[level] = eb;
2286                        path->slots[level] = 0;
2287
2288                        btrfs_tree_read_lock(eb);
2289                        btrfs_set_lock_blocking_read(eb);
2290                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
2291
2292                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2293                                                        fs_info->nodesize,
2294                                                        GFP_NOFS);
2295                        if (ret)
2296                                goto out;
2297                }
2298
2299                if (level == 0) {
2300                        ret = btrfs_qgroup_trace_leaf_items(trans,
2301                                                            path->nodes[level]);
2302                        if (ret)
2303                                goto out;
2304
2305                        /* Nonzero return here means we completed our search */
2306                        ret = adjust_slots_upwards(path, root_level);
2307                        if (ret)
2308                                break;
2309
2310                        /* Restart search with new slots */
2311                        goto walk_down;
2312                }
2313
2314                level--;
2315        }
2316
2317        ret = 0;
2318out:
2319        btrfs_free_path(path);
2320
2321        return ret;
2322}
2323
2324#define UPDATE_NEW      0
2325#define UPDATE_OLD      1
2326/*
2327 * Walk all of the roots that points to the bytenr and adjust their refcnts.
2328 */
2329static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2330                                struct ulist *roots, struct ulist *tmp,
2331                                struct ulist *qgroups, u64 seq, int update_old)
2332{
2333        struct ulist_node *unode;
2334        struct ulist_iterator uiter;
2335        struct ulist_node *tmp_unode;
2336        struct ulist_iterator tmp_uiter;
2337        struct btrfs_qgroup *qg;
2338        int ret = 0;
2339
2340        if (!roots)
2341                return 0;
2342        ULIST_ITER_INIT(&uiter);
2343        while ((unode = ulist_next(roots, &uiter))) {
2344                qg = find_qgroup_rb(fs_info, unode->val);
2345                if (!qg)
2346                        continue;
2347
2348                ulist_reinit(tmp);
2349                ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
2350                                GFP_ATOMIC);
2351                if (ret < 0)
2352                        return ret;
2353                ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
2354                if (ret < 0)
2355                        return ret;
2356                ULIST_ITER_INIT(&tmp_uiter);
2357                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
2358                        struct btrfs_qgroup_list *glist;
2359
2360                        qg = unode_aux_to_qgroup(tmp_unode);
2361                        if (update_old)
2362                                btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2363                        else
2364                                btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2365                        list_for_each_entry(glist, &qg->groups, next_group) {
2366                                ret = ulist_add(qgroups, glist->group->qgroupid,
2367                                                qgroup_to_aux(glist->group),
2368                                                GFP_ATOMIC);
2369                                if (ret < 0)
2370                                        return ret;
2371                                ret = ulist_add(tmp, glist->group->qgroupid,
2372                                                qgroup_to_aux(glist->group),
2373                                                GFP_ATOMIC);
2374                                if (ret < 0)
2375                                        return ret;
2376                        }
2377                }
2378        }
2379        return 0;
2380}
2381
2382/*
2383 * Update qgroup rfer/excl counters.
2384 * Rfer update is easy, codes can explain themselves.
2385 *
2386 * Excl update is tricky, the update is split into 2 parts.
2387 * Part 1: Possible exclusive <-> sharing detect:
2388 *      |       A       |       !A      |
2389 *  -------------------------------------
2390 *  B   |       *       |       -       |
2391 *  -------------------------------------
2392 *  !B  |       +       |       **      |
2393 *  -------------------------------------
2394 *
2395 * Conditions:
2396 * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2397 * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2398 * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2399 * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2400 *
2401 * Results:
2402 * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
2403 * *: Definitely not changed.           **: Possible unchanged.
2404 *
2405 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2406 *
2407 * To make the logic clear, we first use condition A and B to split
2408 * combination into 4 results.
2409 *
2410 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2411 * only on variant maybe 0.
2412 *
2413 * Lastly, check result **, since there are 2 variants maybe 0, split them
2414 * again(2x2).
2415 * But this time we don't need to consider other things, the codes and logic
2416 * is easy to understand now.
2417 */
2418static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
2419                                  struct ulist *qgroups,
2420                                  u64 nr_old_roots,
2421                                  u64 nr_new_roots,
2422                                  u64 num_bytes, u64 seq)
2423{
2424        struct ulist_node *unode;
2425        struct ulist_iterator uiter;
2426        struct btrfs_qgroup *qg;
2427        u64 cur_new_count, cur_old_count;
2428
2429        ULIST_ITER_INIT(&uiter);
2430        while ((unode = ulist_next(qgroups, &uiter))) {
2431                bool dirty = false;
2432
2433                qg = unode_aux_to_qgroup(unode);
2434                cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2435                cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2436
2437                trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2438                                             cur_new_count);
2439
2440                /* Rfer update part */
2441                if (cur_old_count == 0 && cur_new_count > 0) {
2442                        qg->rfer += num_bytes;
2443                        qg->rfer_cmpr += num_bytes;
2444                        dirty = true;
2445                }
2446                if (cur_old_count > 0 && cur_new_count == 0) {
2447                        qg->rfer -= num_bytes;
2448                        qg->rfer_cmpr -= num_bytes;
2449                        dirty = true;
2450                }
2451
2452                /* Excl update part */
2453                /* Exclusive/none -> shared case */
2454                if (cur_old_count == nr_old_roots &&
2455                    cur_new_count < nr_new_roots) {
2456                        /* Exclusive -> shared */
2457                        if (cur_old_count != 0) {
2458                                qg->excl -= num_bytes;
2459                                qg->excl_cmpr -= num_bytes;
2460                                dirty = true;
2461                        }
2462                }
2463
2464                /* Shared -> exclusive/none case */
2465                if (cur_old_count < nr_old_roots &&
2466                    cur_new_count == nr_new_roots) {
2467                        /* Shared->exclusive */
2468                        if (cur_new_count != 0) {
2469                                qg->excl += num_bytes;
2470                                qg->excl_cmpr += num_bytes;
2471                                dirty = true;
2472                        }
2473                }
2474
2475                /* Exclusive/none -> exclusive/none case */
2476                if (cur_old_count == nr_old_roots &&
2477                    cur_new_count == nr_new_roots) {
2478                        if (cur_old_count == 0) {
2479                                /* None -> exclusive/none */
2480
2481                                if (cur_new_count != 0) {
2482                                        /* None -> exclusive */
2483                                        qg->excl += num_bytes;
2484                                        qg->excl_cmpr += num_bytes;
2485                                        dirty = true;
2486                                }
2487                                /* None -> none, nothing changed */
2488                        } else {
2489                                /* Exclusive -> exclusive/none */
2490
2491                                if (cur_new_count == 0) {
2492                                        /* Exclusive -> none */
2493                                        qg->excl -= num_bytes;
2494                                        qg->excl_cmpr -= num_bytes;
2495                                        dirty = true;
2496                                }
2497                                /* Exclusive -> exclusive, nothing changed */
2498                        }
2499                }
2500
2501                if (dirty)
2502                        qgroup_dirty(fs_info, qg);
2503        }
2504        return 0;
2505}
2506
2507/*
2508 * Check if the @roots potentially is a list of fs tree roots
2509 *
2510 * Return 0 for definitely not a fs/subvol tree roots ulist
2511 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2512 *          one as well)
2513 */
2514static int maybe_fs_roots(struct ulist *roots)
2515{
2516        struct ulist_node *unode;
2517        struct ulist_iterator uiter;
2518
2519        /* Empty one, still possible for fs roots */
2520        if (!roots || roots->nnodes == 0)
2521                return 1;
2522
2523        ULIST_ITER_INIT(&uiter);
2524        unode = ulist_next(roots, &uiter);
2525        if (!unode)
2526                return 1;
2527
2528        /*
2529         * If it contains fs tree roots, then it must belong to fs/subvol
2530         * trees.
2531         * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2532         */
2533        return is_fstree(unode->val);
2534}
2535
2536int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2537                                u64 num_bytes, struct ulist *old_roots,
2538                                struct ulist *new_roots)
2539{
2540        struct btrfs_fs_info *fs_info = trans->fs_info;
2541        struct ulist *qgroups = NULL;
2542        struct ulist *tmp = NULL;
2543        u64 seq;
2544        u64 nr_new_roots = 0;
2545        u64 nr_old_roots = 0;
2546        int ret = 0;
2547
2548        /*
2549         * If quotas get disabled meanwhile, the resouces need to be freed and
2550         * we can't just exit here.
2551         */
2552        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2553                goto out_free;
2554
2555        if (new_roots) {
2556                if (!maybe_fs_roots(new_roots))
2557                        goto out_free;
2558                nr_new_roots = new_roots->nnodes;
2559        }
2560        if (old_roots) {
2561                if (!maybe_fs_roots(old_roots))
2562                        goto out_free;
2563                nr_old_roots = old_roots->nnodes;
2564        }
2565
2566        /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2567        if (nr_old_roots == 0 && nr_new_roots == 0)
2568                goto out_free;
2569
2570        BUG_ON(!fs_info->quota_root);
2571
2572        trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2573                                        num_bytes, nr_old_roots, nr_new_roots);
2574
2575        qgroups = ulist_alloc(GFP_NOFS);
2576        if (!qgroups) {
2577                ret = -ENOMEM;
2578                goto out_free;
2579        }
2580        tmp = ulist_alloc(GFP_NOFS);
2581        if (!tmp) {
2582                ret = -ENOMEM;
2583                goto out_free;
2584        }
2585
2586        mutex_lock(&fs_info->qgroup_rescan_lock);
2587        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2588                if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2589                        mutex_unlock(&fs_info->qgroup_rescan_lock);
2590                        ret = 0;
2591                        goto out_free;
2592                }
2593        }
2594        mutex_unlock(&fs_info->qgroup_rescan_lock);
2595
2596        spin_lock(&fs_info->qgroup_lock);
2597        seq = fs_info->qgroup_seq;
2598
2599        /* Update old refcnts using old_roots */
2600        ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
2601                                   UPDATE_OLD);
2602        if (ret < 0)
2603                goto out;
2604
2605        /* Update new refcnts using new_roots */
2606        ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
2607                                   UPDATE_NEW);
2608        if (ret < 0)
2609                goto out;
2610
2611        qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
2612                               num_bytes, seq);
2613
2614        /*
2615         * Bump qgroup_seq to avoid seq overlap
2616         */
2617        fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2618out:
2619        spin_unlock(&fs_info->qgroup_lock);
2620out_free:
2621        ulist_free(tmp);
2622        ulist_free(qgroups);
2623        ulist_free(old_roots);
2624        ulist_free(new_roots);
2625        return ret;
2626}
2627
2628int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2629{
2630        struct btrfs_fs_info *fs_info = trans->fs_info;
2631        struct btrfs_qgroup_extent_record *record;
2632        struct btrfs_delayed_ref_root *delayed_refs;
2633        struct ulist *new_roots = NULL;
2634        struct rb_node *node;
2635        u64 num_dirty_extents = 0;
2636        u64 qgroup_to_skip;
2637        int ret = 0;
2638
2639        delayed_refs = &trans->transaction->delayed_refs;
2640        qgroup_to_skip = delayed_refs->qgroup_to_skip;
2641        while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2642                record = rb_entry(node, struct btrfs_qgroup_extent_record,
2643                                  node);
2644
2645                num_dirty_extents++;
2646                trace_btrfs_qgroup_account_extents(fs_info, record);
2647
2648                if (!ret) {
2649                        /*
2650                         * Old roots should be searched when inserting qgroup
2651                         * extent record
2652                         */
2653                        if (WARN_ON(!record->old_roots)) {
2654                                /* Search commit root to find old_roots */
2655                                ret = btrfs_find_all_roots(NULL, fs_info,
2656                                                record->bytenr, 0,
2657                                                &record->old_roots, false);
2658                                if (ret < 0)
2659                                        goto cleanup;
2660                        }
2661
2662                        /* Free the reserved data space */
2663                        btrfs_qgroup_free_refroot(fs_info,
2664                                        record->data_rsv_refroot,
2665                                        record->data_rsv,
2666                                        BTRFS_QGROUP_RSV_DATA);
2667                        /*
2668                         * Use SEQ_LAST as time_seq to do special search, which
2669                         * doesn't lock tree or delayed_refs and search current
2670                         * root. It's safe inside commit_transaction().
2671                         */
2672                        ret = btrfs_find_all_roots(trans, fs_info,
2673                                record->bytenr, SEQ_LAST, &new_roots, false);
2674                        if (ret < 0)
2675                                goto cleanup;
2676                        if (qgroup_to_skip) {
2677                                ulist_del(new_roots, qgroup_to_skip, 0);
2678                                ulist_del(record->old_roots, qgroup_to_skip,
2679                                          0);
2680                        }
2681                        ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2682                                                          record->num_bytes,
2683                                                          record->old_roots,
2684                                                          new_roots);
2685                        record->old_roots = NULL;
2686                        new_roots = NULL;
2687                }
2688cleanup:
2689                ulist_free(record->old_roots);
2690                ulist_free(new_roots);
2691                new_roots = NULL;
2692                rb_erase(node, &delayed_refs->dirty_extent_root);
2693                kfree(record);
2694
2695        }
2696        trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2697                                       num_dirty_extents);
2698        return ret;
2699}
2700
2701/*
2702 * called from commit_transaction. Writes all changed qgroups to disk.
2703 */
2704int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2705{
2706        struct btrfs_fs_info *fs_info = trans->fs_info;
2707        int ret = 0;
2708
2709        if (!fs_info->quota_root)
2710                return ret;
2711
2712        spin_lock(&fs_info->qgroup_lock);
2713        while (!list_empty(&fs_info->dirty_qgroups)) {
2714                struct btrfs_qgroup *qgroup;
2715                qgroup = list_first_entry(&fs_info->dirty_qgroups,
2716                                          struct btrfs_qgroup, dirty);
2717                list_del_init(&qgroup->dirty);
2718                spin_unlock(&fs_info->qgroup_lock);
2719                ret = update_qgroup_info_item(trans, qgroup);
2720                if (ret)
2721                        fs_info->qgroup_flags |=
2722                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2723                ret = update_qgroup_limit_item(trans, qgroup);
2724                if (ret)
2725                        fs_info->qgroup_flags |=
2726                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2727                spin_lock(&fs_info->qgroup_lock);
2728        }
2729        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2730                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2731        else
2732                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2733        spin_unlock(&fs_info->qgroup_lock);
2734
2735        ret = update_qgroup_status_item(trans);
2736        if (ret)
2737                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2738
2739        return ret;
2740}
2741
2742/*
2743 * Copy the accounting information between qgroups. This is necessary
2744 * when a snapshot or a subvolume is created. Throwing an error will
2745 * cause a transaction abort so we take extra care here to only error
2746 * when a readonly fs is a reasonable outcome.
2747 */
2748int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2749                         u64 objectid, struct btrfs_qgroup_inherit *inherit)
2750{
2751        int ret = 0;
2752        int i;
2753        u64 *i_qgroups;
2754        bool committing = false;
2755        struct btrfs_fs_info *fs_info = trans->fs_info;
2756        struct btrfs_root *quota_root;
2757        struct btrfs_qgroup *srcgroup;
2758        struct btrfs_qgroup *dstgroup;
2759        bool need_rescan = false;
2760        u32 level_size = 0;
2761        u64 nums;
2762
2763        /*
2764         * There are only two callers of this function.
2765         *
2766         * One in create_subvol() in the ioctl context, which needs to hold
2767         * the qgroup_ioctl_lock.
2768         *
2769         * The other one in create_pending_snapshot() where no other qgroup
2770         * code can modify the fs as they all need to either start a new trans
2771         * or hold a trans handler, thus we don't need to hold
2772         * qgroup_ioctl_lock.
2773         * This would avoid long and complex lock chain and make lockdep happy.
2774         */
2775        spin_lock(&fs_info->trans_lock);
2776        if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2777                committing = true;
2778        spin_unlock(&fs_info->trans_lock);
2779
2780        if (!committing)
2781                mutex_lock(&fs_info->qgroup_ioctl_lock);
2782        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2783                goto out;
2784
2785        quota_root = fs_info->quota_root;
2786        if (!quota_root) {
2787                ret = -EINVAL;
2788                goto out;
2789        }
2790
2791        if (inherit) {
2792                i_qgroups = (u64 *)(inherit + 1);
2793                nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2794                       2 * inherit->num_excl_copies;
2795                for (i = 0; i < nums; ++i) {
2796                        srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2797
2798                        /*
2799                         * Zero out invalid groups so we can ignore
2800                         * them later.
2801                         */
2802                        if (!srcgroup ||
2803                            ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
2804                                *i_qgroups = 0ULL;
2805
2806                        ++i_qgroups;
2807                }
2808        }
2809
2810        /*
2811         * create a tracking group for the subvol itself
2812         */
2813        ret = add_qgroup_item(trans, quota_root, objectid);
2814        if (ret)
2815                goto out;
2816
2817        /*
2818         * add qgroup to all inherited groups
2819         */
2820        if (inherit) {
2821                i_qgroups = (u64 *)(inherit + 1);
2822                for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
2823                        if (*i_qgroups == 0)
2824                                continue;
2825                        ret = add_qgroup_relation_item(trans, objectid,
2826                                                       *i_qgroups);
2827                        if (ret && ret != -EEXIST)
2828                                goto out;
2829                        ret = add_qgroup_relation_item(trans, *i_qgroups,
2830                                                       objectid);
2831                        if (ret && ret != -EEXIST)
2832                                goto out;
2833                }
2834                ret = 0;
2835        }
2836
2837
2838        spin_lock(&fs_info->qgroup_lock);
2839
2840        dstgroup = add_qgroup_rb(fs_info, objectid);
2841        if (IS_ERR(dstgroup)) {
2842                ret = PTR_ERR(dstgroup);
2843                goto unlock;
2844        }
2845
2846        if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2847                dstgroup->lim_flags = inherit->lim.flags;
2848                dstgroup->max_rfer = inherit->lim.max_rfer;
2849                dstgroup->max_excl = inherit->lim.max_excl;
2850                dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2851                dstgroup->rsv_excl = inherit->lim.rsv_excl;
2852
2853                ret = update_qgroup_limit_item(trans, dstgroup);
2854                if (ret) {
2855                        fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2856                        btrfs_info(fs_info,
2857                                   "unable to update quota limit for %llu",
2858                                   dstgroup->qgroupid);
2859                        goto unlock;
2860                }
2861        }
2862
2863        if (srcid) {
2864                srcgroup = find_qgroup_rb(fs_info, srcid);
2865                if (!srcgroup)
2866                        goto unlock;
2867
2868                /*
2869                 * We call inherit after we clone the root in order to make sure
2870                 * our counts don't go crazy, so at this point the only
2871                 * difference between the two roots should be the root node.
2872                 */
2873                level_size = fs_info->nodesize;
2874                dstgroup->rfer = srcgroup->rfer;
2875                dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2876                dstgroup->excl = level_size;
2877                dstgroup->excl_cmpr = level_size;
2878                srcgroup->excl = level_size;
2879                srcgroup->excl_cmpr = level_size;
2880
2881                /* inherit the limit info */
2882                dstgroup->lim_flags = srcgroup->lim_flags;
2883                dstgroup->max_rfer = srcgroup->max_rfer;
2884                dstgroup->max_excl = srcgroup->max_excl;
2885                dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2886                dstgroup->rsv_excl = srcgroup->rsv_excl;
2887
2888                qgroup_dirty(fs_info, dstgroup);
2889                qgroup_dirty(fs_info, srcgroup);
2890        }
2891
2892        if (!inherit)
2893                goto unlock;
2894
2895        i_qgroups = (u64 *)(inherit + 1);
2896        for (i = 0; i < inherit->num_qgroups; ++i) {
2897                if (*i_qgroups) {
2898                        ret = add_relation_rb(fs_info, objectid, *i_qgroups);
2899                        if (ret)
2900                                goto unlock;
2901                }
2902                ++i_qgroups;
2903
2904                /*
2905                 * If we're doing a snapshot, and adding the snapshot to a new
2906                 * qgroup, the numbers are guaranteed to be incorrect.
2907                 */
2908                if (srcid)
2909                        need_rescan = true;
2910        }
2911
2912        for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
2913                struct btrfs_qgroup *src;
2914                struct btrfs_qgroup *dst;
2915
2916                if (!i_qgroups[0] || !i_qgroups[1])
2917                        continue;
2918
2919                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2920                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2921
2922                if (!src || !dst) {
2923                        ret = -EINVAL;
2924                        goto unlock;
2925                }
2926
2927                dst->rfer = src->rfer - level_size;
2928                dst->rfer_cmpr = src->rfer_cmpr - level_size;
2929
2930                /* Manually tweaking numbers certainly needs a rescan */
2931                need_rescan = true;
2932        }
2933        for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
2934                struct btrfs_qgroup *src;
2935                struct btrfs_qgroup *dst;
2936
2937                if (!i_qgroups[0] || !i_qgroups[1])
2938                        continue;
2939
2940                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2941                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2942
2943                if (!src || !dst) {
2944                        ret = -EINVAL;
2945                        goto unlock;
2946                }
2947
2948                dst->excl = src->excl + level_size;
2949                dst->excl_cmpr = src->excl_cmpr + level_size;
2950                need_rescan = true;
2951        }
2952
2953unlock:
2954        spin_unlock(&fs_info->qgroup_lock);
2955        if (!ret)
2956                ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
2957out:
2958        if (!committing)
2959                mutex_unlock(&fs_info->qgroup_ioctl_lock);
2960        if (need_rescan)
2961                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2962        return ret;
2963}
2964
2965static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
2966{
2967        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
2968            qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
2969                return false;
2970
2971        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
2972            qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
2973                return false;
2974
2975        return true;
2976}
2977
2978static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
2979                          enum btrfs_qgroup_rsv_type type)
2980{
2981        struct btrfs_qgroup *qgroup;
2982        struct btrfs_fs_info *fs_info = root->fs_info;
2983        u64 ref_root = root->root_key.objectid;
2984        int ret = 0;
2985        struct ulist_node *unode;
2986        struct ulist_iterator uiter;
2987
2988        if (!is_fstree(ref_root))
2989                return 0;
2990
2991        if (num_bytes == 0)
2992                return 0;
2993
2994        if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
2995            capable(CAP_SYS_RESOURCE))
2996                enforce = false;
2997
2998        spin_lock(&fs_info->qgroup_lock);
2999        if (!fs_info->quota_root)
3000                goto out;
3001
3002        qgroup = find_qgroup_rb(fs_info, ref_root);
3003        if (!qgroup)
3004                goto out;
3005
3006        /*
3007         * in a first step, we check all affected qgroups if any limits would
3008         * be exceeded
3009         */
3010        ulist_reinit(fs_info->qgroup_ulist);
3011        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3012                        qgroup_to_aux(qgroup), GFP_ATOMIC);
3013        if (ret < 0)
3014                goto out;
3015        ULIST_ITER_INIT(&uiter);
3016        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3017                struct btrfs_qgroup *qg;
3018                struct btrfs_qgroup_list *glist;
3019
3020                qg = unode_aux_to_qgroup(unode);
3021
3022                if (enforce && !qgroup_check_limits(qg, num_bytes)) {
3023                        ret = -EDQUOT;
3024                        goto out;
3025                }
3026
3027                list_for_each_entry(glist, &qg->groups, next_group) {
3028                        ret = ulist_add(fs_info->qgroup_ulist,
3029                                        glist->group->qgroupid,
3030                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3031                        if (ret < 0)
3032                                goto out;
3033                }
3034        }
3035        ret = 0;
3036        /*
3037         * no limits exceeded, now record the reservation into all qgroups
3038         */
3039        ULIST_ITER_INIT(&uiter);
3040        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3041                struct btrfs_qgroup *qg;
3042
3043                qg = unode_aux_to_qgroup(unode);
3044
3045                qgroup_rsv_add(fs_info, qg, num_bytes, type);
3046        }
3047
3048out:
3049        spin_unlock(&fs_info->qgroup_lock);
3050        return ret;
3051}
3052
3053/*
3054 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3055 * qgroup).
3056 *
3057 * Will handle all higher level qgroup too.
3058 *
3059 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3060 * This special case is only used for META_PERTRANS type.
3061 */
3062void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3063                               u64 ref_root, u64 num_bytes,
3064                               enum btrfs_qgroup_rsv_type type)
3065{
3066        struct btrfs_qgroup *qgroup;
3067        struct ulist_node *unode;
3068        struct ulist_iterator uiter;
3069        int ret = 0;
3070
3071        if (!is_fstree(ref_root))
3072                return;
3073
3074        if (num_bytes == 0)
3075                return;
3076
3077        if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3078                WARN(1, "%s: Invalid type to free", __func__);
3079                return;
3080        }
3081        spin_lock(&fs_info->qgroup_lock);
3082
3083        if (!fs_info->quota_root)
3084                goto out;
3085
3086        qgroup = find_qgroup_rb(fs_info, ref_root);
3087        if (!qgroup)
3088                goto out;
3089
3090        if (num_bytes == (u64)-1)
3091                /*
3092                 * We're freeing all pertrans rsv, get reserved value from
3093                 * level 0 qgroup as real num_bytes to free.
3094                 */
3095                num_bytes = qgroup->rsv.values[type];
3096
3097        ulist_reinit(fs_info->qgroup_ulist);
3098        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3099                        qgroup_to_aux(qgroup), GFP_ATOMIC);
3100        if (ret < 0)
3101                goto out;
3102        ULIST_ITER_INIT(&uiter);
3103        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3104                struct btrfs_qgroup *qg;
3105                struct btrfs_qgroup_list *glist;
3106
3107                qg = unode_aux_to_qgroup(unode);
3108
3109                qgroup_rsv_release(fs_info, qg, num_bytes, type);
3110
3111                list_for_each_entry(glist, &qg->groups, next_group) {
3112                        ret = ulist_add(fs_info->qgroup_ulist,
3113                                        glist->group->qgroupid,
3114                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3115                        if (ret < 0)
3116                                goto out;
3117                }
3118        }
3119
3120out:
3121        spin_unlock(&fs_info->qgroup_lock);
3122}
3123
3124/*
3125 * Check if the leaf is the last leaf. Which means all node pointers
3126 * are at their last position.
3127 */
3128static bool is_last_leaf(struct btrfs_path *path)
3129{
3130        int i;
3131
3132        for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3133                if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3134                        return false;
3135        }
3136        return true;
3137}
3138
3139/*
3140 * returns < 0 on error, 0 when more leafs are to be scanned.
3141 * returns 1 when done.
3142 */
3143static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3144                              struct btrfs_path *path)
3145{
3146        struct btrfs_fs_info *fs_info = trans->fs_info;
3147        struct btrfs_key found;
3148        struct extent_buffer *scratch_leaf = NULL;
3149        struct ulist *roots = NULL;
3150        u64 num_bytes;
3151        bool done;
3152        int slot;
3153        int ret;
3154
3155        mutex_lock(&fs_info->qgroup_rescan_lock);
3156        ret = btrfs_search_slot_for_read(fs_info->extent_root,
3157                                         &fs_info->qgroup_rescan_progress,
3158                                         path, 1, 0);
3159
3160        btrfs_debug(fs_info,
3161                "current progress key (%llu %u %llu), search_slot ret %d",
3162                fs_info->qgroup_rescan_progress.objectid,
3163                fs_info->qgroup_rescan_progress.type,
3164                fs_info->qgroup_rescan_progress.offset, ret);
3165
3166        if (ret) {
3167                /*
3168                 * The rescan is about to end, we will not be scanning any
3169                 * further blocks. We cannot unset the RESCAN flag here, because
3170                 * we want to commit the transaction if everything went well.
3171                 * To make the live accounting work in this phase, we set our
3172                 * scan progress pointer such that every real extent objectid
3173                 * will be smaller.
3174                 */
3175                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3176                btrfs_release_path(path);
3177                mutex_unlock(&fs_info->qgroup_rescan_lock);
3178                return ret;
3179        }
3180        done = is_last_leaf(path);
3181
3182        btrfs_item_key_to_cpu(path->nodes[0], &found,
3183                              btrfs_header_nritems(path->nodes[0]) - 1);
3184        fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3185
3186        scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3187        if (!scratch_leaf) {
3188                ret = -ENOMEM;
3189                mutex_unlock(&fs_info->qgroup_rescan_lock);
3190                goto out;
3191        }
3192        slot = path->slots[0];
3193        btrfs_release_path(path);
3194        mutex_unlock(&fs_info->qgroup_rescan_lock);
3195
3196        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3197                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3198                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3199                    found.type != BTRFS_METADATA_ITEM_KEY)
3200                        continue;
3201                if (found.type == BTRFS_METADATA_ITEM_KEY)
3202                        num_bytes = fs_info->nodesize;
3203                else
3204                        num_bytes = found.offset;
3205
3206                ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
3207                                           &roots, false);
3208                if (ret < 0)
3209                        goto out;
3210                /* For rescan, just pass old_roots as NULL */
3211                ret = btrfs_qgroup_account_extent(trans, found.objectid,
3212                                                  num_bytes, NULL, roots);
3213                if (ret < 0)
3214                        goto out;
3215        }
3216out:
3217        if (scratch_leaf)
3218                free_extent_buffer(scratch_leaf);
3219
3220        if (done && !ret) {
3221                ret = 1;
3222                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3223        }
3224        return ret;
3225}
3226
3227static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3228{
3229        struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3230                                                     qgroup_rescan_work);
3231        struct btrfs_path *path;
3232        struct btrfs_trans_handle *trans = NULL;
3233        int err = -ENOMEM;
3234        int ret = 0;
3235
3236        path = btrfs_alloc_path();
3237        if (!path)
3238                goto out;
3239        /*
3240         * Rescan should only search for commit root, and any later difference
3241         * should be recorded by qgroup
3242         */
3243        path->search_commit_root = 1;
3244        path->skip_locking = 1;
3245
3246        err = 0;
3247        while (!err && !btrfs_fs_closing(fs_info)) {
3248                trans = btrfs_start_transaction(fs_info->fs_root, 0);
3249                if (IS_ERR(trans)) {
3250                        err = PTR_ERR(trans);
3251                        break;
3252                }
3253                if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3254                        err = -EINTR;
3255                } else {
3256                        err = qgroup_rescan_leaf(trans, path);
3257                }
3258                if (err > 0)
3259                        btrfs_commit_transaction(trans);
3260                else
3261                        btrfs_end_transaction(trans);
3262        }
3263
3264out:
3265        btrfs_free_path(path);
3266
3267        mutex_lock(&fs_info->qgroup_rescan_lock);
3268        if (err > 0 &&
3269            fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3270                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3271        } else if (err < 0) {
3272                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3273        }
3274        mutex_unlock(&fs_info->qgroup_rescan_lock);
3275
3276        /*
3277         * only update status, since the previous part has already updated the
3278         * qgroup info.
3279         */
3280        trans = btrfs_start_transaction(fs_info->quota_root, 1);
3281        if (IS_ERR(trans)) {
3282                err = PTR_ERR(trans);
3283                trans = NULL;
3284                btrfs_err(fs_info,
3285                          "fail to start transaction for status update: %d",
3286                          err);
3287        }
3288
3289        mutex_lock(&fs_info->qgroup_rescan_lock);
3290        if (!btrfs_fs_closing(fs_info))
3291                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3292        if (trans) {
3293                ret = update_qgroup_status_item(trans);
3294                if (ret < 0) {
3295                        err = ret;
3296                        btrfs_err(fs_info, "fail to update qgroup status: %d",
3297                                  err);
3298                }
3299        }
3300        fs_info->qgroup_rescan_running = false;
3301        complete_all(&fs_info->qgroup_rescan_completion);
3302        mutex_unlock(&fs_info->qgroup_rescan_lock);
3303
3304        if (!trans)
3305                return;
3306
3307        btrfs_end_transaction(trans);
3308
3309        if (btrfs_fs_closing(fs_info)) {
3310                btrfs_info(fs_info, "qgroup scan paused");
3311        } else if (err >= 0) {
3312                btrfs_info(fs_info, "qgroup scan completed%s",
3313                        err > 0 ? " (inconsistency flag cleared)" : "");
3314        } else {
3315                btrfs_err(fs_info, "qgroup scan failed with %d", err);
3316        }
3317}
3318
3319/*
3320 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3321 * memory required for the rescan context.
3322 */
3323static int
3324qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3325                   int init_flags)
3326{
3327        int ret = 0;
3328
3329        if (!init_flags) {
3330                /* we're resuming qgroup rescan at mount time */
3331                if (!(fs_info->qgroup_flags &
3332                      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3333                        btrfs_warn(fs_info,
3334                        "qgroup rescan init failed, qgroup rescan is not queued");
3335                        ret = -EINVAL;
3336                } else if (!(fs_info->qgroup_flags &
3337                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3338                        btrfs_warn(fs_info,
3339                        "qgroup rescan init failed, qgroup is not enabled");
3340                        ret = -EINVAL;
3341                }
3342
3343                if (ret)
3344                        return ret;
3345        }
3346
3347        mutex_lock(&fs_info->qgroup_rescan_lock);
3348
3349        if (init_flags) {
3350                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3351                        btrfs_warn(fs_info,
3352                                   "qgroup rescan is already in progress");
3353                        ret = -EINPROGRESS;
3354                } else if (!(fs_info->qgroup_flags &
3355                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3356                        btrfs_warn(fs_info,
3357                        "qgroup rescan init failed, qgroup is not enabled");
3358                        ret = -EINVAL;
3359                }
3360
3361                if (ret) {
3362                        mutex_unlock(&fs_info->qgroup_rescan_lock);
3363                        return ret;
3364                }
3365                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3366        }
3367
3368        memset(&fs_info->qgroup_rescan_progress, 0,
3369                sizeof(fs_info->qgroup_rescan_progress));
3370        fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3371        init_completion(&fs_info->qgroup_rescan_completion);
3372        mutex_unlock(&fs_info->qgroup_rescan_lock);
3373
3374        btrfs_init_work(&fs_info->qgroup_rescan_work,
3375                        btrfs_qgroup_rescan_worker, NULL, NULL);
3376        return 0;
3377}
3378
3379static void
3380qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3381{
3382        struct rb_node *n;
3383        struct btrfs_qgroup *qgroup;
3384
3385        spin_lock(&fs_info->qgroup_lock);
3386        /* clear all current qgroup tracking information */
3387        for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3388                qgroup = rb_entry(n, struct btrfs_qgroup, node);
3389                qgroup->rfer = 0;
3390                qgroup->rfer_cmpr = 0;
3391                qgroup->excl = 0;
3392                qgroup->excl_cmpr = 0;
3393                qgroup_dirty(fs_info, qgroup);
3394        }
3395        spin_unlock(&fs_info->qgroup_lock);
3396}
3397
3398int
3399btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3400{
3401        int ret = 0;
3402        struct btrfs_trans_handle *trans;
3403
3404        ret = qgroup_rescan_init(fs_info, 0, 1);
3405        if (ret)
3406                return ret;
3407
3408        /*
3409         * We have set the rescan_progress to 0, which means no more
3410         * delayed refs will be accounted by btrfs_qgroup_account_ref.
3411         * However, btrfs_qgroup_account_ref may be right after its call
3412         * to btrfs_find_all_roots, in which case it would still do the
3413         * accounting.
3414         * To solve this, we're committing the transaction, which will
3415         * ensure we run all delayed refs and only after that, we are
3416         * going to clear all tracking information for a clean start.
3417         */
3418
3419        trans = btrfs_join_transaction(fs_info->fs_root);
3420        if (IS_ERR(trans)) {
3421                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3422                return PTR_ERR(trans);
3423        }
3424        ret = btrfs_commit_transaction(trans);
3425        if (ret) {
3426                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3427                return ret;
3428        }
3429
3430        qgroup_rescan_zero_tracking(fs_info);
3431
3432        mutex_lock(&fs_info->qgroup_rescan_lock);
3433        fs_info->qgroup_rescan_running = true;
3434        btrfs_queue_work(fs_info->qgroup_rescan_workers,
3435                         &fs_info->qgroup_rescan_work);
3436        mutex_unlock(&fs_info->qgroup_rescan_lock);
3437
3438        return 0;
3439}
3440
3441int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3442                                     bool interruptible)
3443{
3444        int running;
3445        int ret = 0;
3446
3447        mutex_lock(&fs_info->qgroup_rescan_lock);
3448        running = fs_info->qgroup_rescan_running;
3449        mutex_unlock(&fs_info->qgroup_rescan_lock);
3450
3451        if (!running)
3452                return 0;
3453
3454        if (interruptible)
3455                ret = wait_for_completion_interruptible(
3456                                        &fs_info->qgroup_rescan_completion);
3457        else
3458                wait_for_completion(&fs_info->qgroup_rescan_completion);
3459
3460        return ret;
3461}
3462
3463/*
3464 * this is only called from open_ctree where we're still single threaded, thus
3465 * locking is omitted here.
3466 */
3467void
3468btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3469{
3470        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3471                mutex_lock(&fs_info->qgroup_rescan_lock);
3472                fs_info->qgroup_rescan_running = true;
3473                btrfs_queue_work(fs_info->qgroup_rescan_workers,
3474                                 &fs_info->qgroup_rescan_work);
3475                mutex_unlock(&fs_info->qgroup_rescan_lock);
3476        }
3477}
3478
3479#define rbtree_iterate_from_safe(node, next, start)                             \
3480       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3481
3482static int qgroup_unreserve_range(struct btrfs_inode *inode,
3483                                  struct extent_changeset *reserved, u64 start,
3484                                  u64 len)
3485{
3486        struct rb_node *node;
3487        struct rb_node *next;
3488        struct ulist_node *entry;
3489        int ret = 0;
3490
3491        node = reserved->range_changed.root.rb_node;
3492        if (!node)
3493                return 0;
3494        while (node) {
3495                entry = rb_entry(node, struct ulist_node, rb_node);
3496                if (entry->val < start)
3497                        node = node->rb_right;
3498                else
3499                        node = node->rb_left;
3500        }
3501
3502        if (entry->val > start && rb_prev(&entry->rb_node))
3503                entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3504                                 rb_node);
3505
3506        rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3507                u64 entry_start;
3508                u64 entry_end;
3509                u64 entry_len;
3510                int clear_ret;
3511
3512                entry = rb_entry(node, struct ulist_node, rb_node);
3513                entry_start = entry->val;
3514                entry_end = entry->aux;
3515                entry_len = entry_end - entry_start + 1;
3516
3517                if (entry_start >= start + len)
3518                        break;
3519                if (entry_start + entry_len <= start)
3520                        continue;
3521                /*
3522                 * Now the entry is in [start, start + len), revert the
3523                 * EXTENT_QGROUP_RESERVED bit.
3524                 */
3525                clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3526                                              entry_end, EXTENT_QGROUP_RESERVED);
3527                if (!ret && clear_ret < 0)
3528                        ret = clear_ret;
3529
3530                ulist_del(&reserved->range_changed, entry->val, entry->aux);
3531                if (likely(reserved->bytes_changed >= entry_len)) {
3532                        reserved->bytes_changed -= entry_len;
3533                } else {
3534                        WARN_ON(1);
3535                        reserved->bytes_changed = 0;
3536                }
3537        }
3538
3539        return ret;
3540}
3541
3542/*
3543 * Try to free some space for qgroup.
3544 *
3545 * For qgroup, there are only 3 ways to free qgroup space:
3546 * - Flush nodatacow write
3547 *   Any nodatacow write will free its reserved data space at run_delalloc_range().
3548 *   In theory, we should only flush nodatacow inodes, but it's not yet
3549 *   possible, so we need to flush the whole root.
3550 *
3551 * - Wait for ordered extents
3552 *   When ordered extents are finished, their reserved metadata is finally
3553 *   converted to per_trans status, which can be freed by later commit
3554 *   transaction.
3555 *
3556 * - Commit transaction
3557 *   This would free the meta_per_trans space.
3558 *   In theory this shouldn't provide much space, but any more qgroup space
3559 *   is needed.
3560 */
3561static int try_flush_qgroup(struct btrfs_root *root)
3562{
3563        struct btrfs_trans_handle *trans;
3564        int ret;
3565        bool can_commit = true;
3566
3567        /*
3568         * We don't want to run flush again and again, so if there is a running
3569         * one, we won't try to start a new flush, but exit directly.
3570         */
3571        if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3572                wait_event(root->qgroup_flush_wait,
3573                        !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3574                return 0;
3575        }
3576
3577        /*
3578         * If current process holds a transaction, we shouldn't flush, as we
3579         * assume all space reservation happens before a transaction handle is
3580         * held.
3581         *
3582         * But there are cases like btrfs_delayed_item_reserve_metadata() where
3583         * we try to reserve space with one transction handle already held.
3584         * In that case we can't commit transaction, but at least try to end it
3585         * and hope the started data writes can free some space.
3586         */
3587        if (current->journal_info &&
3588            current->journal_info != BTRFS_SEND_TRANS_STUB)
3589                can_commit = false;
3590
3591        ret = btrfs_start_delalloc_snapshot(root);
3592        if (ret < 0)
3593                goto out;
3594        btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3595
3596        trans = btrfs_join_transaction(root);
3597        if (IS_ERR(trans)) {
3598                ret = PTR_ERR(trans);
3599                goto out;
3600        }
3601
3602        if (can_commit)
3603                ret = btrfs_commit_transaction(trans);
3604        else
3605                ret = btrfs_end_transaction(trans);
3606out:
3607        clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3608        wake_up(&root->qgroup_flush_wait);
3609        return ret;
3610}
3611
3612static int qgroup_reserve_data(struct btrfs_inode *inode,
3613                        struct extent_changeset **reserved_ret, u64 start,
3614                        u64 len)
3615{
3616        struct btrfs_root *root = inode->root;
3617        struct extent_changeset *reserved;
3618        bool new_reserved = false;
3619        u64 orig_reserved;
3620        u64 to_reserve;
3621        int ret;
3622
3623        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3624            !is_fstree(root->root_key.objectid) || len == 0)
3625                return 0;
3626
3627        /* @reserved parameter is mandatory for qgroup */
3628        if (WARN_ON(!reserved_ret))
3629                return -EINVAL;
3630        if (!*reserved_ret) {
3631                new_reserved = true;
3632                *reserved_ret = extent_changeset_alloc();
3633                if (!*reserved_ret)
3634                        return -ENOMEM;
3635        }
3636        reserved = *reserved_ret;
3637        /* Record already reserved space */
3638        orig_reserved = reserved->bytes_changed;
3639        ret = set_record_extent_bits(&inode->io_tree, start,
3640                        start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3641
3642        /* Newly reserved space */
3643        to_reserve = reserved->bytes_changed - orig_reserved;
3644        trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
3645                                        to_reserve, QGROUP_RESERVE);
3646        if (ret < 0)
3647                goto out;
3648        ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3649        if (ret < 0)
3650                goto cleanup;
3651
3652        return ret;
3653
3654cleanup:
3655        qgroup_unreserve_range(inode, reserved, start, len);
3656out:
3657        if (new_reserved) {
3658                extent_changeset_release(reserved);
3659                kfree(reserved);
3660                *reserved_ret = NULL;
3661        }
3662        return ret;
3663}
3664
3665/*
3666 * Reserve qgroup space for range [start, start + len).
3667 *
3668 * This function will either reserve space from related qgroups or do nothing
3669 * if the range is already reserved.
3670 *
3671 * Return 0 for successful reservation
3672 * Return <0 for error (including -EQUOT)
3673 *
3674 * NOTE: This function may sleep for memory allocation, dirty page flushing and
3675 *       commit transaction. So caller should not hold any dirty page locked.
3676 */
3677int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3678                        struct extent_changeset **reserved_ret, u64 start,
3679                        u64 len)
3680{
3681        int ret;
3682
3683        ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3684        if (ret <= 0 && ret != -EDQUOT)
3685                return ret;
3686
3687        ret = try_flush_qgroup(inode->root);
3688        if (ret < 0)
3689                return ret;
3690        return qgroup_reserve_data(inode, reserved_ret, start, len);
3691}
3692
3693/* Free ranges specified by @reserved, normally in error path */
3694static int qgroup_free_reserved_data(struct btrfs_inode *inode,
3695                        struct extent_changeset *reserved, u64 start, u64 len)
3696{
3697        struct btrfs_root *root = inode->root;
3698        struct ulist_node *unode;
3699        struct ulist_iterator uiter;
3700        struct extent_changeset changeset;
3701        int freed = 0;
3702        int ret;
3703
3704        extent_changeset_init(&changeset);
3705        len = round_up(start + len, root->fs_info->sectorsize);
3706        start = round_down(start, root->fs_info->sectorsize);
3707
3708        ULIST_ITER_INIT(&uiter);
3709        while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3710                u64 range_start = unode->val;
3711                /* unode->aux is the inclusive end */
3712                u64 range_len = unode->aux - range_start + 1;
3713                u64 free_start;
3714                u64 free_len;
3715
3716                extent_changeset_release(&changeset);
3717
3718                /* Only free range in range [start, start + len) */
3719                if (range_start >= start + len ||
3720                    range_start + range_len <= start)
3721                        continue;
3722                free_start = max(range_start, start);
3723                free_len = min(start + len, range_start + range_len) -
3724                           free_start;
3725                /*
3726                 * TODO: To also modify reserved->ranges_reserved to reflect
3727                 * the modification.
3728                 *
3729                 * However as long as we free qgroup reserved according to
3730                 * EXTENT_QGROUP_RESERVED, we won't double free.
3731                 * So not need to rush.
3732                 */
3733                ret = clear_record_extent_bits(&inode->io_tree, free_start,
3734                                free_start + free_len - 1,
3735                                EXTENT_QGROUP_RESERVED, &changeset);
3736                if (ret < 0)
3737                        goto out;
3738                freed += changeset.bytes_changed;
3739        }
3740        btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3741                                  BTRFS_QGROUP_RSV_DATA);
3742        ret = freed;
3743out:
3744        extent_changeset_release(&changeset);
3745        return ret;
3746}
3747
3748static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
3749                        struct extent_changeset *reserved, u64 start, u64 len,
3750                        int free)
3751{
3752        struct extent_changeset changeset;
3753        int trace_op = QGROUP_RELEASE;
3754        int ret;
3755
3756        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
3757                return 0;
3758
3759        /* In release case, we shouldn't have @reserved */
3760        WARN_ON(!free && reserved);
3761        if (free && reserved)
3762                return qgroup_free_reserved_data(inode, reserved, start, len);
3763        extent_changeset_init(&changeset);
3764        ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3765                                       EXTENT_QGROUP_RESERVED, &changeset);
3766        if (ret < 0)
3767                goto out;
3768
3769        if (free)
3770                trace_op = QGROUP_FREE;
3771        trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
3772                                        changeset.bytes_changed, trace_op);
3773        if (free)
3774                btrfs_qgroup_free_refroot(inode->root->fs_info,
3775                                inode->root->root_key.objectid,
3776                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3777        ret = changeset.bytes_changed;
3778out:
3779        extent_changeset_release(&changeset);
3780        return ret;
3781}
3782
3783/*
3784 * Free a reserved space range from io_tree and related qgroups
3785 *
3786 * Should be called when a range of pages get invalidated before reaching disk.
3787 * Or for error cleanup case.
3788 * if @reserved is given, only reserved range in [@start, @start + @len) will
3789 * be freed.
3790 *
3791 * For data written to disk, use btrfs_qgroup_release_data().
3792 *
3793 * NOTE: This function may sleep for memory allocation.
3794 */
3795int btrfs_qgroup_free_data(struct btrfs_inode *inode,
3796                        struct extent_changeset *reserved, u64 start, u64 len)
3797{
3798        return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
3799}
3800
3801/*
3802 * Release a reserved space range from io_tree only.
3803 *
3804 * Should be called when a range of pages get written to disk and corresponding
3805 * FILE_EXTENT is inserted into corresponding root.
3806 *
3807 * Since new qgroup accounting framework will only update qgroup numbers at
3808 * commit_transaction() time, its reserved space shouldn't be freed from
3809 * related qgroups.
3810 *
3811 * But we should release the range from io_tree, to allow further write to be
3812 * COWed.
3813 *
3814 * NOTE: This function may sleep for memory allocation.
3815 */
3816int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
3817{
3818        return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
3819}
3820
3821static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3822                              enum btrfs_qgroup_rsv_type type)
3823{
3824        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3825            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3826                return;
3827        if (num_bytes == 0)
3828                return;
3829
3830        spin_lock(&root->qgroup_meta_rsv_lock);
3831        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
3832                root->qgroup_meta_rsv_prealloc += num_bytes;
3833        else
3834                root->qgroup_meta_rsv_pertrans += num_bytes;
3835        spin_unlock(&root->qgroup_meta_rsv_lock);
3836}
3837
3838static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3839                             enum btrfs_qgroup_rsv_type type)
3840{
3841        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3842            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3843                return 0;
3844        if (num_bytes == 0)
3845                return 0;
3846
3847        spin_lock(&root->qgroup_meta_rsv_lock);
3848        if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
3849                num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
3850                                  num_bytes);
3851                root->qgroup_meta_rsv_prealloc -= num_bytes;
3852        } else {
3853                num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
3854                                  num_bytes);
3855                root->qgroup_meta_rsv_pertrans -= num_bytes;
3856        }
3857        spin_unlock(&root->qgroup_meta_rsv_lock);
3858        return num_bytes;
3859}
3860
3861static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3862                                enum btrfs_qgroup_rsv_type type, bool enforce)
3863{
3864        struct btrfs_fs_info *fs_info = root->fs_info;
3865        int ret;
3866
3867        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3868            !is_fstree(root->root_key.objectid) || num_bytes == 0)
3869                return 0;
3870
3871        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3872        trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
3873        ret = qgroup_reserve(root, num_bytes, enforce, type);
3874        if (ret < 0)
3875                return ret;
3876        /*
3877         * Record what we have reserved into root.
3878         *
3879         * To avoid quota disabled->enabled underflow.
3880         * In that case, we may try to free space we haven't reserved
3881         * (since quota was disabled), so record what we reserved into root.
3882         * And ensure later release won't underflow this number.
3883         */
3884        add_root_meta_rsv(root, num_bytes, type);
3885        return ret;
3886}
3887
3888int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3889                                enum btrfs_qgroup_rsv_type type, bool enforce)
3890{
3891        int ret;
3892
3893        ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
3894        if (ret <= 0 && ret != -EDQUOT)
3895                return ret;
3896
3897        ret = try_flush_qgroup(root);
3898        if (ret < 0)
3899                return ret;
3900        return qgroup_reserve_meta(root, num_bytes, type, enforce);
3901}
3902
3903void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
3904{
3905        struct btrfs_fs_info *fs_info = root->fs_info;
3906
3907        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3908            !is_fstree(root->root_key.objectid))
3909                return;
3910
3911        /* TODO: Update trace point to handle such free */
3912        trace_qgroup_meta_free_all_pertrans(root);
3913        /* Special value -1 means to free all reserved space */
3914        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
3915                                  BTRFS_QGROUP_RSV_META_PERTRANS);
3916}
3917
3918void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
3919                              enum btrfs_qgroup_rsv_type type)
3920{
3921        struct btrfs_fs_info *fs_info = root->fs_info;
3922
3923        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3924            !is_fstree(root->root_key.objectid))
3925                return;
3926
3927        /*
3928         * reservation for META_PREALLOC can happen before quota is enabled,
3929         * which can lead to underflow.
3930         * Here ensure we will only free what we really have reserved.
3931         */
3932        num_bytes = sub_root_meta_rsv(root, num_bytes, type);
3933        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3934        trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
3935        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
3936                                  num_bytes, type);
3937}
3938
3939static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
3940                                int num_bytes)
3941{
3942        struct btrfs_qgroup *qgroup;
3943        struct ulist_node *unode;
3944        struct ulist_iterator uiter;
3945        int ret = 0;
3946
3947        if (num_bytes == 0)
3948                return;
3949        if (!fs_info->quota_root)
3950                return;
3951
3952        spin_lock(&fs_info->qgroup_lock);
3953        qgroup = find_qgroup_rb(fs_info, ref_root);
3954        if (!qgroup)
3955                goto out;
3956        ulist_reinit(fs_info->qgroup_ulist);
3957        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3958                       qgroup_to_aux(qgroup), GFP_ATOMIC);
3959        if (ret < 0)
3960                goto out;
3961        ULIST_ITER_INIT(&uiter);
3962        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3963                struct btrfs_qgroup *qg;
3964                struct btrfs_qgroup_list *glist;
3965
3966                qg = unode_aux_to_qgroup(unode);
3967
3968                qgroup_rsv_release(fs_info, qg, num_bytes,
3969                                BTRFS_QGROUP_RSV_META_PREALLOC);
3970                qgroup_rsv_add(fs_info, qg, num_bytes,
3971                                BTRFS_QGROUP_RSV_META_PERTRANS);
3972                list_for_each_entry(glist, &qg->groups, next_group) {
3973                        ret = ulist_add(fs_info->qgroup_ulist,
3974                                        glist->group->qgroupid,
3975                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3976                        if (ret < 0)
3977                                goto out;
3978                }
3979        }
3980out:
3981        spin_unlock(&fs_info->qgroup_lock);
3982}
3983
3984void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
3985{
3986        struct btrfs_fs_info *fs_info = root->fs_info;
3987
3988        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3989            !is_fstree(root->root_key.objectid))
3990                return;
3991        /* Same as btrfs_qgroup_free_meta_prealloc() */
3992        num_bytes = sub_root_meta_rsv(root, num_bytes,
3993                                      BTRFS_QGROUP_RSV_META_PREALLOC);
3994        trace_qgroup_meta_convert(root, num_bytes);
3995        qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
3996}
3997
3998/*
3999 * Check qgroup reserved space leaking, normally at destroy inode
4000 * time
4001 */
4002void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
4003{
4004        struct extent_changeset changeset;
4005        struct ulist_node *unode;
4006        struct ulist_iterator iter;
4007        int ret;
4008
4009        extent_changeset_init(&changeset);
4010        ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4011                        EXTENT_QGROUP_RESERVED, &changeset);
4012
4013        WARN_ON(ret < 0);
4014        if (WARN_ON(changeset.bytes_changed)) {
4015                ULIST_ITER_INIT(&iter);
4016                while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4017                        btrfs_warn(inode->root->fs_info,
4018                "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4019                                btrfs_ino(inode), unode->val, unode->aux);
4020                }
4021                btrfs_qgroup_free_refroot(inode->root->fs_info,
4022                                inode->root->root_key.objectid,
4023                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4024
4025        }
4026        extent_changeset_release(&changeset);
4027}
4028
4029void btrfs_qgroup_init_swapped_blocks(
4030        struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4031{
4032        int i;
4033
4034        spin_lock_init(&swapped_blocks->lock);
4035        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4036                swapped_blocks->blocks[i] = RB_ROOT;
4037        swapped_blocks->swapped = false;
4038}
4039
4040/*
4041 * Delete all swapped blocks record of @root.
4042 * Every record here means we skipped a full subtree scan for qgroup.
4043 *
4044 * Gets called when committing one transaction.
4045 */
4046void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4047{
4048        struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4049        int i;
4050
4051        swapped_blocks = &root->swapped_blocks;
4052
4053        spin_lock(&swapped_blocks->lock);
4054        if (!swapped_blocks->swapped)
4055                goto out;
4056        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4057                struct rb_root *cur_root = &swapped_blocks->blocks[i];
4058                struct btrfs_qgroup_swapped_block *entry;
4059                struct btrfs_qgroup_swapped_block *next;
4060
4061                rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4062                                                     node)
4063                        kfree(entry);
4064                swapped_blocks->blocks[i] = RB_ROOT;
4065        }
4066        swapped_blocks->swapped = false;
4067out:
4068        spin_unlock(&swapped_blocks->lock);
4069}
4070
4071/*
4072 * Add subtree roots record into @subvol_root.
4073 *
4074 * @subvol_root:        tree root of the subvolume tree get swapped
4075 * @bg:                 block group under balance
4076 * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4077 * @reloc_parent/slot:  pointer to the subtree root in reloc tree
4078 *                      BOTH POINTERS ARE BEFORE TREE SWAP
4079 * @last_snapshot:      last snapshot generation of the subvolume tree
4080 */
4081int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4082                struct btrfs_root *subvol_root,
4083                struct btrfs_block_group *bg,
4084                struct extent_buffer *subvol_parent, int subvol_slot,
4085                struct extent_buffer *reloc_parent, int reloc_slot,
4086                u64 last_snapshot)
4087{
4088        struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4089        struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4090        struct btrfs_qgroup_swapped_block *block;
4091        struct rb_node **cur;
4092        struct rb_node *parent = NULL;
4093        int level = btrfs_header_level(subvol_parent) - 1;
4094        int ret = 0;
4095
4096        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4097                return 0;
4098
4099        if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4100            btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4101                btrfs_err_rl(fs_info,
4102                "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4103                        __func__,
4104                        btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4105                        btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4106                return -EUCLEAN;
4107        }
4108
4109        block = kmalloc(sizeof(*block), GFP_NOFS);
4110        if (!block) {
4111                ret = -ENOMEM;
4112                goto out;
4113        }
4114
4115        /*
4116         * @reloc_parent/slot is still before swap, while @block is going to
4117         * record the bytenr after swap, so we do the swap here.
4118         */
4119        block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4120        block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4121                                                             reloc_slot);
4122        block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4123        block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4124                                                            subvol_slot);
4125        block->last_snapshot = last_snapshot;
4126        block->level = level;
4127
4128        /*
4129         * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4130         * no one else can modify tree blocks thus we qgroup will not change
4131         * no matter the value of trace_leaf.
4132         */
4133        if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4134                block->trace_leaf = true;
4135        else
4136                block->trace_leaf = false;
4137        btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4138
4139        /* Insert @block into @blocks */
4140        spin_lock(&blocks->lock);
4141        cur = &blocks->blocks[level].rb_node;
4142        while (*cur) {
4143                struct btrfs_qgroup_swapped_block *entry;
4144
4145                parent = *cur;
4146                entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4147                                 node);
4148
4149                if (entry->subvol_bytenr < block->subvol_bytenr) {
4150                        cur = &(*cur)->rb_left;
4151                } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4152                        cur = &(*cur)->rb_right;
4153                } else {
4154                        if (entry->subvol_generation !=
4155                                        block->subvol_generation ||
4156                            entry->reloc_bytenr != block->reloc_bytenr ||
4157                            entry->reloc_generation !=
4158                                        block->reloc_generation) {
4159                                /*
4160                                 * Duplicated but mismatch entry found.
4161                                 * Shouldn't happen.
4162                                 *
4163                                 * Marking qgroup inconsistent should be enough
4164                                 * for end users.
4165                                 */
4166                                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4167                                ret = -EEXIST;
4168                        }
4169                        kfree(block);
4170                        goto out_unlock;
4171                }
4172        }
4173        rb_link_node(&block->node, parent, cur);
4174        rb_insert_color(&block->node, &blocks->blocks[level]);
4175        blocks->swapped = true;
4176out_unlock:
4177        spin_unlock(&blocks->lock);
4178out:
4179        if (ret < 0)
4180                fs_info->qgroup_flags |=
4181                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4182        return ret;
4183}
4184
4185/*
4186 * Check if the tree block is a subtree root, and if so do the needed
4187 * delayed subtree trace for qgroup.
4188 *
4189 * This is called during btrfs_cow_block().
4190 */
4191int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4192                                         struct btrfs_root *root,
4193                                         struct extent_buffer *subvol_eb)
4194{
4195        struct btrfs_fs_info *fs_info = root->fs_info;
4196        struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4197        struct btrfs_qgroup_swapped_block *block;
4198        struct extent_buffer *reloc_eb = NULL;
4199        struct rb_node *node;
4200        bool found = false;
4201        bool swapped = false;
4202        int level = btrfs_header_level(subvol_eb);
4203        int ret = 0;
4204        int i;
4205
4206        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4207                return 0;
4208        if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4209                return 0;
4210
4211        spin_lock(&blocks->lock);
4212        if (!blocks->swapped) {
4213                spin_unlock(&blocks->lock);
4214                return 0;
4215        }
4216        node = blocks->blocks[level].rb_node;
4217
4218        while (node) {
4219                block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4220                if (block->subvol_bytenr < subvol_eb->start) {
4221                        node = node->rb_left;
4222                } else if (block->subvol_bytenr > subvol_eb->start) {
4223                        node = node->rb_right;
4224                } else {
4225                        found = true;
4226                        break;
4227                }
4228        }
4229        if (!found) {
4230                spin_unlock(&blocks->lock);
4231                goto out;
4232        }
4233        /* Found one, remove it from @blocks first and update blocks->swapped */
4234        rb_erase(&block->node, &blocks->blocks[level]);
4235        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4236                if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4237                        swapped = true;
4238                        break;
4239                }
4240        }
4241        blocks->swapped = swapped;
4242        spin_unlock(&blocks->lock);
4243
4244        /* Read out reloc subtree root */
4245        reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
4246                                   block->reloc_generation, block->level,
4247                                   &block->first_key);
4248        if (IS_ERR(reloc_eb)) {
4249                ret = PTR_ERR(reloc_eb);
4250                reloc_eb = NULL;
4251                goto free_out;
4252        }
4253        if (!extent_buffer_uptodate(reloc_eb)) {
4254                ret = -EIO;
4255                goto free_out;
4256        }
4257
4258        ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4259                        block->last_snapshot, block->trace_leaf);
4260free_out:
4261        kfree(block);
4262        free_extent_buffer(reloc_eb);
4263out:
4264        if (ret < 0) {
4265                btrfs_err_rl(fs_info,
4266                             "failed to account subtree at bytenr %llu: %d",
4267                             subvol_eb->start, ret);
4268                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4269        }
4270        return ret;
4271}
4272
4273void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4274{
4275        struct btrfs_qgroup_extent_record *entry;
4276        struct btrfs_qgroup_extent_record *next;
4277        struct rb_root *root;
4278
4279        root = &trans->delayed_refs.dirty_extent_root;
4280        rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4281                ulist_free(entry->old_roots);
4282                kfree(entry);
4283        }
4284}
4285