linux/fs/btrfs/qgroup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/rbtree.h>
  11#include <linux/slab.h>
  12#include <linux/workqueue.h>
  13#include <linux/btrfs.h>
  14#include <linux/sizes.h>
  15
  16#include "ctree.h"
  17#include "transaction.h"
  18#include "disk-io.h"
  19#include "locking.h"
  20#include "ulist.h"
  21#include "backref.h"
  22#include "extent_io.h"
  23#include "qgroup.h"
  24
  25
  26/* TODO XXX FIXME
  27 *  - subvol delete -> delete when ref goes to 0? delete limits also?
  28 *  - reorganize keys
  29 *  - compressed
  30 *  - sync
  31 *  - copy also limits on subvol creation
  32 *  - limit
  33 *  - caches for ulists
  34 *  - performance benchmarks
  35 *  - check all ioctl parameters
  36 */
  37
  38/*
  39 * Helpers to access qgroup reservation
  40 *
  41 * Callers should ensure the lock context and type are valid
  42 */
  43
  44static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  45{
  46        u64 ret = 0;
  47        int i;
  48
  49        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  50                ret += qgroup->rsv.values[i];
  51
  52        return ret;
  53}
  54
  55#ifdef CONFIG_BTRFS_DEBUG
  56static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  57{
  58        if (type == BTRFS_QGROUP_RSV_DATA)
  59                return "data";
  60        if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  61                return "meta_pertrans";
  62        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  63                return "meta_prealloc";
  64        return NULL;
  65}
  66#endif
  67
  68static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  69                           struct btrfs_qgroup *qgroup, u64 num_bytes,
  70                           enum btrfs_qgroup_rsv_type type)
  71{
  72        trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  73        qgroup->rsv.values[type] += num_bytes;
  74}
  75
  76static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  77                               struct btrfs_qgroup *qgroup, u64 num_bytes,
  78                               enum btrfs_qgroup_rsv_type type)
  79{
  80        trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  81        if (qgroup->rsv.values[type] >= num_bytes) {
  82                qgroup->rsv.values[type] -= num_bytes;
  83                return;
  84        }
  85#ifdef CONFIG_BTRFS_DEBUG
  86        WARN_RATELIMIT(1,
  87                "qgroup %llu %s reserved space underflow, have %llu to free %llu",
  88                qgroup->qgroupid, qgroup_rsv_type_str(type),
  89                qgroup->rsv.values[type], num_bytes);
  90#endif
  91        qgroup->rsv.values[type] = 0;
  92}
  93
  94static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
  95                                     struct btrfs_qgroup *dest,
  96                                     struct btrfs_qgroup *src)
  97{
  98        int i;
  99
 100        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 101                qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
 102}
 103
 104static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
 105                                         struct btrfs_qgroup *dest,
 106                                          struct btrfs_qgroup *src)
 107{
 108        int i;
 109
 110        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 111                qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 112}
 113
 114static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 115                                           int mod)
 116{
 117        if (qg->old_refcnt < seq)
 118                qg->old_refcnt = seq;
 119        qg->old_refcnt += mod;
 120}
 121
 122static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 123                                           int mod)
 124{
 125        if (qg->new_refcnt < seq)
 126                qg->new_refcnt = seq;
 127        qg->new_refcnt += mod;
 128}
 129
 130static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
 131{
 132        if (qg->old_refcnt < seq)
 133                return 0;
 134        return qg->old_refcnt - seq;
 135}
 136
 137static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
 138{
 139        if (qg->new_refcnt < seq)
 140                return 0;
 141        return qg->new_refcnt - seq;
 142}
 143
 144/*
 145 * glue structure to represent the relations between qgroups.
 146 */
 147struct btrfs_qgroup_list {
 148        struct list_head next_group;
 149        struct list_head next_member;
 150        struct btrfs_qgroup *group;
 151        struct btrfs_qgroup *member;
 152};
 153
 154static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
 155{
 156        return (u64)(uintptr_t)qg;
 157}
 158
 159static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
 160{
 161        return (struct btrfs_qgroup *)(uintptr_t)n->aux;
 162}
 163
 164static int
 165qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 166                   int init_flags);
 167static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 168
 169/* must be called with qgroup_ioctl_lock held */
 170static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 171                                           u64 qgroupid)
 172{
 173        struct rb_node *n = fs_info->qgroup_tree.rb_node;
 174        struct btrfs_qgroup *qgroup;
 175
 176        while (n) {
 177                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 178                if (qgroup->qgroupid < qgroupid)
 179                        n = n->rb_left;
 180                else if (qgroup->qgroupid > qgroupid)
 181                        n = n->rb_right;
 182                else
 183                        return qgroup;
 184        }
 185        return NULL;
 186}
 187
 188/* must be called with qgroup_lock held */
 189static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 190                                          u64 qgroupid)
 191{
 192        struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 193        struct rb_node *parent = NULL;
 194        struct btrfs_qgroup *qgroup;
 195
 196        while (*p) {
 197                parent = *p;
 198                qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 199
 200                if (qgroup->qgroupid < qgroupid)
 201                        p = &(*p)->rb_left;
 202                else if (qgroup->qgroupid > qgroupid)
 203                        p = &(*p)->rb_right;
 204                else
 205                        return qgroup;
 206        }
 207
 208        qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
 209        if (!qgroup)
 210                return ERR_PTR(-ENOMEM);
 211
 212        qgroup->qgroupid = qgroupid;
 213        INIT_LIST_HEAD(&qgroup->groups);
 214        INIT_LIST_HEAD(&qgroup->members);
 215        INIT_LIST_HEAD(&qgroup->dirty);
 216
 217        rb_link_node(&qgroup->node, parent, p);
 218        rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 219
 220        return qgroup;
 221}
 222
 223static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
 224{
 225        struct btrfs_qgroup_list *list;
 226
 227        list_del(&qgroup->dirty);
 228        while (!list_empty(&qgroup->groups)) {
 229                list = list_first_entry(&qgroup->groups,
 230                                        struct btrfs_qgroup_list, next_group);
 231                list_del(&list->next_group);
 232                list_del(&list->next_member);
 233                kfree(list);
 234        }
 235
 236        while (!list_empty(&qgroup->members)) {
 237                list = list_first_entry(&qgroup->members,
 238                                        struct btrfs_qgroup_list, next_member);
 239                list_del(&list->next_group);
 240                list_del(&list->next_member);
 241                kfree(list);
 242        }
 243        kfree(qgroup);
 244}
 245
 246/* must be called with qgroup_lock held */
 247static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 248{
 249        struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 250
 251        if (!qgroup)
 252                return -ENOENT;
 253
 254        rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 255        __del_qgroup_rb(qgroup);
 256        return 0;
 257}
 258
 259/* must be called with qgroup_lock held */
 260static int add_relation_rb(struct btrfs_fs_info *fs_info,
 261                           u64 memberid, u64 parentid)
 262{
 263        struct btrfs_qgroup *member;
 264        struct btrfs_qgroup *parent;
 265        struct btrfs_qgroup_list *list;
 266
 267        member = find_qgroup_rb(fs_info, memberid);
 268        parent = find_qgroup_rb(fs_info, parentid);
 269        if (!member || !parent)
 270                return -ENOENT;
 271
 272        list = kzalloc(sizeof(*list), GFP_ATOMIC);
 273        if (!list)
 274                return -ENOMEM;
 275
 276        list->group = parent;
 277        list->member = member;
 278        list_add_tail(&list->next_group, &member->groups);
 279        list_add_tail(&list->next_member, &parent->members);
 280
 281        return 0;
 282}
 283
 284/* must be called with qgroup_lock held */
 285static int del_relation_rb(struct btrfs_fs_info *fs_info,
 286                           u64 memberid, u64 parentid)
 287{
 288        struct btrfs_qgroup *member;
 289        struct btrfs_qgroup *parent;
 290        struct btrfs_qgroup_list *list;
 291
 292        member = find_qgroup_rb(fs_info, memberid);
 293        parent = find_qgroup_rb(fs_info, parentid);
 294        if (!member || !parent)
 295                return -ENOENT;
 296
 297        list_for_each_entry(list, &member->groups, next_group) {
 298                if (list->group == parent) {
 299                        list_del(&list->next_group);
 300                        list_del(&list->next_member);
 301                        kfree(list);
 302                        return 0;
 303                }
 304        }
 305        return -ENOENT;
 306}
 307
 308#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 309int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 310                               u64 rfer, u64 excl)
 311{
 312        struct btrfs_qgroup *qgroup;
 313
 314        qgroup = find_qgroup_rb(fs_info, qgroupid);
 315        if (!qgroup)
 316                return -EINVAL;
 317        if (qgroup->rfer != rfer || qgroup->excl != excl)
 318                return -EINVAL;
 319        return 0;
 320}
 321#endif
 322
 323/*
 324 * The full config is read in one go, only called from open_ctree()
 325 * It doesn't use any locking, as at this point we're still single-threaded
 326 */
 327int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 328{
 329        struct btrfs_key key;
 330        struct btrfs_key found_key;
 331        struct btrfs_root *quota_root = fs_info->quota_root;
 332        struct btrfs_path *path = NULL;
 333        struct extent_buffer *l;
 334        int slot;
 335        int ret = 0;
 336        u64 flags = 0;
 337        u64 rescan_progress = 0;
 338
 339        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 340                return 0;
 341
 342        fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 343        if (!fs_info->qgroup_ulist) {
 344                ret = -ENOMEM;
 345                goto out;
 346        }
 347
 348        path = btrfs_alloc_path();
 349        if (!path) {
 350                ret = -ENOMEM;
 351                goto out;
 352        }
 353
 354        /* default this to quota off, in case no status key is found */
 355        fs_info->qgroup_flags = 0;
 356
 357        /*
 358         * pass 1: read status, all qgroup infos and limits
 359         */
 360        key.objectid = 0;
 361        key.type = 0;
 362        key.offset = 0;
 363        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 364        if (ret)
 365                goto out;
 366
 367        while (1) {
 368                struct btrfs_qgroup *qgroup;
 369
 370                slot = path->slots[0];
 371                l = path->nodes[0];
 372                btrfs_item_key_to_cpu(l, &found_key, slot);
 373
 374                if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 375                        struct btrfs_qgroup_status_item *ptr;
 376
 377                        ptr = btrfs_item_ptr(l, slot,
 378                                             struct btrfs_qgroup_status_item);
 379
 380                        if (btrfs_qgroup_status_version(l, ptr) !=
 381                            BTRFS_QGROUP_STATUS_VERSION) {
 382                                btrfs_err(fs_info,
 383                                 "old qgroup version, quota disabled");
 384                                goto out;
 385                        }
 386                        if (btrfs_qgroup_status_generation(l, ptr) !=
 387                            fs_info->generation) {
 388                                flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 389                                btrfs_err(fs_info,
 390                                        "qgroup generation mismatch, marked as inconsistent");
 391                        }
 392                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 393                                                                          ptr);
 394                        rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 395                        goto next1;
 396                }
 397
 398                if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 399                    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 400                        goto next1;
 401
 402                qgroup = find_qgroup_rb(fs_info, found_key.offset);
 403                if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 404                    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
 405                        btrfs_err(fs_info, "inconsistent qgroup config");
 406                        flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 407                }
 408                if (!qgroup) {
 409                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
 410                        if (IS_ERR(qgroup)) {
 411                                ret = PTR_ERR(qgroup);
 412                                goto out;
 413                        }
 414                }
 415                switch (found_key.type) {
 416                case BTRFS_QGROUP_INFO_KEY: {
 417                        struct btrfs_qgroup_info_item *ptr;
 418
 419                        ptr = btrfs_item_ptr(l, slot,
 420                                             struct btrfs_qgroup_info_item);
 421                        qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 422                        qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 423                        qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 424                        qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 425                        /* generation currently unused */
 426                        break;
 427                }
 428                case BTRFS_QGROUP_LIMIT_KEY: {
 429                        struct btrfs_qgroup_limit_item *ptr;
 430
 431                        ptr = btrfs_item_ptr(l, slot,
 432                                             struct btrfs_qgroup_limit_item);
 433                        qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 434                        qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 435                        qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 436                        qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 437                        qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 438                        break;
 439                }
 440                }
 441next1:
 442                ret = btrfs_next_item(quota_root, path);
 443                if (ret < 0)
 444                        goto out;
 445                if (ret)
 446                        break;
 447        }
 448        btrfs_release_path(path);
 449
 450        /*
 451         * pass 2: read all qgroup relations
 452         */
 453        key.objectid = 0;
 454        key.type = BTRFS_QGROUP_RELATION_KEY;
 455        key.offset = 0;
 456        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 457        if (ret)
 458                goto out;
 459        while (1) {
 460                slot = path->slots[0];
 461                l = path->nodes[0];
 462                btrfs_item_key_to_cpu(l, &found_key, slot);
 463
 464                if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 465                        goto next2;
 466
 467                if (found_key.objectid > found_key.offset) {
 468                        /* parent <- member, not needed to build config */
 469                        /* FIXME should we omit the key completely? */
 470                        goto next2;
 471                }
 472
 473                ret = add_relation_rb(fs_info, found_key.objectid,
 474                                      found_key.offset);
 475                if (ret == -ENOENT) {
 476                        btrfs_warn(fs_info,
 477                                "orphan qgroup relation 0x%llx->0x%llx",
 478                                found_key.objectid, found_key.offset);
 479                        ret = 0;        /* ignore the error */
 480                }
 481                if (ret)
 482                        goto out;
 483next2:
 484                ret = btrfs_next_item(quota_root, path);
 485                if (ret < 0)
 486                        goto out;
 487                if (ret)
 488                        break;
 489        }
 490out:
 491        fs_info->qgroup_flags |= flags;
 492        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
 493                clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 494        else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
 495                 ret >= 0)
 496                ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 497        btrfs_free_path(path);
 498
 499        if (ret < 0) {
 500                ulist_free(fs_info->qgroup_ulist);
 501                fs_info->qgroup_ulist = NULL;
 502                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 503        }
 504
 505        return ret < 0 ? ret : 0;
 506}
 507
 508/*
 509 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 510 * first two are in single-threaded paths.And for the third one, we have set
 511 * quota_root to be null with qgroup_lock held before, so it is safe to clean
 512 * up the in-memory structures without qgroup_lock held.
 513 */
 514void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 515{
 516        struct rb_node *n;
 517        struct btrfs_qgroup *qgroup;
 518
 519        while ((n = rb_first(&fs_info->qgroup_tree))) {
 520                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 521                rb_erase(n, &fs_info->qgroup_tree);
 522                __del_qgroup_rb(qgroup);
 523        }
 524        /*
 525         * We call btrfs_free_qgroup_config() when unmounting
 526         * filesystem and disabling quota, so we set qgroup_ulist
 527         * to be null here to avoid double free.
 528         */
 529        ulist_free(fs_info->qgroup_ulist);
 530        fs_info->qgroup_ulist = NULL;
 531}
 532
 533static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 534                                    u64 dst)
 535{
 536        int ret;
 537        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 538        struct btrfs_path *path;
 539        struct btrfs_key key;
 540
 541        path = btrfs_alloc_path();
 542        if (!path)
 543                return -ENOMEM;
 544
 545        key.objectid = src;
 546        key.type = BTRFS_QGROUP_RELATION_KEY;
 547        key.offset = dst;
 548
 549        ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 550
 551        btrfs_mark_buffer_dirty(path->nodes[0]);
 552
 553        btrfs_free_path(path);
 554        return ret;
 555}
 556
 557static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 558                                    u64 dst)
 559{
 560        int ret;
 561        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 562        struct btrfs_path *path;
 563        struct btrfs_key key;
 564
 565        path = btrfs_alloc_path();
 566        if (!path)
 567                return -ENOMEM;
 568
 569        key.objectid = src;
 570        key.type = BTRFS_QGROUP_RELATION_KEY;
 571        key.offset = dst;
 572
 573        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 574        if (ret < 0)
 575                goto out;
 576
 577        if (ret > 0) {
 578                ret = -ENOENT;
 579                goto out;
 580        }
 581
 582        ret = btrfs_del_item(trans, quota_root, path);
 583out:
 584        btrfs_free_path(path);
 585        return ret;
 586}
 587
 588static int add_qgroup_item(struct btrfs_trans_handle *trans,
 589                           struct btrfs_root *quota_root, u64 qgroupid)
 590{
 591        int ret;
 592        struct btrfs_path *path;
 593        struct btrfs_qgroup_info_item *qgroup_info;
 594        struct btrfs_qgroup_limit_item *qgroup_limit;
 595        struct extent_buffer *leaf;
 596        struct btrfs_key key;
 597
 598        if (btrfs_is_testing(quota_root->fs_info))
 599                return 0;
 600
 601        path = btrfs_alloc_path();
 602        if (!path)
 603                return -ENOMEM;
 604
 605        key.objectid = 0;
 606        key.type = BTRFS_QGROUP_INFO_KEY;
 607        key.offset = qgroupid;
 608
 609        /*
 610         * Avoid a transaction abort by catching -EEXIST here. In that
 611         * case, we proceed by re-initializing the existing structure
 612         * on disk.
 613         */
 614
 615        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 616                                      sizeof(*qgroup_info));
 617        if (ret && ret != -EEXIST)
 618                goto out;
 619
 620        leaf = path->nodes[0];
 621        qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 622                                 struct btrfs_qgroup_info_item);
 623        btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 624        btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 625        btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 626        btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 627        btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 628
 629        btrfs_mark_buffer_dirty(leaf);
 630
 631        btrfs_release_path(path);
 632
 633        key.type = BTRFS_QGROUP_LIMIT_KEY;
 634        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 635                                      sizeof(*qgroup_limit));
 636        if (ret && ret != -EEXIST)
 637                goto out;
 638
 639        leaf = path->nodes[0];
 640        qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 641                                  struct btrfs_qgroup_limit_item);
 642        btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 643        btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 644        btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 645        btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 646        btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 647
 648        btrfs_mark_buffer_dirty(leaf);
 649
 650        ret = 0;
 651out:
 652        btrfs_free_path(path);
 653        return ret;
 654}
 655
 656static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 657{
 658        int ret;
 659        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 660        struct btrfs_path *path;
 661        struct btrfs_key key;
 662
 663        path = btrfs_alloc_path();
 664        if (!path)
 665                return -ENOMEM;
 666
 667        key.objectid = 0;
 668        key.type = BTRFS_QGROUP_INFO_KEY;
 669        key.offset = qgroupid;
 670        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 671        if (ret < 0)
 672                goto out;
 673
 674        if (ret > 0) {
 675                ret = -ENOENT;
 676                goto out;
 677        }
 678
 679        ret = btrfs_del_item(trans, quota_root, path);
 680        if (ret)
 681                goto out;
 682
 683        btrfs_release_path(path);
 684
 685        key.type = BTRFS_QGROUP_LIMIT_KEY;
 686        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 687        if (ret < 0)
 688                goto out;
 689
 690        if (ret > 0) {
 691                ret = -ENOENT;
 692                goto out;
 693        }
 694
 695        ret = btrfs_del_item(trans, quota_root, path);
 696
 697out:
 698        btrfs_free_path(path);
 699        return ret;
 700}
 701
 702static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 703                                    struct btrfs_qgroup *qgroup)
 704{
 705        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 706        struct btrfs_path *path;
 707        struct btrfs_key key;
 708        struct extent_buffer *l;
 709        struct btrfs_qgroup_limit_item *qgroup_limit;
 710        int ret;
 711        int slot;
 712
 713        key.objectid = 0;
 714        key.type = BTRFS_QGROUP_LIMIT_KEY;
 715        key.offset = qgroup->qgroupid;
 716
 717        path = btrfs_alloc_path();
 718        if (!path)
 719                return -ENOMEM;
 720
 721        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 722        if (ret > 0)
 723                ret = -ENOENT;
 724
 725        if (ret)
 726                goto out;
 727
 728        l = path->nodes[0];
 729        slot = path->slots[0];
 730        qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 731        btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 732        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 733        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 734        btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 735        btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 736
 737        btrfs_mark_buffer_dirty(l);
 738
 739out:
 740        btrfs_free_path(path);
 741        return ret;
 742}
 743
 744static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 745                                   struct btrfs_qgroup *qgroup)
 746{
 747        struct btrfs_fs_info *fs_info = trans->fs_info;
 748        struct btrfs_root *quota_root = fs_info->quota_root;
 749        struct btrfs_path *path;
 750        struct btrfs_key key;
 751        struct extent_buffer *l;
 752        struct btrfs_qgroup_info_item *qgroup_info;
 753        int ret;
 754        int slot;
 755
 756        if (btrfs_is_testing(fs_info))
 757                return 0;
 758
 759        key.objectid = 0;
 760        key.type = BTRFS_QGROUP_INFO_KEY;
 761        key.offset = qgroup->qgroupid;
 762
 763        path = btrfs_alloc_path();
 764        if (!path)
 765                return -ENOMEM;
 766
 767        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 768        if (ret > 0)
 769                ret = -ENOENT;
 770
 771        if (ret)
 772                goto out;
 773
 774        l = path->nodes[0];
 775        slot = path->slots[0];
 776        qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 777        btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 778        btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 779        btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 780        btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 781        btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 782
 783        btrfs_mark_buffer_dirty(l);
 784
 785out:
 786        btrfs_free_path(path);
 787        return ret;
 788}
 789
 790static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 791{
 792        struct btrfs_fs_info *fs_info = trans->fs_info;
 793        struct btrfs_root *quota_root = fs_info->quota_root;
 794        struct btrfs_path *path;
 795        struct btrfs_key key;
 796        struct extent_buffer *l;
 797        struct btrfs_qgroup_status_item *ptr;
 798        int ret;
 799        int slot;
 800
 801        key.objectid = 0;
 802        key.type = BTRFS_QGROUP_STATUS_KEY;
 803        key.offset = 0;
 804
 805        path = btrfs_alloc_path();
 806        if (!path)
 807                return -ENOMEM;
 808
 809        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 810        if (ret > 0)
 811                ret = -ENOENT;
 812
 813        if (ret)
 814                goto out;
 815
 816        l = path->nodes[0];
 817        slot = path->slots[0];
 818        ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 819        btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
 820        btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 821        btrfs_set_qgroup_status_rescan(l, ptr,
 822                                fs_info->qgroup_rescan_progress.objectid);
 823
 824        btrfs_mark_buffer_dirty(l);
 825
 826out:
 827        btrfs_free_path(path);
 828        return ret;
 829}
 830
 831/*
 832 * called with qgroup_lock held
 833 */
 834static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 835                                  struct btrfs_root *root)
 836{
 837        struct btrfs_path *path;
 838        struct btrfs_key key;
 839        struct extent_buffer *leaf = NULL;
 840        int ret;
 841        int nr = 0;
 842
 843        path = btrfs_alloc_path();
 844        if (!path)
 845                return -ENOMEM;
 846
 847        path->leave_spinning = 1;
 848
 849        key.objectid = 0;
 850        key.offset = 0;
 851        key.type = 0;
 852
 853        while (1) {
 854                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 855                if (ret < 0)
 856                        goto out;
 857                leaf = path->nodes[0];
 858                nr = btrfs_header_nritems(leaf);
 859                if (!nr)
 860                        break;
 861                /*
 862                 * delete the leaf one by one
 863                 * since the whole tree is going
 864                 * to be deleted.
 865                 */
 866                path->slots[0] = 0;
 867                ret = btrfs_del_items(trans, root, path, 0, nr);
 868                if (ret)
 869                        goto out;
 870
 871                btrfs_release_path(path);
 872        }
 873        ret = 0;
 874out:
 875        btrfs_free_path(path);
 876        return ret;
 877}
 878
 879int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 880{
 881        struct btrfs_root *quota_root;
 882        struct btrfs_root *tree_root = fs_info->tree_root;
 883        struct btrfs_path *path = NULL;
 884        struct btrfs_qgroup_status_item *ptr;
 885        struct extent_buffer *leaf;
 886        struct btrfs_key key;
 887        struct btrfs_key found_key;
 888        struct btrfs_qgroup *qgroup = NULL;
 889        struct btrfs_trans_handle *trans = NULL;
 890        int ret = 0;
 891        int slot;
 892
 893        mutex_lock(&fs_info->qgroup_ioctl_lock);
 894        if (fs_info->quota_root)
 895                goto out;
 896
 897        fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 898        if (!fs_info->qgroup_ulist) {
 899                ret = -ENOMEM;
 900                goto out;
 901        }
 902
 903        /*
 904         * 1 for quota root item
 905         * 1 for BTRFS_QGROUP_STATUS item
 906         *
 907         * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
 908         * per subvolume. However those are not currently reserved since it
 909         * would be a lot of overkill.
 910         */
 911        trans = btrfs_start_transaction(tree_root, 2);
 912        if (IS_ERR(trans)) {
 913                ret = PTR_ERR(trans);
 914                trans = NULL;
 915                goto out;
 916        }
 917
 918        /*
 919         * initially create the quota tree
 920         */
 921        quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
 922        if (IS_ERR(quota_root)) {
 923                ret =  PTR_ERR(quota_root);
 924                btrfs_abort_transaction(trans, ret);
 925                goto out;
 926        }
 927
 928        path = btrfs_alloc_path();
 929        if (!path) {
 930                ret = -ENOMEM;
 931                btrfs_abort_transaction(trans, ret);
 932                goto out_free_root;
 933        }
 934
 935        key.objectid = 0;
 936        key.type = BTRFS_QGROUP_STATUS_KEY;
 937        key.offset = 0;
 938
 939        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 940                                      sizeof(*ptr));
 941        if (ret) {
 942                btrfs_abort_transaction(trans, ret);
 943                goto out_free_path;
 944        }
 945
 946        leaf = path->nodes[0];
 947        ptr = btrfs_item_ptr(leaf, path->slots[0],
 948                                 struct btrfs_qgroup_status_item);
 949        btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
 950        btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
 951        fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
 952                                BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 953        btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
 954        btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
 955
 956        btrfs_mark_buffer_dirty(leaf);
 957
 958        key.objectid = 0;
 959        key.type = BTRFS_ROOT_REF_KEY;
 960        key.offset = 0;
 961
 962        btrfs_release_path(path);
 963        ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
 964        if (ret > 0)
 965                goto out_add_root;
 966        if (ret < 0) {
 967                btrfs_abort_transaction(trans, ret);
 968                goto out_free_path;
 969        }
 970
 971        while (1) {
 972                slot = path->slots[0];
 973                leaf = path->nodes[0];
 974                btrfs_item_key_to_cpu(leaf, &found_key, slot);
 975
 976                if (found_key.type == BTRFS_ROOT_REF_KEY) {
 977                        ret = add_qgroup_item(trans, quota_root,
 978                                              found_key.offset);
 979                        if (ret) {
 980                                btrfs_abort_transaction(trans, ret);
 981                                goto out_free_path;
 982                        }
 983
 984                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
 985                        if (IS_ERR(qgroup)) {
 986                                ret = PTR_ERR(qgroup);
 987                                btrfs_abort_transaction(trans, ret);
 988                                goto out_free_path;
 989                        }
 990                }
 991                ret = btrfs_next_item(tree_root, path);
 992                if (ret < 0) {
 993                        btrfs_abort_transaction(trans, ret);
 994                        goto out_free_path;
 995                }
 996                if (ret)
 997                        break;
 998        }
 999
1000out_add_root:
1001        btrfs_release_path(path);
1002        ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1003        if (ret) {
1004                btrfs_abort_transaction(trans, ret);
1005                goto out_free_path;
1006        }
1007
1008        qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
1009        if (IS_ERR(qgroup)) {
1010                ret = PTR_ERR(qgroup);
1011                btrfs_abort_transaction(trans, ret);
1012                goto out_free_path;
1013        }
1014
1015        ret = btrfs_commit_transaction(trans);
1016        trans = NULL;
1017        if (ret)
1018                goto out_free_path;
1019
1020        /*
1021         * Set quota enabled flag after committing the transaction, to avoid
1022         * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1023         * creation.
1024         */
1025        spin_lock(&fs_info->qgroup_lock);
1026        fs_info->quota_root = quota_root;
1027        set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1028        spin_unlock(&fs_info->qgroup_lock);
1029
1030        ret = qgroup_rescan_init(fs_info, 0, 1);
1031        if (!ret) {
1032                qgroup_rescan_zero_tracking(fs_info);
1033                btrfs_queue_work(fs_info->qgroup_rescan_workers,
1034                                 &fs_info->qgroup_rescan_work);
1035        }
1036
1037out_free_path:
1038        btrfs_free_path(path);
1039out_free_root:
1040        if (ret) {
1041                free_extent_buffer(quota_root->node);
1042                free_extent_buffer(quota_root->commit_root);
1043                kfree(quota_root);
1044        }
1045out:
1046        if (ret) {
1047                ulist_free(fs_info->qgroup_ulist);
1048                fs_info->qgroup_ulist = NULL;
1049                if (trans)
1050                        btrfs_end_transaction(trans);
1051        }
1052        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1053        return ret;
1054}
1055
1056int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1057{
1058        struct btrfs_root *quota_root;
1059        struct btrfs_trans_handle *trans = NULL;
1060        int ret = 0;
1061
1062        mutex_lock(&fs_info->qgroup_ioctl_lock);
1063        if (!fs_info->quota_root)
1064                goto out;
1065
1066        /*
1067         * 1 For the root item
1068         *
1069         * We should also reserve enough items for the quota tree deletion in
1070         * btrfs_clean_quota_tree but this is not done.
1071         */
1072        trans = btrfs_start_transaction(fs_info->tree_root, 1);
1073        if (IS_ERR(trans)) {
1074                ret = PTR_ERR(trans);
1075                goto out;
1076        }
1077
1078        clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1079        btrfs_qgroup_wait_for_completion(fs_info, false);
1080        spin_lock(&fs_info->qgroup_lock);
1081        quota_root = fs_info->quota_root;
1082        fs_info->quota_root = NULL;
1083        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1084        spin_unlock(&fs_info->qgroup_lock);
1085
1086        btrfs_free_qgroup_config(fs_info);
1087
1088        ret = btrfs_clean_quota_tree(trans, quota_root);
1089        if (ret) {
1090                btrfs_abort_transaction(trans, ret);
1091                goto end_trans;
1092        }
1093
1094        ret = btrfs_del_root(trans, &quota_root->root_key);
1095        if (ret) {
1096                btrfs_abort_transaction(trans, ret);
1097                goto end_trans;
1098        }
1099
1100        list_del(&quota_root->dirty_list);
1101
1102        btrfs_tree_lock(quota_root->node);
1103        btrfs_clean_tree_block(quota_root->node);
1104        btrfs_tree_unlock(quota_root->node);
1105        btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
1106
1107        free_extent_buffer(quota_root->node);
1108        free_extent_buffer(quota_root->commit_root);
1109        kfree(quota_root);
1110
1111end_trans:
1112        ret = btrfs_end_transaction(trans);
1113out:
1114        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1115        return ret;
1116}
1117
1118static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1119                         struct btrfs_qgroup *qgroup)
1120{
1121        if (list_empty(&qgroup->dirty))
1122                list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1123}
1124
1125/*
1126 * The easy accounting, we're updating qgroup relationship whose child qgroup
1127 * only has exclusive extents.
1128 *
1129 * In this case, all exclusive extents will also be exclusive for parent, so
1130 * excl/rfer just get added/removed.
1131 *
1132 * So is qgroup reservation space, which should also be added/removed to
1133 * parent.
1134 * Or when child tries to release reservation space, parent will underflow its
1135 * reservation (for relationship adding case).
1136 *
1137 * Caller should hold fs_info->qgroup_lock.
1138 */
1139static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1140                                    struct ulist *tmp, u64 ref_root,
1141                                    struct btrfs_qgroup *src, int sign)
1142{
1143        struct btrfs_qgroup *qgroup;
1144        struct btrfs_qgroup_list *glist;
1145        struct ulist_node *unode;
1146        struct ulist_iterator uiter;
1147        u64 num_bytes = src->excl;
1148        int ret = 0;
1149
1150        qgroup = find_qgroup_rb(fs_info, ref_root);
1151        if (!qgroup)
1152                goto out;
1153
1154        qgroup->rfer += sign * num_bytes;
1155        qgroup->rfer_cmpr += sign * num_bytes;
1156
1157        WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1158        qgroup->excl += sign * num_bytes;
1159        qgroup->excl_cmpr += sign * num_bytes;
1160
1161        if (sign > 0)
1162                qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1163        else
1164                qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1165
1166        qgroup_dirty(fs_info, qgroup);
1167
1168        /* Get all of the parent groups that contain this qgroup */
1169        list_for_each_entry(glist, &qgroup->groups, next_group) {
1170                ret = ulist_add(tmp, glist->group->qgroupid,
1171                                qgroup_to_aux(glist->group), GFP_ATOMIC);
1172                if (ret < 0)
1173                        goto out;
1174        }
1175
1176        /* Iterate all of the parents and adjust their reference counts */
1177        ULIST_ITER_INIT(&uiter);
1178        while ((unode = ulist_next(tmp, &uiter))) {
1179                qgroup = unode_aux_to_qgroup(unode);
1180                qgroup->rfer += sign * num_bytes;
1181                qgroup->rfer_cmpr += sign * num_bytes;
1182                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1183                qgroup->excl += sign * num_bytes;
1184                if (sign > 0)
1185                        qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1186                else
1187                        qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1188                qgroup->excl_cmpr += sign * num_bytes;
1189                qgroup_dirty(fs_info, qgroup);
1190
1191                /* Add any parents of the parents */
1192                list_for_each_entry(glist, &qgroup->groups, next_group) {
1193                        ret = ulist_add(tmp, glist->group->qgroupid,
1194                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
1195                        if (ret < 0)
1196                                goto out;
1197                }
1198        }
1199        ret = 0;
1200out:
1201        return ret;
1202}
1203
1204
1205/*
1206 * Quick path for updating qgroup with only excl refs.
1207 *
1208 * In that case, just update all parent will be enough.
1209 * Or we needs to do a full rescan.
1210 * Caller should also hold fs_info->qgroup_lock.
1211 *
1212 * Return 0 for quick update, return >0 for need to full rescan
1213 * and mark INCONSISTENT flag.
1214 * Return < 0 for other error.
1215 */
1216static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1217                                   struct ulist *tmp, u64 src, u64 dst,
1218                                   int sign)
1219{
1220        struct btrfs_qgroup *qgroup;
1221        int ret = 1;
1222        int err = 0;
1223
1224        qgroup = find_qgroup_rb(fs_info, src);
1225        if (!qgroup)
1226                goto out;
1227        if (qgroup->excl == qgroup->rfer) {
1228                ret = 0;
1229                err = __qgroup_excl_accounting(fs_info, tmp, dst,
1230                                               qgroup, sign);
1231                if (err < 0) {
1232                        ret = err;
1233                        goto out;
1234                }
1235        }
1236out:
1237        if (ret)
1238                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1239        return ret;
1240}
1241
1242int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1243                              u64 dst)
1244{
1245        struct btrfs_fs_info *fs_info = trans->fs_info;
1246        struct btrfs_root *quota_root;
1247        struct btrfs_qgroup *parent;
1248        struct btrfs_qgroup *member;
1249        struct btrfs_qgroup_list *list;
1250        struct ulist *tmp;
1251        int ret = 0;
1252
1253        /* Check the level of src and dst first */
1254        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1255                return -EINVAL;
1256
1257        tmp = ulist_alloc(GFP_KERNEL);
1258        if (!tmp)
1259                return -ENOMEM;
1260
1261        mutex_lock(&fs_info->qgroup_ioctl_lock);
1262        quota_root = fs_info->quota_root;
1263        if (!quota_root) {
1264                ret = -EINVAL;
1265                goto out;
1266        }
1267        member = find_qgroup_rb(fs_info, src);
1268        parent = find_qgroup_rb(fs_info, dst);
1269        if (!member || !parent) {
1270                ret = -EINVAL;
1271                goto out;
1272        }
1273
1274        /* check if such qgroup relation exist firstly */
1275        list_for_each_entry(list, &member->groups, next_group) {
1276                if (list->group == parent) {
1277                        ret = -EEXIST;
1278                        goto out;
1279                }
1280        }
1281
1282        ret = add_qgroup_relation_item(trans, src, dst);
1283        if (ret)
1284                goto out;
1285
1286        ret = add_qgroup_relation_item(trans, dst, src);
1287        if (ret) {
1288                del_qgroup_relation_item(trans, src, dst);
1289                goto out;
1290        }
1291
1292        spin_lock(&fs_info->qgroup_lock);
1293        ret = add_relation_rb(fs_info, src, dst);
1294        if (ret < 0) {
1295                spin_unlock(&fs_info->qgroup_lock);
1296                goto out;
1297        }
1298        ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1299        spin_unlock(&fs_info->qgroup_lock);
1300out:
1301        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1302        ulist_free(tmp);
1303        return ret;
1304}
1305
1306static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1307                                 u64 dst)
1308{
1309        struct btrfs_fs_info *fs_info = trans->fs_info;
1310        struct btrfs_root *quota_root;
1311        struct btrfs_qgroup *parent;
1312        struct btrfs_qgroup *member;
1313        struct btrfs_qgroup_list *list;
1314        struct ulist *tmp;
1315        int ret = 0;
1316        int err;
1317
1318        tmp = ulist_alloc(GFP_KERNEL);
1319        if (!tmp)
1320                return -ENOMEM;
1321
1322        quota_root = fs_info->quota_root;
1323        if (!quota_root) {
1324                ret = -EINVAL;
1325                goto out;
1326        }
1327
1328        member = find_qgroup_rb(fs_info, src);
1329        parent = find_qgroup_rb(fs_info, dst);
1330        if (!member || !parent) {
1331                ret = -EINVAL;
1332                goto out;
1333        }
1334
1335        /* check if such qgroup relation exist firstly */
1336        list_for_each_entry(list, &member->groups, next_group) {
1337                if (list->group == parent)
1338                        goto exist;
1339        }
1340        ret = -ENOENT;
1341        goto out;
1342exist:
1343        ret = del_qgroup_relation_item(trans, src, dst);
1344        err = del_qgroup_relation_item(trans, dst, src);
1345        if (err && !ret)
1346                ret = err;
1347
1348        spin_lock(&fs_info->qgroup_lock);
1349        del_relation_rb(fs_info, src, dst);
1350        ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1351        spin_unlock(&fs_info->qgroup_lock);
1352out:
1353        ulist_free(tmp);
1354        return ret;
1355}
1356
1357int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1358                              u64 dst)
1359{
1360        struct btrfs_fs_info *fs_info = trans->fs_info;
1361        int ret = 0;
1362
1363        mutex_lock(&fs_info->qgroup_ioctl_lock);
1364        ret = __del_qgroup_relation(trans, src, dst);
1365        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1366
1367        return ret;
1368}
1369
1370int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1371{
1372        struct btrfs_fs_info *fs_info = trans->fs_info;
1373        struct btrfs_root *quota_root;
1374        struct btrfs_qgroup *qgroup;
1375        int ret = 0;
1376
1377        mutex_lock(&fs_info->qgroup_ioctl_lock);
1378        quota_root = fs_info->quota_root;
1379        if (!quota_root) {
1380                ret = -EINVAL;
1381                goto out;
1382        }
1383        qgroup = find_qgroup_rb(fs_info, qgroupid);
1384        if (qgroup) {
1385                ret = -EEXIST;
1386                goto out;
1387        }
1388
1389        ret = add_qgroup_item(trans, quota_root, qgroupid);
1390        if (ret)
1391                goto out;
1392
1393        spin_lock(&fs_info->qgroup_lock);
1394        qgroup = add_qgroup_rb(fs_info, qgroupid);
1395        spin_unlock(&fs_info->qgroup_lock);
1396
1397        if (IS_ERR(qgroup))
1398                ret = PTR_ERR(qgroup);
1399out:
1400        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1401        return ret;
1402}
1403
1404int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1405{
1406        struct btrfs_fs_info *fs_info = trans->fs_info;
1407        struct btrfs_root *quota_root;
1408        struct btrfs_qgroup *qgroup;
1409        struct btrfs_qgroup_list *list;
1410        int ret = 0;
1411
1412        mutex_lock(&fs_info->qgroup_ioctl_lock);
1413        quota_root = fs_info->quota_root;
1414        if (!quota_root) {
1415                ret = -EINVAL;
1416                goto out;
1417        }
1418
1419        qgroup = find_qgroup_rb(fs_info, qgroupid);
1420        if (!qgroup) {
1421                ret = -ENOENT;
1422                goto out;
1423        }
1424
1425        /* Check if there are no children of this qgroup */
1426        if (!list_empty(&qgroup->members)) {
1427                ret = -EBUSY;
1428                goto out;
1429        }
1430
1431        ret = del_qgroup_item(trans, qgroupid);
1432        if (ret && ret != -ENOENT)
1433                goto out;
1434
1435        while (!list_empty(&qgroup->groups)) {
1436                list = list_first_entry(&qgroup->groups,
1437                                        struct btrfs_qgroup_list, next_group);
1438                ret = __del_qgroup_relation(trans, qgroupid,
1439                                            list->group->qgroupid);
1440                if (ret)
1441                        goto out;
1442        }
1443
1444        spin_lock(&fs_info->qgroup_lock);
1445        del_qgroup_rb(fs_info, qgroupid);
1446        spin_unlock(&fs_info->qgroup_lock);
1447out:
1448        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1449        return ret;
1450}
1451
1452int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1453                       struct btrfs_qgroup_limit *limit)
1454{
1455        struct btrfs_fs_info *fs_info = trans->fs_info;
1456        struct btrfs_root *quota_root;
1457        struct btrfs_qgroup *qgroup;
1458        int ret = 0;
1459        /* Sometimes we would want to clear the limit on this qgroup.
1460         * To meet this requirement, we treat the -1 as a special value
1461         * which tell kernel to clear the limit on this qgroup.
1462         */
1463        const u64 CLEAR_VALUE = -1;
1464
1465        mutex_lock(&fs_info->qgroup_ioctl_lock);
1466        quota_root = fs_info->quota_root;
1467        if (!quota_root) {
1468                ret = -EINVAL;
1469                goto out;
1470        }
1471
1472        qgroup = find_qgroup_rb(fs_info, qgroupid);
1473        if (!qgroup) {
1474                ret = -ENOENT;
1475                goto out;
1476        }
1477
1478        spin_lock(&fs_info->qgroup_lock);
1479        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1480                if (limit->max_rfer == CLEAR_VALUE) {
1481                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1482                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1483                        qgroup->max_rfer = 0;
1484                } else {
1485                        qgroup->max_rfer = limit->max_rfer;
1486                }
1487        }
1488        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1489                if (limit->max_excl == CLEAR_VALUE) {
1490                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1491                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1492                        qgroup->max_excl = 0;
1493                } else {
1494                        qgroup->max_excl = limit->max_excl;
1495                }
1496        }
1497        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1498                if (limit->rsv_rfer == CLEAR_VALUE) {
1499                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1500                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1501                        qgroup->rsv_rfer = 0;
1502                } else {
1503                        qgroup->rsv_rfer = limit->rsv_rfer;
1504                }
1505        }
1506        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1507                if (limit->rsv_excl == CLEAR_VALUE) {
1508                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1509                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1510                        qgroup->rsv_excl = 0;
1511                } else {
1512                        qgroup->rsv_excl = limit->rsv_excl;
1513                }
1514        }
1515        qgroup->lim_flags |= limit->flags;
1516
1517        spin_unlock(&fs_info->qgroup_lock);
1518
1519        ret = update_qgroup_limit_item(trans, qgroup);
1520        if (ret) {
1521                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1522                btrfs_info(fs_info, "unable to update quota limit for %llu",
1523                       qgroupid);
1524        }
1525
1526out:
1527        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1528        return ret;
1529}
1530
1531int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1532                                struct btrfs_delayed_ref_root *delayed_refs,
1533                                struct btrfs_qgroup_extent_record *record)
1534{
1535        struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1536        struct rb_node *parent_node = NULL;
1537        struct btrfs_qgroup_extent_record *entry;
1538        u64 bytenr = record->bytenr;
1539
1540        lockdep_assert_held(&delayed_refs->lock);
1541        trace_btrfs_qgroup_trace_extent(fs_info, record);
1542
1543        while (*p) {
1544                parent_node = *p;
1545                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1546                                 node);
1547                if (bytenr < entry->bytenr) {
1548                        p = &(*p)->rb_left;
1549                } else if (bytenr > entry->bytenr) {
1550                        p = &(*p)->rb_right;
1551                } else {
1552                        if (record->data_rsv && !entry->data_rsv) {
1553                                entry->data_rsv = record->data_rsv;
1554                                entry->data_rsv_refroot =
1555                                        record->data_rsv_refroot;
1556                        }
1557                        return 1;
1558                }
1559        }
1560
1561        rb_link_node(&record->node, parent_node, p);
1562        rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1563        return 0;
1564}
1565
1566int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
1567                                   struct btrfs_qgroup_extent_record *qrecord)
1568{
1569        struct ulist *old_root;
1570        u64 bytenr = qrecord->bytenr;
1571        int ret;
1572
1573        ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
1574        if (ret < 0) {
1575                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1576                btrfs_warn(fs_info,
1577"error accounting new delayed refs extent (err code: %d), quota inconsistent",
1578                        ret);
1579                return 0;
1580        }
1581
1582        /*
1583         * Here we don't need to get the lock of
1584         * trans->transaction->delayed_refs, since inserted qrecord won't
1585         * be deleted, only qrecord->node may be modified (new qrecord insert)
1586         *
1587         * So modifying qrecord->old_roots is safe here
1588         */
1589        qrecord->old_roots = old_root;
1590        return 0;
1591}
1592
1593int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1594                              u64 num_bytes, gfp_t gfp_flag)
1595{
1596        struct btrfs_fs_info *fs_info = trans->fs_info;
1597        struct btrfs_qgroup_extent_record *record;
1598        struct btrfs_delayed_ref_root *delayed_refs;
1599        int ret;
1600
1601        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1602            || bytenr == 0 || num_bytes == 0)
1603                return 0;
1604        record = kzalloc(sizeof(*record), gfp_flag);
1605        if (!record)
1606                return -ENOMEM;
1607
1608        delayed_refs = &trans->transaction->delayed_refs;
1609        record->bytenr = bytenr;
1610        record->num_bytes = num_bytes;
1611        record->old_roots = NULL;
1612
1613        spin_lock(&delayed_refs->lock);
1614        ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1615        spin_unlock(&delayed_refs->lock);
1616        if (ret > 0) {
1617                kfree(record);
1618                return 0;
1619        }
1620        return btrfs_qgroup_trace_extent_post(fs_info, record);
1621}
1622
1623int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1624                                  struct extent_buffer *eb)
1625{
1626        struct btrfs_fs_info *fs_info = trans->fs_info;
1627        int nr = btrfs_header_nritems(eb);
1628        int i, extent_type, ret;
1629        struct btrfs_key key;
1630        struct btrfs_file_extent_item *fi;
1631        u64 bytenr, num_bytes;
1632
1633        /* We can be called directly from walk_up_proc() */
1634        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1635                return 0;
1636
1637        for (i = 0; i < nr; i++) {
1638                btrfs_item_key_to_cpu(eb, &key, i);
1639
1640                if (key.type != BTRFS_EXTENT_DATA_KEY)
1641                        continue;
1642
1643                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1644                /* filter out non qgroup-accountable extents  */
1645                extent_type = btrfs_file_extent_type(eb, fi);
1646
1647                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1648                        continue;
1649
1650                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1651                if (!bytenr)
1652                        continue;
1653
1654                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1655
1656                ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
1657                                                GFP_NOFS);
1658                if (ret)
1659                        return ret;
1660        }
1661        cond_resched();
1662        return 0;
1663}
1664
1665/*
1666 * Walk up the tree from the bottom, freeing leaves and any interior
1667 * nodes which have had all slots visited. If a node (leaf or
1668 * interior) is freed, the node above it will have it's slot
1669 * incremented. The root node will never be freed.
1670 *
1671 * At the end of this function, we should have a path which has all
1672 * slots incremented to the next position for a search. If we need to
1673 * read a new node it will be NULL and the node above it will have the
1674 * correct slot selected for a later read.
1675 *
1676 * If we increment the root nodes slot counter past the number of
1677 * elements, 1 is returned to signal completion of the search.
1678 */
1679static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
1680{
1681        int level = 0;
1682        int nr, slot;
1683        struct extent_buffer *eb;
1684
1685        if (root_level == 0)
1686                return 1;
1687
1688        while (level <= root_level) {
1689                eb = path->nodes[level];
1690                nr = btrfs_header_nritems(eb);
1691                path->slots[level]++;
1692                slot = path->slots[level];
1693                if (slot >= nr || level == 0) {
1694                        /*
1695                         * Don't free the root -  we will detect this
1696                         * condition after our loop and return a
1697                         * positive value for caller to stop walking the tree.
1698                         */
1699                        if (level != root_level) {
1700                                btrfs_tree_unlock_rw(eb, path->locks[level]);
1701                                path->locks[level] = 0;
1702
1703                                free_extent_buffer(eb);
1704                                path->nodes[level] = NULL;
1705                                path->slots[level] = 0;
1706                        }
1707                } else {
1708                        /*
1709                         * We have a valid slot to walk back down
1710                         * from. Stop here so caller can process these
1711                         * new nodes.
1712                         */
1713                        break;
1714                }
1715
1716                level++;
1717        }
1718
1719        eb = path->nodes[root_level];
1720        if (path->slots[root_level] >= btrfs_header_nritems(eb))
1721                return 1;
1722
1723        return 0;
1724}
1725
1726/*
1727 * Helper function to trace a subtree tree block swap.
1728 *
1729 * The swap will happen in highest tree block, but there may be a lot of
1730 * tree blocks involved.
1731 *
1732 * For example:
1733 *  OO = Old tree blocks
1734 *  NN = New tree blocks allocated during balance
1735 *
1736 *           File tree (257)                  Reloc tree for 257
1737 * L2              OO                                NN
1738 *               /    \                            /    \
1739 * L1          OO      OO (a)                    OO      NN (a)
1740 *            / \     / \                       / \     / \
1741 * L0       OO   OO OO   OO                   OO   OO NN   NN
1742 *                  (b)  (c)                          (b)  (c)
1743 *
1744 * When calling qgroup_trace_extent_swap(), we will pass:
1745 * @src_eb = OO(a)
1746 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1747 * @dst_level = 0
1748 * @root_level = 1
1749 *
1750 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1751 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1752 *
1753 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1754 *
1755 * 1) Tree search from @src_eb
1756 *    It should acts as a simplified btrfs_search_slot().
1757 *    The key for search can be extracted from @dst_path->nodes[dst_level]
1758 *    (first key).
1759 *
1760 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1761 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1762 *    They should be marked during previous (@dst_level = 1) iteration.
1763 *
1764 * 3) Mark file extents in leaves dirty
1765 *    We don't have good way to pick out new file extents only.
1766 *    So we still follow the old method by scanning all file extents in
1767 *    the leave.
1768 *
1769 * This function can free us from keeping two paths, thus later we only need
1770 * to care about how to iterate all new tree blocks in reloc tree.
1771 */
1772static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1773                                    struct extent_buffer *src_eb,
1774                                    struct btrfs_path *dst_path,
1775                                    int dst_level, int root_level,
1776                                    bool trace_leaf)
1777{
1778        struct btrfs_key key;
1779        struct btrfs_path *src_path;
1780        struct btrfs_fs_info *fs_info = trans->fs_info;
1781        u32 nodesize = fs_info->nodesize;
1782        int cur_level = root_level;
1783        int ret;
1784
1785        BUG_ON(dst_level > root_level);
1786        /* Level mismatch */
1787        if (btrfs_header_level(src_eb) != root_level)
1788                return -EINVAL;
1789
1790        src_path = btrfs_alloc_path();
1791        if (!src_path) {
1792                ret = -ENOMEM;
1793                goto out;
1794        }
1795
1796        if (dst_level)
1797                btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1798        else
1799                btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1800
1801        /* For src_path */
1802        extent_buffer_get(src_eb);
1803        src_path->nodes[root_level] = src_eb;
1804        src_path->slots[root_level] = dst_path->slots[root_level];
1805        src_path->locks[root_level] = 0;
1806
1807        /* A simplified version of btrfs_search_slot() */
1808        while (cur_level >= dst_level) {
1809                struct btrfs_key src_key;
1810                struct btrfs_key dst_key;
1811
1812                if (src_path->nodes[cur_level] == NULL) {
1813                        struct btrfs_key first_key;
1814                        struct extent_buffer *eb;
1815                        int parent_slot;
1816                        u64 child_gen;
1817                        u64 child_bytenr;
1818
1819                        eb = src_path->nodes[cur_level + 1];
1820                        parent_slot = src_path->slots[cur_level + 1];
1821                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
1822                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
1823                        btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
1824
1825                        eb = read_tree_block(fs_info, child_bytenr, child_gen,
1826                                             cur_level, &first_key);
1827                        if (IS_ERR(eb)) {
1828                                ret = PTR_ERR(eb);
1829                                goto out;
1830                        } else if (!extent_buffer_uptodate(eb)) {
1831                                free_extent_buffer(eb);
1832                                ret = -EIO;
1833                                goto out;
1834                        }
1835
1836                        src_path->nodes[cur_level] = eb;
1837
1838                        btrfs_tree_read_lock(eb);
1839                        btrfs_set_lock_blocking_read(eb);
1840                        src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
1841                }
1842
1843                src_path->slots[cur_level] = dst_path->slots[cur_level];
1844                if (cur_level) {
1845                        btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
1846                                        &dst_key, dst_path->slots[cur_level]);
1847                        btrfs_node_key_to_cpu(src_path->nodes[cur_level],
1848                                        &src_key, src_path->slots[cur_level]);
1849                } else {
1850                        btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
1851                                        &dst_key, dst_path->slots[cur_level]);
1852                        btrfs_item_key_to_cpu(src_path->nodes[cur_level],
1853                                        &src_key, src_path->slots[cur_level]);
1854                }
1855                /* Content mismatch, something went wrong */
1856                if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
1857                        ret = -ENOENT;
1858                        goto out;
1859                }
1860                cur_level--;
1861        }
1862
1863        /*
1864         * Now both @dst_path and @src_path have been populated, record the tree
1865         * blocks for qgroup accounting.
1866         */
1867        ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
1868                        nodesize, GFP_NOFS);
1869        if (ret < 0)
1870                goto out;
1871        ret = btrfs_qgroup_trace_extent(trans,
1872                        dst_path->nodes[dst_level]->start,
1873                        nodesize, GFP_NOFS);
1874        if (ret < 0)
1875                goto out;
1876
1877        /* Record leaf file extents */
1878        if (dst_level == 0 && trace_leaf) {
1879                ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
1880                if (ret < 0)
1881                        goto out;
1882                ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
1883        }
1884out:
1885        btrfs_free_path(src_path);
1886        return ret;
1887}
1888
1889/*
1890 * Helper function to do recursive generation-aware depth-first search, to
1891 * locate all new tree blocks in a subtree of reloc tree.
1892 *
1893 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
1894 *         reloc tree
1895 * L2         NN (a)
1896 *          /    \
1897 * L1    OO        NN (b)
1898 *      /  \      /  \
1899 * L0  OO  OO    OO  NN
1900 *               (c) (d)
1901 * If we pass:
1902 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
1903 * @cur_level = 1
1904 * @root_level = 1
1905 *
1906 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
1907 * above tree blocks along with their counter parts in file tree.
1908 * While during search, old tree blocks OO(c) will be skipped as tree block swap
1909 * won't affect OO(c).
1910 */
1911static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
1912                                           struct extent_buffer *src_eb,
1913                                           struct btrfs_path *dst_path,
1914                                           int cur_level, int root_level,
1915                                           u64 last_snapshot, bool trace_leaf)
1916{
1917        struct btrfs_fs_info *fs_info = trans->fs_info;
1918        struct extent_buffer *eb;
1919        bool need_cleanup = false;
1920        int ret = 0;
1921        int i;
1922
1923        /* Level sanity check */
1924        if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
1925            root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
1926            root_level < cur_level) {
1927                btrfs_err_rl(fs_info,
1928                        "%s: bad levels, cur_level=%d root_level=%d",
1929                        __func__, cur_level, root_level);
1930                return -EUCLEAN;
1931        }
1932
1933        /* Read the tree block if needed */
1934        if (dst_path->nodes[cur_level] == NULL) {
1935                struct btrfs_key first_key;
1936                int parent_slot;
1937                u64 child_gen;
1938                u64 child_bytenr;
1939
1940                /*
1941                 * dst_path->nodes[root_level] must be initialized before
1942                 * calling this function.
1943                 */
1944                if (cur_level == root_level) {
1945                        btrfs_err_rl(fs_info,
1946        "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
1947                                __func__, root_level, root_level, cur_level);
1948                        return -EUCLEAN;
1949                }
1950
1951                /*
1952                 * We need to get child blockptr/gen from parent before we can
1953                 * read it.
1954                  */
1955                eb = dst_path->nodes[cur_level + 1];
1956                parent_slot = dst_path->slots[cur_level + 1];
1957                child_bytenr = btrfs_node_blockptr(eb, parent_slot);
1958                child_gen = btrfs_node_ptr_generation(eb, parent_slot);
1959                btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
1960
1961                /* This node is old, no need to trace */
1962                if (child_gen < last_snapshot)
1963                        goto out;
1964
1965                eb = read_tree_block(fs_info, child_bytenr, child_gen,
1966                                     cur_level, &first_key);
1967                if (IS_ERR(eb)) {
1968                        ret = PTR_ERR(eb);
1969                        goto out;
1970                } else if (!extent_buffer_uptodate(eb)) {
1971                        free_extent_buffer(eb);
1972                        ret = -EIO;
1973                        goto out;
1974                }
1975
1976                dst_path->nodes[cur_level] = eb;
1977                dst_path->slots[cur_level] = 0;
1978
1979                btrfs_tree_read_lock(eb);
1980                btrfs_set_lock_blocking_read(eb);
1981                dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
1982                need_cleanup = true;
1983        }
1984
1985        /* Now record this tree block and its counter part for qgroups */
1986        ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
1987                                       root_level, trace_leaf);
1988        if (ret < 0)
1989                goto cleanup;
1990
1991        eb = dst_path->nodes[cur_level];
1992
1993        if (cur_level > 0) {
1994                /* Iterate all child tree blocks */
1995                for (i = 0; i < btrfs_header_nritems(eb); i++) {
1996                        /* Skip old tree blocks as they won't be swapped */
1997                        if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
1998                                continue;
1999                        dst_path->slots[cur_level] = i;
2000
2001                        /* Recursive call (at most 7 times) */
2002                        ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2003                                        dst_path, cur_level - 1, root_level,
2004                                        last_snapshot, trace_leaf);
2005                        if (ret < 0)
2006                                goto cleanup;
2007                }
2008        }
2009
2010cleanup:
2011        if (need_cleanup) {
2012                /* Clean up */
2013                btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2014                                     dst_path->locks[cur_level]);
2015                free_extent_buffer(dst_path->nodes[cur_level]);
2016                dst_path->nodes[cur_level] = NULL;
2017                dst_path->slots[cur_level] = 0;
2018                dst_path->locks[cur_level] = 0;
2019        }
2020out:
2021        return ret;
2022}
2023
2024static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2025                                struct extent_buffer *src_eb,
2026                                struct extent_buffer *dst_eb,
2027                                u64 last_snapshot, bool trace_leaf)
2028{
2029        struct btrfs_fs_info *fs_info = trans->fs_info;
2030        struct btrfs_path *dst_path = NULL;
2031        int level;
2032        int ret;
2033
2034        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2035                return 0;
2036
2037        /* Wrong parameter order */
2038        if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2039                btrfs_err_rl(fs_info,
2040                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2041                             btrfs_header_generation(src_eb),
2042                             btrfs_header_generation(dst_eb));
2043                return -EUCLEAN;
2044        }
2045
2046        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2047                ret = -EIO;
2048                goto out;
2049        }
2050
2051        level = btrfs_header_level(dst_eb);
2052        dst_path = btrfs_alloc_path();
2053        if (!dst_path) {
2054                ret = -ENOMEM;
2055                goto out;
2056        }
2057        /* For dst_path */
2058        extent_buffer_get(dst_eb);
2059        dst_path->nodes[level] = dst_eb;
2060        dst_path->slots[level] = 0;
2061        dst_path->locks[level] = 0;
2062
2063        /* Do the generation aware breadth-first search */
2064        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2065                                              level, last_snapshot, trace_leaf);
2066        if (ret < 0)
2067                goto out;
2068        ret = 0;
2069
2070out:
2071        btrfs_free_path(dst_path);
2072        if (ret < 0)
2073                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2074        return ret;
2075}
2076
2077int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2078                               struct extent_buffer *root_eb,
2079                               u64 root_gen, int root_level)
2080{
2081        struct btrfs_fs_info *fs_info = trans->fs_info;
2082        int ret = 0;
2083        int level;
2084        struct extent_buffer *eb = root_eb;
2085        struct btrfs_path *path = NULL;
2086
2087        BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2088        BUG_ON(root_eb == NULL);
2089
2090        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2091                return 0;
2092
2093        if (!extent_buffer_uptodate(root_eb)) {
2094                ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
2095                if (ret)
2096                        goto out;
2097        }
2098
2099        if (root_level == 0) {
2100                ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2101                goto out;
2102        }
2103
2104        path = btrfs_alloc_path();
2105        if (!path)
2106                return -ENOMEM;
2107
2108        /*
2109         * Walk down the tree.  Missing extent blocks are filled in as
2110         * we go. Metadata is accounted every time we read a new
2111         * extent block.
2112         *
2113         * When we reach a leaf, we account for file extent items in it,
2114         * walk back up the tree (adjusting slot pointers as we go)
2115         * and restart the search process.
2116         */
2117        extent_buffer_get(root_eb); /* For path */
2118        path->nodes[root_level] = root_eb;
2119        path->slots[root_level] = 0;
2120        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2121walk_down:
2122        level = root_level;
2123        while (level >= 0) {
2124                if (path->nodes[level] == NULL) {
2125                        struct btrfs_key first_key;
2126                        int parent_slot;
2127                        u64 child_gen;
2128                        u64 child_bytenr;
2129
2130                        /*
2131                         * We need to get child blockptr/gen from parent before
2132                         * we can read it.
2133                          */
2134                        eb = path->nodes[level + 1];
2135                        parent_slot = path->slots[level + 1];
2136                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2137                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2138                        btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
2139
2140                        eb = read_tree_block(fs_info, child_bytenr, child_gen,
2141                                             level, &first_key);
2142                        if (IS_ERR(eb)) {
2143                                ret = PTR_ERR(eb);
2144                                goto out;
2145                        } else if (!extent_buffer_uptodate(eb)) {
2146                                free_extent_buffer(eb);
2147                                ret = -EIO;
2148                                goto out;
2149                        }
2150
2151                        path->nodes[level] = eb;
2152                        path->slots[level] = 0;
2153
2154                        btrfs_tree_read_lock(eb);
2155                        btrfs_set_lock_blocking_read(eb);
2156                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
2157
2158                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2159                                                        fs_info->nodesize,
2160                                                        GFP_NOFS);
2161                        if (ret)
2162                                goto out;
2163                }
2164
2165                if (level == 0) {
2166                        ret = btrfs_qgroup_trace_leaf_items(trans,
2167                                                            path->nodes[level]);
2168                        if (ret)
2169                                goto out;
2170
2171                        /* Nonzero return here means we completed our search */
2172                        ret = adjust_slots_upwards(path, root_level);
2173                        if (ret)
2174                                break;
2175
2176                        /* Restart search with new slots */
2177                        goto walk_down;
2178                }
2179
2180                level--;
2181        }
2182
2183        ret = 0;
2184out:
2185        btrfs_free_path(path);
2186
2187        return ret;
2188}
2189
2190#define UPDATE_NEW      0
2191#define UPDATE_OLD      1
2192/*
2193 * Walk all of the roots that points to the bytenr and adjust their refcnts.
2194 */
2195static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2196                                struct ulist *roots, struct ulist *tmp,
2197                                struct ulist *qgroups, u64 seq, int update_old)
2198{
2199        struct ulist_node *unode;
2200        struct ulist_iterator uiter;
2201        struct ulist_node *tmp_unode;
2202        struct ulist_iterator tmp_uiter;
2203        struct btrfs_qgroup *qg;
2204        int ret = 0;
2205
2206        if (!roots)
2207                return 0;
2208        ULIST_ITER_INIT(&uiter);
2209        while ((unode = ulist_next(roots, &uiter))) {
2210                qg = find_qgroup_rb(fs_info, unode->val);
2211                if (!qg)
2212                        continue;
2213
2214                ulist_reinit(tmp);
2215                ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
2216                                GFP_ATOMIC);
2217                if (ret < 0)
2218                        return ret;
2219                ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
2220                if (ret < 0)
2221                        return ret;
2222                ULIST_ITER_INIT(&tmp_uiter);
2223                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
2224                        struct btrfs_qgroup_list *glist;
2225
2226                        qg = unode_aux_to_qgroup(tmp_unode);
2227                        if (update_old)
2228                                btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2229                        else
2230                                btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2231                        list_for_each_entry(glist, &qg->groups, next_group) {
2232                                ret = ulist_add(qgroups, glist->group->qgroupid,
2233                                                qgroup_to_aux(glist->group),
2234                                                GFP_ATOMIC);
2235                                if (ret < 0)
2236                                        return ret;
2237                                ret = ulist_add(tmp, glist->group->qgroupid,
2238                                                qgroup_to_aux(glist->group),
2239                                                GFP_ATOMIC);
2240                                if (ret < 0)
2241                                        return ret;
2242                        }
2243                }
2244        }
2245        return 0;
2246}
2247
2248/*
2249 * Update qgroup rfer/excl counters.
2250 * Rfer update is easy, codes can explain themselves.
2251 *
2252 * Excl update is tricky, the update is split into 2 part.
2253 * Part 1: Possible exclusive <-> sharing detect:
2254 *      |       A       |       !A      |
2255 *  -------------------------------------
2256 *  B   |       *       |       -       |
2257 *  -------------------------------------
2258 *  !B  |       +       |       **      |
2259 *  -------------------------------------
2260 *
2261 * Conditions:
2262 * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2263 * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2264 * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2265 * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2266 *
2267 * Results:
2268 * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
2269 * *: Definitely not changed.           **: Possible unchanged.
2270 *
2271 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2272 *
2273 * To make the logic clear, we first use condition A and B to split
2274 * combination into 4 results.
2275 *
2276 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2277 * only on variant maybe 0.
2278 *
2279 * Lastly, check result **, since there are 2 variants maybe 0, split them
2280 * again(2x2).
2281 * But this time we don't need to consider other things, the codes and logic
2282 * is easy to understand now.
2283 */
2284static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
2285                                  struct ulist *qgroups,
2286                                  u64 nr_old_roots,
2287                                  u64 nr_new_roots,
2288                                  u64 num_bytes, u64 seq)
2289{
2290        struct ulist_node *unode;
2291        struct ulist_iterator uiter;
2292        struct btrfs_qgroup *qg;
2293        u64 cur_new_count, cur_old_count;
2294
2295        ULIST_ITER_INIT(&uiter);
2296        while ((unode = ulist_next(qgroups, &uiter))) {
2297                bool dirty = false;
2298
2299                qg = unode_aux_to_qgroup(unode);
2300                cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2301                cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2302
2303                trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2304                                             cur_new_count);
2305
2306                /* Rfer update part */
2307                if (cur_old_count == 0 && cur_new_count > 0) {
2308                        qg->rfer += num_bytes;
2309                        qg->rfer_cmpr += num_bytes;
2310                        dirty = true;
2311                }
2312                if (cur_old_count > 0 && cur_new_count == 0) {
2313                        qg->rfer -= num_bytes;
2314                        qg->rfer_cmpr -= num_bytes;
2315                        dirty = true;
2316                }
2317
2318                /* Excl update part */
2319                /* Exclusive/none -> shared case */
2320                if (cur_old_count == nr_old_roots &&
2321                    cur_new_count < nr_new_roots) {
2322                        /* Exclusive -> shared */
2323                        if (cur_old_count != 0) {
2324                                qg->excl -= num_bytes;
2325                                qg->excl_cmpr -= num_bytes;
2326                                dirty = true;
2327                        }
2328                }
2329
2330                /* Shared -> exclusive/none case */
2331                if (cur_old_count < nr_old_roots &&
2332                    cur_new_count == nr_new_roots) {
2333                        /* Shared->exclusive */
2334                        if (cur_new_count != 0) {
2335                                qg->excl += num_bytes;
2336                                qg->excl_cmpr += num_bytes;
2337                                dirty = true;
2338                        }
2339                }
2340
2341                /* Exclusive/none -> exclusive/none case */
2342                if (cur_old_count == nr_old_roots &&
2343                    cur_new_count == nr_new_roots) {
2344                        if (cur_old_count == 0) {
2345                                /* None -> exclusive/none */
2346
2347                                if (cur_new_count != 0) {
2348                                        /* None -> exclusive */
2349                                        qg->excl += num_bytes;
2350                                        qg->excl_cmpr += num_bytes;
2351                                        dirty = true;
2352                                }
2353                                /* None -> none, nothing changed */
2354                        } else {
2355                                /* Exclusive -> exclusive/none */
2356
2357                                if (cur_new_count == 0) {
2358                                        /* Exclusive -> none */
2359                                        qg->excl -= num_bytes;
2360                                        qg->excl_cmpr -= num_bytes;
2361                                        dirty = true;
2362                                }
2363                                /* Exclusive -> exclusive, nothing changed */
2364                        }
2365                }
2366
2367                if (dirty)
2368                        qgroup_dirty(fs_info, qg);
2369        }
2370        return 0;
2371}
2372
2373/*
2374 * Check if the @roots potentially is a list of fs tree roots
2375 *
2376 * Return 0 for definitely not a fs/subvol tree roots ulist
2377 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2378 *          one as well)
2379 */
2380static int maybe_fs_roots(struct ulist *roots)
2381{
2382        struct ulist_node *unode;
2383        struct ulist_iterator uiter;
2384
2385        /* Empty one, still possible for fs roots */
2386        if (!roots || roots->nnodes == 0)
2387                return 1;
2388
2389        ULIST_ITER_INIT(&uiter);
2390        unode = ulist_next(roots, &uiter);
2391        if (!unode)
2392                return 1;
2393
2394        /*
2395         * If it contains fs tree roots, then it must belong to fs/subvol
2396         * trees.
2397         * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2398         */
2399        return is_fstree(unode->val);
2400}
2401
2402int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2403                                u64 num_bytes, struct ulist *old_roots,
2404                                struct ulist *new_roots)
2405{
2406        struct btrfs_fs_info *fs_info = trans->fs_info;
2407        struct ulist *qgroups = NULL;
2408        struct ulist *tmp = NULL;
2409        u64 seq;
2410        u64 nr_new_roots = 0;
2411        u64 nr_old_roots = 0;
2412        int ret = 0;
2413
2414        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2415                return 0;
2416
2417        if (new_roots) {
2418                if (!maybe_fs_roots(new_roots))
2419                        goto out_free;
2420                nr_new_roots = new_roots->nnodes;
2421        }
2422        if (old_roots) {
2423                if (!maybe_fs_roots(old_roots))
2424                        goto out_free;
2425                nr_old_roots = old_roots->nnodes;
2426        }
2427
2428        /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2429        if (nr_old_roots == 0 && nr_new_roots == 0)
2430                goto out_free;
2431
2432        BUG_ON(!fs_info->quota_root);
2433
2434        trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2435                                        num_bytes, nr_old_roots, nr_new_roots);
2436
2437        qgroups = ulist_alloc(GFP_NOFS);
2438        if (!qgroups) {
2439                ret = -ENOMEM;
2440                goto out_free;
2441        }
2442        tmp = ulist_alloc(GFP_NOFS);
2443        if (!tmp) {
2444                ret = -ENOMEM;
2445                goto out_free;
2446        }
2447
2448        mutex_lock(&fs_info->qgroup_rescan_lock);
2449        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2450                if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2451                        mutex_unlock(&fs_info->qgroup_rescan_lock);
2452                        ret = 0;
2453                        goto out_free;
2454                }
2455        }
2456        mutex_unlock(&fs_info->qgroup_rescan_lock);
2457
2458        spin_lock(&fs_info->qgroup_lock);
2459        seq = fs_info->qgroup_seq;
2460
2461        /* Update old refcnts using old_roots */
2462        ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
2463                                   UPDATE_OLD);
2464        if (ret < 0)
2465                goto out;
2466
2467        /* Update new refcnts using new_roots */
2468        ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
2469                                   UPDATE_NEW);
2470        if (ret < 0)
2471                goto out;
2472
2473        qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
2474                               num_bytes, seq);
2475
2476        /*
2477         * Bump qgroup_seq to avoid seq overlap
2478         */
2479        fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2480out:
2481        spin_unlock(&fs_info->qgroup_lock);
2482out_free:
2483        ulist_free(tmp);
2484        ulist_free(qgroups);
2485        ulist_free(old_roots);
2486        ulist_free(new_roots);
2487        return ret;
2488}
2489
2490int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2491{
2492        struct btrfs_fs_info *fs_info = trans->fs_info;
2493        struct btrfs_qgroup_extent_record *record;
2494        struct btrfs_delayed_ref_root *delayed_refs;
2495        struct ulist *new_roots = NULL;
2496        struct rb_node *node;
2497        u64 num_dirty_extents = 0;
2498        u64 qgroup_to_skip;
2499        int ret = 0;
2500
2501        delayed_refs = &trans->transaction->delayed_refs;
2502        qgroup_to_skip = delayed_refs->qgroup_to_skip;
2503        while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2504                record = rb_entry(node, struct btrfs_qgroup_extent_record,
2505                                  node);
2506
2507                num_dirty_extents++;
2508                trace_btrfs_qgroup_account_extents(fs_info, record);
2509
2510                if (!ret) {
2511                        /*
2512                         * Old roots should be searched when inserting qgroup
2513                         * extent record
2514                         */
2515                        if (WARN_ON(!record->old_roots)) {
2516                                /* Search commit root to find old_roots */
2517                                ret = btrfs_find_all_roots(NULL, fs_info,
2518                                                record->bytenr, 0,
2519                                                &record->old_roots, false);
2520                                if (ret < 0)
2521                                        goto cleanup;
2522                        }
2523
2524                        /* Free the reserved data space */
2525                        btrfs_qgroup_free_refroot(fs_info,
2526                                        record->data_rsv_refroot,
2527                                        record->data_rsv,
2528                                        BTRFS_QGROUP_RSV_DATA);
2529                        /*
2530                         * Use SEQ_LAST as time_seq to do special search, which
2531                         * doesn't lock tree or delayed_refs and search current
2532                         * root. It's safe inside commit_transaction().
2533                         */
2534                        ret = btrfs_find_all_roots(trans, fs_info,
2535                                record->bytenr, SEQ_LAST, &new_roots, false);
2536                        if (ret < 0)
2537                                goto cleanup;
2538                        if (qgroup_to_skip) {
2539                                ulist_del(new_roots, qgroup_to_skip, 0);
2540                                ulist_del(record->old_roots, qgroup_to_skip,
2541                                          0);
2542                        }
2543                        ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2544                                                          record->num_bytes,
2545                                                          record->old_roots,
2546                                                          new_roots);
2547                        record->old_roots = NULL;
2548                        new_roots = NULL;
2549                }
2550cleanup:
2551                ulist_free(record->old_roots);
2552                ulist_free(new_roots);
2553                new_roots = NULL;
2554                rb_erase(node, &delayed_refs->dirty_extent_root);
2555                kfree(record);
2556
2557        }
2558        trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2559                                       num_dirty_extents);
2560        return ret;
2561}
2562
2563/*
2564 * called from commit_transaction. Writes all changed qgroups to disk.
2565 */
2566int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2567{
2568        struct btrfs_fs_info *fs_info = trans->fs_info;
2569        struct btrfs_root *quota_root = fs_info->quota_root;
2570        int ret = 0;
2571
2572        if (!quota_root)
2573                return ret;
2574
2575        spin_lock(&fs_info->qgroup_lock);
2576        while (!list_empty(&fs_info->dirty_qgroups)) {
2577                struct btrfs_qgroup *qgroup;
2578                qgroup = list_first_entry(&fs_info->dirty_qgroups,
2579                                          struct btrfs_qgroup, dirty);
2580                list_del_init(&qgroup->dirty);
2581                spin_unlock(&fs_info->qgroup_lock);
2582                ret = update_qgroup_info_item(trans, qgroup);
2583                if (ret)
2584                        fs_info->qgroup_flags |=
2585                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2586                ret = update_qgroup_limit_item(trans, qgroup);
2587                if (ret)
2588                        fs_info->qgroup_flags |=
2589                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2590                spin_lock(&fs_info->qgroup_lock);
2591        }
2592        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2593                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2594        else
2595                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2596        spin_unlock(&fs_info->qgroup_lock);
2597
2598        ret = update_qgroup_status_item(trans);
2599        if (ret)
2600                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2601
2602        return ret;
2603}
2604
2605/*
2606 * Copy the accounting information between qgroups. This is necessary
2607 * when a snapshot or a subvolume is created. Throwing an error will
2608 * cause a transaction abort so we take extra care here to only error
2609 * when a readonly fs is a reasonable outcome.
2610 */
2611int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2612                         u64 objectid, struct btrfs_qgroup_inherit *inherit)
2613{
2614        int ret = 0;
2615        int i;
2616        u64 *i_qgroups;
2617        bool committing = false;
2618        struct btrfs_fs_info *fs_info = trans->fs_info;
2619        struct btrfs_root *quota_root;
2620        struct btrfs_qgroup *srcgroup;
2621        struct btrfs_qgroup *dstgroup;
2622        u32 level_size = 0;
2623        u64 nums;
2624
2625        /*
2626         * There are only two callers of this function.
2627         *
2628         * One in create_subvol() in the ioctl context, which needs to hold
2629         * the qgroup_ioctl_lock.
2630         *
2631         * The other one in create_pending_snapshot() where no other qgroup
2632         * code can modify the fs as they all need to either start a new trans
2633         * or hold a trans handler, thus we don't need to hold
2634         * qgroup_ioctl_lock.
2635         * This would avoid long and complex lock chain and make lockdep happy.
2636         */
2637        spin_lock(&fs_info->trans_lock);
2638        if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2639                committing = true;
2640        spin_unlock(&fs_info->trans_lock);
2641
2642        if (!committing)
2643                mutex_lock(&fs_info->qgroup_ioctl_lock);
2644        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2645                goto out;
2646
2647        quota_root = fs_info->quota_root;
2648        if (!quota_root) {
2649                ret = -EINVAL;
2650                goto out;
2651        }
2652
2653        if (inherit) {
2654                i_qgroups = (u64 *)(inherit + 1);
2655                nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2656                       2 * inherit->num_excl_copies;
2657                for (i = 0; i < nums; ++i) {
2658                        srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2659
2660                        /*
2661                         * Zero out invalid groups so we can ignore
2662                         * them later.
2663                         */
2664                        if (!srcgroup ||
2665                            ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
2666                                *i_qgroups = 0ULL;
2667
2668                        ++i_qgroups;
2669                }
2670        }
2671
2672        /*
2673         * create a tracking group for the subvol itself
2674         */
2675        ret = add_qgroup_item(trans, quota_root, objectid);
2676        if (ret)
2677                goto out;
2678
2679        /*
2680         * add qgroup to all inherited groups
2681         */
2682        if (inherit) {
2683                i_qgroups = (u64 *)(inherit + 1);
2684                for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
2685                        if (*i_qgroups == 0)
2686                                continue;
2687                        ret = add_qgroup_relation_item(trans, objectid,
2688                                                       *i_qgroups);
2689                        if (ret && ret != -EEXIST)
2690                                goto out;
2691                        ret = add_qgroup_relation_item(trans, *i_qgroups,
2692                                                       objectid);
2693                        if (ret && ret != -EEXIST)
2694                                goto out;
2695                }
2696                ret = 0;
2697        }
2698
2699
2700        spin_lock(&fs_info->qgroup_lock);
2701
2702        dstgroup = add_qgroup_rb(fs_info, objectid);
2703        if (IS_ERR(dstgroup)) {
2704                ret = PTR_ERR(dstgroup);
2705                goto unlock;
2706        }
2707
2708        if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2709                dstgroup->lim_flags = inherit->lim.flags;
2710                dstgroup->max_rfer = inherit->lim.max_rfer;
2711                dstgroup->max_excl = inherit->lim.max_excl;
2712                dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2713                dstgroup->rsv_excl = inherit->lim.rsv_excl;
2714
2715                ret = update_qgroup_limit_item(trans, dstgroup);
2716                if (ret) {
2717                        fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2718                        btrfs_info(fs_info,
2719                                   "unable to update quota limit for %llu",
2720                                   dstgroup->qgroupid);
2721                        goto unlock;
2722                }
2723        }
2724
2725        if (srcid) {
2726                srcgroup = find_qgroup_rb(fs_info, srcid);
2727                if (!srcgroup)
2728                        goto unlock;
2729
2730                /*
2731                 * We call inherit after we clone the root in order to make sure
2732                 * our counts don't go crazy, so at this point the only
2733                 * difference between the two roots should be the root node.
2734                 */
2735                level_size = fs_info->nodesize;
2736                dstgroup->rfer = srcgroup->rfer;
2737                dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2738                dstgroup->excl = level_size;
2739                dstgroup->excl_cmpr = level_size;
2740                srcgroup->excl = level_size;
2741                srcgroup->excl_cmpr = level_size;
2742
2743                /* inherit the limit info */
2744                dstgroup->lim_flags = srcgroup->lim_flags;
2745                dstgroup->max_rfer = srcgroup->max_rfer;
2746                dstgroup->max_excl = srcgroup->max_excl;
2747                dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2748                dstgroup->rsv_excl = srcgroup->rsv_excl;
2749
2750                qgroup_dirty(fs_info, dstgroup);
2751                qgroup_dirty(fs_info, srcgroup);
2752        }
2753
2754        if (!inherit)
2755                goto unlock;
2756
2757        i_qgroups = (u64 *)(inherit + 1);
2758        for (i = 0; i < inherit->num_qgroups; ++i) {
2759                if (*i_qgroups) {
2760                        ret = add_relation_rb(fs_info, objectid, *i_qgroups);
2761                        if (ret)
2762                                goto unlock;
2763                }
2764                ++i_qgroups;
2765        }
2766
2767        for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
2768                struct btrfs_qgroup *src;
2769                struct btrfs_qgroup *dst;
2770
2771                if (!i_qgroups[0] || !i_qgroups[1])
2772                        continue;
2773
2774                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2775                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2776
2777                if (!src || !dst) {
2778                        ret = -EINVAL;
2779                        goto unlock;
2780                }
2781
2782                dst->rfer = src->rfer - level_size;
2783                dst->rfer_cmpr = src->rfer_cmpr - level_size;
2784        }
2785        for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
2786                struct btrfs_qgroup *src;
2787                struct btrfs_qgroup *dst;
2788
2789                if (!i_qgroups[0] || !i_qgroups[1])
2790                        continue;
2791
2792                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2793                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2794
2795                if (!src || !dst) {
2796                        ret = -EINVAL;
2797                        goto unlock;
2798                }
2799
2800                dst->excl = src->excl + level_size;
2801                dst->excl_cmpr = src->excl_cmpr + level_size;
2802        }
2803
2804unlock:
2805        spin_unlock(&fs_info->qgroup_lock);
2806out:
2807        if (!committing)
2808                mutex_unlock(&fs_info->qgroup_ioctl_lock);
2809        return ret;
2810}
2811
2812/*
2813 * Two limits to commit transaction in advance.
2814 *
2815 * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
2816 * For SIZE, it will be in byte unit as threshold.
2817 */
2818#define QGROUP_FREE_RATIO               32
2819#define QGROUP_FREE_SIZE                SZ_32M
2820static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
2821                                const struct btrfs_qgroup *qg, u64 num_bytes)
2822{
2823        u64 free;
2824        u64 threshold;
2825
2826        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
2827            qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
2828                return false;
2829
2830        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
2831            qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
2832                return false;
2833
2834        /*
2835         * Even if we passed the check, it's better to check if reservation
2836         * for meta_pertrans is pushing us near limit.
2837         * If there is too much pertrans reservation or it's near the limit,
2838         * let's try commit transaction to free some, using transaction_kthread
2839         */
2840        if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
2841                              BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
2842                if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
2843                        free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
2844                        threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
2845                                          QGROUP_FREE_SIZE);
2846                } else {
2847                        free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
2848                        threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
2849                                          QGROUP_FREE_SIZE);
2850                }
2851
2852                /*
2853                 * Use transaction_kthread to commit transaction, so we no
2854                 * longer need to bother nested transaction nor lock context.
2855                 */
2856                if (free < threshold)
2857                        btrfs_commit_transaction_locksafe(fs_info);
2858        }
2859
2860        return true;
2861}
2862
2863static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
2864                          enum btrfs_qgroup_rsv_type type)
2865{
2866        struct btrfs_root *quota_root;
2867        struct btrfs_qgroup *qgroup;
2868        struct btrfs_fs_info *fs_info = root->fs_info;
2869        u64 ref_root = root->root_key.objectid;
2870        int ret = 0;
2871        struct ulist_node *unode;
2872        struct ulist_iterator uiter;
2873
2874        if (!is_fstree(ref_root))
2875                return 0;
2876
2877        if (num_bytes == 0)
2878                return 0;
2879
2880        if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
2881            capable(CAP_SYS_RESOURCE))
2882                enforce = false;
2883
2884        spin_lock(&fs_info->qgroup_lock);
2885        quota_root = fs_info->quota_root;
2886        if (!quota_root)
2887                goto out;
2888
2889        qgroup = find_qgroup_rb(fs_info, ref_root);
2890        if (!qgroup)
2891                goto out;
2892
2893        /*
2894         * in a first step, we check all affected qgroups if any limits would
2895         * be exceeded
2896         */
2897        ulist_reinit(fs_info->qgroup_ulist);
2898        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
2899                        qgroup_to_aux(qgroup), GFP_ATOMIC);
2900        if (ret < 0)
2901                goto out;
2902        ULIST_ITER_INIT(&uiter);
2903        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2904                struct btrfs_qgroup *qg;
2905                struct btrfs_qgroup_list *glist;
2906
2907                qg = unode_aux_to_qgroup(unode);
2908
2909                if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
2910                        ret = -EDQUOT;
2911                        goto out;
2912                }
2913
2914                list_for_each_entry(glist, &qg->groups, next_group) {
2915                        ret = ulist_add(fs_info->qgroup_ulist,
2916                                        glist->group->qgroupid,
2917                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
2918                        if (ret < 0)
2919                                goto out;
2920                }
2921        }
2922        ret = 0;
2923        /*
2924         * no limits exceeded, now record the reservation into all qgroups
2925         */
2926        ULIST_ITER_INIT(&uiter);
2927        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2928                struct btrfs_qgroup *qg;
2929
2930                qg = unode_aux_to_qgroup(unode);
2931
2932                qgroup_rsv_add(fs_info, qg, num_bytes, type);
2933        }
2934
2935out:
2936        spin_unlock(&fs_info->qgroup_lock);
2937        return ret;
2938}
2939
2940/*
2941 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
2942 * qgroup).
2943 *
2944 * Will handle all higher level qgroup too.
2945 *
2946 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
2947 * This special case is only used for META_PERTRANS type.
2948 */
2949void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
2950                               u64 ref_root, u64 num_bytes,
2951                               enum btrfs_qgroup_rsv_type type)
2952{
2953        struct btrfs_root *quota_root;
2954        struct btrfs_qgroup *qgroup;
2955        struct ulist_node *unode;
2956        struct ulist_iterator uiter;
2957        int ret = 0;
2958
2959        if (!is_fstree(ref_root))
2960                return;
2961
2962        if (num_bytes == 0)
2963                return;
2964
2965        if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
2966                WARN(1, "%s: Invalid type to free", __func__);
2967                return;
2968        }
2969        spin_lock(&fs_info->qgroup_lock);
2970
2971        quota_root = fs_info->quota_root;
2972        if (!quota_root)
2973                goto out;
2974
2975        qgroup = find_qgroup_rb(fs_info, ref_root);
2976        if (!qgroup)
2977                goto out;
2978
2979        if (num_bytes == (u64)-1)
2980                /*
2981                 * We're freeing all pertrans rsv, get reserved value from
2982                 * level 0 qgroup as real num_bytes to free.
2983                 */
2984                num_bytes = qgroup->rsv.values[type];
2985
2986        ulist_reinit(fs_info->qgroup_ulist);
2987        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
2988                        qgroup_to_aux(qgroup), GFP_ATOMIC);
2989        if (ret < 0)
2990                goto out;
2991        ULIST_ITER_INIT(&uiter);
2992        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
2993                struct btrfs_qgroup *qg;
2994                struct btrfs_qgroup_list *glist;
2995
2996                qg = unode_aux_to_qgroup(unode);
2997
2998                qgroup_rsv_release(fs_info, qg, num_bytes, type);
2999
3000                list_for_each_entry(glist, &qg->groups, next_group) {
3001                        ret = ulist_add(fs_info->qgroup_ulist,
3002                                        glist->group->qgroupid,
3003                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3004                        if (ret < 0)
3005                                goto out;
3006                }
3007        }
3008
3009out:
3010        spin_unlock(&fs_info->qgroup_lock);
3011}
3012
3013/*
3014 * Check if the leaf is the last leaf. Which means all node pointers
3015 * are at their last position.
3016 */
3017static bool is_last_leaf(struct btrfs_path *path)
3018{
3019        int i;
3020
3021        for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3022                if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3023                        return false;
3024        }
3025        return true;
3026}
3027
3028/*
3029 * returns < 0 on error, 0 when more leafs are to be scanned.
3030 * returns 1 when done.
3031 */
3032static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3033                              struct btrfs_path *path)
3034{
3035        struct btrfs_fs_info *fs_info = trans->fs_info;
3036        struct btrfs_key found;
3037        struct extent_buffer *scratch_leaf = NULL;
3038        struct ulist *roots = NULL;
3039        u64 num_bytes;
3040        bool done;
3041        int slot;
3042        int ret;
3043
3044        mutex_lock(&fs_info->qgroup_rescan_lock);
3045        ret = btrfs_search_slot_for_read(fs_info->extent_root,
3046                                         &fs_info->qgroup_rescan_progress,
3047                                         path, 1, 0);
3048
3049        btrfs_debug(fs_info,
3050                "current progress key (%llu %u %llu), search_slot ret %d",
3051                fs_info->qgroup_rescan_progress.objectid,
3052                fs_info->qgroup_rescan_progress.type,
3053                fs_info->qgroup_rescan_progress.offset, ret);
3054
3055        if (ret) {
3056                /*
3057                 * The rescan is about to end, we will not be scanning any
3058                 * further blocks. We cannot unset the RESCAN flag here, because
3059                 * we want to commit the transaction if everything went well.
3060                 * To make the live accounting work in this phase, we set our
3061                 * scan progress pointer such that every real extent objectid
3062                 * will be smaller.
3063                 */
3064                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3065                btrfs_release_path(path);
3066                mutex_unlock(&fs_info->qgroup_rescan_lock);
3067                return ret;
3068        }
3069        done = is_last_leaf(path);
3070
3071        btrfs_item_key_to_cpu(path->nodes[0], &found,
3072                              btrfs_header_nritems(path->nodes[0]) - 1);
3073        fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3074
3075        scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3076        if (!scratch_leaf) {
3077                ret = -ENOMEM;
3078                mutex_unlock(&fs_info->qgroup_rescan_lock);
3079                goto out;
3080        }
3081        slot = path->slots[0];
3082        btrfs_release_path(path);
3083        mutex_unlock(&fs_info->qgroup_rescan_lock);
3084
3085        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3086                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3087                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3088                    found.type != BTRFS_METADATA_ITEM_KEY)
3089                        continue;
3090                if (found.type == BTRFS_METADATA_ITEM_KEY)
3091                        num_bytes = fs_info->nodesize;
3092                else
3093                        num_bytes = found.offset;
3094
3095                ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
3096                                           &roots, false);
3097                if (ret < 0)
3098                        goto out;
3099                /* For rescan, just pass old_roots as NULL */
3100                ret = btrfs_qgroup_account_extent(trans, found.objectid,
3101                                                  num_bytes, NULL, roots);
3102                if (ret < 0)
3103                        goto out;
3104        }
3105out:
3106        if (scratch_leaf)
3107                free_extent_buffer(scratch_leaf);
3108
3109        if (done && !ret) {
3110                ret = 1;
3111                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3112        }
3113        return ret;
3114}
3115
3116static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3117{
3118        struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3119                                                     qgroup_rescan_work);
3120        struct btrfs_path *path;
3121        struct btrfs_trans_handle *trans = NULL;
3122        int err = -ENOMEM;
3123        int ret = 0;
3124
3125        path = btrfs_alloc_path();
3126        if (!path)
3127                goto out;
3128        /*
3129         * Rescan should only search for commit root, and any later difference
3130         * should be recorded by qgroup
3131         */
3132        path->search_commit_root = 1;
3133        path->skip_locking = 1;
3134
3135        err = 0;
3136        while (!err && !btrfs_fs_closing(fs_info)) {
3137                trans = btrfs_start_transaction(fs_info->fs_root, 0);
3138                if (IS_ERR(trans)) {
3139                        err = PTR_ERR(trans);
3140                        break;
3141                }
3142                if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3143                        err = -EINTR;
3144                } else {
3145                        err = qgroup_rescan_leaf(trans, path);
3146                }
3147                if (err > 0)
3148                        btrfs_commit_transaction(trans);
3149                else
3150                        btrfs_end_transaction(trans);
3151        }
3152
3153out:
3154        btrfs_free_path(path);
3155
3156        mutex_lock(&fs_info->qgroup_rescan_lock);
3157        if (!btrfs_fs_closing(fs_info))
3158                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3159
3160        if (err > 0 &&
3161            fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3162                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3163        } else if (err < 0) {
3164                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3165        }
3166        mutex_unlock(&fs_info->qgroup_rescan_lock);
3167
3168        /*
3169         * only update status, since the previous part has already updated the
3170         * qgroup info.
3171         */
3172        trans = btrfs_start_transaction(fs_info->quota_root, 1);
3173        if (IS_ERR(trans)) {
3174                err = PTR_ERR(trans);
3175                btrfs_err(fs_info,
3176                          "fail to start transaction for status update: %d",
3177                          err);
3178                goto done;
3179        }
3180        ret = update_qgroup_status_item(trans);
3181        if (ret < 0) {
3182                err = ret;
3183                btrfs_err(fs_info, "fail to update qgroup status: %d", err);
3184        }
3185        btrfs_end_transaction(trans);
3186
3187        if (btrfs_fs_closing(fs_info)) {
3188                btrfs_info(fs_info, "qgroup scan paused");
3189        } else if (err >= 0) {
3190                btrfs_info(fs_info, "qgroup scan completed%s",
3191                        err > 0 ? " (inconsistency flag cleared)" : "");
3192        } else {
3193                btrfs_err(fs_info, "qgroup scan failed with %d", err);
3194        }
3195
3196done:
3197        mutex_lock(&fs_info->qgroup_rescan_lock);
3198        fs_info->qgroup_rescan_running = false;
3199        mutex_unlock(&fs_info->qgroup_rescan_lock);
3200        complete_all(&fs_info->qgroup_rescan_completion);
3201}
3202
3203/*
3204 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3205 * memory required for the rescan context.
3206 */
3207static int
3208qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3209                   int init_flags)
3210{
3211        int ret = 0;
3212
3213        if (!init_flags) {
3214                /* we're resuming qgroup rescan at mount time */
3215                if (!(fs_info->qgroup_flags &
3216                      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3217                        btrfs_warn(fs_info,
3218                        "qgroup rescan init failed, qgroup is not enabled");
3219                        ret = -EINVAL;
3220                } else if (!(fs_info->qgroup_flags &
3221                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3222                        btrfs_warn(fs_info,
3223                        "qgroup rescan init failed, qgroup rescan is not queued");
3224                        ret = -EINVAL;
3225                }
3226
3227                if (ret)
3228                        return ret;
3229        }
3230
3231        mutex_lock(&fs_info->qgroup_rescan_lock);
3232        spin_lock(&fs_info->qgroup_lock);
3233
3234        if (init_flags) {
3235                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3236                        btrfs_warn(fs_info,
3237                                   "qgroup rescan is already in progress");
3238                        ret = -EINPROGRESS;
3239                } else if (!(fs_info->qgroup_flags &
3240                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3241                        btrfs_warn(fs_info,
3242                        "qgroup rescan init failed, qgroup is not enabled");
3243                        ret = -EINVAL;
3244                }
3245
3246                if (ret) {
3247                        spin_unlock(&fs_info->qgroup_lock);
3248                        mutex_unlock(&fs_info->qgroup_rescan_lock);
3249                        return ret;
3250                }
3251                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3252        }
3253
3254        memset(&fs_info->qgroup_rescan_progress, 0,
3255                sizeof(fs_info->qgroup_rescan_progress));
3256        fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3257        init_completion(&fs_info->qgroup_rescan_completion);
3258        fs_info->qgroup_rescan_running = true;
3259
3260        spin_unlock(&fs_info->qgroup_lock);
3261        mutex_unlock(&fs_info->qgroup_rescan_lock);
3262
3263        memset(&fs_info->qgroup_rescan_work, 0,
3264               sizeof(fs_info->qgroup_rescan_work));
3265        btrfs_init_work(&fs_info->qgroup_rescan_work,
3266                        btrfs_qgroup_rescan_helper,
3267                        btrfs_qgroup_rescan_worker, NULL, NULL);
3268        return 0;
3269}
3270
3271static void
3272qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3273{
3274        struct rb_node *n;
3275        struct btrfs_qgroup *qgroup;
3276
3277        spin_lock(&fs_info->qgroup_lock);
3278        /* clear all current qgroup tracking information */
3279        for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3280                qgroup = rb_entry(n, struct btrfs_qgroup, node);
3281                qgroup->rfer = 0;
3282                qgroup->rfer_cmpr = 0;
3283                qgroup->excl = 0;
3284                qgroup->excl_cmpr = 0;
3285                qgroup_dirty(fs_info, qgroup);
3286        }
3287        spin_unlock(&fs_info->qgroup_lock);
3288}
3289
3290int
3291btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3292{
3293        int ret = 0;
3294        struct btrfs_trans_handle *trans;
3295
3296        ret = qgroup_rescan_init(fs_info, 0, 1);
3297        if (ret)
3298                return ret;
3299
3300        /*
3301         * We have set the rescan_progress to 0, which means no more
3302         * delayed refs will be accounted by btrfs_qgroup_account_ref.
3303         * However, btrfs_qgroup_account_ref may be right after its call
3304         * to btrfs_find_all_roots, in which case it would still do the
3305         * accounting.
3306         * To solve this, we're committing the transaction, which will
3307         * ensure we run all delayed refs and only after that, we are
3308         * going to clear all tracking information for a clean start.
3309         */
3310
3311        trans = btrfs_join_transaction(fs_info->fs_root);
3312        if (IS_ERR(trans)) {
3313                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3314                return PTR_ERR(trans);
3315        }
3316        ret = btrfs_commit_transaction(trans);
3317        if (ret) {
3318                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3319                return ret;
3320        }
3321
3322        qgroup_rescan_zero_tracking(fs_info);
3323
3324        btrfs_queue_work(fs_info->qgroup_rescan_workers,
3325                         &fs_info->qgroup_rescan_work);
3326
3327        return 0;
3328}
3329
3330int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3331                                     bool interruptible)
3332{
3333        int running;
3334        int ret = 0;
3335
3336        mutex_lock(&fs_info->qgroup_rescan_lock);
3337        spin_lock(&fs_info->qgroup_lock);
3338        running = fs_info->qgroup_rescan_running;
3339        spin_unlock(&fs_info->qgroup_lock);
3340        mutex_unlock(&fs_info->qgroup_rescan_lock);
3341
3342        if (!running)
3343                return 0;
3344
3345        if (interruptible)
3346                ret = wait_for_completion_interruptible(
3347                                        &fs_info->qgroup_rescan_completion);
3348        else
3349                wait_for_completion(&fs_info->qgroup_rescan_completion);
3350
3351        return ret;
3352}
3353
3354/*
3355 * this is only called from open_ctree where we're still single threaded, thus
3356 * locking is omitted here.
3357 */
3358void
3359btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3360{
3361        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
3362                btrfs_queue_work(fs_info->qgroup_rescan_workers,
3363                                 &fs_info->qgroup_rescan_work);
3364}
3365
3366/*
3367 * Reserve qgroup space for range [start, start + len).
3368 *
3369 * This function will either reserve space from related qgroups or doing
3370 * nothing if the range is already reserved.
3371 *
3372 * Return 0 for successful reserve
3373 * Return <0 for error (including -EQUOT)
3374 *
3375 * NOTE: this function may sleep for memory allocation.
3376 *       if btrfs_qgroup_reserve_data() is called multiple times with
3377 *       same @reserved, caller must ensure when error happens it's OK
3378 *       to free *ALL* reserved space.
3379 */
3380int btrfs_qgroup_reserve_data(struct inode *inode,
3381                        struct extent_changeset **reserved_ret, u64 start,
3382                        u64 len)
3383{
3384        struct btrfs_root *root = BTRFS_I(inode)->root;
3385        struct ulist_node *unode;
3386        struct ulist_iterator uiter;
3387        struct extent_changeset *reserved;
3388        u64 orig_reserved;
3389        u64 to_reserve;
3390        int ret;
3391
3392        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3393            !is_fstree(root->root_key.objectid) || len == 0)
3394                return 0;
3395
3396        /* @reserved parameter is mandatory for qgroup */
3397        if (WARN_ON(!reserved_ret))
3398                return -EINVAL;
3399        if (!*reserved_ret) {
3400                *reserved_ret = extent_changeset_alloc();
3401                if (!*reserved_ret)
3402                        return -ENOMEM;
3403        }
3404        reserved = *reserved_ret;
3405        /* Record already reserved space */
3406        orig_reserved = reserved->bytes_changed;
3407        ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
3408                        start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3409
3410        /* Newly reserved space */
3411        to_reserve = reserved->bytes_changed - orig_reserved;
3412        trace_btrfs_qgroup_reserve_data(inode, start, len,
3413                                        to_reserve, QGROUP_RESERVE);
3414        if (ret < 0)
3415                goto cleanup;
3416        ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3417        if (ret < 0)
3418                goto cleanup;
3419
3420        return ret;
3421
3422cleanup:
3423        /* cleanup *ALL* already reserved ranges */
3424        ULIST_ITER_INIT(&uiter);
3425        while ((unode = ulist_next(&reserved->range_changed, &uiter)))
3426                clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
3427                                 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
3428        extent_changeset_release(reserved);
3429        return ret;
3430}
3431
3432/* Free ranges specified by @reserved, normally in error path */
3433static int qgroup_free_reserved_data(struct inode *inode,
3434                        struct extent_changeset *reserved, u64 start, u64 len)
3435{
3436        struct btrfs_root *root = BTRFS_I(inode)->root;
3437        struct ulist_node *unode;
3438        struct ulist_iterator uiter;
3439        struct extent_changeset changeset;
3440        int freed = 0;
3441        int ret;
3442
3443        extent_changeset_init(&changeset);
3444        len = round_up(start + len, root->fs_info->sectorsize);
3445        start = round_down(start, root->fs_info->sectorsize);
3446
3447        ULIST_ITER_INIT(&uiter);
3448        while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3449                u64 range_start = unode->val;
3450                /* unode->aux is the inclusive end */
3451                u64 range_len = unode->aux - range_start + 1;
3452                u64 free_start;
3453                u64 free_len;
3454
3455                extent_changeset_release(&changeset);
3456
3457                /* Only free range in range [start, start + len) */
3458                if (range_start >= start + len ||
3459                    range_start + range_len <= start)
3460                        continue;
3461                free_start = max(range_start, start);
3462                free_len = min(start + len, range_start + range_len) -
3463                           free_start;
3464                /*
3465                 * TODO: To also modify reserved->ranges_reserved to reflect
3466                 * the modification.
3467                 *
3468                 * However as long as we free qgroup reserved according to
3469                 * EXTENT_QGROUP_RESERVED, we won't double free.
3470                 * So not need to rush.
3471                 */
3472                ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
3473                                free_start, free_start + free_len - 1,
3474                                EXTENT_QGROUP_RESERVED, &changeset);
3475                if (ret < 0)
3476                        goto out;
3477                freed += changeset.bytes_changed;
3478        }
3479        btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3480                                  BTRFS_QGROUP_RSV_DATA);
3481        ret = freed;
3482out:
3483        extent_changeset_release(&changeset);
3484        return ret;
3485}
3486
3487static int __btrfs_qgroup_release_data(struct inode *inode,
3488                        struct extent_changeset *reserved, u64 start, u64 len,
3489                        int free)
3490{
3491        struct extent_changeset changeset;
3492        int trace_op = QGROUP_RELEASE;
3493        int ret;
3494
3495        if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
3496                      &BTRFS_I(inode)->root->fs_info->flags))
3497                return 0;
3498
3499        /* In release case, we shouldn't have @reserved */
3500        WARN_ON(!free && reserved);
3501        if (free && reserved)
3502                return qgroup_free_reserved_data(inode, reserved, start, len);
3503        extent_changeset_init(&changeset);
3504        ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
3505                        start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
3506        if (ret < 0)
3507                goto out;
3508
3509        if (free)
3510                trace_op = QGROUP_FREE;
3511        trace_btrfs_qgroup_release_data(inode, start, len,
3512                                        changeset.bytes_changed, trace_op);
3513        if (free)
3514                btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
3515                                BTRFS_I(inode)->root->root_key.objectid,
3516                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3517        ret = changeset.bytes_changed;
3518out:
3519        extent_changeset_release(&changeset);
3520        return ret;
3521}
3522
3523/*
3524 * Free a reserved space range from io_tree and related qgroups
3525 *
3526 * Should be called when a range of pages get invalidated before reaching disk.
3527 * Or for error cleanup case.
3528 * if @reserved is given, only reserved range in [@start, @start + @len) will
3529 * be freed.
3530 *
3531 * For data written to disk, use btrfs_qgroup_release_data().
3532 *
3533 * NOTE: This function may sleep for memory allocation.
3534 */
3535int btrfs_qgroup_free_data(struct inode *inode,
3536                        struct extent_changeset *reserved, u64 start, u64 len)
3537{
3538        return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
3539}
3540
3541/*
3542 * Release a reserved space range from io_tree only.
3543 *
3544 * Should be called when a range of pages get written to disk and corresponding
3545 * FILE_EXTENT is inserted into corresponding root.
3546 *
3547 * Since new qgroup accounting framework will only update qgroup numbers at
3548 * commit_transaction() time, its reserved space shouldn't be freed from
3549 * related qgroups.
3550 *
3551 * But we should release the range from io_tree, to allow further write to be
3552 * COWed.
3553 *
3554 * NOTE: This function may sleep for memory allocation.
3555 */
3556int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
3557{
3558        return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
3559}
3560
3561static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3562                              enum btrfs_qgroup_rsv_type type)
3563{
3564        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3565            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3566                return;
3567        if (num_bytes == 0)
3568                return;
3569
3570        spin_lock(&root->qgroup_meta_rsv_lock);
3571        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
3572                root->qgroup_meta_rsv_prealloc += num_bytes;
3573        else
3574                root->qgroup_meta_rsv_pertrans += num_bytes;
3575        spin_unlock(&root->qgroup_meta_rsv_lock);
3576}
3577
3578static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3579                             enum btrfs_qgroup_rsv_type type)
3580{
3581        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3582            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3583                return 0;
3584        if (num_bytes == 0)
3585                return 0;
3586
3587        spin_lock(&root->qgroup_meta_rsv_lock);
3588        if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
3589                num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
3590                                  num_bytes);
3591                root->qgroup_meta_rsv_prealloc -= num_bytes;
3592        } else {
3593                num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
3594                                  num_bytes);
3595                root->qgroup_meta_rsv_pertrans -= num_bytes;
3596        }
3597        spin_unlock(&root->qgroup_meta_rsv_lock);
3598        return num_bytes;
3599}
3600
3601int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3602                                enum btrfs_qgroup_rsv_type type, bool enforce)
3603{
3604        struct btrfs_fs_info *fs_info = root->fs_info;
3605        int ret;
3606
3607        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3608            !is_fstree(root->root_key.objectid) || num_bytes == 0)
3609                return 0;
3610
3611        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3612        trace_qgroup_meta_reserve(root, type, (s64)num_bytes);
3613        ret = qgroup_reserve(root, num_bytes, enforce, type);
3614        if (ret < 0)
3615                return ret;
3616        /*
3617         * Record what we have reserved into root.
3618         *
3619         * To avoid quota disabled->enabled underflow.
3620         * In that case, we may try to free space we haven't reserved
3621         * (since quota was disabled), so record what we reserved into root.
3622         * And ensure later release won't underflow this number.
3623         */
3624        add_root_meta_rsv(root, num_bytes, type);
3625        return ret;
3626}
3627
3628void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
3629{
3630        struct btrfs_fs_info *fs_info = root->fs_info;
3631
3632        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3633            !is_fstree(root->root_key.objectid))
3634                return;
3635
3636        /* TODO: Update trace point to handle such free */
3637        trace_qgroup_meta_free_all_pertrans(root);
3638        /* Special value -1 means to free all reserved space */
3639        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
3640                                  BTRFS_QGROUP_RSV_META_PERTRANS);
3641}
3642
3643void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
3644                              enum btrfs_qgroup_rsv_type type)
3645{
3646        struct btrfs_fs_info *fs_info = root->fs_info;
3647
3648        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3649            !is_fstree(root->root_key.objectid))
3650                return;
3651
3652        /*
3653         * reservation for META_PREALLOC can happen before quota is enabled,
3654         * which can lead to underflow.
3655         * Here ensure we will only free what we really have reserved.
3656         */
3657        num_bytes = sub_root_meta_rsv(root, num_bytes, type);
3658        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3659        trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
3660        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
3661                                  num_bytes, type);
3662}
3663
3664static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
3665                                int num_bytes)
3666{
3667        struct btrfs_root *quota_root = fs_info->quota_root;
3668        struct btrfs_qgroup *qgroup;
3669        struct ulist_node *unode;
3670        struct ulist_iterator uiter;
3671        int ret = 0;
3672
3673        if (num_bytes == 0)
3674                return;
3675        if (!quota_root)
3676                return;
3677
3678        spin_lock(&fs_info->qgroup_lock);
3679        qgroup = find_qgroup_rb(fs_info, ref_root);
3680        if (!qgroup)
3681                goto out;
3682        ulist_reinit(fs_info->qgroup_ulist);
3683        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3684                       qgroup_to_aux(qgroup), GFP_ATOMIC);
3685        if (ret < 0)
3686                goto out;
3687        ULIST_ITER_INIT(&uiter);
3688        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3689                struct btrfs_qgroup *qg;
3690                struct btrfs_qgroup_list *glist;
3691
3692                qg = unode_aux_to_qgroup(unode);
3693
3694                qgroup_rsv_release(fs_info, qg, num_bytes,
3695                                BTRFS_QGROUP_RSV_META_PREALLOC);
3696                qgroup_rsv_add(fs_info, qg, num_bytes,
3697                                BTRFS_QGROUP_RSV_META_PERTRANS);
3698                list_for_each_entry(glist, &qg->groups, next_group) {
3699                        ret = ulist_add(fs_info->qgroup_ulist,
3700                                        glist->group->qgroupid,
3701                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3702                        if (ret < 0)
3703                                goto out;
3704                }
3705        }
3706out:
3707        spin_unlock(&fs_info->qgroup_lock);
3708}
3709
3710void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
3711{
3712        struct btrfs_fs_info *fs_info = root->fs_info;
3713
3714        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3715            !is_fstree(root->root_key.objectid))
3716                return;
3717        /* Same as btrfs_qgroup_free_meta_prealloc() */
3718        num_bytes = sub_root_meta_rsv(root, num_bytes,
3719                                      BTRFS_QGROUP_RSV_META_PREALLOC);
3720        trace_qgroup_meta_convert(root, num_bytes);
3721        qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
3722}
3723
3724/*
3725 * Check qgroup reserved space leaking, normally at destroy inode
3726 * time
3727 */
3728void btrfs_qgroup_check_reserved_leak(struct inode *inode)
3729{
3730        struct extent_changeset changeset;
3731        struct ulist_node *unode;
3732        struct ulist_iterator iter;
3733        int ret;
3734
3735        extent_changeset_init(&changeset);
3736        ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
3737                        EXTENT_QGROUP_RESERVED, &changeset);
3738
3739        WARN_ON(ret < 0);
3740        if (WARN_ON(changeset.bytes_changed)) {
3741                ULIST_ITER_INIT(&iter);
3742                while ((unode = ulist_next(&changeset.range_changed, &iter))) {
3743                        btrfs_warn(BTRFS_I(inode)->root->fs_info,
3744                                "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
3745                                inode->i_ino, unode->val, unode->aux);
3746                }
3747                btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
3748                                BTRFS_I(inode)->root->root_key.objectid,
3749                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3750
3751        }
3752        extent_changeset_release(&changeset);
3753}
3754
3755void btrfs_qgroup_init_swapped_blocks(
3756        struct btrfs_qgroup_swapped_blocks *swapped_blocks)
3757{
3758        int i;
3759
3760        spin_lock_init(&swapped_blocks->lock);
3761        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
3762                swapped_blocks->blocks[i] = RB_ROOT;
3763        swapped_blocks->swapped = false;
3764}
3765
3766/*
3767 * Delete all swapped blocks record of @root.
3768 * Every record here means we skipped a full subtree scan for qgroup.
3769 *
3770 * Gets called when committing one transaction.
3771 */
3772void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
3773{
3774        struct btrfs_qgroup_swapped_blocks *swapped_blocks;
3775        int i;
3776
3777        swapped_blocks = &root->swapped_blocks;
3778
3779        spin_lock(&swapped_blocks->lock);
3780        if (!swapped_blocks->swapped)
3781                goto out;
3782        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3783                struct rb_root *cur_root = &swapped_blocks->blocks[i];
3784                struct btrfs_qgroup_swapped_block *entry;
3785                struct btrfs_qgroup_swapped_block *next;
3786
3787                rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
3788                                                     node)
3789                        kfree(entry);
3790                swapped_blocks->blocks[i] = RB_ROOT;
3791        }
3792        swapped_blocks->swapped = false;
3793out:
3794        spin_unlock(&swapped_blocks->lock);
3795}
3796
3797/*
3798 * Add subtree roots record into @subvol_root.
3799 *
3800 * @subvol_root:        tree root of the subvolume tree get swapped
3801 * @bg:                 block group under balance
3802 * @subvol_parent/slot: pointer to the subtree root in subvolume tree
3803 * @reloc_parent/slot:  pointer to the subtree root in reloc tree
3804 *                      BOTH POINTERS ARE BEFORE TREE SWAP
3805 * @last_snapshot:      last snapshot generation of the subvolume tree
3806 */
3807int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
3808                struct btrfs_root *subvol_root,
3809                struct btrfs_block_group_cache *bg,
3810                struct extent_buffer *subvol_parent, int subvol_slot,
3811                struct extent_buffer *reloc_parent, int reloc_slot,
3812                u64 last_snapshot)
3813{
3814        struct btrfs_fs_info *fs_info = subvol_root->fs_info;
3815        struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
3816        struct btrfs_qgroup_swapped_block *block;
3817        struct rb_node **cur;
3818        struct rb_node *parent = NULL;
3819        int level = btrfs_header_level(subvol_parent) - 1;
3820        int ret = 0;
3821
3822        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3823                return 0;
3824
3825        if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
3826            btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
3827                btrfs_err_rl(fs_info,
3828                "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
3829                        __func__,
3830                        btrfs_node_ptr_generation(subvol_parent, subvol_slot),
3831                        btrfs_node_ptr_generation(reloc_parent, reloc_slot));
3832                return -EUCLEAN;
3833        }
3834
3835        block = kmalloc(sizeof(*block), GFP_NOFS);
3836        if (!block) {
3837                ret = -ENOMEM;
3838                goto out;
3839        }
3840
3841        /*
3842         * @reloc_parent/slot is still before swap, while @block is going to
3843         * record the bytenr after swap, so we do the swap here.
3844         */
3845        block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
3846        block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
3847                                                             reloc_slot);
3848        block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
3849        block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
3850                                                            subvol_slot);
3851        block->last_snapshot = last_snapshot;
3852        block->level = level;
3853
3854        /*
3855         * If we have bg == NULL, we're called from btrfs_recover_relocation(),
3856         * no one else can modify tree blocks thus we qgroup will not change
3857         * no matter the value of trace_leaf.
3858         */
3859        if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
3860                block->trace_leaf = true;
3861        else
3862                block->trace_leaf = false;
3863        btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
3864
3865        /* Insert @block into @blocks */
3866        spin_lock(&blocks->lock);
3867        cur = &blocks->blocks[level].rb_node;
3868        while (*cur) {
3869                struct btrfs_qgroup_swapped_block *entry;
3870
3871                parent = *cur;
3872                entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
3873                                 node);
3874
3875                if (entry->subvol_bytenr < block->subvol_bytenr) {
3876                        cur = &(*cur)->rb_left;
3877                } else if (entry->subvol_bytenr > block->subvol_bytenr) {
3878                        cur = &(*cur)->rb_right;
3879                } else {
3880                        if (entry->subvol_generation !=
3881                                        block->subvol_generation ||
3882                            entry->reloc_bytenr != block->reloc_bytenr ||
3883                            entry->reloc_generation !=
3884                                        block->reloc_generation) {
3885                                /*
3886                                 * Duplicated but mismatch entry found.
3887                                 * Shouldn't happen.
3888                                 *
3889                                 * Marking qgroup inconsistent should be enough
3890                                 * for end users.
3891                                 */
3892                                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
3893                                ret = -EEXIST;
3894                        }
3895                        kfree(block);
3896                        goto out_unlock;
3897                }
3898        }
3899        rb_link_node(&block->node, parent, cur);
3900        rb_insert_color(&block->node, &blocks->blocks[level]);
3901        blocks->swapped = true;
3902out_unlock:
3903        spin_unlock(&blocks->lock);
3904out:
3905        if (ret < 0)
3906                fs_info->qgroup_flags |=
3907                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3908        return ret;
3909}
3910
3911/*
3912 * Check if the tree block is a subtree root, and if so do the needed
3913 * delayed subtree trace for qgroup.
3914 *
3915 * This is called during btrfs_cow_block().
3916 */
3917int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
3918                                         struct btrfs_root *root,
3919                                         struct extent_buffer *subvol_eb)
3920{
3921        struct btrfs_fs_info *fs_info = root->fs_info;
3922        struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
3923        struct btrfs_qgroup_swapped_block *block;
3924        struct extent_buffer *reloc_eb = NULL;
3925        struct rb_node *node;
3926        bool found = false;
3927        bool swapped = false;
3928        int level = btrfs_header_level(subvol_eb);
3929        int ret = 0;
3930        int i;
3931
3932        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
3933                return 0;
3934        if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
3935                return 0;
3936
3937        spin_lock(&blocks->lock);
3938        if (!blocks->swapped) {
3939                spin_unlock(&blocks->lock);
3940                return 0;
3941        }
3942        node = blocks->blocks[level].rb_node;
3943
3944        while (node) {
3945                block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
3946                if (block->subvol_bytenr < subvol_eb->start) {
3947                        node = node->rb_left;
3948                } else if (block->subvol_bytenr > subvol_eb->start) {
3949                        node = node->rb_right;
3950                } else {
3951                        found = true;
3952                        break;
3953                }
3954        }
3955        if (!found) {
3956                spin_unlock(&blocks->lock);
3957                goto out;
3958        }
3959        /* Found one, remove it from @blocks first and update blocks->swapped */
3960        rb_erase(&block->node, &blocks->blocks[level]);
3961        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
3962                if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
3963                        swapped = true;
3964                        break;
3965                }
3966        }
3967        blocks->swapped = swapped;
3968        spin_unlock(&blocks->lock);
3969
3970        /* Read out reloc subtree root */
3971        reloc_eb = read_tree_block(fs_info, block->reloc_bytenr,
3972                                   block->reloc_generation, block->level,
3973                                   &block->first_key);
3974        if (IS_ERR(reloc_eb)) {
3975                ret = PTR_ERR(reloc_eb);
3976                reloc_eb = NULL;
3977                goto free_out;
3978        }
3979        if (!extent_buffer_uptodate(reloc_eb)) {
3980                ret = -EIO;
3981                goto free_out;
3982        }
3983
3984        ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
3985                        block->last_snapshot, block->trace_leaf);
3986free_out:
3987        kfree(block);
3988        free_extent_buffer(reloc_eb);
3989out:
3990        if (ret < 0) {
3991                btrfs_err_rl(fs_info,
3992                             "failed to account subtree at bytenr %llu: %d",
3993                             subvol_eb->start, ret);
3994                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3995        }
3996        return ret;
3997}
3998