linux/fs/btrfs/qgroup.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/pagemap.h>
   8#include <linux/writeback.h>
   9#include <linux/blkdev.h>
  10#include <linux/rbtree.h>
  11#include <linux/slab.h>
  12#include <linux/workqueue.h>
  13#include <linux/btrfs.h>
  14#include <linux/sched/mm.h>
  15
  16#include "ctree.h"
  17#include "transaction.h"
  18#include "disk-io.h"
  19#include "locking.h"
  20#include "ulist.h"
  21#include "backref.h"
  22#include "extent_io.h"
  23#include "qgroup.h"
  24#include "block-group.h"
  25#include "sysfs.h"
  26#include "tree-mod-log.h"
  27
  28/* TODO XXX FIXME
  29 *  - subvol delete -> delete when ref goes to 0? delete limits also?
  30 *  - reorganize keys
  31 *  - compressed
  32 *  - sync
  33 *  - copy also limits on subvol creation
  34 *  - limit
  35 *  - caches for ulists
  36 *  - performance benchmarks
  37 *  - check all ioctl parameters
  38 */
  39
  40/*
  41 * Helpers to access qgroup reservation
  42 *
  43 * Callers should ensure the lock context and type are valid
  44 */
  45
  46static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup)
  47{
  48        u64 ret = 0;
  49        int i;
  50
  51        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
  52                ret += qgroup->rsv.values[i];
  53
  54        return ret;
  55}
  56
  57#ifdef CONFIG_BTRFS_DEBUG
  58static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type)
  59{
  60        if (type == BTRFS_QGROUP_RSV_DATA)
  61                return "data";
  62        if (type == BTRFS_QGROUP_RSV_META_PERTRANS)
  63                return "meta_pertrans";
  64        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
  65                return "meta_prealloc";
  66        return NULL;
  67}
  68#endif
  69
  70static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
  71                           struct btrfs_qgroup *qgroup, u64 num_bytes,
  72                           enum btrfs_qgroup_rsv_type type)
  73{
  74        trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
  75        qgroup->rsv.values[type] += num_bytes;
  76}
  77
  78static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
  79                               struct btrfs_qgroup *qgroup, u64 num_bytes,
  80                               enum btrfs_qgroup_rsv_type type)
  81{
  82        trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
  83        if (qgroup->rsv.values[type] >= num_bytes) {
  84                qgroup->rsv.values[type] -= num_bytes;
  85                return;
  86        }
  87#ifdef CONFIG_BTRFS_DEBUG
  88        WARN_RATELIMIT(1,
  89                "qgroup %llu %s reserved space underflow, have %llu to free %llu",
  90                qgroup->qgroupid, qgroup_rsv_type_str(type),
  91                qgroup->rsv.values[type], num_bytes);
  92#endif
  93        qgroup->rsv.values[type] = 0;
  94}
  95
  96static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info,
  97                                     struct btrfs_qgroup *dest,
  98                                     struct btrfs_qgroup *src)
  99{
 100        int i;
 101
 102        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 103                qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i);
 104}
 105
 106static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info,
 107                                         struct btrfs_qgroup *dest,
 108                                          struct btrfs_qgroup *src)
 109{
 110        int i;
 111
 112        for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++)
 113                qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i);
 114}
 115
 116static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
 117                                           int mod)
 118{
 119        if (qg->old_refcnt < seq)
 120                qg->old_refcnt = seq;
 121        qg->old_refcnt += mod;
 122}
 123
 124static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
 125                                           int mod)
 126{
 127        if (qg->new_refcnt < seq)
 128                qg->new_refcnt = seq;
 129        qg->new_refcnt += mod;
 130}
 131
 132static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
 133{
 134        if (qg->old_refcnt < seq)
 135                return 0;
 136        return qg->old_refcnt - seq;
 137}
 138
 139static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
 140{
 141        if (qg->new_refcnt < seq)
 142                return 0;
 143        return qg->new_refcnt - seq;
 144}
 145
 146/*
 147 * glue structure to represent the relations between qgroups.
 148 */
 149struct btrfs_qgroup_list {
 150        struct list_head next_group;
 151        struct list_head next_member;
 152        struct btrfs_qgroup *group;
 153        struct btrfs_qgroup *member;
 154};
 155
 156static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg)
 157{
 158        return (u64)(uintptr_t)qg;
 159}
 160
 161static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n)
 162{
 163        return (struct btrfs_qgroup *)(uintptr_t)n->aux;
 164}
 165
 166static int
 167qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 168                   int init_flags);
 169static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
 170
 171/* must be called with qgroup_ioctl_lock held */
 172static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
 173                                           u64 qgroupid)
 174{
 175        struct rb_node *n = fs_info->qgroup_tree.rb_node;
 176        struct btrfs_qgroup *qgroup;
 177
 178        while (n) {
 179                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 180                if (qgroup->qgroupid < qgroupid)
 181                        n = n->rb_left;
 182                else if (qgroup->qgroupid > qgroupid)
 183                        n = n->rb_right;
 184                else
 185                        return qgroup;
 186        }
 187        return NULL;
 188}
 189
 190/* must be called with qgroup_lock held */
 191static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 192                                          u64 qgroupid)
 193{
 194        struct rb_node **p = &fs_info->qgroup_tree.rb_node;
 195        struct rb_node *parent = NULL;
 196        struct btrfs_qgroup *qgroup;
 197
 198        while (*p) {
 199                parent = *p;
 200                qgroup = rb_entry(parent, struct btrfs_qgroup, node);
 201
 202                if (qgroup->qgroupid < qgroupid)
 203                        p = &(*p)->rb_left;
 204                else if (qgroup->qgroupid > qgroupid)
 205                        p = &(*p)->rb_right;
 206                else
 207                        return qgroup;
 208        }
 209
 210        qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC);
 211        if (!qgroup)
 212                return ERR_PTR(-ENOMEM);
 213
 214        qgroup->qgroupid = qgroupid;
 215        INIT_LIST_HEAD(&qgroup->groups);
 216        INIT_LIST_HEAD(&qgroup->members);
 217        INIT_LIST_HEAD(&qgroup->dirty);
 218
 219        rb_link_node(&qgroup->node, parent, p);
 220        rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
 221
 222        return qgroup;
 223}
 224
 225static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
 226                            struct btrfs_qgroup *qgroup)
 227{
 228        struct btrfs_qgroup_list *list;
 229
 230        list_del(&qgroup->dirty);
 231        while (!list_empty(&qgroup->groups)) {
 232                list = list_first_entry(&qgroup->groups,
 233                                        struct btrfs_qgroup_list, next_group);
 234                list_del(&list->next_group);
 235                list_del(&list->next_member);
 236                kfree(list);
 237        }
 238
 239        while (!list_empty(&qgroup->members)) {
 240                list = list_first_entry(&qgroup->members,
 241                                        struct btrfs_qgroup_list, next_member);
 242                list_del(&list->next_group);
 243                list_del(&list->next_member);
 244                kfree(list);
 245        }
 246}
 247
 248/* must be called with qgroup_lock held */
 249static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 250{
 251        struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 252
 253        if (!qgroup)
 254                return -ENOENT;
 255
 256        rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 257        __del_qgroup_rb(fs_info, qgroup);
 258        return 0;
 259}
 260
 261/* must be called with qgroup_lock held */
 262static int add_relation_rb(struct btrfs_fs_info *fs_info,
 263                           u64 memberid, u64 parentid)
 264{
 265        struct btrfs_qgroup *member;
 266        struct btrfs_qgroup *parent;
 267        struct btrfs_qgroup_list *list;
 268
 269        member = find_qgroup_rb(fs_info, memberid);
 270        parent = find_qgroup_rb(fs_info, parentid);
 271        if (!member || !parent)
 272                return -ENOENT;
 273
 274        list = kzalloc(sizeof(*list), GFP_ATOMIC);
 275        if (!list)
 276                return -ENOMEM;
 277
 278        list->group = parent;
 279        list->member = member;
 280        list_add_tail(&list->next_group, &member->groups);
 281        list_add_tail(&list->next_member, &parent->members);
 282
 283        return 0;
 284}
 285
 286/* must be called with qgroup_lock held */
 287static int del_relation_rb(struct btrfs_fs_info *fs_info,
 288                           u64 memberid, u64 parentid)
 289{
 290        struct btrfs_qgroup *member;
 291        struct btrfs_qgroup *parent;
 292        struct btrfs_qgroup_list *list;
 293
 294        member = find_qgroup_rb(fs_info, memberid);
 295        parent = find_qgroup_rb(fs_info, parentid);
 296        if (!member || !parent)
 297                return -ENOENT;
 298
 299        list_for_each_entry(list, &member->groups, next_group) {
 300                if (list->group == parent) {
 301                        list_del(&list->next_group);
 302                        list_del(&list->next_member);
 303                        kfree(list);
 304                        return 0;
 305                }
 306        }
 307        return -ENOENT;
 308}
 309
 310#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 311int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
 312                               u64 rfer, u64 excl)
 313{
 314        struct btrfs_qgroup *qgroup;
 315
 316        qgroup = find_qgroup_rb(fs_info, qgroupid);
 317        if (!qgroup)
 318                return -EINVAL;
 319        if (qgroup->rfer != rfer || qgroup->excl != excl)
 320                return -EINVAL;
 321        return 0;
 322}
 323#endif
 324
 325/*
 326 * The full config is read in one go, only called from open_ctree()
 327 * It doesn't use any locking, as at this point we're still single-threaded
 328 */
 329int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
 330{
 331        struct btrfs_key key;
 332        struct btrfs_key found_key;
 333        struct btrfs_root *quota_root = fs_info->quota_root;
 334        struct btrfs_path *path = NULL;
 335        struct extent_buffer *l;
 336        int slot;
 337        int ret = 0;
 338        u64 flags = 0;
 339        u64 rescan_progress = 0;
 340
 341        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 342                return 0;
 343
 344        fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
 345        if (!fs_info->qgroup_ulist) {
 346                ret = -ENOMEM;
 347                goto out;
 348        }
 349
 350        path = btrfs_alloc_path();
 351        if (!path) {
 352                ret = -ENOMEM;
 353                goto out;
 354        }
 355
 356        ret = btrfs_sysfs_add_qgroups(fs_info);
 357        if (ret < 0)
 358                goto out;
 359        /* default this to quota off, in case no status key is found */
 360        fs_info->qgroup_flags = 0;
 361
 362        /*
 363         * pass 1: read status, all qgroup infos and limits
 364         */
 365        key.objectid = 0;
 366        key.type = 0;
 367        key.offset = 0;
 368        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1);
 369        if (ret)
 370                goto out;
 371
 372        while (1) {
 373                struct btrfs_qgroup *qgroup;
 374
 375                slot = path->slots[0];
 376                l = path->nodes[0];
 377                btrfs_item_key_to_cpu(l, &found_key, slot);
 378
 379                if (found_key.type == BTRFS_QGROUP_STATUS_KEY) {
 380                        struct btrfs_qgroup_status_item *ptr;
 381
 382                        ptr = btrfs_item_ptr(l, slot,
 383                                             struct btrfs_qgroup_status_item);
 384
 385                        if (btrfs_qgroup_status_version(l, ptr) !=
 386                            BTRFS_QGROUP_STATUS_VERSION) {
 387                                btrfs_err(fs_info,
 388                                 "old qgroup version, quota disabled");
 389                                goto out;
 390                        }
 391                        if (btrfs_qgroup_status_generation(l, ptr) !=
 392                            fs_info->generation) {
 393                                flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 394                                btrfs_err(fs_info,
 395                                        "qgroup generation mismatch, marked as inconsistent");
 396                        }
 397                        fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
 398                                                                          ptr);
 399                        rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
 400                        goto next1;
 401                }
 402
 403                if (found_key.type != BTRFS_QGROUP_INFO_KEY &&
 404                    found_key.type != BTRFS_QGROUP_LIMIT_KEY)
 405                        goto next1;
 406
 407                qgroup = find_qgroup_rb(fs_info, found_key.offset);
 408                if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
 409                    (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
 410                        btrfs_err(fs_info, "inconsistent qgroup config");
 411                        flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 412                }
 413                if (!qgroup) {
 414                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
 415                        if (IS_ERR(qgroup)) {
 416                                ret = PTR_ERR(qgroup);
 417                                goto out;
 418                        }
 419                }
 420                ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
 421                if (ret < 0)
 422                        goto out;
 423
 424                switch (found_key.type) {
 425                case BTRFS_QGROUP_INFO_KEY: {
 426                        struct btrfs_qgroup_info_item *ptr;
 427
 428                        ptr = btrfs_item_ptr(l, slot,
 429                                             struct btrfs_qgroup_info_item);
 430                        qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr);
 431                        qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr);
 432                        qgroup->excl = btrfs_qgroup_info_excl(l, ptr);
 433                        qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr);
 434                        /* generation currently unused */
 435                        break;
 436                }
 437                case BTRFS_QGROUP_LIMIT_KEY: {
 438                        struct btrfs_qgroup_limit_item *ptr;
 439
 440                        ptr = btrfs_item_ptr(l, slot,
 441                                             struct btrfs_qgroup_limit_item);
 442                        qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr);
 443                        qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr);
 444                        qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr);
 445                        qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr);
 446                        qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr);
 447                        break;
 448                }
 449                }
 450next1:
 451                ret = btrfs_next_item(quota_root, path);
 452                if (ret < 0)
 453                        goto out;
 454                if (ret)
 455                        break;
 456        }
 457        btrfs_release_path(path);
 458
 459        /*
 460         * pass 2: read all qgroup relations
 461         */
 462        key.objectid = 0;
 463        key.type = BTRFS_QGROUP_RELATION_KEY;
 464        key.offset = 0;
 465        ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0);
 466        if (ret)
 467                goto out;
 468        while (1) {
 469                slot = path->slots[0];
 470                l = path->nodes[0];
 471                btrfs_item_key_to_cpu(l, &found_key, slot);
 472
 473                if (found_key.type != BTRFS_QGROUP_RELATION_KEY)
 474                        goto next2;
 475
 476                if (found_key.objectid > found_key.offset) {
 477                        /* parent <- member, not needed to build config */
 478                        /* FIXME should we omit the key completely? */
 479                        goto next2;
 480                }
 481
 482                ret = add_relation_rb(fs_info, found_key.objectid,
 483                                      found_key.offset);
 484                if (ret == -ENOENT) {
 485                        btrfs_warn(fs_info,
 486                                "orphan qgroup relation 0x%llx->0x%llx",
 487                                found_key.objectid, found_key.offset);
 488                        ret = 0;        /* ignore the error */
 489                }
 490                if (ret)
 491                        goto out;
 492next2:
 493                ret = btrfs_next_item(quota_root, path);
 494                if (ret < 0)
 495                        goto out;
 496                if (ret)
 497                        break;
 498        }
 499out:
 500        btrfs_free_path(path);
 501        fs_info->qgroup_flags |= flags;
 502        if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
 503                clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
 504        else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
 505                 ret >= 0)
 506                ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
 507
 508        if (ret < 0) {
 509                ulist_free(fs_info->qgroup_ulist);
 510                fs_info->qgroup_ulist = NULL;
 511                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
 512                btrfs_sysfs_del_qgroups(fs_info);
 513        }
 514
 515        return ret < 0 ? ret : 0;
 516}
 517
 518/*
 519 * Called in close_ctree() when quota is still enabled.  This verifies we don't
 520 * leak some reserved space.
 521 *
 522 * Return false if no reserved space is left.
 523 * Return true if some reserved space is leaked.
 524 */
 525bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
 526{
 527        struct rb_node *node;
 528        bool ret = false;
 529
 530        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
 531                return ret;
 532        /*
 533         * Since we're unmounting, there is no race and no need to grab qgroup
 534         * lock.  And here we don't go post-order to provide a more user
 535         * friendly sorted result.
 536         */
 537        for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
 538                struct btrfs_qgroup *qgroup;
 539                int i;
 540
 541                qgroup = rb_entry(node, struct btrfs_qgroup, node);
 542                for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
 543                        if (qgroup->rsv.values[i]) {
 544                                ret = true;
 545                                btrfs_warn(fs_info,
 546                "qgroup %hu/%llu has unreleased space, type %d rsv %llu",
 547                                   btrfs_qgroup_level(qgroup->qgroupid),
 548                                   btrfs_qgroup_subvolid(qgroup->qgroupid),
 549                                   i, qgroup->rsv.values[i]);
 550                        }
 551                }
 552        }
 553        return ret;
 554}
 555
 556/*
 557 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
 558 * first two are in single-threaded paths.And for the third one, we have set
 559 * quota_root to be null with qgroup_lock held before, so it is safe to clean
 560 * up the in-memory structures without qgroup_lock held.
 561 */
 562void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 563{
 564        struct rb_node *n;
 565        struct btrfs_qgroup *qgroup;
 566
 567        while ((n = rb_first(&fs_info->qgroup_tree))) {
 568                qgroup = rb_entry(n, struct btrfs_qgroup, node);
 569                rb_erase(n, &fs_info->qgroup_tree);
 570                __del_qgroup_rb(fs_info, qgroup);
 571                btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
 572                kfree(qgroup);
 573        }
 574        /*
 575         * We call btrfs_free_qgroup_config() when unmounting
 576         * filesystem and disabling quota, so we set qgroup_ulist
 577         * to be null here to avoid double free.
 578         */
 579        ulist_free(fs_info->qgroup_ulist);
 580        fs_info->qgroup_ulist = NULL;
 581        btrfs_sysfs_del_qgroups(fs_info);
 582}
 583
 584static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 585                                    u64 dst)
 586{
 587        int ret;
 588        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 589        struct btrfs_path *path;
 590        struct btrfs_key key;
 591
 592        path = btrfs_alloc_path();
 593        if (!path)
 594                return -ENOMEM;
 595
 596        key.objectid = src;
 597        key.type = BTRFS_QGROUP_RELATION_KEY;
 598        key.offset = dst;
 599
 600        ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0);
 601
 602        btrfs_mark_buffer_dirty(path->nodes[0]);
 603
 604        btrfs_free_path(path);
 605        return ret;
 606}
 607
 608static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src,
 609                                    u64 dst)
 610{
 611        int ret;
 612        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 613        struct btrfs_path *path;
 614        struct btrfs_key key;
 615
 616        path = btrfs_alloc_path();
 617        if (!path)
 618                return -ENOMEM;
 619
 620        key.objectid = src;
 621        key.type = BTRFS_QGROUP_RELATION_KEY;
 622        key.offset = dst;
 623
 624        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 625        if (ret < 0)
 626                goto out;
 627
 628        if (ret > 0) {
 629                ret = -ENOENT;
 630                goto out;
 631        }
 632
 633        ret = btrfs_del_item(trans, quota_root, path);
 634out:
 635        btrfs_free_path(path);
 636        return ret;
 637}
 638
 639static int add_qgroup_item(struct btrfs_trans_handle *trans,
 640                           struct btrfs_root *quota_root, u64 qgroupid)
 641{
 642        int ret;
 643        struct btrfs_path *path;
 644        struct btrfs_qgroup_info_item *qgroup_info;
 645        struct btrfs_qgroup_limit_item *qgroup_limit;
 646        struct extent_buffer *leaf;
 647        struct btrfs_key key;
 648
 649        if (btrfs_is_testing(quota_root->fs_info))
 650                return 0;
 651
 652        path = btrfs_alloc_path();
 653        if (!path)
 654                return -ENOMEM;
 655
 656        key.objectid = 0;
 657        key.type = BTRFS_QGROUP_INFO_KEY;
 658        key.offset = qgroupid;
 659
 660        /*
 661         * Avoid a transaction abort by catching -EEXIST here. In that
 662         * case, we proceed by re-initializing the existing structure
 663         * on disk.
 664         */
 665
 666        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 667                                      sizeof(*qgroup_info));
 668        if (ret && ret != -EEXIST)
 669                goto out;
 670
 671        leaf = path->nodes[0];
 672        qgroup_info = btrfs_item_ptr(leaf, path->slots[0],
 673                                 struct btrfs_qgroup_info_item);
 674        btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid);
 675        btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0);
 676        btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0);
 677        btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0);
 678        btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0);
 679
 680        btrfs_mark_buffer_dirty(leaf);
 681
 682        btrfs_release_path(path);
 683
 684        key.type = BTRFS_QGROUP_LIMIT_KEY;
 685        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
 686                                      sizeof(*qgroup_limit));
 687        if (ret && ret != -EEXIST)
 688                goto out;
 689
 690        leaf = path->nodes[0];
 691        qgroup_limit = btrfs_item_ptr(leaf, path->slots[0],
 692                                  struct btrfs_qgroup_limit_item);
 693        btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0);
 694        btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0);
 695        btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0);
 696        btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0);
 697        btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0);
 698
 699        btrfs_mark_buffer_dirty(leaf);
 700
 701        ret = 0;
 702out:
 703        btrfs_free_path(path);
 704        return ret;
 705}
 706
 707static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid)
 708{
 709        int ret;
 710        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 711        struct btrfs_path *path;
 712        struct btrfs_key key;
 713
 714        path = btrfs_alloc_path();
 715        if (!path)
 716                return -ENOMEM;
 717
 718        key.objectid = 0;
 719        key.type = BTRFS_QGROUP_INFO_KEY;
 720        key.offset = qgroupid;
 721        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 722        if (ret < 0)
 723                goto out;
 724
 725        if (ret > 0) {
 726                ret = -ENOENT;
 727                goto out;
 728        }
 729
 730        ret = btrfs_del_item(trans, quota_root, path);
 731        if (ret)
 732                goto out;
 733
 734        btrfs_release_path(path);
 735
 736        key.type = BTRFS_QGROUP_LIMIT_KEY;
 737        ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1);
 738        if (ret < 0)
 739                goto out;
 740
 741        if (ret > 0) {
 742                ret = -ENOENT;
 743                goto out;
 744        }
 745
 746        ret = btrfs_del_item(trans, quota_root, path);
 747
 748out:
 749        btrfs_free_path(path);
 750        return ret;
 751}
 752
 753static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
 754                                    struct btrfs_qgroup *qgroup)
 755{
 756        struct btrfs_root *quota_root = trans->fs_info->quota_root;
 757        struct btrfs_path *path;
 758        struct btrfs_key key;
 759        struct extent_buffer *l;
 760        struct btrfs_qgroup_limit_item *qgroup_limit;
 761        int ret;
 762        int slot;
 763
 764        key.objectid = 0;
 765        key.type = BTRFS_QGROUP_LIMIT_KEY;
 766        key.offset = qgroup->qgroupid;
 767
 768        path = btrfs_alloc_path();
 769        if (!path)
 770                return -ENOMEM;
 771
 772        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 773        if (ret > 0)
 774                ret = -ENOENT;
 775
 776        if (ret)
 777                goto out;
 778
 779        l = path->nodes[0];
 780        slot = path->slots[0];
 781        qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
 782        btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
 783        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
 784        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
 785        btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
 786        btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
 787
 788        btrfs_mark_buffer_dirty(l);
 789
 790out:
 791        btrfs_free_path(path);
 792        return ret;
 793}
 794
 795static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
 796                                   struct btrfs_qgroup *qgroup)
 797{
 798        struct btrfs_fs_info *fs_info = trans->fs_info;
 799        struct btrfs_root *quota_root = fs_info->quota_root;
 800        struct btrfs_path *path;
 801        struct btrfs_key key;
 802        struct extent_buffer *l;
 803        struct btrfs_qgroup_info_item *qgroup_info;
 804        int ret;
 805        int slot;
 806
 807        if (btrfs_is_testing(fs_info))
 808                return 0;
 809
 810        key.objectid = 0;
 811        key.type = BTRFS_QGROUP_INFO_KEY;
 812        key.offset = qgroup->qgroupid;
 813
 814        path = btrfs_alloc_path();
 815        if (!path)
 816                return -ENOMEM;
 817
 818        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 819        if (ret > 0)
 820                ret = -ENOENT;
 821
 822        if (ret)
 823                goto out;
 824
 825        l = path->nodes[0];
 826        slot = path->slots[0];
 827        qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
 828        btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
 829        btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
 830        btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
 831        btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl);
 832        btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr);
 833
 834        btrfs_mark_buffer_dirty(l);
 835
 836out:
 837        btrfs_free_path(path);
 838        return ret;
 839}
 840
 841static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
 842{
 843        struct btrfs_fs_info *fs_info = trans->fs_info;
 844        struct btrfs_root *quota_root = fs_info->quota_root;
 845        struct btrfs_path *path;
 846        struct btrfs_key key;
 847        struct extent_buffer *l;
 848        struct btrfs_qgroup_status_item *ptr;
 849        int ret;
 850        int slot;
 851
 852        key.objectid = 0;
 853        key.type = BTRFS_QGROUP_STATUS_KEY;
 854        key.offset = 0;
 855
 856        path = btrfs_alloc_path();
 857        if (!path)
 858                return -ENOMEM;
 859
 860        ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
 861        if (ret > 0)
 862                ret = -ENOENT;
 863
 864        if (ret)
 865                goto out;
 866
 867        l = path->nodes[0];
 868        slot = path->slots[0];
 869        ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item);
 870        btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags);
 871        btrfs_set_qgroup_status_generation(l, ptr, trans->transid);
 872        btrfs_set_qgroup_status_rescan(l, ptr,
 873                                fs_info->qgroup_rescan_progress.objectid);
 874
 875        btrfs_mark_buffer_dirty(l);
 876
 877out:
 878        btrfs_free_path(path);
 879        return ret;
 880}
 881
 882/*
 883 * called with qgroup_lock held
 884 */
 885static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 886                                  struct btrfs_root *root)
 887{
 888        struct btrfs_path *path;
 889        struct btrfs_key key;
 890        struct extent_buffer *leaf = NULL;
 891        int ret;
 892        int nr = 0;
 893
 894        path = btrfs_alloc_path();
 895        if (!path)
 896                return -ENOMEM;
 897
 898        key.objectid = 0;
 899        key.offset = 0;
 900        key.type = 0;
 901
 902        while (1) {
 903                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 904                if (ret < 0)
 905                        goto out;
 906                leaf = path->nodes[0];
 907                nr = btrfs_header_nritems(leaf);
 908                if (!nr)
 909                        break;
 910                /*
 911                 * delete the leaf one by one
 912                 * since the whole tree is going
 913                 * to be deleted.
 914                 */
 915                path->slots[0] = 0;
 916                ret = btrfs_del_items(trans, root, path, 0, nr);
 917                if (ret)
 918                        goto out;
 919
 920                btrfs_release_path(path);
 921        }
 922        ret = 0;
 923out:
 924        btrfs_free_path(path);
 925        return ret;
 926}
 927
 928int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
 929{
 930        struct btrfs_root *quota_root;
 931        struct btrfs_root *tree_root = fs_info->tree_root;
 932        struct btrfs_path *path = NULL;
 933        struct btrfs_qgroup_status_item *ptr;
 934        struct extent_buffer *leaf;
 935        struct btrfs_key key;
 936        struct btrfs_key found_key;
 937        struct btrfs_qgroup *qgroup = NULL;
 938        struct btrfs_trans_handle *trans = NULL;
 939        struct ulist *ulist = NULL;
 940        int ret = 0;
 941        int slot;
 942
 943        mutex_lock(&fs_info->qgroup_ioctl_lock);
 944        if (fs_info->quota_root)
 945                goto out;
 946
 947        ulist = ulist_alloc(GFP_KERNEL);
 948        if (!ulist) {
 949                ret = -ENOMEM;
 950                goto out;
 951        }
 952
 953        ret = btrfs_sysfs_add_qgroups(fs_info);
 954        if (ret < 0)
 955                goto out;
 956
 957        /*
 958         * Unlock qgroup_ioctl_lock before starting the transaction. This is to
 959         * avoid lock acquisition inversion problems (reported by lockdep) between
 960         * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
 961         * start a transaction.
 962         * After we started the transaction lock qgroup_ioctl_lock again and
 963         * check if someone else created the quota root in the meanwhile. If so,
 964         * just return success and release the transaction handle.
 965         *
 966         * Also we don't need to worry about someone else calling
 967         * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
 968         * that function returns 0 (success) when the sysfs entries already exist.
 969         */
 970        mutex_unlock(&fs_info->qgroup_ioctl_lock);
 971
 972        /*
 973         * 1 for quota root item
 974         * 1 for BTRFS_QGROUP_STATUS item
 975         *
 976         * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items
 977         * per subvolume. However those are not currently reserved since it
 978         * would be a lot of overkill.
 979         */
 980        trans = btrfs_start_transaction(tree_root, 2);
 981
 982        mutex_lock(&fs_info->qgroup_ioctl_lock);
 983        if (IS_ERR(trans)) {
 984                ret = PTR_ERR(trans);
 985                trans = NULL;
 986                goto out;
 987        }
 988
 989        if (fs_info->quota_root)
 990                goto out;
 991
 992        fs_info->qgroup_ulist = ulist;
 993        ulist = NULL;
 994
 995        /*
 996         * initially create the quota tree
 997         */
 998        quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID);
 999        if (IS_ERR(quota_root)) {
1000                ret =  PTR_ERR(quota_root);
1001                btrfs_abort_transaction(trans, ret);
1002                goto out;
1003        }
1004
1005        path = btrfs_alloc_path();
1006        if (!path) {
1007                ret = -ENOMEM;
1008                btrfs_abort_transaction(trans, ret);
1009                goto out_free_root;
1010        }
1011
1012        key.objectid = 0;
1013        key.type = BTRFS_QGROUP_STATUS_KEY;
1014        key.offset = 0;
1015
1016        ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
1017                                      sizeof(*ptr));
1018        if (ret) {
1019                btrfs_abort_transaction(trans, ret);
1020                goto out_free_path;
1021        }
1022
1023        leaf = path->nodes[0];
1024        ptr = btrfs_item_ptr(leaf, path->slots[0],
1025                                 struct btrfs_qgroup_status_item);
1026        btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid);
1027        btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION);
1028        fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON |
1029                                BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1030        btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags);
1031        btrfs_set_qgroup_status_rescan(leaf, ptr, 0);
1032
1033        btrfs_mark_buffer_dirty(leaf);
1034
1035        key.objectid = 0;
1036        key.type = BTRFS_ROOT_REF_KEY;
1037        key.offset = 0;
1038
1039        btrfs_release_path(path);
1040        ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0);
1041        if (ret > 0)
1042                goto out_add_root;
1043        if (ret < 0) {
1044                btrfs_abort_transaction(trans, ret);
1045                goto out_free_path;
1046        }
1047
1048        while (1) {
1049                slot = path->slots[0];
1050                leaf = path->nodes[0];
1051                btrfs_item_key_to_cpu(leaf, &found_key, slot);
1052
1053                if (found_key.type == BTRFS_ROOT_REF_KEY) {
1054
1055                        /* Release locks on tree_root before we access quota_root */
1056                        btrfs_release_path(path);
1057
1058                        ret = add_qgroup_item(trans, quota_root,
1059                                              found_key.offset);
1060                        if (ret) {
1061                                btrfs_abort_transaction(trans, ret);
1062                                goto out_free_path;
1063                        }
1064
1065                        qgroup = add_qgroup_rb(fs_info, found_key.offset);
1066                        if (IS_ERR(qgroup)) {
1067                                ret = PTR_ERR(qgroup);
1068                                btrfs_abort_transaction(trans, ret);
1069                                goto out_free_path;
1070                        }
1071                        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1072                        if (ret < 0) {
1073                                btrfs_abort_transaction(trans, ret);
1074                                goto out_free_path;
1075                        }
1076                        ret = btrfs_search_slot_for_read(tree_root, &found_key,
1077                                                         path, 1, 0);
1078                        if (ret < 0) {
1079                                btrfs_abort_transaction(trans, ret);
1080                                goto out_free_path;
1081                        }
1082                        if (ret > 0) {
1083                                /*
1084                                 * Shouldn't happen, but in case it does we
1085                                 * don't need to do the btrfs_next_item, just
1086                                 * continue.
1087                                 */
1088                                continue;
1089                        }
1090                }
1091                ret = btrfs_next_item(tree_root, path);
1092                if (ret < 0) {
1093                        btrfs_abort_transaction(trans, ret);
1094                        goto out_free_path;
1095                }
1096                if (ret)
1097                        break;
1098        }
1099
1100out_add_root:
1101        btrfs_release_path(path);
1102        ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID);
1103        if (ret) {
1104                btrfs_abort_transaction(trans, ret);
1105                goto out_free_path;
1106        }
1107
1108        qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID);
1109        if (IS_ERR(qgroup)) {
1110                ret = PTR_ERR(qgroup);
1111                btrfs_abort_transaction(trans, ret);
1112                goto out_free_path;
1113        }
1114        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1115        if (ret < 0) {
1116                btrfs_abort_transaction(trans, ret);
1117                goto out_free_path;
1118        }
1119
1120        ret = btrfs_commit_transaction(trans);
1121        trans = NULL;
1122        if (ret)
1123                goto out_free_path;
1124
1125        /*
1126         * Set quota enabled flag after committing the transaction, to avoid
1127         * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
1128         * creation.
1129         */
1130        spin_lock(&fs_info->qgroup_lock);
1131        fs_info->quota_root = quota_root;
1132        set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1133        spin_unlock(&fs_info->qgroup_lock);
1134
1135        ret = qgroup_rescan_init(fs_info, 0, 1);
1136        if (!ret) {
1137                qgroup_rescan_zero_tracking(fs_info);
1138                fs_info->qgroup_rescan_running = true;
1139                btrfs_queue_work(fs_info->qgroup_rescan_workers,
1140                                 &fs_info->qgroup_rescan_work);
1141        }
1142
1143out_free_path:
1144        btrfs_free_path(path);
1145out_free_root:
1146        if (ret)
1147                btrfs_put_root(quota_root);
1148out:
1149        if (ret) {
1150                ulist_free(fs_info->qgroup_ulist);
1151                fs_info->qgroup_ulist = NULL;
1152                btrfs_sysfs_del_qgroups(fs_info);
1153        }
1154        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1155        if (ret && trans)
1156                btrfs_end_transaction(trans);
1157        else if (trans)
1158                ret = btrfs_end_transaction(trans);
1159        ulist_free(ulist);
1160        return ret;
1161}
1162
1163int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
1164{
1165        struct btrfs_root *quota_root;
1166        struct btrfs_trans_handle *trans = NULL;
1167        int ret = 0;
1168
1169        mutex_lock(&fs_info->qgroup_ioctl_lock);
1170        if (!fs_info->quota_root)
1171                goto out;
1172        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1173
1174        /*
1175         * 1 For the root item
1176         *
1177         * We should also reserve enough items for the quota tree deletion in
1178         * btrfs_clean_quota_tree but this is not done.
1179         *
1180         * Also, we must always start a transaction without holding the mutex
1181         * qgroup_ioctl_lock, see btrfs_quota_enable().
1182         */
1183        trans = btrfs_start_transaction(fs_info->tree_root, 1);
1184
1185        mutex_lock(&fs_info->qgroup_ioctl_lock);
1186        if (IS_ERR(trans)) {
1187                ret = PTR_ERR(trans);
1188                trans = NULL;
1189                goto out;
1190        }
1191
1192        if (!fs_info->quota_root)
1193                goto out;
1194
1195        clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
1196        btrfs_qgroup_wait_for_completion(fs_info, false);
1197        spin_lock(&fs_info->qgroup_lock);
1198        quota_root = fs_info->quota_root;
1199        fs_info->quota_root = NULL;
1200        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
1201        spin_unlock(&fs_info->qgroup_lock);
1202
1203        btrfs_free_qgroup_config(fs_info);
1204
1205        ret = btrfs_clean_quota_tree(trans, quota_root);
1206        if (ret) {
1207                btrfs_abort_transaction(trans, ret);
1208                goto out;
1209        }
1210
1211        ret = btrfs_del_root(trans, &quota_root->root_key);
1212        if (ret) {
1213                btrfs_abort_transaction(trans, ret);
1214                goto out;
1215        }
1216
1217        list_del(&quota_root->dirty_list);
1218
1219        btrfs_tree_lock(quota_root->node);
1220        btrfs_clean_tree_block(quota_root->node);
1221        btrfs_tree_unlock(quota_root->node);
1222        btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
1223
1224        btrfs_put_root(quota_root);
1225
1226out:
1227        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1228        if (ret && trans)
1229                btrfs_end_transaction(trans);
1230        else if (trans)
1231                ret = btrfs_end_transaction(trans);
1232
1233        return ret;
1234}
1235
1236static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1237                         struct btrfs_qgroup *qgroup)
1238{
1239        if (list_empty(&qgroup->dirty))
1240                list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1241}
1242
1243/*
1244 * The easy accounting, we're updating qgroup relationship whose child qgroup
1245 * only has exclusive extents.
1246 *
1247 * In this case, all exclusive extents will also be exclusive for parent, so
1248 * excl/rfer just get added/removed.
1249 *
1250 * So is qgroup reservation space, which should also be added/removed to
1251 * parent.
1252 * Or when child tries to release reservation space, parent will underflow its
1253 * reservation (for relationship adding case).
1254 *
1255 * Caller should hold fs_info->qgroup_lock.
1256 */
1257static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1258                                    struct ulist *tmp, u64 ref_root,
1259                                    struct btrfs_qgroup *src, int sign)
1260{
1261        struct btrfs_qgroup *qgroup;
1262        struct btrfs_qgroup_list *glist;
1263        struct ulist_node *unode;
1264        struct ulist_iterator uiter;
1265        u64 num_bytes = src->excl;
1266        int ret = 0;
1267
1268        qgroup = find_qgroup_rb(fs_info, ref_root);
1269        if (!qgroup)
1270                goto out;
1271
1272        qgroup->rfer += sign * num_bytes;
1273        qgroup->rfer_cmpr += sign * num_bytes;
1274
1275        WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1276        qgroup->excl += sign * num_bytes;
1277        qgroup->excl_cmpr += sign * num_bytes;
1278
1279        if (sign > 0)
1280                qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1281        else
1282                qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1283
1284        qgroup_dirty(fs_info, qgroup);
1285
1286        /* Get all of the parent groups that contain this qgroup */
1287        list_for_each_entry(glist, &qgroup->groups, next_group) {
1288                ret = ulist_add(tmp, glist->group->qgroupid,
1289                                qgroup_to_aux(glist->group), GFP_ATOMIC);
1290                if (ret < 0)
1291                        goto out;
1292        }
1293
1294        /* Iterate all of the parents and adjust their reference counts */
1295        ULIST_ITER_INIT(&uiter);
1296        while ((unode = ulist_next(tmp, &uiter))) {
1297                qgroup = unode_aux_to_qgroup(unode);
1298                qgroup->rfer += sign * num_bytes;
1299                qgroup->rfer_cmpr += sign * num_bytes;
1300                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1301                qgroup->excl += sign * num_bytes;
1302                if (sign > 0)
1303                        qgroup_rsv_add_by_qgroup(fs_info, qgroup, src);
1304                else
1305                        qgroup_rsv_release_by_qgroup(fs_info, qgroup, src);
1306                qgroup->excl_cmpr += sign * num_bytes;
1307                qgroup_dirty(fs_info, qgroup);
1308
1309                /* Add any parents of the parents */
1310                list_for_each_entry(glist, &qgroup->groups, next_group) {
1311                        ret = ulist_add(tmp, glist->group->qgroupid,
1312                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
1313                        if (ret < 0)
1314                                goto out;
1315                }
1316        }
1317        ret = 0;
1318out:
1319        return ret;
1320}
1321
1322
1323/*
1324 * Quick path for updating qgroup with only excl refs.
1325 *
1326 * In that case, just update all parent will be enough.
1327 * Or we needs to do a full rescan.
1328 * Caller should also hold fs_info->qgroup_lock.
1329 *
1330 * Return 0 for quick update, return >0 for need to full rescan
1331 * and mark INCONSISTENT flag.
1332 * Return < 0 for other error.
1333 */
1334static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1335                                   struct ulist *tmp, u64 src, u64 dst,
1336                                   int sign)
1337{
1338        struct btrfs_qgroup *qgroup;
1339        int ret = 1;
1340        int err = 0;
1341
1342        qgroup = find_qgroup_rb(fs_info, src);
1343        if (!qgroup)
1344                goto out;
1345        if (qgroup->excl == qgroup->rfer) {
1346                ret = 0;
1347                err = __qgroup_excl_accounting(fs_info, tmp, dst,
1348                                               qgroup, sign);
1349                if (err < 0) {
1350                        ret = err;
1351                        goto out;
1352                }
1353        }
1354out:
1355        if (ret)
1356                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1357        return ret;
1358}
1359
1360int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1361                              u64 dst)
1362{
1363        struct btrfs_fs_info *fs_info = trans->fs_info;
1364        struct btrfs_qgroup *parent;
1365        struct btrfs_qgroup *member;
1366        struct btrfs_qgroup_list *list;
1367        struct ulist *tmp;
1368        unsigned int nofs_flag;
1369        int ret = 0;
1370
1371        /* Check the level of src and dst first */
1372        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1373                return -EINVAL;
1374
1375        /* We hold a transaction handle open, must do a NOFS allocation. */
1376        nofs_flag = memalloc_nofs_save();
1377        tmp = ulist_alloc(GFP_KERNEL);
1378        memalloc_nofs_restore(nofs_flag);
1379        if (!tmp)
1380                return -ENOMEM;
1381
1382        mutex_lock(&fs_info->qgroup_ioctl_lock);
1383        if (!fs_info->quota_root) {
1384                ret = -ENOTCONN;
1385                goto out;
1386        }
1387        member = find_qgroup_rb(fs_info, src);
1388        parent = find_qgroup_rb(fs_info, dst);
1389        if (!member || !parent) {
1390                ret = -EINVAL;
1391                goto out;
1392        }
1393
1394        /* check if such qgroup relation exist firstly */
1395        list_for_each_entry(list, &member->groups, next_group) {
1396                if (list->group == parent) {
1397                        ret = -EEXIST;
1398                        goto out;
1399                }
1400        }
1401
1402        ret = add_qgroup_relation_item(trans, src, dst);
1403        if (ret)
1404                goto out;
1405
1406        ret = add_qgroup_relation_item(trans, dst, src);
1407        if (ret) {
1408                del_qgroup_relation_item(trans, src, dst);
1409                goto out;
1410        }
1411
1412        spin_lock(&fs_info->qgroup_lock);
1413        ret = add_relation_rb(fs_info, src, dst);
1414        if (ret < 0) {
1415                spin_unlock(&fs_info->qgroup_lock);
1416                goto out;
1417        }
1418        ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1419        spin_unlock(&fs_info->qgroup_lock);
1420out:
1421        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1422        ulist_free(tmp);
1423        return ret;
1424}
1425
1426static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1427                                 u64 dst)
1428{
1429        struct btrfs_fs_info *fs_info = trans->fs_info;
1430        struct btrfs_qgroup *parent;
1431        struct btrfs_qgroup *member;
1432        struct btrfs_qgroup_list *list;
1433        struct ulist *tmp;
1434        bool found = false;
1435        unsigned int nofs_flag;
1436        int ret = 0;
1437        int ret2;
1438
1439        /* We hold a transaction handle open, must do a NOFS allocation. */
1440        nofs_flag = memalloc_nofs_save();
1441        tmp = ulist_alloc(GFP_KERNEL);
1442        memalloc_nofs_restore(nofs_flag);
1443        if (!tmp)
1444                return -ENOMEM;
1445
1446        if (!fs_info->quota_root) {
1447                ret = -ENOTCONN;
1448                goto out;
1449        }
1450
1451        member = find_qgroup_rb(fs_info, src);
1452        parent = find_qgroup_rb(fs_info, dst);
1453        /*
1454         * The parent/member pair doesn't exist, then try to delete the dead
1455         * relation items only.
1456         */
1457        if (!member || !parent)
1458                goto delete_item;
1459
1460        /* check if such qgroup relation exist firstly */
1461        list_for_each_entry(list, &member->groups, next_group) {
1462                if (list->group == parent) {
1463                        found = true;
1464                        break;
1465                }
1466        }
1467
1468delete_item:
1469        ret = del_qgroup_relation_item(trans, src, dst);
1470        if (ret < 0 && ret != -ENOENT)
1471                goto out;
1472        ret2 = del_qgroup_relation_item(trans, dst, src);
1473        if (ret2 < 0 && ret2 != -ENOENT)
1474                goto out;
1475
1476        /* At least one deletion succeeded, return 0 */
1477        if (!ret || !ret2)
1478                ret = 0;
1479
1480        if (found) {
1481                spin_lock(&fs_info->qgroup_lock);
1482                del_relation_rb(fs_info, src, dst);
1483                ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1484                spin_unlock(&fs_info->qgroup_lock);
1485        }
1486out:
1487        ulist_free(tmp);
1488        return ret;
1489}
1490
1491int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
1492                              u64 dst)
1493{
1494        struct btrfs_fs_info *fs_info = trans->fs_info;
1495        int ret = 0;
1496
1497        mutex_lock(&fs_info->qgroup_ioctl_lock);
1498        ret = __del_qgroup_relation(trans, src, dst);
1499        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1500
1501        return ret;
1502}
1503
1504int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1505{
1506        struct btrfs_fs_info *fs_info = trans->fs_info;
1507        struct btrfs_root *quota_root;
1508        struct btrfs_qgroup *qgroup;
1509        int ret = 0;
1510
1511        mutex_lock(&fs_info->qgroup_ioctl_lock);
1512        if (!fs_info->quota_root) {
1513                ret = -ENOTCONN;
1514                goto out;
1515        }
1516        quota_root = fs_info->quota_root;
1517        qgroup = find_qgroup_rb(fs_info, qgroupid);
1518        if (qgroup) {
1519                ret = -EEXIST;
1520                goto out;
1521        }
1522
1523        ret = add_qgroup_item(trans, quota_root, qgroupid);
1524        if (ret)
1525                goto out;
1526
1527        spin_lock(&fs_info->qgroup_lock);
1528        qgroup = add_qgroup_rb(fs_info, qgroupid);
1529        spin_unlock(&fs_info->qgroup_lock);
1530
1531        if (IS_ERR(qgroup)) {
1532                ret = PTR_ERR(qgroup);
1533                goto out;
1534        }
1535        ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
1536out:
1537        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1538        return ret;
1539}
1540
1541int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
1542{
1543        struct btrfs_fs_info *fs_info = trans->fs_info;
1544        struct btrfs_qgroup *qgroup;
1545        struct btrfs_qgroup_list *list;
1546        int ret = 0;
1547
1548        mutex_lock(&fs_info->qgroup_ioctl_lock);
1549        if (!fs_info->quota_root) {
1550                ret = -ENOTCONN;
1551                goto out;
1552        }
1553
1554        qgroup = find_qgroup_rb(fs_info, qgroupid);
1555        if (!qgroup) {
1556                ret = -ENOENT;
1557                goto out;
1558        }
1559
1560        /* Check if there are no children of this qgroup */
1561        if (!list_empty(&qgroup->members)) {
1562                ret = -EBUSY;
1563                goto out;
1564        }
1565
1566        ret = del_qgroup_item(trans, qgroupid);
1567        if (ret && ret != -ENOENT)
1568                goto out;
1569
1570        while (!list_empty(&qgroup->groups)) {
1571                list = list_first_entry(&qgroup->groups,
1572                                        struct btrfs_qgroup_list, next_group);
1573                ret = __del_qgroup_relation(trans, qgroupid,
1574                                            list->group->qgroupid);
1575                if (ret)
1576                        goto out;
1577        }
1578
1579        spin_lock(&fs_info->qgroup_lock);
1580        del_qgroup_rb(fs_info, qgroupid);
1581        spin_unlock(&fs_info->qgroup_lock);
1582
1583        /*
1584         * Remove the qgroup from sysfs now without holding the qgroup_lock
1585         * spinlock, since the sysfs_remove_group() function needs to take
1586         * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
1587         */
1588        btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
1589        kfree(qgroup);
1590out:
1591        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1592        return ret;
1593}
1594
1595int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
1596                       struct btrfs_qgroup_limit *limit)
1597{
1598        struct btrfs_fs_info *fs_info = trans->fs_info;
1599        struct btrfs_qgroup *qgroup;
1600        int ret = 0;
1601        /* Sometimes we would want to clear the limit on this qgroup.
1602         * To meet this requirement, we treat the -1 as a special value
1603         * which tell kernel to clear the limit on this qgroup.
1604         */
1605        const u64 CLEAR_VALUE = -1;
1606
1607        mutex_lock(&fs_info->qgroup_ioctl_lock);
1608        if (!fs_info->quota_root) {
1609                ret = -ENOTCONN;
1610                goto out;
1611        }
1612
1613        qgroup = find_qgroup_rb(fs_info, qgroupid);
1614        if (!qgroup) {
1615                ret = -ENOENT;
1616                goto out;
1617        }
1618
1619        spin_lock(&fs_info->qgroup_lock);
1620        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
1621                if (limit->max_rfer == CLEAR_VALUE) {
1622                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1623                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
1624                        qgroup->max_rfer = 0;
1625                } else {
1626                        qgroup->max_rfer = limit->max_rfer;
1627                }
1628        }
1629        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
1630                if (limit->max_excl == CLEAR_VALUE) {
1631                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1632                        limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
1633                        qgroup->max_excl = 0;
1634                } else {
1635                        qgroup->max_excl = limit->max_excl;
1636                }
1637        }
1638        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
1639                if (limit->rsv_rfer == CLEAR_VALUE) {
1640                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1641                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
1642                        qgroup->rsv_rfer = 0;
1643                } else {
1644                        qgroup->rsv_rfer = limit->rsv_rfer;
1645                }
1646        }
1647        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
1648                if (limit->rsv_excl == CLEAR_VALUE) {
1649                        qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1650                        limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
1651                        qgroup->rsv_excl = 0;
1652                } else {
1653                        qgroup->rsv_excl = limit->rsv_excl;
1654                }
1655        }
1656        qgroup->lim_flags |= limit->flags;
1657
1658        spin_unlock(&fs_info->qgroup_lock);
1659
1660        ret = update_qgroup_limit_item(trans, qgroup);
1661        if (ret) {
1662                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1663                btrfs_info(fs_info, "unable to update quota limit for %llu",
1664                       qgroupid);
1665        }
1666
1667out:
1668        mutex_unlock(&fs_info->qgroup_ioctl_lock);
1669        return ret;
1670}
1671
1672int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
1673                                struct btrfs_delayed_ref_root *delayed_refs,
1674                                struct btrfs_qgroup_extent_record *record)
1675{
1676        struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1677        struct rb_node *parent_node = NULL;
1678        struct btrfs_qgroup_extent_record *entry;
1679        u64 bytenr = record->bytenr;
1680
1681        lockdep_assert_held(&delayed_refs->lock);
1682        trace_btrfs_qgroup_trace_extent(fs_info, record);
1683
1684        while (*p) {
1685                parent_node = *p;
1686                entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
1687                                 node);
1688                if (bytenr < entry->bytenr) {
1689                        p = &(*p)->rb_left;
1690                } else if (bytenr > entry->bytenr) {
1691                        p = &(*p)->rb_right;
1692                } else {
1693                        if (record->data_rsv && !entry->data_rsv) {
1694                                entry->data_rsv = record->data_rsv;
1695                                entry->data_rsv_refroot =
1696                                        record->data_rsv_refroot;
1697                        }
1698                        return 1;
1699                }
1700        }
1701
1702        rb_link_node(&record->node, parent_node, p);
1703        rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1704        return 0;
1705}
1706
1707int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
1708                                   struct btrfs_qgroup_extent_record *qrecord)
1709{
1710        struct ulist *old_root;
1711        u64 bytenr = qrecord->bytenr;
1712        int ret;
1713
1714        /*
1715         * We are always called in a context where we are already holding a
1716         * transaction handle. Often we are called when adding a data delayed
1717         * reference from btrfs_truncate_inode_items() (truncating or unlinking),
1718         * in which case we will be holding a write lock on extent buffer from a
1719         * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
1720         * acquire fs_info->commit_root_sem, because that is a higher level lock
1721         * that must be acquired before locking any extent buffers.
1722         *
1723         * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
1724         * but we can't pass it a non-NULL transaction handle, because otherwise
1725         * it would not use commit roots and would lock extent buffers, causing
1726         * a deadlock if it ends up trying to read lock the same extent buffer
1727         * that was previously write locked at btrfs_truncate_inode_items().
1728         *
1729         * So pass a NULL transaction handle to btrfs_find_all_roots() and
1730         * explicitly tell it to not acquire the commit_root_sem - if we are
1731         * holding a transaction handle we don't need its protection.
1732         */
1733        ASSERT(trans != NULL);
1734
1735        ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
1736                                   true);
1737        if (ret < 0) {
1738                trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1739                btrfs_warn(trans->fs_info,
1740"error accounting new delayed refs extent (err code: %d), quota inconsistent",
1741                        ret);
1742                return 0;
1743        }
1744
1745        /*
1746         * Here we don't need to get the lock of
1747         * trans->transaction->delayed_refs, since inserted qrecord won't
1748         * be deleted, only qrecord->node may be modified (new qrecord insert)
1749         *
1750         * So modifying qrecord->old_roots is safe here
1751         */
1752        qrecord->old_roots = old_root;
1753        return 0;
1754}
1755
1756int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
1757                              u64 num_bytes, gfp_t gfp_flag)
1758{
1759        struct btrfs_fs_info *fs_info = trans->fs_info;
1760        struct btrfs_qgroup_extent_record *record;
1761        struct btrfs_delayed_ref_root *delayed_refs;
1762        int ret;
1763
1764        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
1765            || bytenr == 0 || num_bytes == 0)
1766                return 0;
1767        record = kzalloc(sizeof(*record), gfp_flag);
1768        if (!record)
1769                return -ENOMEM;
1770
1771        delayed_refs = &trans->transaction->delayed_refs;
1772        record->bytenr = bytenr;
1773        record->num_bytes = num_bytes;
1774        record->old_roots = NULL;
1775
1776        spin_lock(&delayed_refs->lock);
1777        ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
1778        spin_unlock(&delayed_refs->lock);
1779        if (ret > 0) {
1780                kfree(record);
1781                return 0;
1782        }
1783        return btrfs_qgroup_trace_extent_post(trans, record);
1784}
1785
1786int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
1787                                  struct extent_buffer *eb)
1788{
1789        struct btrfs_fs_info *fs_info = trans->fs_info;
1790        int nr = btrfs_header_nritems(eb);
1791        int i, extent_type, ret;
1792        struct btrfs_key key;
1793        struct btrfs_file_extent_item *fi;
1794        u64 bytenr, num_bytes;
1795
1796        /* We can be called directly from walk_up_proc() */
1797        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1798                return 0;
1799
1800        for (i = 0; i < nr; i++) {
1801                btrfs_item_key_to_cpu(eb, &key, i);
1802
1803                if (key.type != BTRFS_EXTENT_DATA_KEY)
1804                        continue;
1805
1806                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
1807                /* filter out non qgroup-accountable extents  */
1808                extent_type = btrfs_file_extent_type(eb, fi);
1809
1810                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1811                        continue;
1812
1813                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
1814                if (!bytenr)
1815                        continue;
1816
1817                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
1818
1819                ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
1820                                                GFP_NOFS);
1821                if (ret)
1822                        return ret;
1823        }
1824        cond_resched();
1825        return 0;
1826}
1827
1828/*
1829 * Walk up the tree from the bottom, freeing leaves and any interior
1830 * nodes which have had all slots visited. If a node (leaf or
1831 * interior) is freed, the node above it will have it's slot
1832 * incremented. The root node will never be freed.
1833 *
1834 * At the end of this function, we should have a path which has all
1835 * slots incremented to the next position for a search. If we need to
1836 * read a new node it will be NULL and the node above it will have the
1837 * correct slot selected for a later read.
1838 *
1839 * If we increment the root nodes slot counter past the number of
1840 * elements, 1 is returned to signal completion of the search.
1841 */
1842static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
1843{
1844        int level = 0;
1845        int nr, slot;
1846        struct extent_buffer *eb;
1847
1848        if (root_level == 0)
1849                return 1;
1850
1851        while (level <= root_level) {
1852                eb = path->nodes[level];
1853                nr = btrfs_header_nritems(eb);
1854                path->slots[level]++;
1855                slot = path->slots[level];
1856                if (slot >= nr || level == 0) {
1857                        /*
1858                         * Don't free the root -  we will detect this
1859                         * condition after our loop and return a
1860                         * positive value for caller to stop walking the tree.
1861                         */
1862                        if (level != root_level) {
1863                                btrfs_tree_unlock_rw(eb, path->locks[level]);
1864                                path->locks[level] = 0;
1865
1866                                free_extent_buffer(eb);
1867                                path->nodes[level] = NULL;
1868                                path->slots[level] = 0;
1869                        }
1870                } else {
1871                        /*
1872                         * We have a valid slot to walk back down
1873                         * from. Stop here so caller can process these
1874                         * new nodes.
1875                         */
1876                        break;
1877                }
1878
1879                level++;
1880        }
1881
1882        eb = path->nodes[root_level];
1883        if (path->slots[root_level] >= btrfs_header_nritems(eb))
1884                return 1;
1885
1886        return 0;
1887}
1888
1889/*
1890 * Helper function to trace a subtree tree block swap.
1891 *
1892 * The swap will happen in highest tree block, but there may be a lot of
1893 * tree blocks involved.
1894 *
1895 * For example:
1896 *  OO = Old tree blocks
1897 *  NN = New tree blocks allocated during balance
1898 *
1899 *           File tree (257)                  Reloc tree for 257
1900 * L2              OO                                NN
1901 *               /    \                            /    \
1902 * L1          OO      OO (a)                    OO      NN (a)
1903 *            / \     / \                       / \     / \
1904 * L0       OO   OO OO   OO                   OO   OO NN   NN
1905 *                  (b)  (c)                          (b)  (c)
1906 *
1907 * When calling qgroup_trace_extent_swap(), we will pass:
1908 * @src_eb = OO(a)
1909 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
1910 * @dst_level = 0
1911 * @root_level = 1
1912 *
1913 * In that case, qgroup_trace_extent_swap() will search from OO(a) to
1914 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
1915 *
1916 * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
1917 *
1918 * 1) Tree search from @src_eb
1919 *    It should acts as a simplified btrfs_search_slot().
1920 *    The key for search can be extracted from @dst_path->nodes[dst_level]
1921 *    (first key).
1922 *
1923 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
1924 *    NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
1925 *    They should be marked during previous (@dst_level = 1) iteration.
1926 *
1927 * 3) Mark file extents in leaves dirty
1928 *    We don't have good way to pick out new file extents only.
1929 *    So we still follow the old method by scanning all file extents in
1930 *    the leave.
1931 *
1932 * This function can free us from keeping two paths, thus later we only need
1933 * to care about how to iterate all new tree blocks in reloc tree.
1934 */
1935static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
1936                                    struct extent_buffer *src_eb,
1937                                    struct btrfs_path *dst_path,
1938                                    int dst_level, int root_level,
1939                                    bool trace_leaf)
1940{
1941        struct btrfs_key key;
1942        struct btrfs_path *src_path;
1943        struct btrfs_fs_info *fs_info = trans->fs_info;
1944        u32 nodesize = fs_info->nodesize;
1945        int cur_level = root_level;
1946        int ret;
1947
1948        BUG_ON(dst_level > root_level);
1949        /* Level mismatch */
1950        if (btrfs_header_level(src_eb) != root_level)
1951                return -EINVAL;
1952
1953        src_path = btrfs_alloc_path();
1954        if (!src_path) {
1955                ret = -ENOMEM;
1956                goto out;
1957        }
1958
1959        if (dst_level)
1960                btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1961        else
1962                btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
1963
1964        /* For src_path */
1965        atomic_inc(&src_eb->refs);
1966        src_path->nodes[root_level] = src_eb;
1967        src_path->slots[root_level] = dst_path->slots[root_level];
1968        src_path->locks[root_level] = 0;
1969
1970        /* A simplified version of btrfs_search_slot() */
1971        while (cur_level >= dst_level) {
1972                struct btrfs_key src_key;
1973                struct btrfs_key dst_key;
1974
1975                if (src_path->nodes[cur_level] == NULL) {
1976                        struct extent_buffer *eb;
1977                        int parent_slot;
1978
1979                        eb = src_path->nodes[cur_level + 1];
1980                        parent_slot = src_path->slots[cur_level + 1];
1981
1982                        eb = btrfs_read_node_slot(eb, parent_slot);
1983                        if (IS_ERR(eb)) {
1984                                ret = PTR_ERR(eb);
1985                                goto out;
1986                        }
1987
1988                        src_path->nodes[cur_level] = eb;
1989
1990                        btrfs_tree_read_lock(eb);
1991                        src_path->locks[cur_level] = BTRFS_READ_LOCK;
1992                }
1993
1994                src_path->slots[cur_level] = dst_path->slots[cur_level];
1995                if (cur_level) {
1996                        btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
1997                                        &dst_key, dst_path->slots[cur_level]);
1998                        btrfs_node_key_to_cpu(src_path->nodes[cur_level],
1999                                        &src_key, src_path->slots[cur_level]);
2000                } else {
2001                        btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
2002                                        &dst_key, dst_path->slots[cur_level]);
2003                        btrfs_item_key_to_cpu(src_path->nodes[cur_level],
2004                                        &src_key, src_path->slots[cur_level]);
2005                }
2006                /* Content mismatch, something went wrong */
2007                if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
2008                        ret = -ENOENT;
2009                        goto out;
2010                }
2011                cur_level--;
2012        }
2013
2014        /*
2015         * Now both @dst_path and @src_path have been populated, record the tree
2016         * blocks for qgroup accounting.
2017         */
2018        ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
2019                        nodesize, GFP_NOFS);
2020        if (ret < 0)
2021                goto out;
2022        ret = btrfs_qgroup_trace_extent(trans,
2023                        dst_path->nodes[dst_level]->start,
2024                        nodesize, GFP_NOFS);
2025        if (ret < 0)
2026                goto out;
2027
2028        /* Record leaf file extents */
2029        if (dst_level == 0 && trace_leaf) {
2030                ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
2031                if (ret < 0)
2032                        goto out;
2033                ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
2034        }
2035out:
2036        btrfs_free_path(src_path);
2037        return ret;
2038}
2039
2040/*
2041 * Helper function to do recursive generation-aware depth-first search, to
2042 * locate all new tree blocks in a subtree of reloc tree.
2043 *
2044 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
2045 *         reloc tree
2046 * L2         NN (a)
2047 *          /    \
2048 * L1    OO        NN (b)
2049 *      /  \      /  \
2050 * L0  OO  OO    OO  NN
2051 *               (c) (d)
2052 * If we pass:
2053 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
2054 * @cur_level = 1
2055 * @root_level = 1
2056 *
2057 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
2058 * above tree blocks along with their counter parts in file tree.
2059 * While during search, old tree blocks OO(c) will be skipped as tree block swap
2060 * won't affect OO(c).
2061 */
2062static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
2063                                           struct extent_buffer *src_eb,
2064                                           struct btrfs_path *dst_path,
2065                                           int cur_level, int root_level,
2066                                           u64 last_snapshot, bool trace_leaf)
2067{
2068        struct btrfs_fs_info *fs_info = trans->fs_info;
2069        struct extent_buffer *eb;
2070        bool need_cleanup = false;
2071        int ret = 0;
2072        int i;
2073
2074        /* Level sanity check */
2075        if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 ||
2076            root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 ||
2077            root_level < cur_level) {
2078                btrfs_err_rl(fs_info,
2079                        "%s: bad levels, cur_level=%d root_level=%d",
2080                        __func__, cur_level, root_level);
2081                return -EUCLEAN;
2082        }
2083
2084        /* Read the tree block if needed */
2085        if (dst_path->nodes[cur_level] == NULL) {
2086                int parent_slot;
2087                u64 child_gen;
2088
2089                /*
2090                 * dst_path->nodes[root_level] must be initialized before
2091                 * calling this function.
2092                 */
2093                if (cur_level == root_level) {
2094                        btrfs_err_rl(fs_info,
2095        "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
2096                                __func__, root_level, root_level, cur_level);
2097                        return -EUCLEAN;
2098                }
2099
2100                /*
2101                 * We need to get child blockptr/gen from parent before we can
2102                 * read it.
2103                  */
2104                eb = dst_path->nodes[cur_level + 1];
2105                parent_slot = dst_path->slots[cur_level + 1];
2106                child_gen = btrfs_node_ptr_generation(eb, parent_slot);
2107
2108                /* This node is old, no need to trace */
2109                if (child_gen < last_snapshot)
2110                        goto out;
2111
2112                eb = btrfs_read_node_slot(eb, parent_slot);
2113                if (IS_ERR(eb)) {
2114                        ret = PTR_ERR(eb);
2115                        goto out;
2116                }
2117
2118                dst_path->nodes[cur_level] = eb;
2119                dst_path->slots[cur_level] = 0;
2120
2121                btrfs_tree_read_lock(eb);
2122                dst_path->locks[cur_level] = BTRFS_READ_LOCK;
2123                need_cleanup = true;
2124        }
2125
2126        /* Now record this tree block and its counter part for qgroups */
2127        ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
2128                                       root_level, trace_leaf);
2129        if (ret < 0)
2130                goto cleanup;
2131
2132        eb = dst_path->nodes[cur_level];
2133
2134        if (cur_level > 0) {
2135                /* Iterate all child tree blocks */
2136                for (i = 0; i < btrfs_header_nritems(eb); i++) {
2137                        /* Skip old tree blocks as they won't be swapped */
2138                        if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
2139                                continue;
2140                        dst_path->slots[cur_level] = i;
2141
2142                        /* Recursive call (at most 7 times) */
2143                        ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
2144                                        dst_path, cur_level - 1, root_level,
2145                                        last_snapshot, trace_leaf);
2146                        if (ret < 0)
2147                                goto cleanup;
2148                }
2149        }
2150
2151cleanup:
2152        if (need_cleanup) {
2153                /* Clean up */
2154                btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
2155                                     dst_path->locks[cur_level]);
2156                free_extent_buffer(dst_path->nodes[cur_level]);
2157                dst_path->nodes[cur_level] = NULL;
2158                dst_path->slots[cur_level] = 0;
2159                dst_path->locks[cur_level] = 0;
2160        }
2161out:
2162        return ret;
2163}
2164
2165static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
2166                                struct extent_buffer *src_eb,
2167                                struct extent_buffer *dst_eb,
2168                                u64 last_snapshot, bool trace_leaf)
2169{
2170        struct btrfs_fs_info *fs_info = trans->fs_info;
2171        struct btrfs_path *dst_path = NULL;
2172        int level;
2173        int ret;
2174
2175        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2176                return 0;
2177
2178        /* Wrong parameter order */
2179        if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) {
2180                btrfs_err_rl(fs_info,
2181                "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
2182                             btrfs_header_generation(src_eb),
2183                             btrfs_header_generation(dst_eb));
2184                return -EUCLEAN;
2185        }
2186
2187        if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
2188                ret = -EIO;
2189                goto out;
2190        }
2191
2192        level = btrfs_header_level(dst_eb);
2193        dst_path = btrfs_alloc_path();
2194        if (!dst_path) {
2195                ret = -ENOMEM;
2196                goto out;
2197        }
2198        /* For dst_path */
2199        atomic_inc(&dst_eb->refs);
2200        dst_path->nodes[level] = dst_eb;
2201        dst_path->slots[level] = 0;
2202        dst_path->locks[level] = 0;
2203
2204        /* Do the generation aware breadth-first search */
2205        ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
2206                                              level, last_snapshot, trace_leaf);
2207        if (ret < 0)
2208                goto out;
2209        ret = 0;
2210
2211out:
2212        btrfs_free_path(dst_path);
2213        if (ret < 0)
2214                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2215        return ret;
2216}
2217
2218int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
2219                               struct extent_buffer *root_eb,
2220                               u64 root_gen, int root_level)
2221{
2222        struct btrfs_fs_info *fs_info = trans->fs_info;
2223        int ret = 0;
2224        int level;
2225        struct extent_buffer *eb = root_eb;
2226        struct btrfs_path *path = NULL;
2227
2228        BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
2229        BUG_ON(root_eb == NULL);
2230
2231        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2232                return 0;
2233
2234        if (!extent_buffer_uptodate(root_eb)) {
2235                ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
2236                if (ret)
2237                        goto out;
2238        }
2239
2240        if (root_level == 0) {
2241                ret = btrfs_qgroup_trace_leaf_items(trans, root_eb);
2242                goto out;
2243        }
2244
2245        path = btrfs_alloc_path();
2246        if (!path)
2247                return -ENOMEM;
2248
2249        /*
2250         * Walk down the tree.  Missing extent blocks are filled in as
2251         * we go. Metadata is accounted every time we read a new
2252         * extent block.
2253         *
2254         * When we reach a leaf, we account for file extent items in it,
2255         * walk back up the tree (adjusting slot pointers as we go)
2256         * and restart the search process.
2257         */
2258        atomic_inc(&root_eb->refs);     /* For path */
2259        path->nodes[root_level] = root_eb;
2260        path->slots[root_level] = 0;
2261        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
2262walk_down:
2263        level = root_level;
2264        while (level >= 0) {
2265                if (path->nodes[level] == NULL) {
2266                        int parent_slot;
2267                        u64 child_bytenr;
2268
2269                        /*
2270                         * We need to get child blockptr from parent before we
2271                         * can read it.
2272                          */
2273                        eb = path->nodes[level + 1];
2274                        parent_slot = path->slots[level + 1];
2275                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
2276
2277                        eb = btrfs_read_node_slot(eb, parent_slot);
2278                        if (IS_ERR(eb)) {
2279                                ret = PTR_ERR(eb);
2280                                goto out;
2281                        }
2282
2283                        path->nodes[level] = eb;
2284                        path->slots[level] = 0;
2285
2286                        btrfs_tree_read_lock(eb);
2287                        path->locks[level] = BTRFS_READ_LOCK;
2288
2289                        ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
2290                                                        fs_info->nodesize,
2291                                                        GFP_NOFS);
2292                        if (ret)
2293                                goto out;
2294                }
2295
2296                if (level == 0) {
2297                        ret = btrfs_qgroup_trace_leaf_items(trans,
2298                                                            path->nodes[level]);
2299                        if (ret)
2300                                goto out;
2301
2302                        /* Nonzero return here means we completed our search */
2303                        ret = adjust_slots_upwards(path, root_level);
2304                        if (ret)
2305                                break;
2306
2307                        /* Restart search with new slots */
2308                        goto walk_down;
2309                }
2310
2311                level--;
2312        }
2313
2314        ret = 0;
2315out:
2316        btrfs_free_path(path);
2317
2318        return ret;
2319}
2320
2321#define UPDATE_NEW      0
2322#define UPDATE_OLD      1
2323/*
2324 * Walk all of the roots that points to the bytenr and adjust their refcnts.
2325 */
2326static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
2327                                struct ulist *roots, struct ulist *tmp,
2328                                struct ulist *qgroups, u64 seq, int update_old)
2329{
2330        struct ulist_node *unode;
2331        struct ulist_iterator uiter;
2332        struct ulist_node *tmp_unode;
2333        struct ulist_iterator tmp_uiter;
2334        struct btrfs_qgroup *qg;
2335        int ret = 0;
2336
2337        if (!roots)
2338                return 0;
2339        ULIST_ITER_INIT(&uiter);
2340        while ((unode = ulist_next(roots, &uiter))) {
2341                qg = find_qgroup_rb(fs_info, unode->val);
2342                if (!qg)
2343                        continue;
2344
2345                ulist_reinit(tmp);
2346                ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg),
2347                                GFP_ATOMIC);
2348                if (ret < 0)
2349                        return ret;
2350                ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC);
2351                if (ret < 0)
2352                        return ret;
2353                ULIST_ITER_INIT(&tmp_uiter);
2354                while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
2355                        struct btrfs_qgroup_list *glist;
2356
2357                        qg = unode_aux_to_qgroup(tmp_unode);
2358                        if (update_old)
2359                                btrfs_qgroup_update_old_refcnt(qg, seq, 1);
2360                        else
2361                                btrfs_qgroup_update_new_refcnt(qg, seq, 1);
2362                        list_for_each_entry(glist, &qg->groups, next_group) {
2363                                ret = ulist_add(qgroups, glist->group->qgroupid,
2364                                                qgroup_to_aux(glist->group),
2365                                                GFP_ATOMIC);
2366                                if (ret < 0)
2367                                        return ret;
2368                                ret = ulist_add(tmp, glist->group->qgroupid,
2369                                                qgroup_to_aux(glist->group),
2370                                                GFP_ATOMIC);
2371                                if (ret < 0)
2372                                        return ret;
2373                        }
2374                }
2375        }
2376        return 0;
2377}
2378
2379/*
2380 * Update qgroup rfer/excl counters.
2381 * Rfer update is easy, codes can explain themselves.
2382 *
2383 * Excl update is tricky, the update is split into 2 parts.
2384 * Part 1: Possible exclusive <-> sharing detect:
2385 *      |       A       |       !A      |
2386 *  -------------------------------------
2387 *  B   |       *       |       -       |
2388 *  -------------------------------------
2389 *  !B  |       +       |       **      |
2390 *  -------------------------------------
2391 *
2392 * Conditions:
2393 * A:   cur_old_roots < nr_old_roots    (not exclusive before)
2394 * !A:  cur_old_roots == nr_old_roots   (possible exclusive before)
2395 * B:   cur_new_roots < nr_new_roots    (not exclusive now)
2396 * !B:  cur_new_roots == nr_new_roots   (possible exclusive now)
2397 *
2398 * Results:
2399 * +: Possible sharing -> exclusive     -: Possible exclusive -> sharing
2400 * *: Definitely not changed.           **: Possible unchanged.
2401 *
2402 * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
2403 *
2404 * To make the logic clear, we first use condition A and B to split
2405 * combination into 4 results.
2406 *
2407 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
2408 * only on variant maybe 0.
2409 *
2410 * Lastly, check result **, since there are 2 variants maybe 0, split them
2411 * again(2x2).
2412 * But this time we don't need to consider other things, the codes and logic
2413 * is easy to understand now.
2414 */
2415static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
2416                                  struct ulist *qgroups,
2417                                  u64 nr_old_roots,
2418                                  u64 nr_new_roots,
2419                                  u64 num_bytes, u64 seq)
2420{
2421        struct ulist_node *unode;
2422        struct ulist_iterator uiter;
2423        struct btrfs_qgroup *qg;
2424        u64 cur_new_count, cur_old_count;
2425
2426        ULIST_ITER_INIT(&uiter);
2427        while ((unode = ulist_next(qgroups, &uiter))) {
2428                bool dirty = false;
2429
2430                qg = unode_aux_to_qgroup(unode);
2431                cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
2432                cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
2433
2434                trace_qgroup_update_counters(fs_info, qg, cur_old_count,
2435                                             cur_new_count);
2436
2437                /* Rfer update part */
2438                if (cur_old_count == 0 && cur_new_count > 0) {
2439                        qg->rfer += num_bytes;
2440                        qg->rfer_cmpr += num_bytes;
2441                        dirty = true;
2442                }
2443                if (cur_old_count > 0 && cur_new_count == 0) {
2444                        qg->rfer -= num_bytes;
2445                        qg->rfer_cmpr -= num_bytes;
2446                        dirty = true;
2447                }
2448
2449                /* Excl update part */
2450                /* Exclusive/none -> shared case */
2451                if (cur_old_count == nr_old_roots &&
2452                    cur_new_count < nr_new_roots) {
2453                        /* Exclusive -> shared */
2454                        if (cur_old_count != 0) {
2455                                qg->excl -= num_bytes;
2456                                qg->excl_cmpr -= num_bytes;
2457                                dirty = true;
2458                        }
2459                }
2460
2461                /* Shared -> exclusive/none case */
2462                if (cur_old_count < nr_old_roots &&
2463                    cur_new_count == nr_new_roots) {
2464                        /* Shared->exclusive */
2465                        if (cur_new_count != 0) {
2466                                qg->excl += num_bytes;
2467                                qg->excl_cmpr += num_bytes;
2468                                dirty = true;
2469                        }
2470                }
2471
2472                /* Exclusive/none -> exclusive/none case */
2473                if (cur_old_count == nr_old_roots &&
2474                    cur_new_count == nr_new_roots) {
2475                        if (cur_old_count == 0) {
2476                                /* None -> exclusive/none */
2477
2478                                if (cur_new_count != 0) {
2479                                        /* None -> exclusive */
2480                                        qg->excl += num_bytes;
2481                                        qg->excl_cmpr += num_bytes;
2482                                        dirty = true;
2483                                }
2484                                /* None -> none, nothing changed */
2485                        } else {
2486                                /* Exclusive -> exclusive/none */
2487
2488                                if (cur_new_count == 0) {
2489                                        /* Exclusive -> none */
2490                                        qg->excl -= num_bytes;
2491                                        qg->excl_cmpr -= num_bytes;
2492                                        dirty = true;
2493                                }
2494                                /* Exclusive -> exclusive, nothing changed */
2495                        }
2496                }
2497
2498                if (dirty)
2499                        qgroup_dirty(fs_info, qg);
2500        }
2501        return 0;
2502}
2503
2504/*
2505 * Check if the @roots potentially is a list of fs tree roots
2506 *
2507 * Return 0 for definitely not a fs/subvol tree roots ulist
2508 * Return 1 for possible fs/subvol tree roots in the list (considering an empty
2509 *          one as well)
2510 */
2511static int maybe_fs_roots(struct ulist *roots)
2512{
2513        struct ulist_node *unode;
2514        struct ulist_iterator uiter;
2515
2516        /* Empty one, still possible for fs roots */
2517        if (!roots || roots->nnodes == 0)
2518                return 1;
2519
2520        ULIST_ITER_INIT(&uiter);
2521        unode = ulist_next(roots, &uiter);
2522        if (!unode)
2523                return 1;
2524
2525        /*
2526         * If it contains fs tree roots, then it must belong to fs/subvol
2527         * trees.
2528         * If it contains a non-fs tree, it won't be shared with fs/subvol trees.
2529         */
2530        return is_fstree(unode->val);
2531}
2532
2533int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
2534                                u64 num_bytes, struct ulist *old_roots,
2535                                struct ulist *new_roots)
2536{
2537        struct btrfs_fs_info *fs_info = trans->fs_info;
2538        struct ulist *qgroups = NULL;
2539        struct ulist *tmp = NULL;
2540        u64 seq;
2541        u64 nr_new_roots = 0;
2542        u64 nr_old_roots = 0;
2543        int ret = 0;
2544
2545        /*
2546         * If quotas get disabled meanwhile, the resources need to be freed and
2547         * we can't just exit here.
2548         */
2549        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2550                goto out_free;
2551
2552        if (new_roots) {
2553                if (!maybe_fs_roots(new_roots))
2554                        goto out_free;
2555                nr_new_roots = new_roots->nnodes;
2556        }
2557        if (old_roots) {
2558                if (!maybe_fs_roots(old_roots))
2559                        goto out_free;
2560                nr_old_roots = old_roots->nnodes;
2561        }
2562
2563        /* Quick exit, either not fs tree roots, or won't affect any qgroup */
2564        if (nr_old_roots == 0 && nr_new_roots == 0)
2565                goto out_free;
2566
2567        BUG_ON(!fs_info->quota_root);
2568
2569        trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr,
2570                                        num_bytes, nr_old_roots, nr_new_roots);
2571
2572        qgroups = ulist_alloc(GFP_NOFS);
2573        if (!qgroups) {
2574                ret = -ENOMEM;
2575                goto out_free;
2576        }
2577        tmp = ulist_alloc(GFP_NOFS);
2578        if (!tmp) {
2579                ret = -ENOMEM;
2580                goto out_free;
2581        }
2582
2583        mutex_lock(&fs_info->qgroup_rescan_lock);
2584        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
2585                if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
2586                        mutex_unlock(&fs_info->qgroup_rescan_lock);
2587                        ret = 0;
2588                        goto out_free;
2589                }
2590        }
2591        mutex_unlock(&fs_info->qgroup_rescan_lock);
2592
2593        spin_lock(&fs_info->qgroup_lock);
2594        seq = fs_info->qgroup_seq;
2595
2596        /* Update old refcnts using old_roots */
2597        ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
2598                                   UPDATE_OLD);
2599        if (ret < 0)
2600                goto out;
2601
2602        /* Update new refcnts using new_roots */
2603        ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
2604                                   UPDATE_NEW);
2605        if (ret < 0)
2606                goto out;
2607
2608        qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
2609                               num_bytes, seq);
2610
2611        /*
2612         * Bump qgroup_seq to avoid seq overlap
2613         */
2614        fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
2615out:
2616        spin_unlock(&fs_info->qgroup_lock);
2617out_free:
2618        ulist_free(tmp);
2619        ulist_free(qgroups);
2620        ulist_free(old_roots);
2621        ulist_free(new_roots);
2622        return ret;
2623}
2624
2625int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
2626{
2627        struct btrfs_fs_info *fs_info = trans->fs_info;
2628        struct btrfs_qgroup_extent_record *record;
2629        struct btrfs_delayed_ref_root *delayed_refs;
2630        struct ulist *new_roots = NULL;
2631        struct rb_node *node;
2632        u64 num_dirty_extents = 0;
2633        u64 qgroup_to_skip;
2634        int ret = 0;
2635
2636        delayed_refs = &trans->transaction->delayed_refs;
2637        qgroup_to_skip = delayed_refs->qgroup_to_skip;
2638        while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
2639                record = rb_entry(node, struct btrfs_qgroup_extent_record,
2640                                  node);
2641
2642                num_dirty_extents++;
2643                trace_btrfs_qgroup_account_extents(fs_info, record);
2644
2645                if (!ret) {
2646                        /*
2647                         * Old roots should be searched when inserting qgroup
2648                         * extent record
2649                         */
2650                        if (WARN_ON(!record->old_roots)) {
2651                                /* Search commit root to find old_roots */
2652                                ret = btrfs_find_all_roots(NULL, fs_info,
2653                                                record->bytenr, 0,
2654                                                &record->old_roots, false);
2655                                if (ret < 0)
2656                                        goto cleanup;
2657                        }
2658
2659                        /* Free the reserved data space */
2660                        btrfs_qgroup_free_refroot(fs_info,
2661                                        record->data_rsv_refroot,
2662                                        record->data_rsv,
2663                                        BTRFS_QGROUP_RSV_DATA);
2664                        /*
2665                         * Use BTRFS_SEQ_LAST as time_seq to do special search,
2666                         * which doesn't lock tree or delayed_refs and search
2667                         * current root. It's safe inside commit_transaction().
2668                         */
2669                        ret = btrfs_find_all_roots(trans, fs_info,
2670                           record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
2671                        if (ret < 0)
2672                                goto cleanup;
2673                        if (qgroup_to_skip) {
2674                                ulist_del(new_roots, qgroup_to_skip, 0);
2675                                ulist_del(record->old_roots, qgroup_to_skip,
2676                                          0);
2677                        }
2678                        ret = btrfs_qgroup_account_extent(trans, record->bytenr,
2679                                                          record->num_bytes,
2680                                                          record->old_roots,
2681                                                          new_roots);
2682                        record->old_roots = NULL;
2683                        new_roots = NULL;
2684                }
2685cleanup:
2686                ulist_free(record->old_roots);
2687                ulist_free(new_roots);
2688                new_roots = NULL;
2689                rb_erase(node, &delayed_refs->dirty_extent_root);
2690                kfree(record);
2691
2692        }
2693        trace_qgroup_num_dirty_extents(fs_info, trans->transid,
2694                                       num_dirty_extents);
2695        return ret;
2696}
2697
2698/*
2699 * called from commit_transaction. Writes all changed qgroups to disk.
2700 */
2701int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
2702{
2703        struct btrfs_fs_info *fs_info = trans->fs_info;
2704        int ret = 0;
2705
2706        if (!fs_info->quota_root)
2707                return ret;
2708
2709        spin_lock(&fs_info->qgroup_lock);
2710        while (!list_empty(&fs_info->dirty_qgroups)) {
2711                struct btrfs_qgroup *qgroup;
2712                qgroup = list_first_entry(&fs_info->dirty_qgroups,
2713                                          struct btrfs_qgroup, dirty);
2714                list_del_init(&qgroup->dirty);
2715                spin_unlock(&fs_info->qgroup_lock);
2716                ret = update_qgroup_info_item(trans, qgroup);
2717                if (ret)
2718                        fs_info->qgroup_flags |=
2719                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2720                ret = update_qgroup_limit_item(trans, qgroup);
2721                if (ret)
2722                        fs_info->qgroup_flags |=
2723                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2724                spin_lock(&fs_info->qgroup_lock);
2725        }
2726        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2727                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON;
2728        else
2729                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
2730        spin_unlock(&fs_info->qgroup_lock);
2731
2732        ret = update_qgroup_status_item(trans);
2733        if (ret)
2734                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2735
2736        return ret;
2737}
2738
2739/*
2740 * Copy the accounting information between qgroups. This is necessary
2741 * when a snapshot or a subvolume is created. Throwing an error will
2742 * cause a transaction abort so we take extra care here to only error
2743 * when a readonly fs is a reasonable outcome.
2744 */
2745int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
2746                         u64 objectid, struct btrfs_qgroup_inherit *inherit)
2747{
2748        int ret = 0;
2749        int i;
2750        u64 *i_qgroups;
2751        bool committing = false;
2752        struct btrfs_fs_info *fs_info = trans->fs_info;
2753        struct btrfs_root *quota_root;
2754        struct btrfs_qgroup *srcgroup;
2755        struct btrfs_qgroup *dstgroup;
2756        bool need_rescan = false;
2757        u32 level_size = 0;
2758        u64 nums;
2759
2760        /*
2761         * There are only two callers of this function.
2762         *
2763         * One in create_subvol() in the ioctl context, which needs to hold
2764         * the qgroup_ioctl_lock.
2765         *
2766         * The other one in create_pending_snapshot() where no other qgroup
2767         * code can modify the fs as they all need to either start a new trans
2768         * or hold a trans handler, thus we don't need to hold
2769         * qgroup_ioctl_lock.
2770         * This would avoid long and complex lock chain and make lockdep happy.
2771         */
2772        spin_lock(&fs_info->trans_lock);
2773        if (trans->transaction->state == TRANS_STATE_COMMIT_DOING)
2774                committing = true;
2775        spin_unlock(&fs_info->trans_lock);
2776
2777        if (!committing)
2778                mutex_lock(&fs_info->qgroup_ioctl_lock);
2779        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
2780                goto out;
2781
2782        quota_root = fs_info->quota_root;
2783        if (!quota_root) {
2784                ret = -EINVAL;
2785                goto out;
2786        }
2787
2788        if (inherit) {
2789                i_qgroups = (u64 *)(inherit + 1);
2790                nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
2791                       2 * inherit->num_excl_copies;
2792                for (i = 0; i < nums; ++i) {
2793                        srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
2794
2795                        /*
2796                         * Zero out invalid groups so we can ignore
2797                         * them later.
2798                         */
2799                        if (!srcgroup ||
2800                            ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
2801                                *i_qgroups = 0ULL;
2802
2803                        ++i_qgroups;
2804                }
2805        }
2806
2807        /*
2808         * create a tracking group for the subvol itself
2809         */
2810        ret = add_qgroup_item(trans, quota_root, objectid);
2811        if (ret)
2812                goto out;
2813
2814        /*
2815         * add qgroup to all inherited groups
2816         */
2817        if (inherit) {
2818                i_qgroups = (u64 *)(inherit + 1);
2819                for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
2820                        if (*i_qgroups == 0)
2821                                continue;
2822                        ret = add_qgroup_relation_item(trans, objectid,
2823                                                       *i_qgroups);
2824                        if (ret && ret != -EEXIST)
2825                                goto out;
2826                        ret = add_qgroup_relation_item(trans, *i_qgroups,
2827                                                       objectid);
2828                        if (ret && ret != -EEXIST)
2829                                goto out;
2830                }
2831                ret = 0;
2832        }
2833
2834
2835        spin_lock(&fs_info->qgroup_lock);
2836
2837        dstgroup = add_qgroup_rb(fs_info, objectid);
2838        if (IS_ERR(dstgroup)) {
2839                ret = PTR_ERR(dstgroup);
2840                goto unlock;
2841        }
2842
2843        if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2844                dstgroup->lim_flags = inherit->lim.flags;
2845                dstgroup->max_rfer = inherit->lim.max_rfer;
2846                dstgroup->max_excl = inherit->lim.max_excl;
2847                dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2848                dstgroup->rsv_excl = inherit->lim.rsv_excl;
2849
2850                ret = update_qgroup_limit_item(trans, dstgroup);
2851                if (ret) {
2852                        fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2853                        btrfs_info(fs_info,
2854                                   "unable to update quota limit for %llu",
2855                                   dstgroup->qgroupid);
2856                        goto unlock;
2857                }
2858        }
2859
2860        if (srcid) {
2861                srcgroup = find_qgroup_rb(fs_info, srcid);
2862                if (!srcgroup)
2863                        goto unlock;
2864
2865                /*
2866                 * We call inherit after we clone the root in order to make sure
2867                 * our counts don't go crazy, so at this point the only
2868                 * difference between the two roots should be the root node.
2869                 */
2870                level_size = fs_info->nodesize;
2871                dstgroup->rfer = srcgroup->rfer;
2872                dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
2873                dstgroup->excl = level_size;
2874                dstgroup->excl_cmpr = level_size;
2875                srcgroup->excl = level_size;
2876                srcgroup->excl_cmpr = level_size;
2877
2878                /* inherit the limit info */
2879                dstgroup->lim_flags = srcgroup->lim_flags;
2880                dstgroup->max_rfer = srcgroup->max_rfer;
2881                dstgroup->max_excl = srcgroup->max_excl;
2882                dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2883                dstgroup->rsv_excl = srcgroup->rsv_excl;
2884
2885                qgroup_dirty(fs_info, dstgroup);
2886                qgroup_dirty(fs_info, srcgroup);
2887        }
2888
2889        if (!inherit)
2890                goto unlock;
2891
2892        i_qgroups = (u64 *)(inherit + 1);
2893        for (i = 0; i < inherit->num_qgroups; ++i) {
2894                if (*i_qgroups) {
2895                        ret = add_relation_rb(fs_info, objectid, *i_qgroups);
2896                        if (ret)
2897                                goto unlock;
2898                }
2899                ++i_qgroups;
2900
2901                /*
2902                 * If we're doing a snapshot, and adding the snapshot to a new
2903                 * qgroup, the numbers are guaranteed to be incorrect.
2904                 */
2905                if (srcid)
2906                        need_rescan = true;
2907        }
2908
2909        for (i = 0; i <  inherit->num_ref_copies; ++i, i_qgroups += 2) {
2910                struct btrfs_qgroup *src;
2911                struct btrfs_qgroup *dst;
2912
2913                if (!i_qgroups[0] || !i_qgroups[1])
2914                        continue;
2915
2916                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2917                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2918
2919                if (!src || !dst) {
2920                        ret = -EINVAL;
2921                        goto unlock;
2922                }
2923
2924                dst->rfer = src->rfer - level_size;
2925                dst->rfer_cmpr = src->rfer_cmpr - level_size;
2926
2927                /* Manually tweaking numbers certainly needs a rescan */
2928                need_rescan = true;
2929        }
2930        for (i = 0; i <  inherit->num_excl_copies; ++i, i_qgroups += 2) {
2931                struct btrfs_qgroup *src;
2932                struct btrfs_qgroup *dst;
2933
2934                if (!i_qgroups[0] || !i_qgroups[1])
2935                        continue;
2936
2937                src = find_qgroup_rb(fs_info, i_qgroups[0]);
2938                dst = find_qgroup_rb(fs_info, i_qgroups[1]);
2939
2940                if (!src || !dst) {
2941                        ret = -EINVAL;
2942                        goto unlock;
2943                }
2944
2945                dst->excl = src->excl + level_size;
2946                dst->excl_cmpr = src->excl_cmpr + level_size;
2947                need_rescan = true;
2948        }
2949
2950unlock:
2951        spin_unlock(&fs_info->qgroup_lock);
2952        if (!ret)
2953                ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup);
2954out:
2955        if (!committing)
2956                mutex_unlock(&fs_info->qgroup_ioctl_lock);
2957        if (need_rescan)
2958                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2959        return ret;
2960}
2961
2962static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
2963{
2964        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
2965            qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
2966                return false;
2967
2968        if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) &&
2969            qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
2970                return false;
2971
2972        return true;
2973}
2974
2975static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
2976                          enum btrfs_qgroup_rsv_type type)
2977{
2978        struct btrfs_qgroup *qgroup;
2979        struct btrfs_fs_info *fs_info = root->fs_info;
2980        u64 ref_root = root->root_key.objectid;
2981        int ret = 0;
2982        struct ulist_node *unode;
2983        struct ulist_iterator uiter;
2984
2985        if (!is_fstree(ref_root))
2986                return 0;
2987
2988        if (num_bytes == 0)
2989                return 0;
2990
2991        if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) &&
2992            capable(CAP_SYS_RESOURCE))
2993                enforce = false;
2994
2995        spin_lock(&fs_info->qgroup_lock);
2996        if (!fs_info->quota_root)
2997                goto out;
2998
2999        qgroup = find_qgroup_rb(fs_info, ref_root);
3000        if (!qgroup)
3001                goto out;
3002
3003        /*
3004         * in a first step, we check all affected qgroups if any limits would
3005         * be exceeded
3006         */
3007        ulist_reinit(fs_info->qgroup_ulist);
3008        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3009                        qgroup_to_aux(qgroup), GFP_ATOMIC);
3010        if (ret < 0)
3011                goto out;
3012        ULIST_ITER_INIT(&uiter);
3013        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3014                struct btrfs_qgroup *qg;
3015                struct btrfs_qgroup_list *glist;
3016
3017                qg = unode_aux_to_qgroup(unode);
3018
3019                if (enforce && !qgroup_check_limits(qg, num_bytes)) {
3020                        ret = -EDQUOT;
3021                        goto out;
3022                }
3023
3024                list_for_each_entry(glist, &qg->groups, next_group) {
3025                        ret = ulist_add(fs_info->qgroup_ulist,
3026                                        glist->group->qgroupid,
3027                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3028                        if (ret < 0)
3029                                goto out;
3030                }
3031        }
3032        ret = 0;
3033        /*
3034         * no limits exceeded, now record the reservation into all qgroups
3035         */
3036        ULIST_ITER_INIT(&uiter);
3037        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3038                struct btrfs_qgroup *qg;
3039
3040                qg = unode_aux_to_qgroup(unode);
3041
3042                qgroup_rsv_add(fs_info, qg, num_bytes, type);
3043        }
3044
3045out:
3046        spin_unlock(&fs_info->qgroup_lock);
3047        return ret;
3048}
3049
3050/*
3051 * Free @num_bytes of reserved space with @type for qgroup.  (Normally level 0
3052 * qgroup).
3053 *
3054 * Will handle all higher level qgroup too.
3055 *
3056 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup.
3057 * This special case is only used for META_PERTRANS type.
3058 */
3059void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
3060                               u64 ref_root, u64 num_bytes,
3061                               enum btrfs_qgroup_rsv_type type)
3062{
3063        struct btrfs_qgroup *qgroup;
3064        struct ulist_node *unode;
3065        struct ulist_iterator uiter;
3066        int ret = 0;
3067
3068        if (!is_fstree(ref_root))
3069                return;
3070
3071        if (num_bytes == 0)
3072                return;
3073
3074        if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) {
3075                WARN(1, "%s: Invalid type to free", __func__);
3076                return;
3077        }
3078        spin_lock(&fs_info->qgroup_lock);
3079
3080        if (!fs_info->quota_root)
3081                goto out;
3082
3083        qgroup = find_qgroup_rb(fs_info, ref_root);
3084        if (!qgroup)
3085                goto out;
3086
3087        if (num_bytes == (u64)-1)
3088                /*
3089                 * We're freeing all pertrans rsv, get reserved value from
3090                 * level 0 qgroup as real num_bytes to free.
3091                 */
3092                num_bytes = qgroup->rsv.values[type];
3093
3094        ulist_reinit(fs_info->qgroup_ulist);
3095        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3096                        qgroup_to_aux(qgroup), GFP_ATOMIC);
3097        if (ret < 0)
3098                goto out;
3099        ULIST_ITER_INIT(&uiter);
3100        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3101                struct btrfs_qgroup *qg;
3102                struct btrfs_qgroup_list *glist;
3103
3104                qg = unode_aux_to_qgroup(unode);
3105
3106                qgroup_rsv_release(fs_info, qg, num_bytes, type);
3107
3108                list_for_each_entry(glist, &qg->groups, next_group) {
3109                        ret = ulist_add(fs_info->qgroup_ulist,
3110                                        glist->group->qgroupid,
3111                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3112                        if (ret < 0)
3113                                goto out;
3114                }
3115        }
3116
3117out:
3118        spin_unlock(&fs_info->qgroup_lock);
3119}
3120
3121/*
3122 * Check if the leaf is the last leaf. Which means all node pointers
3123 * are at their last position.
3124 */
3125static bool is_last_leaf(struct btrfs_path *path)
3126{
3127        int i;
3128
3129        for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
3130                if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1)
3131                        return false;
3132        }
3133        return true;
3134}
3135
3136/*
3137 * returns < 0 on error, 0 when more leafs are to be scanned.
3138 * returns 1 when done.
3139 */
3140static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
3141                              struct btrfs_path *path)
3142{
3143        struct btrfs_fs_info *fs_info = trans->fs_info;
3144        struct btrfs_key found;
3145        struct extent_buffer *scratch_leaf = NULL;
3146        struct ulist *roots = NULL;
3147        u64 num_bytes;
3148        bool done;
3149        int slot;
3150        int ret;
3151
3152        mutex_lock(&fs_info->qgroup_rescan_lock);
3153        ret = btrfs_search_slot_for_read(fs_info->extent_root,
3154                                         &fs_info->qgroup_rescan_progress,
3155                                         path, 1, 0);
3156
3157        btrfs_debug(fs_info,
3158                "current progress key (%llu %u %llu), search_slot ret %d",
3159                fs_info->qgroup_rescan_progress.objectid,
3160                fs_info->qgroup_rescan_progress.type,
3161                fs_info->qgroup_rescan_progress.offset, ret);
3162
3163        if (ret) {
3164                /*
3165                 * The rescan is about to end, we will not be scanning any
3166                 * further blocks. We cannot unset the RESCAN flag here, because
3167                 * we want to commit the transaction if everything went well.
3168                 * To make the live accounting work in this phase, we set our
3169                 * scan progress pointer such that every real extent objectid
3170                 * will be smaller.
3171                 */
3172                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3173                btrfs_release_path(path);
3174                mutex_unlock(&fs_info->qgroup_rescan_lock);
3175                return ret;
3176        }
3177        done = is_last_leaf(path);
3178
3179        btrfs_item_key_to_cpu(path->nodes[0], &found,
3180                              btrfs_header_nritems(path->nodes[0]) - 1);
3181        fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
3182
3183        scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
3184        if (!scratch_leaf) {
3185                ret = -ENOMEM;
3186                mutex_unlock(&fs_info->qgroup_rescan_lock);
3187                goto out;
3188        }
3189        slot = path->slots[0];
3190        btrfs_release_path(path);
3191        mutex_unlock(&fs_info->qgroup_rescan_lock);
3192
3193        for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
3194                btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
3195                if (found.type != BTRFS_EXTENT_ITEM_KEY &&
3196                    found.type != BTRFS_METADATA_ITEM_KEY)
3197                        continue;
3198                if (found.type == BTRFS_METADATA_ITEM_KEY)
3199                        num_bytes = fs_info->nodesize;
3200                else
3201                        num_bytes = found.offset;
3202
3203                ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
3204                                           &roots, false);
3205                if (ret < 0)
3206                        goto out;
3207                /* For rescan, just pass old_roots as NULL */
3208                ret = btrfs_qgroup_account_extent(trans, found.objectid,
3209                                                  num_bytes, NULL, roots);
3210                if (ret < 0)
3211                        goto out;
3212        }
3213out:
3214        if (scratch_leaf)
3215                free_extent_buffer(scratch_leaf);
3216
3217        if (done && !ret) {
3218                ret = 1;
3219                fs_info->qgroup_rescan_progress.objectid = (u64)-1;
3220        }
3221        return ret;
3222}
3223
3224static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
3225{
3226        return btrfs_fs_closing(fs_info) ||
3227                test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
3228}
3229
3230static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
3231{
3232        struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
3233                                                     qgroup_rescan_work);
3234        struct btrfs_path *path;
3235        struct btrfs_trans_handle *trans = NULL;
3236        int err = -ENOMEM;
3237        int ret = 0;
3238        bool stopped = false;
3239
3240        path = btrfs_alloc_path();
3241        if (!path)
3242                goto out;
3243        /*
3244         * Rescan should only search for commit root, and any later difference
3245         * should be recorded by qgroup
3246         */
3247        path->search_commit_root = 1;
3248        path->skip_locking = 1;
3249
3250        err = 0;
3251        while (!err && !(stopped = rescan_should_stop(fs_info))) {
3252                trans = btrfs_start_transaction(fs_info->fs_root, 0);
3253                if (IS_ERR(trans)) {
3254                        err = PTR_ERR(trans);
3255                        break;
3256                }
3257                if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
3258                        err = -EINTR;
3259                } else {
3260                        err = qgroup_rescan_leaf(trans, path);
3261                }
3262                if (err > 0)
3263                        btrfs_commit_transaction(trans);
3264                else
3265                        btrfs_end_transaction(trans);
3266        }
3267
3268out:
3269        btrfs_free_path(path);
3270
3271        mutex_lock(&fs_info->qgroup_rescan_lock);
3272        if (err > 0 &&
3273            fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
3274                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3275        } else if (err < 0) {
3276                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
3277        }
3278        mutex_unlock(&fs_info->qgroup_rescan_lock);
3279
3280        /*
3281         * only update status, since the previous part has already updated the
3282         * qgroup info.
3283         */
3284        trans = btrfs_start_transaction(fs_info->quota_root, 1);
3285        if (IS_ERR(trans)) {
3286                err = PTR_ERR(trans);
3287                trans = NULL;
3288                btrfs_err(fs_info,
3289                          "fail to start transaction for status update: %d",
3290                          err);
3291        }
3292
3293        mutex_lock(&fs_info->qgroup_rescan_lock);
3294        if (!stopped)
3295                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3296        if (trans) {
3297                ret = update_qgroup_status_item(trans);
3298                if (ret < 0) {
3299                        err = ret;
3300                        btrfs_err(fs_info, "fail to update qgroup status: %d",
3301                                  err);
3302                }
3303        }
3304        fs_info->qgroup_rescan_running = false;
3305        complete_all(&fs_info->qgroup_rescan_completion);
3306        mutex_unlock(&fs_info->qgroup_rescan_lock);
3307
3308        if (!trans)
3309                return;
3310
3311        btrfs_end_transaction(trans);
3312
3313        if (stopped) {
3314                btrfs_info(fs_info, "qgroup scan paused");
3315        } else if (err >= 0) {
3316                btrfs_info(fs_info, "qgroup scan completed%s",
3317                        err > 0 ? " (inconsistency flag cleared)" : "");
3318        } else {
3319                btrfs_err(fs_info, "qgroup scan failed with %d", err);
3320        }
3321}
3322
3323/*
3324 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all
3325 * memory required for the rescan context.
3326 */
3327static int
3328qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
3329                   int init_flags)
3330{
3331        int ret = 0;
3332
3333        if (!init_flags) {
3334                /* we're resuming qgroup rescan at mount time */
3335                if (!(fs_info->qgroup_flags &
3336                      BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
3337                        btrfs_warn(fs_info,
3338                        "qgroup rescan init failed, qgroup rescan is not queued");
3339                        ret = -EINVAL;
3340                } else if (!(fs_info->qgroup_flags &
3341                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3342                        btrfs_warn(fs_info,
3343                        "qgroup rescan init failed, qgroup is not enabled");
3344                        ret = -EINVAL;
3345                }
3346
3347                if (ret)
3348                        return ret;
3349        }
3350
3351        mutex_lock(&fs_info->qgroup_rescan_lock);
3352
3353        if (init_flags) {
3354                if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3355                        btrfs_warn(fs_info,
3356                                   "qgroup rescan is already in progress");
3357                        ret = -EINPROGRESS;
3358                } else if (!(fs_info->qgroup_flags &
3359                             BTRFS_QGROUP_STATUS_FLAG_ON)) {
3360                        btrfs_warn(fs_info,
3361                        "qgroup rescan init failed, qgroup is not enabled");
3362                        ret = -EINVAL;
3363                }
3364
3365                if (ret) {
3366                        mutex_unlock(&fs_info->qgroup_rescan_lock);
3367                        return ret;
3368                }
3369                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3370        }
3371
3372        memset(&fs_info->qgroup_rescan_progress, 0,
3373                sizeof(fs_info->qgroup_rescan_progress));
3374        fs_info->qgroup_rescan_progress.objectid = progress_objectid;
3375        init_completion(&fs_info->qgroup_rescan_completion);
3376        mutex_unlock(&fs_info->qgroup_rescan_lock);
3377
3378        btrfs_init_work(&fs_info->qgroup_rescan_work,
3379                        btrfs_qgroup_rescan_worker, NULL, NULL);
3380        return 0;
3381}
3382
3383static void
3384qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
3385{
3386        struct rb_node *n;
3387        struct btrfs_qgroup *qgroup;
3388
3389        spin_lock(&fs_info->qgroup_lock);
3390        /* clear all current qgroup tracking information */
3391        for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) {
3392                qgroup = rb_entry(n, struct btrfs_qgroup, node);
3393                qgroup->rfer = 0;
3394                qgroup->rfer_cmpr = 0;
3395                qgroup->excl = 0;
3396                qgroup->excl_cmpr = 0;
3397                qgroup_dirty(fs_info, qgroup);
3398        }
3399        spin_unlock(&fs_info->qgroup_lock);
3400}
3401
3402int
3403btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
3404{
3405        int ret = 0;
3406        struct btrfs_trans_handle *trans;
3407
3408        ret = qgroup_rescan_init(fs_info, 0, 1);
3409        if (ret)
3410                return ret;
3411
3412        /*
3413         * We have set the rescan_progress to 0, which means no more
3414         * delayed refs will be accounted by btrfs_qgroup_account_ref.
3415         * However, btrfs_qgroup_account_ref may be right after its call
3416         * to btrfs_find_all_roots, in which case it would still do the
3417         * accounting.
3418         * To solve this, we're committing the transaction, which will
3419         * ensure we run all delayed refs and only after that, we are
3420         * going to clear all tracking information for a clean start.
3421         */
3422
3423        trans = btrfs_join_transaction(fs_info->fs_root);
3424        if (IS_ERR(trans)) {
3425                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3426                return PTR_ERR(trans);
3427        }
3428        ret = btrfs_commit_transaction(trans);
3429        if (ret) {
3430                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
3431                return ret;
3432        }
3433
3434        qgroup_rescan_zero_tracking(fs_info);
3435
3436        mutex_lock(&fs_info->qgroup_rescan_lock);
3437        fs_info->qgroup_rescan_running = true;
3438        btrfs_queue_work(fs_info->qgroup_rescan_workers,
3439                         &fs_info->qgroup_rescan_work);
3440        mutex_unlock(&fs_info->qgroup_rescan_lock);
3441
3442        return 0;
3443}
3444
3445int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
3446                                     bool interruptible)
3447{
3448        int running;
3449        int ret = 0;
3450
3451        mutex_lock(&fs_info->qgroup_rescan_lock);
3452        running = fs_info->qgroup_rescan_running;
3453        mutex_unlock(&fs_info->qgroup_rescan_lock);
3454
3455        if (!running)
3456                return 0;
3457
3458        if (interruptible)
3459                ret = wait_for_completion_interruptible(
3460                                        &fs_info->qgroup_rescan_completion);
3461        else
3462                wait_for_completion(&fs_info->qgroup_rescan_completion);
3463
3464        return ret;
3465}
3466
3467/*
3468 * this is only called from open_ctree where we're still single threaded, thus
3469 * locking is omitted here.
3470 */
3471void
3472btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
3473{
3474        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3475                mutex_lock(&fs_info->qgroup_rescan_lock);
3476                fs_info->qgroup_rescan_running = true;
3477                btrfs_queue_work(fs_info->qgroup_rescan_workers,
3478                                 &fs_info->qgroup_rescan_work);
3479                mutex_unlock(&fs_info->qgroup_rescan_lock);
3480        }
3481}
3482
3483#define rbtree_iterate_from_safe(node, next, start)                             \
3484       for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
3485
3486static int qgroup_unreserve_range(struct btrfs_inode *inode,
3487                                  struct extent_changeset *reserved, u64 start,
3488                                  u64 len)
3489{
3490        struct rb_node *node;
3491        struct rb_node *next;
3492        struct ulist_node *entry;
3493        int ret = 0;
3494
3495        node = reserved->range_changed.root.rb_node;
3496        if (!node)
3497                return 0;
3498        while (node) {
3499                entry = rb_entry(node, struct ulist_node, rb_node);
3500                if (entry->val < start)
3501                        node = node->rb_right;
3502                else
3503                        node = node->rb_left;
3504        }
3505
3506        if (entry->val > start && rb_prev(&entry->rb_node))
3507                entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
3508                                 rb_node);
3509
3510        rbtree_iterate_from_safe(node, next, &entry->rb_node) {
3511                u64 entry_start;
3512                u64 entry_end;
3513                u64 entry_len;
3514                int clear_ret;
3515
3516                entry = rb_entry(node, struct ulist_node, rb_node);
3517                entry_start = entry->val;
3518                entry_end = entry->aux;
3519                entry_len = entry_end - entry_start + 1;
3520
3521                if (entry_start >= start + len)
3522                        break;
3523                if (entry_start + entry_len <= start)
3524                        continue;
3525                /*
3526                 * Now the entry is in [start, start + len), revert the
3527                 * EXTENT_QGROUP_RESERVED bit.
3528                 */
3529                clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
3530                                              entry_end, EXTENT_QGROUP_RESERVED);
3531                if (!ret && clear_ret < 0)
3532                        ret = clear_ret;
3533
3534                ulist_del(&reserved->range_changed, entry->val, entry->aux);
3535                if (likely(reserved->bytes_changed >= entry_len)) {
3536                        reserved->bytes_changed -= entry_len;
3537                } else {
3538                        WARN_ON(1);
3539                        reserved->bytes_changed = 0;
3540                }
3541        }
3542
3543        return ret;
3544}
3545
3546/*
3547 * Try to free some space for qgroup.
3548 *
3549 * For qgroup, there are only 3 ways to free qgroup space:
3550 * - Flush nodatacow write
3551 *   Any nodatacow write will free its reserved data space at run_delalloc_range().
3552 *   In theory, we should only flush nodatacow inodes, but it's not yet
3553 *   possible, so we need to flush the whole root.
3554 *
3555 * - Wait for ordered extents
3556 *   When ordered extents are finished, their reserved metadata is finally
3557 *   converted to per_trans status, which can be freed by later commit
3558 *   transaction.
3559 *
3560 * - Commit transaction
3561 *   This would free the meta_per_trans space.
3562 *   In theory this shouldn't provide much space, but any more qgroup space
3563 *   is needed.
3564 */
3565static int try_flush_qgroup(struct btrfs_root *root)
3566{
3567        struct btrfs_trans_handle *trans;
3568        int ret;
3569
3570        /* Can't hold an open transaction or we run the risk of deadlocking. */
3571        ASSERT(current->journal_info == NULL);
3572        if (WARN_ON(current->journal_info))
3573                return 0;
3574
3575        /*
3576         * We don't want to run flush again and again, so if there is a running
3577         * one, we won't try to start a new flush, but exit directly.
3578         */
3579        if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
3580                wait_event(root->qgroup_flush_wait,
3581                        !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
3582                return 0;
3583        }
3584
3585        ret = btrfs_start_delalloc_snapshot(root, true);
3586        if (ret < 0)
3587                goto out;
3588        btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
3589
3590        trans = btrfs_join_transaction(root);
3591        if (IS_ERR(trans)) {
3592                ret = PTR_ERR(trans);
3593                goto out;
3594        }
3595
3596        ret = btrfs_commit_transaction(trans);
3597out:
3598        clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
3599        wake_up(&root->qgroup_flush_wait);
3600        return ret;
3601}
3602
3603static int qgroup_reserve_data(struct btrfs_inode *inode,
3604                        struct extent_changeset **reserved_ret, u64 start,
3605                        u64 len)
3606{
3607        struct btrfs_root *root = inode->root;
3608        struct extent_changeset *reserved;
3609        bool new_reserved = false;
3610        u64 orig_reserved;
3611        u64 to_reserve;
3612        int ret;
3613
3614        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
3615            !is_fstree(root->root_key.objectid) || len == 0)
3616                return 0;
3617
3618        /* @reserved parameter is mandatory for qgroup */
3619        if (WARN_ON(!reserved_ret))
3620                return -EINVAL;
3621        if (!*reserved_ret) {
3622                new_reserved = true;
3623                *reserved_ret = extent_changeset_alloc();
3624                if (!*reserved_ret)
3625                        return -ENOMEM;
3626        }
3627        reserved = *reserved_ret;
3628        /* Record already reserved space */
3629        orig_reserved = reserved->bytes_changed;
3630        ret = set_record_extent_bits(&inode->io_tree, start,
3631                        start + len -1, EXTENT_QGROUP_RESERVED, reserved);
3632
3633        /* Newly reserved space */
3634        to_reserve = reserved->bytes_changed - orig_reserved;
3635        trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
3636                                        to_reserve, QGROUP_RESERVE);
3637        if (ret < 0)
3638                goto out;
3639        ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
3640        if (ret < 0)
3641                goto cleanup;
3642
3643        return ret;
3644
3645cleanup:
3646        qgroup_unreserve_range(inode, reserved, start, len);
3647out:
3648        if (new_reserved) {
3649                extent_changeset_free(reserved);
3650                *reserved_ret = NULL;
3651        }
3652        return ret;
3653}
3654
3655/*
3656 * Reserve qgroup space for range [start, start + len).
3657 *
3658 * This function will either reserve space from related qgroups or do nothing
3659 * if the range is already reserved.
3660 *
3661 * Return 0 for successful reservation
3662 * Return <0 for error (including -EQUOT)
3663 *
3664 * NOTE: This function may sleep for memory allocation, dirty page flushing and
3665 *       commit transaction. So caller should not hold any dirty page locked.
3666 */
3667int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
3668                        struct extent_changeset **reserved_ret, u64 start,
3669                        u64 len)
3670{
3671        int ret;
3672
3673        ret = qgroup_reserve_data(inode, reserved_ret, start, len);
3674        if (ret <= 0 && ret != -EDQUOT)
3675                return ret;
3676
3677        ret = try_flush_qgroup(inode->root);
3678        if (ret < 0)
3679                return ret;
3680        return qgroup_reserve_data(inode, reserved_ret, start, len);
3681}
3682
3683/* Free ranges specified by @reserved, normally in error path */
3684static int qgroup_free_reserved_data(struct btrfs_inode *inode,
3685                        struct extent_changeset *reserved, u64 start, u64 len)
3686{
3687        struct btrfs_root *root = inode->root;
3688        struct ulist_node *unode;
3689        struct ulist_iterator uiter;
3690        struct extent_changeset changeset;
3691        int freed = 0;
3692        int ret;
3693
3694        extent_changeset_init(&changeset);
3695        len = round_up(start + len, root->fs_info->sectorsize);
3696        start = round_down(start, root->fs_info->sectorsize);
3697
3698        ULIST_ITER_INIT(&uiter);
3699        while ((unode = ulist_next(&reserved->range_changed, &uiter))) {
3700                u64 range_start = unode->val;
3701                /* unode->aux is the inclusive end */
3702                u64 range_len = unode->aux - range_start + 1;
3703                u64 free_start;
3704                u64 free_len;
3705
3706                extent_changeset_release(&changeset);
3707
3708                /* Only free range in range [start, start + len) */
3709                if (range_start >= start + len ||
3710                    range_start + range_len <= start)
3711                        continue;
3712                free_start = max(range_start, start);
3713                free_len = min(start + len, range_start + range_len) -
3714                           free_start;
3715                /*
3716                 * TODO: To also modify reserved->ranges_reserved to reflect
3717                 * the modification.
3718                 *
3719                 * However as long as we free qgroup reserved according to
3720                 * EXTENT_QGROUP_RESERVED, we won't double free.
3721                 * So not need to rush.
3722                 */
3723                ret = clear_record_extent_bits(&inode->io_tree, free_start,
3724                                free_start + free_len - 1,
3725                                EXTENT_QGROUP_RESERVED, &changeset);
3726                if (ret < 0)
3727                        goto out;
3728                freed += changeset.bytes_changed;
3729        }
3730        btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
3731                                  BTRFS_QGROUP_RSV_DATA);
3732        ret = freed;
3733out:
3734        extent_changeset_release(&changeset);
3735        return ret;
3736}
3737
3738static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
3739                        struct extent_changeset *reserved, u64 start, u64 len,
3740                        int free)
3741{
3742        struct extent_changeset changeset;
3743        int trace_op = QGROUP_RELEASE;
3744        int ret;
3745
3746        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags))
3747                return 0;
3748
3749        /* In release case, we shouldn't have @reserved */
3750        WARN_ON(!free && reserved);
3751        if (free && reserved)
3752                return qgroup_free_reserved_data(inode, reserved, start, len);
3753        extent_changeset_init(&changeset);
3754        ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
3755                                       EXTENT_QGROUP_RESERVED, &changeset);
3756        if (ret < 0)
3757                goto out;
3758
3759        if (free)
3760                trace_op = QGROUP_FREE;
3761        trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len,
3762                                        changeset.bytes_changed, trace_op);
3763        if (free)
3764                btrfs_qgroup_free_refroot(inode->root->fs_info,
3765                                inode->root->root_key.objectid,
3766                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
3767        ret = changeset.bytes_changed;
3768out:
3769        extent_changeset_release(&changeset);
3770        return ret;
3771}
3772
3773/*
3774 * Free a reserved space range from io_tree and related qgroups
3775 *
3776 * Should be called when a range of pages get invalidated before reaching disk.
3777 * Or for error cleanup case.
3778 * if @reserved is given, only reserved range in [@start, @start + @len) will
3779 * be freed.
3780 *
3781 * For data written to disk, use btrfs_qgroup_release_data().
3782 *
3783 * NOTE: This function may sleep for memory allocation.
3784 */
3785int btrfs_qgroup_free_data(struct btrfs_inode *inode,
3786                        struct extent_changeset *reserved, u64 start, u64 len)
3787{
3788        return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
3789}
3790
3791/*
3792 * Release a reserved space range from io_tree only.
3793 *
3794 * Should be called when a range of pages get written to disk and corresponding
3795 * FILE_EXTENT is inserted into corresponding root.
3796 *
3797 * Since new qgroup accounting framework will only update qgroup numbers at
3798 * commit_transaction() time, its reserved space shouldn't be freed from
3799 * related qgroups.
3800 *
3801 * But we should release the range from io_tree, to allow further write to be
3802 * COWed.
3803 *
3804 * NOTE: This function may sleep for memory allocation.
3805 */
3806int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
3807{
3808        return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
3809}
3810
3811static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3812                              enum btrfs_qgroup_rsv_type type)
3813{
3814        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3815            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3816                return;
3817        if (num_bytes == 0)
3818                return;
3819
3820        spin_lock(&root->qgroup_meta_rsv_lock);
3821        if (type == BTRFS_QGROUP_RSV_META_PREALLOC)
3822                root->qgroup_meta_rsv_prealloc += num_bytes;
3823        else
3824                root->qgroup_meta_rsv_pertrans += num_bytes;
3825        spin_unlock(&root->qgroup_meta_rsv_lock);
3826}
3827
3828static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
3829                             enum btrfs_qgroup_rsv_type type)
3830{
3831        if (type != BTRFS_QGROUP_RSV_META_PREALLOC &&
3832            type != BTRFS_QGROUP_RSV_META_PERTRANS)
3833                return 0;
3834        if (num_bytes == 0)
3835                return 0;
3836
3837        spin_lock(&root->qgroup_meta_rsv_lock);
3838        if (type == BTRFS_QGROUP_RSV_META_PREALLOC) {
3839                num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc,
3840                                  num_bytes);
3841                root->qgroup_meta_rsv_prealloc -= num_bytes;
3842        } else {
3843                num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans,
3844                                  num_bytes);
3845                root->qgroup_meta_rsv_pertrans -= num_bytes;
3846        }
3847        spin_unlock(&root->qgroup_meta_rsv_lock);
3848        return num_bytes;
3849}
3850
3851int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3852                              enum btrfs_qgroup_rsv_type type, bool enforce)
3853{
3854        struct btrfs_fs_info *fs_info = root->fs_info;
3855        int ret;
3856
3857        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3858            !is_fstree(root->root_key.objectid) || num_bytes == 0)
3859                return 0;
3860
3861        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3862        trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
3863        ret = qgroup_reserve(root, num_bytes, enforce, type);
3864        if (ret < 0)
3865                return ret;
3866        /*
3867         * Record what we have reserved into root.
3868         *
3869         * To avoid quota disabled->enabled underflow.
3870         * In that case, we may try to free space we haven't reserved
3871         * (since quota was disabled), so record what we reserved into root.
3872         * And ensure later release won't underflow this number.
3873         */
3874        add_root_meta_rsv(root, num_bytes, type);
3875        return ret;
3876}
3877
3878int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
3879                                enum btrfs_qgroup_rsv_type type, bool enforce)
3880{
3881        int ret;
3882
3883        ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3884        if (ret <= 0 && ret != -EDQUOT)
3885                return ret;
3886
3887        ret = try_flush_qgroup(root);
3888        if (ret < 0)
3889                return ret;
3890        return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
3891}
3892
3893void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
3894{
3895        struct btrfs_fs_info *fs_info = root->fs_info;
3896
3897        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3898            !is_fstree(root->root_key.objectid))
3899                return;
3900
3901        /* TODO: Update trace point to handle such free */
3902        trace_qgroup_meta_free_all_pertrans(root);
3903        /* Special value -1 means to free all reserved space */
3904        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
3905                                  BTRFS_QGROUP_RSV_META_PERTRANS);
3906}
3907
3908void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
3909                              enum btrfs_qgroup_rsv_type type)
3910{
3911        struct btrfs_fs_info *fs_info = root->fs_info;
3912
3913        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3914            !is_fstree(root->root_key.objectid))
3915                return;
3916
3917        /*
3918         * reservation for META_PREALLOC can happen before quota is enabled,
3919         * which can lead to underflow.
3920         * Here ensure we will only free what we really have reserved.
3921         */
3922        num_bytes = sub_root_meta_rsv(root, num_bytes, type);
3923        BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
3924        trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
3925        btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
3926                                  num_bytes, type);
3927}
3928
3929static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
3930                                int num_bytes)
3931{
3932        struct btrfs_qgroup *qgroup;
3933        struct ulist_node *unode;
3934        struct ulist_iterator uiter;
3935        int ret = 0;
3936
3937        if (num_bytes == 0)
3938                return;
3939        if (!fs_info->quota_root)
3940                return;
3941
3942        spin_lock(&fs_info->qgroup_lock);
3943        qgroup = find_qgroup_rb(fs_info, ref_root);
3944        if (!qgroup)
3945                goto out;
3946        ulist_reinit(fs_info->qgroup_ulist);
3947        ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid,
3948                       qgroup_to_aux(qgroup), GFP_ATOMIC);
3949        if (ret < 0)
3950                goto out;
3951        ULIST_ITER_INIT(&uiter);
3952        while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
3953                struct btrfs_qgroup *qg;
3954                struct btrfs_qgroup_list *glist;
3955
3956                qg = unode_aux_to_qgroup(unode);
3957
3958                qgroup_rsv_release(fs_info, qg, num_bytes,
3959                                BTRFS_QGROUP_RSV_META_PREALLOC);
3960                qgroup_rsv_add(fs_info, qg, num_bytes,
3961                                BTRFS_QGROUP_RSV_META_PERTRANS);
3962                list_for_each_entry(glist, &qg->groups, next_group) {
3963                        ret = ulist_add(fs_info->qgroup_ulist,
3964                                        glist->group->qgroupid,
3965                                        qgroup_to_aux(glist->group), GFP_ATOMIC);
3966                        if (ret < 0)
3967                                goto out;
3968                }
3969        }
3970out:
3971        spin_unlock(&fs_info->qgroup_lock);
3972}
3973
3974void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
3975{
3976        struct btrfs_fs_info *fs_info = root->fs_info;
3977
3978        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
3979            !is_fstree(root->root_key.objectid))
3980                return;
3981        /* Same as btrfs_qgroup_free_meta_prealloc() */
3982        num_bytes = sub_root_meta_rsv(root, num_bytes,
3983                                      BTRFS_QGROUP_RSV_META_PREALLOC);
3984        trace_qgroup_meta_convert(root, num_bytes);
3985        qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
3986}
3987
3988/*
3989 * Check qgroup reserved space leaking, normally at destroy inode
3990 * time
3991 */
3992void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
3993{
3994        struct extent_changeset changeset;
3995        struct ulist_node *unode;
3996        struct ulist_iterator iter;
3997        int ret;
3998
3999        extent_changeset_init(&changeset);
4000        ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
4001                        EXTENT_QGROUP_RESERVED, &changeset);
4002
4003        WARN_ON(ret < 0);
4004        if (WARN_ON(changeset.bytes_changed)) {
4005                ULIST_ITER_INIT(&iter);
4006                while ((unode = ulist_next(&changeset.range_changed, &iter))) {
4007                        btrfs_warn(inode->root->fs_info,
4008                "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
4009                                btrfs_ino(inode), unode->val, unode->aux);
4010                }
4011                btrfs_qgroup_free_refroot(inode->root->fs_info,
4012                                inode->root->root_key.objectid,
4013                                changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
4014
4015        }
4016        extent_changeset_release(&changeset);
4017}
4018
4019void btrfs_qgroup_init_swapped_blocks(
4020        struct btrfs_qgroup_swapped_blocks *swapped_blocks)
4021{
4022        int i;
4023
4024        spin_lock_init(&swapped_blocks->lock);
4025        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
4026                swapped_blocks->blocks[i] = RB_ROOT;
4027        swapped_blocks->swapped = false;
4028}
4029
4030/*
4031 * Delete all swapped blocks record of @root.
4032 * Every record here means we skipped a full subtree scan for qgroup.
4033 *
4034 * Gets called when committing one transaction.
4035 */
4036void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
4037{
4038        struct btrfs_qgroup_swapped_blocks *swapped_blocks;
4039        int i;
4040
4041        swapped_blocks = &root->swapped_blocks;
4042
4043        spin_lock(&swapped_blocks->lock);
4044        if (!swapped_blocks->swapped)
4045                goto out;
4046        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4047                struct rb_root *cur_root = &swapped_blocks->blocks[i];
4048                struct btrfs_qgroup_swapped_block *entry;
4049                struct btrfs_qgroup_swapped_block *next;
4050
4051                rbtree_postorder_for_each_entry_safe(entry, next, cur_root,
4052                                                     node)
4053                        kfree(entry);
4054                swapped_blocks->blocks[i] = RB_ROOT;
4055        }
4056        swapped_blocks->swapped = false;
4057out:
4058        spin_unlock(&swapped_blocks->lock);
4059}
4060
4061/*
4062 * Add subtree roots record into @subvol_root.
4063 *
4064 * @subvol_root:        tree root of the subvolume tree get swapped
4065 * @bg:                 block group under balance
4066 * @subvol_parent/slot: pointer to the subtree root in subvolume tree
4067 * @reloc_parent/slot:  pointer to the subtree root in reloc tree
4068 *                      BOTH POINTERS ARE BEFORE TREE SWAP
4069 * @last_snapshot:      last snapshot generation of the subvolume tree
4070 */
4071int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
4072                struct btrfs_root *subvol_root,
4073                struct btrfs_block_group *bg,
4074                struct extent_buffer *subvol_parent, int subvol_slot,
4075                struct extent_buffer *reloc_parent, int reloc_slot,
4076                u64 last_snapshot)
4077{
4078        struct btrfs_fs_info *fs_info = subvol_root->fs_info;
4079        struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
4080        struct btrfs_qgroup_swapped_block *block;
4081        struct rb_node **cur;
4082        struct rb_node *parent = NULL;
4083        int level = btrfs_header_level(subvol_parent) - 1;
4084        int ret = 0;
4085
4086        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4087                return 0;
4088
4089        if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) >
4090            btrfs_node_ptr_generation(reloc_parent, reloc_slot)) {
4091                btrfs_err_rl(fs_info,
4092                "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu",
4093                        __func__,
4094                        btrfs_node_ptr_generation(subvol_parent, subvol_slot),
4095                        btrfs_node_ptr_generation(reloc_parent, reloc_slot));
4096                return -EUCLEAN;
4097        }
4098
4099        block = kmalloc(sizeof(*block), GFP_NOFS);
4100        if (!block) {
4101                ret = -ENOMEM;
4102                goto out;
4103        }
4104
4105        /*
4106         * @reloc_parent/slot is still before swap, while @block is going to
4107         * record the bytenr after swap, so we do the swap here.
4108         */
4109        block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot);
4110        block->subvol_generation = btrfs_node_ptr_generation(reloc_parent,
4111                                                             reloc_slot);
4112        block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot);
4113        block->reloc_generation = btrfs_node_ptr_generation(subvol_parent,
4114                                                            subvol_slot);
4115        block->last_snapshot = last_snapshot;
4116        block->level = level;
4117
4118        /*
4119         * If we have bg == NULL, we're called from btrfs_recover_relocation(),
4120         * no one else can modify tree blocks thus we qgroup will not change
4121         * no matter the value of trace_leaf.
4122         */
4123        if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA)
4124                block->trace_leaf = true;
4125        else
4126                block->trace_leaf = false;
4127        btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot);
4128
4129        /* Insert @block into @blocks */
4130        spin_lock(&blocks->lock);
4131        cur = &blocks->blocks[level].rb_node;
4132        while (*cur) {
4133                struct btrfs_qgroup_swapped_block *entry;
4134
4135                parent = *cur;
4136                entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
4137                                 node);
4138
4139                if (entry->subvol_bytenr < block->subvol_bytenr) {
4140                        cur = &(*cur)->rb_left;
4141                } else if (entry->subvol_bytenr > block->subvol_bytenr) {
4142                        cur = &(*cur)->rb_right;
4143                } else {
4144                        if (entry->subvol_generation !=
4145                                        block->subvol_generation ||
4146                            entry->reloc_bytenr != block->reloc_bytenr ||
4147                            entry->reloc_generation !=
4148                                        block->reloc_generation) {
4149                                /*
4150                                 * Duplicated but mismatch entry found.
4151                                 * Shouldn't happen.
4152                                 *
4153                                 * Marking qgroup inconsistent should be enough
4154                                 * for end users.
4155                                 */
4156                                WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
4157                                ret = -EEXIST;
4158                        }
4159                        kfree(block);
4160                        goto out_unlock;
4161                }
4162        }
4163        rb_link_node(&block->node, parent, cur);
4164        rb_insert_color(&block->node, &blocks->blocks[level]);
4165        blocks->swapped = true;
4166out_unlock:
4167        spin_unlock(&blocks->lock);
4168out:
4169        if (ret < 0)
4170                fs_info->qgroup_flags |=
4171                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4172        return ret;
4173}
4174
4175/*
4176 * Check if the tree block is a subtree root, and if so do the needed
4177 * delayed subtree trace for qgroup.
4178 *
4179 * This is called during btrfs_cow_block().
4180 */
4181int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
4182                                         struct btrfs_root *root,
4183                                         struct extent_buffer *subvol_eb)
4184{
4185        struct btrfs_fs_info *fs_info = root->fs_info;
4186        struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
4187        struct btrfs_qgroup_swapped_block *block;
4188        struct extent_buffer *reloc_eb = NULL;
4189        struct rb_node *node;
4190        bool found = false;
4191        bool swapped = false;
4192        int level = btrfs_header_level(subvol_eb);
4193        int ret = 0;
4194        int i;
4195
4196        if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
4197                return 0;
4198        if (!is_fstree(root->root_key.objectid) || !root->reloc_root)
4199                return 0;
4200
4201        spin_lock(&blocks->lock);
4202        if (!blocks->swapped) {
4203                spin_unlock(&blocks->lock);
4204                return 0;
4205        }
4206        node = blocks->blocks[level].rb_node;
4207
4208        while (node) {
4209                block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
4210                if (block->subvol_bytenr < subvol_eb->start) {
4211                        node = node->rb_left;
4212                } else if (block->subvol_bytenr > subvol_eb->start) {
4213                        node = node->rb_right;
4214                } else {
4215                        found = true;
4216                        break;
4217                }
4218        }
4219        if (!found) {
4220                spin_unlock(&blocks->lock);
4221                goto out;
4222        }
4223        /* Found one, remove it from @blocks first and update blocks->swapped */
4224        rb_erase(&block->node, &blocks->blocks[level]);
4225        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4226                if (RB_EMPTY_ROOT(&blocks->blocks[i])) {
4227                        swapped = true;
4228                        break;
4229                }
4230        }
4231        blocks->swapped = swapped;
4232        spin_unlock(&blocks->lock);
4233
4234        /* Read out reloc subtree root */
4235        reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
4236                                   block->reloc_generation, block->level,
4237                                   &block->first_key);
4238        if (IS_ERR(reloc_eb)) {
4239                ret = PTR_ERR(reloc_eb);
4240                reloc_eb = NULL;
4241                goto free_out;
4242        }
4243        if (!extent_buffer_uptodate(reloc_eb)) {
4244                ret = -EIO;
4245                goto free_out;
4246        }
4247
4248        ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb,
4249                        block->last_snapshot, block->trace_leaf);
4250free_out:
4251        kfree(block);
4252        free_extent_buffer(reloc_eb);
4253out:
4254        if (ret < 0) {
4255                btrfs_err_rl(fs_info,
4256                             "failed to account subtree at bytenr %llu: %d",
4257                             subvol_eb->start, ret);
4258                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
4259        }
4260        return ret;
4261}
4262
4263void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
4264{
4265        struct btrfs_qgroup_extent_record *entry;
4266        struct btrfs_qgroup_extent_record *next;
4267        struct rb_root *root;
4268
4269        root = &trans->delayed_refs.dirty_extent_root;
4270        rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
4271                ulist_free(entry->old_roots);
4272                kfree(entry);
4273        }
4274}
4275