linux/fs/btrfs/space-info.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "ctree.h"
   4#include "space-info.h"
   5#include "sysfs.h"
   6#include "volumes.h"
   7#include "free-space-cache.h"
   8#include "ordered-data.h"
   9#include "transaction.h"
  10#include "math.h"
  11
  12u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
  13                          bool may_use_included)
  14{
  15        ASSERT(s_info);
  16        return s_info->bytes_used + s_info->bytes_reserved +
  17                s_info->bytes_pinned + s_info->bytes_readonly +
  18                (may_use_included ? s_info->bytes_may_use : 0);
  19}
  20
  21/*
  22 * after adding space to the filesystem, we need to clear the full flags
  23 * on all the space infos.
  24 */
  25void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
  26{
  27        struct list_head *head = &info->space_info;
  28        struct btrfs_space_info *found;
  29
  30        rcu_read_lock();
  31        list_for_each_entry_rcu(found, head, list)
  32                found->full = 0;
  33        rcu_read_unlock();
  34}
  35
  36static const char *alloc_name(u64 flags)
  37{
  38        switch (flags) {
  39        case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
  40                return "mixed";
  41        case BTRFS_BLOCK_GROUP_METADATA:
  42                return "metadata";
  43        case BTRFS_BLOCK_GROUP_DATA:
  44                return "data";
  45        case BTRFS_BLOCK_GROUP_SYSTEM:
  46                return "system";
  47        default:
  48                WARN_ON(1);
  49                return "invalid-combination";
  50        };
  51}
  52
  53static int create_space_info(struct btrfs_fs_info *info, u64 flags)
  54{
  55
  56        struct btrfs_space_info *space_info;
  57        int i;
  58        int ret;
  59
  60        space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
  61        if (!space_info)
  62                return -ENOMEM;
  63
  64        ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
  65                                 GFP_KERNEL);
  66        if (ret) {
  67                kfree(space_info);
  68                return ret;
  69        }
  70
  71        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
  72                INIT_LIST_HEAD(&space_info->block_groups[i]);
  73        init_rwsem(&space_info->groups_sem);
  74        spin_lock_init(&space_info->lock);
  75        space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
  76        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
  77        init_waitqueue_head(&space_info->wait);
  78        INIT_LIST_HEAD(&space_info->ro_bgs);
  79        INIT_LIST_HEAD(&space_info->tickets);
  80        INIT_LIST_HEAD(&space_info->priority_tickets);
  81
  82        ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
  83                                    info->space_info_kobj, "%s",
  84                                    alloc_name(space_info->flags));
  85        if (ret) {
  86                kobject_put(&space_info->kobj);
  87                return ret;
  88        }
  89
  90        list_add_rcu(&space_info->list, &info->space_info);
  91        if (flags & BTRFS_BLOCK_GROUP_DATA)
  92                info->data_sinfo = space_info;
  93
  94        return ret;
  95}
  96
  97int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
  98{
  99        struct btrfs_super_block *disk_super;
 100        u64 features;
 101        u64 flags;
 102        int mixed = 0;
 103        int ret;
 104
 105        disk_super = fs_info->super_copy;
 106        if (!btrfs_super_root(disk_super))
 107                return -EINVAL;
 108
 109        features = btrfs_super_incompat_flags(disk_super);
 110        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
 111                mixed = 1;
 112
 113        flags = BTRFS_BLOCK_GROUP_SYSTEM;
 114        ret = create_space_info(fs_info, flags);
 115        if (ret)
 116                goto out;
 117
 118        if (mixed) {
 119                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
 120                ret = create_space_info(fs_info, flags);
 121        } else {
 122                flags = BTRFS_BLOCK_GROUP_METADATA;
 123                ret = create_space_info(fs_info, flags);
 124                if (ret)
 125                        goto out;
 126
 127                flags = BTRFS_BLOCK_GROUP_DATA;
 128                ret = create_space_info(fs_info, flags);
 129        }
 130out:
 131        return ret;
 132}
 133
 134void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
 135                             u64 total_bytes, u64 bytes_used,
 136                             u64 bytes_readonly,
 137                             struct btrfs_space_info **space_info)
 138{
 139        struct btrfs_space_info *found;
 140        int factor;
 141
 142        factor = btrfs_bg_type_to_factor(flags);
 143
 144        found = btrfs_find_space_info(info, flags);
 145        ASSERT(found);
 146        spin_lock(&found->lock);
 147        found->total_bytes += total_bytes;
 148        found->disk_total += total_bytes * factor;
 149        found->bytes_used += bytes_used;
 150        found->disk_used += bytes_used * factor;
 151        found->bytes_readonly += bytes_readonly;
 152        if (total_bytes > 0)
 153                found->full = 0;
 154        btrfs_space_info_add_new_bytes(info, found,
 155                                       total_bytes - bytes_used -
 156                                       bytes_readonly);
 157        spin_unlock(&found->lock);
 158        *space_info = found;
 159}
 160
 161struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
 162                                               u64 flags)
 163{
 164        struct list_head *head = &info->space_info;
 165        struct btrfs_space_info *found;
 166
 167        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 168
 169        rcu_read_lock();
 170        list_for_each_entry_rcu(found, head, list) {
 171                if (found->flags & flags) {
 172                        rcu_read_unlock();
 173                        return found;
 174                }
 175        }
 176        rcu_read_unlock();
 177        return NULL;
 178}
 179
 180static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
 181{
 182        return (global->size << 1);
 183}
 184
 185static int can_overcommit(struct btrfs_fs_info *fs_info,
 186                          struct btrfs_space_info *space_info, u64 bytes,
 187                          enum btrfs_reserve_flush_enum flush,
 188                          bool system_chunk)
 189{
 190        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 191        u64 profile;
 192        u64 space_size;
 193        u64 avail;
 194        u64 used;
 195        int factor;
 196
 197        /* Don't overcommit when in mixed mode. */
 198        if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
 199                return 0;
 200
 201        if (system_chunk)
 202                profile = btrfs_system_alloc_profile(fs_info);
 203        else
 204                profile = btrfs_metadata_alloc_profile(fs_info);
 205
 206        used = btrfs_space_info_used(space_info, false);
 207
 208        /*
 209         * We only want to allow over committing if we have lots of actual space
 210         * free, but if we don't have enough space to handle the global reserve
 211         * space then we could end up having a real enospc problem when trying
 212         * to allocate a chunk or some other such important allocation.
 213         */
 214        spin_lock(&global_rsv->lock);
 215        space_size = calc_global_rsv_need_space(global_rsv);
 216        spin_unlock(&global_rsv->lock);
 217        if (used + space_size >= space_info->total_bytes)
 218                return 0;
 219
 220        used += space_info->bytes_may_use;
 221
 222        avail = atomic64_read(&fs_info->free_chunk_space);
 223
 224        /*
 225         * If we have dup, raid1 or raid10 then only half of the free
 226         * space is actually usable.  For raid56, the space info used
 227         * doesn't include the parity drive, so we don't have to
 228         * change the math
 229         */
 230        factor = btrfs_bg_type_to_factor(profile);
 231        avail = div_u64(avail, factor);
 232
 233        /*
 234         * If we aren't flushing all things, let us overcommit up to
 235         * 1/2th of the space. If we can flush, don't let us overcommit
 236         * too much, let it overcommit up to 1/8 of the space.
 237         */
 238        if (flush == BTRFS_RESERVE_FLUSH_ALL)
 239                avail >>= 3;
 240        else
 241                avail >>= 1;
 242
 243        if (used + bytes < space_info->total_bytes + avail)
 244                return 1;
 245        return 0;
 246}
 247
 248/*
 249 * This is for space we already have accounted in space_info->bytes_may_use, so
 250 * basically when we're returning space from block_rsv's.
 251 */
 252void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 253                                    struct btrfs_space_info *space_info,
 254                                    u64 num_bytes)
 255{
 256        struct reserve_ticket *ticket;
 257        struct list_head *head;
 258        u64 used;
 259        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
 260        bool check_overcommit = false;
 261
 262        spin_lock(&space_info->lock);
 263        head = &space_info->priority_tickets;
 264
 265        /*
 266         * If we are over our limit then we need to check and see if we can
 267         * overcommit, and if we can't then we just need to free up our space
 268         * and not satisfy any requests.
 269         */
 270        used = btrfs_space_info_used(space_info, true);
 271        if (used - num_bytes >= space_info->total_bytes)
 272                check_overcommit = true;
 273again:
 274        while (!list_empty(head) && num_bytes) {
 275                ticket = list_first_entry(head, struct reserve_ticket,
 276                                          list);
 277                /*
 278                 * We use 0 bytes because this space is already reserved, so
 279                 * adding the ticket space would be a double count.
 280                 */
 281                if (check_overcommit &&
 282                    !can_overcommit(fs_info, space_info, 0, flush, false))
 283                        break;
 284                if (num_bytes >= ticket->bytes) {
 285                        list_del_init(&ticket->list);
 286                        num_bytes -= ticket->bytes;
 287                        ticket->bytes = 0;
 288                        space_info->tickets_id++;
 289                        wake_up(&ticket->wait);
 290                } else {
 291                        ticket->bytes -= num_bytes;
 292                        num_bytes = 0;
 293                }
 294        }
 295
 296        if (num_bytes && head == &space_info->priority_tickets) {
 297                head = &space_info->tickets;
 298                flush = BTRFS_RESERVE_FLUSH_ALL;
 299                goto again;
 300        }
 301        btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
 302        trace_btrfs_space_reservation(fs_info, "space_info",
 303                                      space_info->flags, num_bytes, 0);
 304        spin_unlock(&space_info->lock);
 305}
 306
 307/*
 308 * This is for newly allocated space that isn't accounted in
 309 * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
 310 * we use this helper.
 311 */
 312void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
 313                                    struct btrfs_space_info *space_info,
 314                                    u64 num_bytes)
 315{
 316        struct reserve_ticket *ticket;
 317        struct list_head *head = &space_info->priority_tickets;
 318
 319again:
 320        while (!list_empty(head) && num_bytes) {
 321                ticket = list_first_entry(head, struct reserve_ticket,
 322                                          list);
 323                if (num_bytes >= ticket->bytes) {
 324                        trace_btrfs_space_reservation(fs_info, "space_info",
 325                                                      space_info->flags,
 326                                                      ticket->bytes, 1);
 327                        list_del_init(&ticket->list);
 328                        num_bytes -= ticket->bytes;
 329                        btrfs_space_info_update_bytes_may_use(fs_info,
 330                                                              space_info,
 331                                                              ticket->bytes);
 332                        ticket->bytes = 0;
 333                        space_info->tickets_id++;
 334                        wake_up(&ticket->wait);
 335                } else {
 336                        trace_btrfs_space_reservation(fs_info, "space_info",
 337                                                      space_info->flags,
 338                                                      num_bytes, 1);
 339                        btrfs_space_info_update_bytes_may_use(fs_info,
 340                                                              space_info,
 341                                                              num_bytes);
 342                        ticket->bytes -= num_bytes;
 343                        num_bytes = 0;
 344                }
 345        }
 346
 347        if (num_bytes && head == &space_info->priority_tickets) {
 348                head = &space_info->tickets;
 349                goto again;
 350        }
 351}
 352
 353#define DUMP_BLOCK_RSV(fs_info, rsv_name)                               \
 354do {                                                                    \
 355        struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name;           \
 356        spin_lock(&__rsv->lock);                                        \
 357        btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu",      \
 358                   __rsv->size, __rsv->reserved);                       \
 359        spin_unlock(&__rsv->lock);                                      \
 360} while (0)
 361
 362void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
 363                           struct btrfs_space_info *info, u64 bytes,
 364                           int dump_block_groups)
 365{
 366        struct btrfs_block_group_cache *cache;
 367        int index = 0;
 368
 369        spin_lock(&info->lock);
 370        btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
 371                   info->flags,
 372                   info->total_bytes - btrfs_space_info_used(info, true),
 373                   info->full ? "" : "not ");
 374        btrfs_info(fs_info,
 375                "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
 376                info->total_bytes, info->bytes_used, info->bytes_pinned,
 377                info->bytes_reserved, info->bytes_may_use,
 378                info->bytes_readonly);
 379        spin_unlock(&info->lock);
 380
 381        DUMP_BLOCK_RSV(fs_info, global_block_rsv);
 382        DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
 383        DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
 384        DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
 385        DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
 386
 387        if (!dump_block_groups)
 388                return;
 389
 390        down_read(&info->groups_sem);
 391again:
 392        list_for_each_entry(cache, &info->block_groups[index], list) {
 393                spin_lock(&cache->lock);
 394                btrfs_info(fs_info,
 395                        "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
 396                        cache->key.objectid, cache->key.offset,
 397                        btrfs_block_group_used(&cache->item), cache->pinned,
 398                        cache->reserved, cache->ro ? "[readonly]" : "");
 399                btrfs_dump_free_space(cache, bytes);
 400                spin_unlock(&cache->lock);
 401        }
 402        if (++index < BTRFS_NR_RAID_TYPES)
 403                goto again;
 404        up_read(&info->groups_sem);
 405}
 406
 407static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
 408                                         unsigned long nr_pages, int nr_items)
 409{
 410        struct super_block *sb = fs_info->sb;
 411
 412        if (down_read_trylock(&sb->s_umount)) {
 413                writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
 414                up_read(&sb->s_umount);
 415        } else {
 416                /*
 417                 * We needn't worry the filesystem going from r/w to r/o though
 418                 * we don't acquire ->s_umount mutex, because the filesystem
 419                 * should guarantee the delalloc inodes list be empty after
 420                 * the filesystem is readonly(all dirty pages are written to
 421                 * the disk).
 422                 */
 423                btrfs_start_delalloc_roots(fs_info, nr_items);
 424                if (!current->journal_info)
 425                        btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
 426        }
 427}
 428
 429static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
 430                                        u64 to_reclaim)
 431{
 432        u64 bytes;
 433        u64 nr;
 434
 435        bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
 436        nr = div64_u64(to_reclaim, bytes);
 437        if (!nr)
 438                nr = 1;
 439        return nr;
 440}
 441
 442#define EXTENT_SIZE_PER_ITEM    SZ_256K
 443
 444/*
 445 * shrink metadata reservation for delalloc
 446 */
 447static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 448                            u64 orig, bool wait_ordered)
 449{
 450        struct btrfs_space_info *space_info;
 451        struct btrfs_trans_handle *trans;
 452        u64 delalloc_bytes;
 453        u64 dio_bytes;
 454        u64 async_pages;
 455        u64 items;
 456        long time_left;
 457        unsigned long nr_pages;
 458        int loops;
 459
 460        /* Calc the number of the pages we need flush for space reservation */
 461        items = calc_reclaim_items_nr(fs_info, to_reclaim);
 462        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
 463
 464        trans = (struct btrfs_trans_handle *)current->journal_info;
 465        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 466
 467        delalloc_bytes = percpu_counter_sum_positive(
 468                                                &fs_info->delalloc_bytes);
 469        dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
 470        if (delalloc_bytes == 0 && dio_bytes == 0) {
 471                if (trans)
 472                        return;
 473                if (wait_ordered)
 474                        btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 475                return;
 476        }
 477
 478        /*
 479         * If we are doing more ordered than delalloc we need to just wait on
 480         * ordered extents, otherwise we'll waste time trying to flush delalloc
 481         * that likely won't give us the space back we need.
 482         */
 483        if (dio_bytes > delalloc_bytes)
 484                wait_ordered = true;
 485
 486        loops = 0;
 487        while ((delalloc_bytes || dio_bytes) && loops < 3) {
 488                nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
 489
 490                /*
 491                 * Triggers inode writeback for up to nr_pages. This will invoke
 492                 * ->writepages callback and trigger delalloc filling
 493                 *  (btrfs_run_delalloc_range()).
 494                 */
 495                btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
 496
 497                /*
 498                 * We need to wait for the compressed pages to start before
 499                 * we continue.
 500                 */
 501                async_pages = atomic_read(&fs_info->async_delalloc_pages);
 502                if (!async_pages)
 503                        goto skip_async;
 504
 505                /*
 506                 * Calculate how many compressed pages we want to be written
 507                 * before we continue. I.e if there are more async pages than we
 508                 * require wait_event will wait until nr_pages are written.
 509                 */
 510                if (async_pages <= nr_pages)
 511                        async_pages = 0;
 512                else
 513                        async_pages -= nr_pages;
 514
 515                wait_event(fs_info->async_submit_wait,
 516                           atomic_read(&fs_info->async_delalloc_pages) <=
 517                           (int)async_pages);
 518skip_async:
 519                spin_lock(&space_info->lock);
 520                if (list_empty(&space_info->tickets) &&
 521                    list_empty(&space_info->priority_tickets)) {
 522                        spin_unlock(&space_info->lock);
 523                        break;
 524                }
 525                spin_unlock(&space_info->lock);
 526
 527                loops++;
 528                if (wait_ordered && !trans) {
 529                        btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
 530                } else {
 531                        time_left = schedule_timeout_killable(1);
 532                        if (time_left)
 533                                break;
 534                }
 535                delalloc_bytes = percpu_counter_sum_positive(
 536                                                &fs_info->delalloc_bytes);
 537                dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
 538        }
 539}
 540
 541/**
 542 * maybe_commit_transaction - possibly commit the transaction if its ok to
 543 * @root - the root we're allocating for
 544 * @bytes - the number of bytes we want to reserve
 545 * @force - force the commit
 546 *
 547 * This will check to make sure that committing the transaction will actually
 548 * get us somewhere and then commit the transaction if it does.  Otherwise it
 549 * will return -ENOSPC.
 550 */
 551static int may_commit_transaction(struct btrfs_fs_info *fs_info,
 552                                  struct btrfs_space_info *space_info)
 553{
 554        struct reserve_ticket *ticket = NULL;
 555        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
 556        struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
 557        struct btrfs_trans_handle *trans;
 558        u64 bytes_needed;
 559        u64 reclaim_bytes = 0;
 560
 561        trans = (struct btrfs_trans_handle *)current->journal_info;
 562        if (trans)
 563                return -EAGAIN;
 564
 565        spin_lock(&space_info->lock);
 566        if (!list_empty(&space_info->priority_tickets))
 567                ticket = list_first_entry(&space_info->priority_tickets,
 568                                          struct reserve_ticket, list);
 569        else if (!list_empty(&space_info->tickets))
 570                ticket = list_first_entry(&space_info->tickets,
 571                                          struct reserve_ticket, list);
 572        bytes_needed = (ticket) ? ticket->bytes : 0;
 573        spin_unlock(&space_info->lock);
 574
 575        if (!bytes_needed)
 576                return 0;
 577
 578        trans = btrfs_join_transaction(fs_info->extent_root);
 579        if (IS_ERR(trans))
 580                return PTR_ERR(trans);
 581
 582        /*
 583         * See if there is enough pinned space to make this reservation, or if
 584         * we have block groups that are going to be freed, allowing us to
 585         * possibly do a chunk allocation the next loop through.
 586         */
 587        if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
 588            __percpu_counter_compare(&space_info->total_bytes_pinned,
 589                                     bytes_needed,
 590                                     BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
 591                goto commit;
 592
 593        /*
 594         * See if there is some space in the delayed insertion reservation for
 595         * this reservation.
 596         */
 597        if (space_info != delayed_rsv->space_info)
 598                goto enospc;
 599
 600        spin_lock(&delayed_rsv->lock);
 601        reclaim_bytes += delayed_rsv->reserved;
 602        spin_unlock(&delayed_rsv->lock);
 603
 604        spin_lock(&delayed_refs_rsv->lock);
 605        reclaim_bytes += delayed_refs_rsv->reserved;
 606        spin_unlock(&delayed_refs_rsv->lock);
 607        if (reclaim_bytes >= bytes_needed)
 608                goto commit;
 609        bytes_needed -= reclaim_bytes;
 610
 611        if (__percpu_counter_compare(&space_info->total_bytes_pinned,
 612                                   bytes_needed,
 613                                   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
 614                goto enospc;
 615
 616commit:
 617        return btrfs_commit_transaction(trans);
 618enospc:
 619        btrfs_end_transaction(trans);
 620        return -ENOSPC;
 621}
 622
 623/*
 624 * Try to flush some data based on policy set by @state. This is only advisory
 625 * and may fail for various reasons. The caller is supposed to examine the
 626 * state of @space_info to detect the outcome.
 627 */
 628static void flush_space(struct btrfs_fs_info *fs_info,
 629                       struct btrfs_space_info *space_info, u64 num_bytes,
 630                       int state)
 631{
 632        struct btrfs_root *root = fs_info->extent_root;
 633        struct btrfs_trans_handle *trans;
 634        int nr;
 635        int ret = 0;
 636
 637        switch (state) {
 638        case FLUSH_DELAYED_ITEMS_NR:
 639        case FLUSH_DELAYED_ITEMS:
 640                if (state == FLUSH_DELAYED_ITEMS_NR)
 641                        nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
 642                else
 643                        nr = -1;
 644
 645                trans = btrfs_join_transaction(root);
 646                if (IS_ERR(trans)) {
 647                        ret = PTR_ERR(trans);
 648                        break;
 649                }
 650                ret = btrfs_run_delayed_items_nr(trans, nr);
 651                btrfs_end_transaction(trans);
 652                break;
 653        case FLUSH_DELALLOC:
 654        case FLUSH_DELALLOC_WAIT:
 655                shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
 656                                state == FLUSH_DELALLOC_WAIT);
 657                break;
 658        case FLUSH_DELAYED_REFS_NR:
 659        case FLUSH_DELAYED_REFS:
 660                trans = btrfs_join_transaction(root);
 661                if (IS_ERR(trans)) {
 662                        ret = PTR_ERR(trans);
 663                        break;
 664                }
 665                if (state == FLUSH_DELAYED_REFS_NR)
 666                        nr = calc_reclaim_items_nr(fs_info, num_bytes);
 667                else
 668                        nr = 0;
 669                btrfs_run_delayed_refs(trans, nr);
 670                btrfs_end_transaction(trans);
 671                break;
 672        case ALLOC_CHUNK:
 673        case ALLOC_CHUNK_FORCE:
 674                trans = btrfs_join_transaction(root);
 675                if (IS_ERR(trans)) {
 676                        ret = PTR_ERR(trans);
 677                        break;
 678                }
 679                ret = btrfs_chunk_alloc(trans,
 680                                btrfs_metadata_alloc_profile(fs_info),
 681                                (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
 682                                        CHUNK_ALLOC_FORCE);
 683                btrfs_end_transaction(trans);
 684                if (ret > 0 || ret == -ENOSPC)
 685                        ret = 0;
 686                break;
 687        case COMMIT_TRANS:
 688                /*
 689                 * If we have pending delayed iputs then we could free up a
 690                 * bunch of pinned space, so make sure we run the iputs before
 691                 * we do our pinned bytes check below.
 692                 */
 693                btrfs_run_delayed_iputs(fs_info);
 694                btrfs_wait_on_delayed_iputs(fs_info);
 695
 696                ret = may_commit_transaction(fs_info, space_info);
 697                break;
 698        default:
 699                ret = -ENOSPC;
 700                break;
 701        }
 702
 703        trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
 704                                ret);
 705        return;
 706}
 707
 708static inline u64
 709btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
 710                                 struct btrfs_space_info *space_info,
 711                                 bool system_chunk)
 712{
 713        struct reserve_ticket *ticket;
 714        u64 used;
 715        u64 expected;
 716        u64 to_reclaim = 0;
 717
 718        list_for_each_entry(ticket, &space_info->tickets, list)
 719                to_reclaim += ticket->bytes;
 720        list_for_each_entry(ticket, &space_info->priority_tickets, list)
 721                to_reclaim += ticket->bytes;
 722        if (to_reclaim)
 723                return to_reclaim;
 724
 725        to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
 726        if (can_overcommit(fs_info, space_info, to_reclaim,
 727                           BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 728                return 0;
 729
 730        used = btrfs_space_info_used(space_info, true);
 731
 732        if (can_overcommit(fs_info, space_info, SZ_1M,
 733                           BTRFS_RESERVE_FLUSH_ALL, system_chunk))
 734                expected = div_factor_fine(space_info->total_bytes, 95);
 735        else
 736                expected = div_factor_fine(space_info->total_bytes, 90);
 737
 738        if (used > expected)
 739                to_reclaim = used - expected;
 740        else
 741                to_reclaim = 0;
 742        to_reclaim = min(to_reclaim, space_info->bytes_may_use +
 743                                     space_info->bytes_reserved);
 744        return to_reclaim;
 745}
 746
 747static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
 748                                        struct btrfs_space_info *space_info,
 749                                        u64 used, bool system_chunk)
 750{
 751        u64 thresh = div_factor_fine(space_info->total_bytes, 98);
 752
 753        /* If we're just plain full then async reclaim just slows us down. */
 754        if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
 755                return 0;
 756
 757        if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 758                                              system_chunk))
 759                return 0;
 760
 761        return (used >= thresh && !btrfs_fs_closing(fs_info) &&
 762                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 763}
 764
 765static bool wake_all_tickets(struct list_head *head)
 766{
 767        struct reserve_ticket *ticket;
 768
 769        while (!list_empty(head)) {
 770                ticket = list_first_entry(head, struct reserve_ticket, list);
 771                list_del_init(&ticket->list);
 772                ticket->error = -ENOSPC;
 773                wake_up(&ticket->wait);
 774                if (ticket->bytes != ticket->orig_bytes)
 775                        return true;
 776        }
 777        return false;
 778}
 779
 780/*
 781 * This is for normal flushers, we can wait all goddamned day if we want to.  We
 782 * will loop and continuously try to flush as long as we are making progress.
 783 * We count progress as clearing off tickets each time we have to loop.
 784 */
 785static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
 786{
 787        struct btrfs_fs_info *fs_info;
 788        struct btrfs_space_info *space_info;
 789        u64 to_reclaim;
 790        int flush_state;
 791        int commit_cycles = 0;
 792        u64 last_tickets_id;
 793
 794        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
 795        space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 796
 797        spin_lock(&space_info->lock);
 798        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 799                                                      false);
 800        if (!to_reclaim) {
 801                space_info->flush = 0;
 802                spin_unlock(&space_info->lock);
 803                return;
 804        }
 805        last_tickets_id = space_info->tickets_id;
 806        spin_unlock(&space_info->lock);
 807
 808        flush_state = FLUSH_DELAYED_ITEMS_NR;
 809        do {
 810                flush_space(fs_info, space_info, to_reclaim, flush_state);
 811                spin_lock(&space_info->lock);
 812                if (list_empty(&space_info->tickets)) {
 813                        space_info->flush = 0;
 814                        spin_unlock(&space_info->lock);
 815                        return;
 816                }
 817                to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
 818                                                              space_info,
 819                                                              false);
 820                if (last_tickets_id == space_info->tickets_id) {
 821                        flush_state++;
 822                } else {
 823                        last_tickets_id = space_info->tickets_id;
 824                        flush_state = FLUSH_DELAYED_ITEMS_NR;
 825                        if (commit_cycles)
 826                                commit_cycles--;
 827                }
 828
 829                /*
 830                 * We don't want to force a chunk allocation until we've tried
 831                 * pretty hard to reclaim space.  Think of the case where we
 832                 * freed up a bunch of space and so have a lot of pinned space
 833                 * to reclaim.  We would rather use that than possibly create a
 834                 * underutilized metadata chunk.  So if this is our first run
 835                 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
 836                 * commit the transaction.  If nothing has changed the next go
 837                 * around then we can force a chunk allocation.
 838                 */
 839                if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
 840                        flush_state++;
 841
 842                if (flush_state > COMMIT_TRANS) {
 843                        commit_cycles++;
 844                        if (commit_cycles > 2) {
 845                                if (wake_all_tickets(&space_info->tickets)) {
 846                                        flush_state = FLUSH_DELAYED_ITEMS_NR;
 847                                        commit_cycles--;
 848                                } else {
 849                                        space_info->flush = 0;
 850                                }
 851                        } else {
 852                                flush_state = FLUSH_DELAYED_ITEMS_NR;
 853                        }
 854                }
 855                spin_unlock(&space_info->lock);
 856        } while (flush_state <= COMMIT_TRANS);
 857}
 858
 859void btrfs_init_async_reclaim_work(struct work_struct *work)
 860{
 861        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
 862}
 863
 864static const enum btrfs_flush_state priority_flush_states[] = {
 865        FLUSH_DELAYED_ITEMS_NR,
 866        FLUSH_DELAYED_ITEMS,
 867        ALLOC_CHUNK,
 868};
 869
 870static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
 871                                            struct btrfs_space_info *space_info,
 872                                            struct reserve_ticket *ticket)
 873{
 874        u64 to_reclaim;
 875        int flush_state;
 876
 877        spin_lock(&space_info->lock);
 878        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
 879                                                      false);
 880        if (!to_reclaim) {
 881                spin_unlock(&space_info->lock);
 882                return;
 883        }
 884        spin_unlock(&space_info->lock);
 885
 886        flush_state = 0;
 887        do {
 888                flush_space(fs_info, space_info, to_reclaim,
 889                            priority_flush_states[flush_state]);
 890                flush_state++;
 891                spin_lock(&space_info->lock);
 892                if (ticket->bytes == 0) {
 893                        spin_unlock(&space_info->lock);
 894                        return;
 895                }
 896                spin_unlock(&space_info->lock);
 897        } while (flush_state < ARRAY_SIZE(priority_flush_states));
 898}
 899
 900static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
 901                               struct btrfs_space_info *space_info,
 902                               struct reserve_ticket *ticket)
 903
 904{
 905        DEFINE_WAIT(wait);
 906        u64 reclaim_bytes = 0;
 907        int ret = 0;
 908
 909        spin_lock(&space_info->lock);
 910        while (ticket->bytes > 0 && ticket->error == 0) {
 911                ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
 912                if (ret) {
 913                        ret = -EINTR;
 914                        break;
 915                }
 916                spin_unlock(&space_info->lock);
 917
 918                schedule();
 919
 920                finish_wait(&ticket->wait, &wait);
 921                spin_lock(&space_info->lock);
 922        }
 923        if (!ret)
 924                ret = ticket->error;
 925        if (!list_empty(&ticket->list))
 926                list_del_init(&ticket->list);
 927        if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
 928                reclaim_bytes = ticket->orig_bytes - ticket->bytes;
 929        spin_unlock(&space_info->lock);
 930
 931        if (reclaim_bytes)
 932                btrfs_space_info_add_old_bytes(fs_info, space_info,
 933                                               reclaim_bytes);
 934        return ret;
 935}
 936
 937/**
 938 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 939 * @root - the root we're allocating for
 940 * @space_info - the space info we want to allocate from
 941 * @orig_bytes - the number of bytes we want
 942 * @flush - whether or not we can flush to make our reservation
 943 *
 944 * This will reserve orig_bytes number of bytes from the space info associated
 945 * with the block_rsv.  If there is not enough space it will make an attempt to
 946 * flush out space to make room.  It will do this by flushing delalloc if
 947 * possible or committing the transaction.  If flush is 0 then no attempts to
 948 * regain reservations will be made and this will fail if there is not enough
 949 * space already.
 950 */
 951static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
 952                                    struct btrfs_space_info *space_info,
 953                                    u64 orig_bytes,
 954                                    enum btrfs_reserve_flush_enum flush,
 955                                    bool system_chunk)
 956{
 957        struct reserve_ticket ticket;
 958        u64 used;
 959        u64 reclaim_bytes = 0;
 960        int ret = 0;
 961
 962        ASSERT(orig_bytes);
 963        ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
 964
 965        spin_lock(&space_info->lock);
 966        ret = -ENOSPC;
 967        used = btrfs_space_info_used(space_info, true);
 968
 969        /*
 970         * Carry on if we have enough space (short-circuit) OR call
 971         * can_overcommit() to ensure we can overcommit to continue.
 972         */
 973        if ((used + orig_bytes <= space_info->total_bytes) ||
 974            can_overcommit(fs_info, space_info, orig_bytes, flush,
 975                           system_chunk)) {
 976                btrfs_space_info_update_bytes_may_use(fs_info, space_info,
 977                                                      orig_bytes);
 978                trace_btrfs_space_reservation(fs_info, "space_info",
 979                                              space_info->flags, orig_bytes, 1);
 980                ret = 0;
 981        }
 982
 983        /*
 984         * If we couldn't make a reservation then setup our reservation ticket
 985         * and kick the async worker if it's not already running.
 986         *
 987         * If we are a priority flusher then we just need to add our ticket to
 988         * the list and we will do our own flushing further down.
 989         */
 990        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
 991                ticket.orig_bytes = orig_bytes;
 992                ticket.bytes = orig_bytes;
 993                ticket.error = 0;
 994                init_waitqueue_head(&ticket.wait);
 995                if (flush == BTRFS_RESERVE_FLUSH_ALL) {
 996                        list_add_tail(&ticket.list, &space_info->tickets);
 997                        if (!space_info->flush) {
 998                                space_info->flush = 1;
 999                                trace_btrfs_trigger_flush(fs_info,
1000                                                          space_info->flags,
1001                                                          orig_bytes, flush,
1002                                                          "enospc");
1003                                queue_work(system_unbound_wq,
1004                                           &fs_info->async_reclaim_work);
1005                        }
1006                } else {
1007                        list_add_tail(&ticket.list,
1008                                      &space_info->priority_tickets);
1009                }
1010        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1011                used += orig_bytes;
1012                /*
1013                 * We will do the space reservation dance during log replay,
1014                 * which means we won't have fs_info->fs_root set, so don't do
1015                 * the async reclaim as we will panic.
1016                 */
1017                if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1018                    need_do_async_reclaim(fs_info, space_info,
1019                                          used, system_chunk) &&
1020                    !work_busy(&fs_info->async_reclaim_work)) {
1021                        trace_btrfs_trigger_flush(fs_info, space_info->flags,
1022                                                  orig_bytes, flush, "preempt");
1023                        queue_work(system_unbound_wq,
1024                                   &fs_info->async_reclaim_work);
1025                }
1026        }
1027        spin_unlock(&space_info->lock);
1028        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1029                return ret;
1030
1031        if (flush == BTRFS_RESERVE_FLUSH_ALL)
1032                return wait_reserve_ticket(fs_info, space_info, &ticket);
1033
1034        ret = 0;
1035        priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1036        spin_lock(&space_info->lock);
1037        if (ticket.bytes) {
1038                if (ticket.bytes < orig_bytes)
1039                        reclaim_bytes = orig_bytes - ticket.bytes;
1040                list_del_init(&ticket.list);
1041                ret = -ENOSPC;
1042        }
1043        spin_unlock(&space_info->lock);
1044
1045        if (reclaim_bytes)
1046                btrfs_space_info_add_old_bytes(fs_info, space_info,
1047                                               reclaim_bytes);
1048        ASSERT(list_empty(&ticket.list));
1049        return ret;
1050}
1051
1052/**
1053 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1054 * @root - the root we're allocating for
1055 * @block_rsv - the block_rsv we're allocating for
1056 * @orig_bytes - the number of bytes we want
1057 * @flush - whether or not we can flush to make our reservation
1058 *
1059 * This will reserve orig_bytes number of bytes from the space info associated
1060 * with the block_rsv.  If there is not enough space it will make an attempt to
1061 * flush out space to make room.  It will do this by flushing delalloc if
1062 * possible or committing the transaction.  If flush is 0 then no attempts to
1063 * regain reservations will be made and this will fail if there is not enough
1064 * space already.
1065 */
1066int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1067                                 struct btrfs_block_rsv *block_rsv,
1068                                 u64 orig_bytes,
1069                                 enum btrfs_reserve_flush_enum flush)
1070{
1071        struct btrfs_fs_info *fs_info = root->fs_info;
1072        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1073        int ret;
1074        bool system_chunk = (root == fs_info->chunk_root);
1075
1076        ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1077                                       orig_bytes, flush, system_chunk);
1078        if (ret == -ENOSPC &&
1079            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1080                if (block_rsv != global_rsv &&
1081                    !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1082                        ret = 0;
1083        }
1084        if (ret == -ENOSPC) {
1085                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1086                                              block_rsv->space_info->flags,
1087                                              orig_bytes, 1);
1088
1089                if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1090                        btrfs_dump_space_info(fs_info, block_rsv->space_info,
1091                                              orig_bytes, 0);
1092        }
1093        return ret;
1094}
1095