linux/fs/btrfs/free-space-cache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2008 Red Hat.  All rights reserved.
   4 */
   5
   6#include <linux/pagemap.h>
   7#include <linux/sched.h>
   8#include <linux/sched/signal.h>
   9#include <linux/slab.h>
  10#include <linux/math64.h>
  11#include <linux/ratelimit.h>
  12#include <linux/error-injection.h>
  13#include <linux/sched/mm.h>
  14#include "misc.h"
  15#include "ctree.h"
  16#include "free-space-cache.h"
  17#include "transaction.h"
  18#include "disk-io.h"
  19#include "extent_io.h"
  20#include "volumes.h"
  21#include "space-info.h"
  22#include "delalloc-space.h"
  23#include "block-group.h"
  24#include "discard.h"
  25#include "subpage.h"
  26
  27#define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
  28#define MAX_CACHE_BYTES_PER_GIG SZ_64K
  29#define FORCE_EXTENT_THRESHOLD  SZ_1M
  30
  31struct btrfs_trim_range {
  32        u64 start;
  33        u64 bytes;
  34        struct list_head list;
  35};
  36
  37static int link_free_space(struct btrfs_free_space_ctl *ctl,
  38                           struct btrfs_free_space *info);
  39static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
  40                              struct btrfs_free_space *info);
  41static int search_bitmap(struct btrfs_free_space_ctl *ctl,
  42                         struct btrfs_free_space *bitmap_info, u64 *offset,
  43                         u64 *bytes, bool for_alloc);
  44static void free_bitmap(struct btrfs_free_space_ctl *ctl,
  45                        struct btrfs_free_space *bitmap_info);
  46static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
  47                              struct btrfs_free_space *info, u64 offset,
  48                              u64 bytes);
  49
  50static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
  51                                               struct btrfs_path *path,
  52                                               u64 offset)
  53{
  54        struct btrfs_fs_info *fs_info = root->fs_info;
  55        struct btrfs_key key;
  56        struct btrfs_key location;
  57        struct btrfs_disk_key disk_key;
  58        struct btrfs_free_space_header *header;
  59        struct extent_buffer *leaf;
  60        struct inode *inode = NULL;
  61        unsigned nofs_flag;
  62        int ret;
  63
  64        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
  65        key.offset = offset;
  66        key.type = 0;
  67
  68        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  69        if (ret < 0)
  70                return ERR_PTR(ret);
  71        if (ret > 0) {
  72                btrfs_release_path(path);
  73                return ERR_PTR(-ENOENT);
  74        }
  75
  76        leaf = path->nodes[0];
  77        header = btrfs_item_ptr(leaf, path->slots[0],
  78                                struct btrfs_free_space_header);
  79        btrfs_free_space_key(leaf, header, &disk_key);
  80        btrfs_disk_key_to_cpu(&location, &disk_key);
  81        btrfs_release_path(path);
  82
  83        /*
  84         * We are often under a trans handle at this point, so we need to make
  85         * sure NOFS is set to keep us from deadlocking.
  86         */
  87        nofs_flag = memalloc_nofs_save();
  88        inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
  89        btrfs_release_path(path);
  90        memalloc_nofs_restore(nofs_flag);
  91        if (IS_ERR(inode))
  92                return inode;
  93
  94        mapping_set_gfp_mask(inode->i_mapping,
  95                        mapping_gfp_constraint(inode->i_mapping,
  96                        ~(__GFP_FS | __GFP_HIGHMEM)));
  97
  98        return inode;
  99}
 100
 101struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
 102                struct btrfs_path *path)
 103{
 104        struct btrfs_fs_info *fs_info = block_group->fs_info;
 105        struct inode *inode = NULL;
 106        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 107
 108        spin_lock(&block_group->lock);
 109        if (block_group->inode)
 110                inode = igrab(block_group->inode);
 111        spin_unlock(&block_group->lock);
 112        if (inode)
 113                return inode;
 114
 115        inode = __lookup_free_space_inode(fs_info->tree_root, path,
 116                                          block_group->start);
 117        if (IS_ERR(inode))
 118                return inode;
 119
 120        spin_lock(&block_group->lock);
 121        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
 122                btrfs_info(fs_info, "Old style space inode found, converting.");
 123                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
 124                        BTRFS_INODE_NODATACOW;
 125                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 126        }
 127
 128        if (!block_group->iref) {
 129                block_group->inode = igrab(inode);
 130                block_group->iref = 1;
 131        }
 132        spin_unlock(&block_group->lock);
 133
 134        return inode;
 135}
 136
 137static int __create_free_space_inode(struct btrfs_root *root,
 138                                     struct btrfs_trans_handle *trans,
 139                                     struct btrfs_path *path,
 140                                     u64 ino, u64 offset)
 141{
 142        struct btrfs_key key;
 143        struct btrfs_disk_key disk_key;
 144        struct btrfs_free_space_header *header;
 145        struct btrfs_inode_item *inode_item;
 146        struct extent_buffer *leaf;
 147        /* We inline CRCs for the free disk space cache */
 148        const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
 149                          BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 150        int ret;
 151
 152        ret = btrfs_insert_empty_inode(trans, root, path, ino);
 153        if (ret)
 154                return ret;
 155
 156        leaf = path->nodes[0];
 157        inode_item = btrfs_item_ptr(leaf, path->slots[0],
 158                                    struct btrfs_inode_item);
 159        btrfs_item_key(leaf, &disk_key, path->slots[0]);
 160        memzero_extent_buffer(leaf, (unsigned long)inode_item,
 161                             sizeof(*inode_item));
 162        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
 163        btrfs_set_inode_size(leaf, inode_item, 0);
 164        btrfs_set_inode_nbytes(leaf, inode_item, 0);
 165        btrfs_set_inode_uid(leaf, inode_item, 0);
 166        btrfs_set_inode_gid(leaf, inode_item, 0);
 167        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
 168        btrfs_set_inode_flags(leaf, inode_item, flags);
 169        btrfs_set_inode_nlink(leaf, inode_item, 1);
 170        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 171        btrfs_set_inode_block_group(leaf, inode_item, offset);
 172        btrfs_mark_buffer_dirty(leaf);
 173        btrfs_release_path(path);
 174
 175        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 176        key.offset = offset;
 177        key.type = 0;
 178        ret = btrfs_insert_empty_item(trans, root, path, &key,
 179                                      sizeof(struct btrfs_free_space_header));
 180        if (ret < 0) {
 181                btrfs_release_path(path);
 182                return ret;
 183        }
 184
 185        leaf = path->nodes[0];
 186        header = btrfs_item_ptr(leaf, path->slots[0],
 187                                struct btrfs_free_space_header);
 188        memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
 189        btrfs_set_free_space_key(leaf, header, &disk_key);
 190        btrfs_mark_buffer_dirty(leaf);
 191        btrfs_release_path(path);
 192
 193        return 0;
 194}
 195
 196int create_free_space_inode(struct btrfs_trans_handle *trans,
 197                            struct btrfs_block_group *block_group,
 198                            struct btrfs_path *path)
 199{
 200        int ret;
 201        u64 ino;
 202
 203        ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
 204        if (ret < 0)
 205                return ret;
 206
 207        return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
 208                                         ino, block_group->start);
 209}
 210
 211/*
 212 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
 213 * handles lookup, otherwise it takes ownership and iputs the inode.
 214 * Don't reuse an inode pointer after passing it into this function.
 215 */
 216int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
 217                                  struct inode *inode,
 218                                  struct btrfs_block_group *block_group)
 219{
 220        struct btrfs_path *path;
 221        struct btrfs_key key;
 222        int ret = 0;
 223
 224        path = btrfs_alloc_path();
 225        if (!path)
 226                return -ENOMEM;
 227
 228        if (!inode)
 229                inode = lookup_free_space_inode(block_group, path);
 230        if (IS_ERR(inode)) {
 231                if (PTR_ERR(inode) != -ENOENT)
 232                        ret = PTR_ERR(inode);
 233                goto out;
 234        }
 235        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 236        if (ret) {
 237                btrfs_add_delayed_iput(inode);
 238                goto out;
 239        }
 240        clear_nlink(inode);
 241        /* One for the block groups ref */
 242        spin_lock(&block_group->lock);
 243        if (block_group->iref) {
 244                block_group->iref = 0;
 245                block_group->inode = NULL;
 246                spin_unlock(&block_group->lock);
 247                iput(inode);
 248        } else {
 249                spin_unlock(&block_group->lock);
 250        }
 251        /* One for the lookup ref */
 252        btrfs_add_delayed_iput(inode);
 253
 254        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 255        key.type = 0;
 256        key.offset = block_group->start;
 257        ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
 258                                -1, 1);
 259        if (ret) {
 260                if (ret > 0)
 261                        ret = 0;
 262                goto out;
 263        }
 264        ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
 265out:
 266        btrfs_free_path(path);
 267        return ret;
 268}
 269
 270int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
 271                                       struct btrfs_block_rsv *rsv)
 272{
 273        u64 needed_bytes;
 274        int ret;
 275
 276        /* 1 for slack space, 1 for updating the inode */
 277        needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
 278                btrfs_calc_metadata_size(fs_info, 1);
 279
 280        spin_lock(&rsv->lock);
 281        if (rsv->reserved < needed_bytes)
 282                ret = -ENOSPC;
 283        else
 284                ret = 0;
 285        spin_unlock(&rsv->lock);
 286        return ret;
 287}
 288
 289int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 290                                    struct btrfs_block_group *block_group,
 291                                    struct inode *inode)
 292{
 293        struct btrfs_root *root = BTRFS_I(inode)->root;
 294        int ret = 0;
 295        bool locked = false;
 296
 297        if (block_group) {
 298                struct btrfs_path *path = btrfs_alloc_path();
 299
 300                if (!path) {
 301                        ret = -ENOMEM;
 302                        goto fail;
 303                }
 304                locked = true;
 305                mutex_lock(&trans->transaction->cache_write_mutex);
 306                if (!list_empty(&block_group->io_list)) {
 307                        list_del_init(&block_group->io_list);
 308
 309                        btrfs_wait_cache_io(trans, block_group, path);
 310                        btrfs_put_block_group(block_group);
 311                }
 312
 313                /*
 314                 * now that we've truncated the cache away, its no longer
 315                 * setup or written
 316                 */
 317                spin_lock(&block_group->lock);
 318                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 319                spin_unlock(&block_group->lock);
 320                btrfs_free_path(path);
 321        }
 322
 323        btrfs_i_size_write(BTRFS_I(inode), 0);
 324        truncate_pagecache(inode, 0);
 325
 326        /*
 327         * We skip the throttling logic for free space cache inodes, so we don't
 328         * need to check for -EAGAIN.
 329         */
 330        ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode),
 331                                         0, BTRFS_EXTENT_DATA_KEY, NULL);
 332        if (ret)
 333                goto fail;
 334
 335        ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
 336
 337fail:
 338        if (locked)
 339                mutex_unlock(&trans->transaction->cache_write_mutex);
 340        if (ret)
 341                btrfs_abort_transaction(trans, ret);
 342
 343        return ret;
 344}
 345
 346static void readahead_cache(struct inode *inode)
 347{
 348        struct file_ra_state ra;
 349        unsigned long last_index;
 350
 351        file_ra_state_init(&ra, inode->i_mapping);
 352        last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 353
 354        page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
 355}
 356
 357static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 358                       int write)
 359{
 360        int num_pages;
 361
 362        num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 363
 364        /* Make sure we can fit our crcs and generation into the first page */
 365        if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
 366                return -ENOSPC;
 367
 368        memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
 369
 370        io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
 371        if (!io_ctl->pages)
 372                return -ENOMEM;
 373
 374        io_ctl->num_pages = num_pages;
 375        io_ctl->fs_info = btrfs_sb(inode->i_sb);
 376        io_ctl->inode = inode;
 377
 378        return 0;
 379}
 380ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
 381
 382static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 383{
 384        kfree(io_ctl->pages);
 385        io_ctl->pages = NULL;
 386}
 387
 388static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 389{
 390        if (io_ctl->cur) {
 391                io_ctl->cur = NULL;
 392                io_ctl->orig = NULL;
 393        }
 394}
 395
 396static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 397{
 398        ASSERT(io_ctl->index < io_ctl->num_pages);
 399        io_ctl->page = io_ctl->pages[io_ctl->index++];
 400        io_ctl->cur = page_address(io_ctl->page);
 401        io_ctl->orig = io_ctl->cur;
 402        io_ctl->size = PAGE_SIZE;
 403        if (clear)
 404                clear_page(io_ctl->cur);
 405}
 406
 407static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 408{
 409        int i;
 410
 411        io_ctl_unmap_page(io_ctl);
 412
 413        for (i = 0; i < io_ctl->num_pages; i++) {
 414                if (io_ctl->pages[i]) {
 415                        btrfs_page_clear_checked(io_ctl->fs_info,
 416                                        io_ctl->pages[i],
 417                                        page_offset(io_ctl->pages[i]),
 418                                        PAGE_SIZE);
 419                        unlock_page(io_ctl->pages[i]);
 420                        put_page(io_ctl->pages[i]);
 421                }
 422        }
 423}
 424
 425static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
 426{
 427        struct page *page;
 428        struct inode *inode = io_ctl->inode;
 429        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 430        int i;
 431
 432        for (i = 0; i < io_ctl->num_pages; i++) {
 433                int ret;
 434
 435                page = find_or_create_page(inode->i_mapping, i, mask);
 436                if (!page) {
 437                        io_ctl_drop_pages(io_ctl);
 438                        return -ENOMEM;
 439                }
 440
 441                ret = set_page_extent_mapped(page);
 442                if (ret < 0) {
 443                        unlock_page(page);
 444                        put_page(page);
 445                        io_ctl_drop_pages(io_ctl);
 446                        return ret;
 447                }
 448
 449                io_ctl->pages[i] = page;
 450                if (uptodate && !PageUptodate(page)) {
 451                        btrfs_readpage(NULL, page);
 452                        lock_page(page);
 453                        if (page->mapping != inode->i_mapping) {
 454                                btrfs_err(BTRFS_I(inode)->root->fs_info,
 455                                          "free space cache page truncated");
 456                                io_ctl_drop_pages(io_ctl);
 457                                return -EIO;
 458                        }
 459                        if (!PageUptodate(page)) {
 460                                btrfs_err(BTRFS_I(inode)->root->fs_info,
 461                                           "error reading free space cache");
 462                                io_ctl_drop_pages(io_ctl);
 463                                return -EIO;
 464                        }
 465                }
 466        }
 467
 468        for (i = 0; i < io_ctl->num_pages; i++)
 469                clear_page_dirty_for_io(io_ctl->pages[i]);
 470
 471        return 0;
 472}
 473
 474static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 475{
 476        io_ctl_map_page(io_ctl, 1);
 477
 478        /*
 479         * Skip the csum areas.  If we don't check crcs then we just have a
 480         * 64bit chunk at the front of the first page.
 481         */
 482        io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
 483        io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
 484
 485        put_unaligned_le64(generation, io_ctl->cur);
 486        io_ctl->cur += sizeof(u64);
 487}
 488
 489static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 490{
 491        u64 cache_gen;
 492
 493        /*
 494         * Skip the crc area.  If we don't check crcs then we just have a 64bit
 495         * chunk at the front of the first page.
 496         */
 497        io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
 498        io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
 499
 500        cache_gen = get_unaligned_le64(io_ctl->cur);
 501        if (cache_gen != generation) {
 502                btrfs_err_rl(io_ctl->fs_info,
 503                        "space cache generation (%llu) does not match inode (%llu)",
 504                                cache_gen, generation);
 505                io_ctl_unmap_page(io_ctl);
 506                return -EIO;
 507        }
 508        io_ctl->cur += sizeof(u64);
 509        return 0;
 510}
 511
 512static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 513{
 514        u32 *tmp;
 515        u32 crc = ~(u32)0;
 516        unsigned offset = 0;
 517
 518        if (index == 0)
 519                offset = sizeof(u32) * io_ctl->num_pages;
 520
 521        crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 522        btrfs_crc32c_final(crc, (u8 *)&crc);
 523        io_ctl_unmap_page(io_ctl);
 524        tmp = page_address(io_ctl->pages[0]);
 525        tmp += index;
 526        *tmp = crc;
 527}
 528
 529static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 530{
 531        u32 *tmp, val;
 532        u32 crc = ~(u32)0;
 533        unsigned offset = 0;
 534
 535        if (index == 0)
 536                offset = sizeof(u32) * io_ctl->num_pages;
 537
 538        tmp = page_address(io_ctl->pages[0]);
 539        tmp += index;
 540        val = *tmp;
 541
 542        io_ctl_map_page(io_ctl, 0);
 543        crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 544        btrfs_crc32c_final(crc, (u8 *)&crc);
 545        if (val != crc) {
 546                btrfs_err_rl(io_ctl->fs_info,
 547                        "csum mismatch on free space cache");
 548                io_ctl_unmap_page(io_ctl);
 549                return -EIO;
 550        }
 551
 552        return 0;
 553}
 554
 555static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
 556                            void *bitmap)
 557{
 558        struct btrfs_free_space_entry *entry;
 559
 560        if (!io_ctl->cur)
 561                return -ENOSPC;
 562
 563        entry = io_ctl->cur;
 564        put_unaligned_le64(offset, &entry->offset);
 565        put_unaligned_le64(bytes, &entry->bytes);
 566        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
 567                BTRFS_FREE_SPACE_EXTENT;
 568        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 569        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 570
 571        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 572                return 0;
 573
 574        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 575
 576        /* No more pages to map */
 577        if (io_ctl->index >= io_ctl->num_pages)
 578                return 0;
 579
 580        /* map the next page */
 581        io_ctl_map_page(io_ctl, 1);
 582        return 0;
 583}
 584
 585static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 586{
 587        if (!io_ctl->cur)
 588                return -ENOSPC;
 589
 590        /*
 591         * If we aren't at the start of the current page, unmap this one and
 592         * map the next one if there is any left.
 593         */
 594        if (io_ctl->cur != io_ctl->orig) {
 595                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 596                if (io_ctl->index >= io_ctl->num_pages)
 597                        return -ENOSPC;
 598                io_ctl_map_page(io_ctl, 0);
 599        }
 600
 601        copy_page(io_ctl->cur, bitmap);
 602        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 603        if (io_ctl->index < io_ctl->num_pages)
 604                io_ctl_map_page(io_ctl, 0);
 605        return 0;
 606}
 607
 608static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
 609{
 610        /*
 611         * If we're not on the boundary we know we've modified the page and we
 612         * need to crc the page.
 613         */
 614        if (io_ctl->cur != io_ctl->orig)
 615                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 616        else
 617                io_ctl_unmap_page(io_ctl);
 618
 619        while (io_ctl->index < io_ctl->num_pages) {
 620                io_ctl_map_page(io_ctl, 1);
 621                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 622        }
 623}
 624
 625static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
 626                            struct btrfs_free_space *entry, u8 *type)
 627{
 628        struct btrfs_free_space_entry *e;
 629        int ret;
 630
 631        if (!io_ctl->cur) {
 632                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 633                if (ret)
 634                        return ret;
 635        }
 636
 637        e = io_ctl->cur;
 638        entry->offset = get_unaligned_le64(&e->offset);
 639        entry->bytes = get_unaligned_le64(&e->bytes);
 640        *type = e->type;
 641        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 642        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 643
 644        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 645                return 0;
 646
 647        io_ctl_unmap_page(io_ctl);
 648
 649        return 0;
 650}
 651
 652static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 653                              struct btrfs_free_space *entry)
 654{
 655        int ret;
 656
 657        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 658        if (ret)
 659                return ret;
 660
 661        copy_page(entry->bitmap, io_ctl->cur);
 662        io_ctl_unmap_page(io_ctl);
 663
 664        return 0;
 665}
 666
 667static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 668{
 669        struct btrfs_block_group *block_group = ctl->private;
 670        u64 max_bytes;
 671        u64 bitmap_bytes;
 672        u64 extent_bytes;
 673        u64 size = block_group->length;
 674        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
 675        u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
 676
 677        max_bitmaps = max_t(u64, max_bitmaps, 1);
 678
 679        ASSERT(ctl->total_bitmaps <= max_bitmaps);
 680
 681        /*
 682         * We are trying to keep the total amount of memory used per 1GiB of
 683         * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
 684         * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
 685         * bitmaps, we may end up using more memory than this.
 686         */
 687        if (size < SZ_1G)
 688                max_bytes = MAX_CACHE_BYTES_PER_GIG;
 689        else
 690                max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
 691
 692        bitmap_bytes = ctl->total_bitmaps * ctl->unit;
 693
 694        /*
 695         * we want the extent entry threshold to always be at most 1/2 the max
 696         * bytes we can have, or whatever is less than that.
 697         */
 698        extent_bytes = max_bytes - bitmap_bytes;
 699        extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
 700
 701        ctl->extents_thresh =
 702                div_u64(extent_bytes, sizeof(struct btrfs_free_space));
 703}
 704
 705static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 706                                   struct btrfs_free_space_ctl *ctl,
 707                                   struct btrfs_path *path, u64 offset)
 708{
 709        struct btrfs_fs_info *fs_info = root->fs_info;
 710        struct btrfs_free_space_header *header;
 711        struct extent_buffer *leaf;
 712        struct btrfs_io_ctl io_ctl;
 713        struct btrfs_key key;
 714        struct btrfs_free_space *e, *n;
 715        LIST_HEAD(bitmaps);
 716        u64 num_entries;
 717        u64 num_bitmaps;
 718        u64 generation;
 719        u8 type;
 720        int ret = 0;
 721
 722        /* Nothing in the space cache, goodbye */
 723        if (!i_size_read(inode))
 724                return 0;
 725
 726        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 727        key.offset = offset;
 728        key.type = 0;
 729
 730        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 731        if (ret < 0)
 732                return 0;
 733        else if (ret > 0) {
 734                btrfs_release_path(path);
 735                return 0;
 736        }
 737
 738        ret = -1;
 739
 740        leaf = path->nodes[0];
 741        header = btrfs_item_ptr(leaf, path->slots[0],
 742                                struct btrfs_free_space_header);
 743        num_entries = btrfs_free_space_entries(leaf, header);
 744        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
 745        generation = btrfs_free_space_generation(leaf, header);
 746        btrfs_release_path(path);
 747
 748        if (!BTRFS_I(inode)->generation) {
 749                btrfs_info(fs_info,
 750                           "the free space cache file (%llu) is invalid, skip it",
 751                           offset);
 752                return 0;
 753        }
 754
 755        if (BTRFS_I(inode)->generation != generation) {
 756                btrfs_err(fs_info,
 757                          "free space inode generation (%llu) did not match free space cache generation (%llu)",
 758                          BTRFS_I(inode)->generation, generation);
 759                return 0;
 760        }
 761
 762        if (!num_entries)
 763                return 0;
 764
 765        ret = io_ctl_init(&io_ctl, inode, 0);
 766        if (ret)
 767                return ret;
 768
 769        readahead_cache(inode);
 770
 771        ret = io_ctl_prepare_pages(&io_ctl, true);
 772        if (ret)
 773                goto out;
 774
 775        ret = io_ctl_check_crc(&io_ctl, 0);
 776        if (ret)
 777                goto free_cache;
 778
 779        ret = io_ctl_check_generation(&io_ctl, generation);
 780        if (ret)
 781                goto free_cache;
 782
 783        while (num_entries) {
 784                e = kmem_cache_zalloc(btrfs_free_space_cachep,
 785                                      GFP_NOFS);
 786                if (!e) {
 787                        ret = -ENOMEM;
 788                        goto free_cache;
 789                }
 790
 791                ret = io_ctl_read_entry(&io_ctl, e, &type);
 792                if (ret) {
 793                        kmem_cache_free(btrfs_free_space_cachep, e);
 794                        goto free_cache;
 795                }
 796
 797                if (!e->bytes) {
 798                        ret = -1;
 799                        kmem_cache_free(btrfs_free_space_cachep, e);
 800                        goto free_cache;
 801                }
 802
 803                if (type == BTRFS_FREE_SPACE_EXTENT) {
 804                        spin_lock(&ctl->tree_lock);
 805                        ret = link_free_space(ctl, e);
 806                        spin_unlock(&ctl->tree_lock);
 807                        if (ret) {
 808                                btrfs_err(fs_info,
 809                                        "Duplicate entries in free space cache, dumping");
 810                                kmem_cache_free(btrfs_free_space_cachep, e);
 811                                goto free_cache;
 812                        }
 813                } else {
 814                        ASSERT(num_bitmaps);
 815                        num_bitmaps--;
 816                        e->bitmap = kmem_cache_zalloc(
 817                                        btrfs_free_space_bitmap_cachep, GFP_NOFS);
 818                        if (!e->bitmap) {
 819                                ret = -ENOMEM;
 820                                kmem_cache_free(
 821                                        btrfs_free_space_cachep, e);
 822                                goto free_cache;
 823                        }
 824                        spin_lock(&ctl->tree_lock);
 825                        ret = link_free_space(ctl, e);
 826                        ctl->total_bitmaps++;
 827                        recalculate_thresholds(ctl);
 828                        spin_unlock(&ctl->tree_lock);
 829                        if (ret) {
 830                                btrfs_err(fs_info,
 831                                        "Duplicate entries in free space cache, dumping");
 832                                kmem_cache_free(btrfs_free_space_cachep, e);
 833                                goto free_cache;
 834                        }
 835                        list_add_tail(&e->list, &bitmaps);
 836                }
 837
 838                num_entries--;
 839        }
 840
 841        io_ctl_unmap_page(&io_ctl);
 842
 843        /*
 844         * We add the bitmaps at the end of the entries in order that
 845         * the bitmap entries are added to the cache.
 846         */
 847        list_for_each_entry_safe(e, n, &bitmaps, list) {
 848                list_del_init(&e->list);
 849                ret = io_ctl_read_bitmap(&io_ctl, e);
 850                if (ret)
 851                        goto free_cache;
 852        }
 853
 854        io_ctl_drop_pages(&io_ctl);
 855        ret = 1;
 856out:
 857        io_ctl_free(&io_ctl);
 858        return ret;
 859free_cache:
 860        io_ctl_drop_pages(&io_ctl);
 861        __btrfs_remove_free_space_cache(ctl);
 862        goto out;
 863}
 864
 865static int copy_free_space_cache(struct btrfs_block_group *block_group,
 866                                 struct btrfs_free_space_ctl *ctl)
 867{
 868        struct btrfs_free_space *info;
 869        struct rb_node *n;
 870        int ret = 0;
 871
 872        while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
 873                info = rb_entry(n, struct btrfs_free_space, offset_index);
 874                if (!info->bitmap) {
 875                        unlink_free_space(ctl, info);
 876                        ret = btrfs_add_free_space(block_group, info->offset,
 877                                                   info->bytes);
 878                        kmem_cache_free(btrfs_free_space_cachep, info);
 879                } else {
 880                        u64 offset = info->offset;
 881                        u64 bytes = ctl->unit;
 882
 883                        while (search_bitmap(ctl, info, &offset, &bytes,
 884                                             false) == 0) {
 885                                ret = btrfs_add_free_space(block_group, offset,
 886                                                           bytes);
 887                                if (ret)
 888                                        break;
 889                                bitmap_clear_bits(ctl, info, offset, bytes);
 890                                offset = info->offset;
 891                                bytes = ctl->unit;
 892                        }
 893                        free_bitmap(ctl, info);
 894                }
 895                cond_resched();
 896        }
 897        return ret;
 898}
 899
 900int load_free_space_cache(struct btrfs_block_group *block_group)
 901{
 902        struct btrfs_fs_info *fs_info = block_group->fs_info;
 903        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 904        struct btrfs_free_space_ctl tmp_ctl = {};
 905        struct inode *inode;
 906        struct btrfs_path *path;
 907        int ret = 0;
 908        bool matched;
 909        u64 used = block_group->used;
 910
 911        /*
 912         * Because we could potentially discard our loaded free space, we want
 913         * to load everything into a temporary structure first, and then if it's
 914         * valid copy it all into the actual free space ctl.
 915         */
 916        btrfs_init_free_space_ctl(block_group, &tmp_ctl);
 917
 918        /*
 919         * If this block group has been marked to be cleared for one reason or
 920         * another then we can't trust the on disk cache, so just return.
 921         */
 922        spin_lock(&block_group->lock);
 923        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 924                spin_unlock(&block_group->lock);
 925                return 0;
 926        }
 927        spin_unlock(&block_group->lock);
 928
 929        path = btrfs_alloc_path();
 930        if (!path)
 931                return 0;
 932        path->search_commit_root = 1;
 933        path->skip_locking = 1;
 934
 935        /*
 936         * We must pass a path with search_commit_root set to btrfs_iget in
 937         * order to avoid a deadlock when allocating extents for the tree root.
 938         *
 939         * When we are COWing an extent buffer from the tree root, when looking
 940         * for a free extent, at extent-tree.c:find_free_extent(), we can find
 941         * block group without its free space cache loaded. When we find one
 942         * we must load its space cache which requires reading its free space
 943         * cache's inode item from the root tree. If this inode item is located
 944         * in the same leaf that we started COWing before, then we end up in
 945         * deadlock on the extent buffer (trying to read lock it when we
 946         * previously write locked it).
 947         *
 948         * It's safe to read the inode item using the commit root because
 949         * block groups, once loaded, stay in memory forever (until they are
 950         * removed) as well as their space caches once loaded. New block groups
 951         * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
 952         * we will never try to read their inode item while the fs is mounted.
 953         */
 954        inode = lookup_free_space_inode(block_group, path);
 955        if (IS_ERR(inode)) {
 956                btrfs_free_path(path);
 957                return 0;
 958        }
 959
 960        /* We may have converted the inode and made the cache invalid. */
 961        spin_lock(&block_group->lock);
 962        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 963                spin_unlock(&block_group->lock);
 964                btrfs_free_path(path);
 965                goto out;
 966        }
 967        spin_unlock(&block_group->lock);
 968
 969        ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
 970                                      path, block_group->start);
 971        btrfs_free_path(path);
 972        if (ret <= 0)
 973                goto out;
 974
 975        matched = (tmp_ctl.free_space == (block_group->length - used -
 976                                          block_group->bytes_super));
 977
 978        if (matched) {
 979                ret = copy_free_space_cache(block_group, &tmp_ctl);
 980                /*
 981                 * ret == 1 means we successfully loaded the free space cache,
 982                 * so we need to re-set it here.
 983                 */
 984                if (ret == 0)
 985                        ret = 1;
 986        } else {
 987                __btrfs_remove_free_space_cache(&tmp_ctl);
 988                btrfs_warn(fs_info,
 989                           "block group %llu has wrong amount of free space",
 990                           block_group->start);
 991                ret = -1;
 992        }
 993out:
 994        if (ret < 0) {
 995                /* This cache is bogus, make sure it gets cleared */
 996                spin_lock(&block_group->lock);
 997                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 998                spin_unlock(&block_group->lock);
 999                ret = 0;
1000
1001                btrfs_warn(fs_info,
1002                           "failed to load free space cache for block group %llu, rebuilding it now",
1003                           block_group->start);
1004        }
1005
1006        spin_lock(&ctl->tree_lock);
1007        btrfs_discard_update_discardable(block_group);
1008        spin_unlock(&ctl->tree_lock);
1009        iput(inode);
1010        return ret;
1011}
1012
1013static noinline_for_stack
1014int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1015                              struct btrfs_free_space_ctl *ctl,
1016                              struct btrfs_block_group *block_group,
1017                              int *entries, int *bitmaps,
1018                              struct list_head *bitmap_list)
1019{
1020        int ret;
1021        struct btrfs_free_cluster *cluster = NULL;
1022        struct btrfs_free_cluster *cluster_locked = NULL;
1023        struct rb_node *node = rb_first(&ctl->free_space_offset);
1024        struct btrfs_trim_range *trim_entry;
1025
1026        /* Get the cluster for this block_group if it exists */
1027        if (block_group && !list_empty(&block_group->cluster_list)) {
1028                cluster = list_entry(block_group->cluster_list.next,
1029                                     struct btrfs_free_cluster,
1030                                     block_group_list);
1031        }
1032
1033        if (!node && cluster) {
1034                cluster_locked = cluster;
1035                spin_lock(&cluster_locked->lock);
1036                node = rb_first(&cluster->root);
1037                cluster = NULL;
1038        }
1039
1040        /* Write out the extent entries */
1041        while (node) {
1042                struct btrfs_free_space *e;
1043
1044                e = rb_entry(node, struct btrfs_free_space, offset_index);
1045                *entries += 1;
1046
1047                ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1048                                       e->bitmap);
1049                if (ret)
1050                        goto fail;
1051
1052                if (e->bitmap) {
1053                        list_add_tail(&e->list, bitmap_list);
1054                        *bitmaps += 1;
1055                }
1056                node = rb_next(node);
1057                if (!node && cluster) {
1058                        node = rb_first(&cluster->root);
1059                        cluster_locked = cluster;
1060                        spin_lock(&cluster_locked->lock);
1061                        cluster = NULL;
1062                }
1063        }
1064        if (cluster_locked) {
1065                spin_unlock(&cluster_locked->lock);
1066                cluster_locked = NULL;
1067        }
1068
1069        /*
1070         * Make sure we don't miss any range that was removed from our rbtree
1071         * because trimming is running. Otherwise after a umount+mount (or crash
1072         * after committing the transaction) we would leak free space and get
1073         * an inconsistent free space cache report from fsck.
1074         */
1075        list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1076                ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1077                                       trim_entry->bytes, NULL);
1078                if (ret)
1079                        goto fail;
1080                *entries += 1;
1081        }
1082
1083        return 0;
1084fail:
1085        if (cluster_locked)
1086                spin_unlock(&cluster_locked->lock);
1087        return -ENOSPC;
1088}
1089
1090static noinline_for_stack int
1091update_cache_item(struct btrfs_trans_handle *trans,
1092                  struct btrfs_root *root,
1093                  struct inode *inode,
1094                  struct btrfs_path *path, u64 offset,
1095                  int entries, int bitmaps)
1096{
1097        struct btrfs_key key;
1098        struct btrfs_free_space_header *header;
1099        struct extent_buffer *leaf;
1100        int ret;
1101
1102        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1103        key.offset = offset;
1104        key.type = 0;
1105
1106        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1107        if (ret < 0) {
1108                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1109                                 EXTENT_DELALLOC, 0, 0, NULL);
1110                goto fail;
1111        }
1112        leaf = path->nodes[0];
1113        if (ret > 0) {
1114                struct btrfs_key found_key;
1115                ASSERT(path->slots[0]);
1116                path->slots[0]--;
1117                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1118                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1119                    found_key.offset != offset) {
1120                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1121                                         inode->i_size - 1, EXTENT_DELALLOC, 0,
1122                                         0, NULL);
1123                        btrfs_release_path(path);
1124                        goto fail;
1125                }
1126        }
1127
1128        BTRFS_I(inode)->generation = trans->transid;
1129        header = btrfs_item_ptr(leaf, path->slots[0],
1130                                struct btrfs_free_space_header);
1131        btrfs_set_free_space_entries(leaf, header, entries);
1132        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1133        btrfs_set_free_space_generation(leaf, header, trans->transid);
1134        btrfs_mark_buffer_dirty(leaf);
1135        btrfs_release_path(path);
1136
1137        return 0;
1138
1139fail:
1140        return -1;
1141}
1142
1143static noinline_for_stack int write_pinned_extent_entries(
1144                            struct btrfs_trans_handle *trans,
1145                            struct btrfs_block_group *block_group,
1146                            struct btrfs_io_ctl *io_ctl,
1147                            int *entries)
1148{
1149        u64 start, extent_start, extent_end, len;
1150        struct extent_io_tree *unpin = NULL;
1151        int ret;
1152
1153        if (!block_group)
1154                return 0;
1155
1156        /*
1157         * We want to add any pinned extents to our free space cache
1158         * so we don't leak the space
1159         *
1160         * We shouldn't have switched the pinned extents yet so this is the
1161         * right one
1162         */
1163        unpin = &trans->transaction->pinned_extents;
1164
1165        start = block_group->start;
1166
1167        while (start < block_group->start + block_group->length) {
1168                ret = find_first_extent_bit(unpin, start,
1169                                            &extent_start, &extent_end,
1170                                            EXTENT_DIRTY, NULL);
1171                if (ret)
1172                        return 0;
1173
1174                /* This pinned extent is out of our range */
1175                if (extent_start >= block_group->start + block_group->length)
1176                        return 0;
1177
1178                extent_start = max(extent_start, start);
1179                extent_end = min(block_group->start + block_group->length,
1180                                 extent_end + 1);
1181                len = extent_end - extent_start;
1182
1183                *entries += 1;
1184                ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1185                if (ret)
1186                        return -ENOSPC;
1187
1188                start = extent_end;
1189        }
1190
1191        return 0;
1192}
1193
1194static noinline_for_stack int
1195write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1196{
1197        struct btrfs_free_space *entry, *next;
1198        int ret;
1199
1200        /* Write out the bitmaps */
1201        list_for_each_entry_safe(entry, next, bitmap_list, list) {
1202                ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1203                if (ret)
1204                        return -ENOSPC;
1205                list_del_init(&entry->list);
1206        }
1207
1208        return 0;
1209}
1210
1211static int flush_dirty_cache(struct inode *inode)
1212{
1213        int ret;
1214
1215        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1216        if (ret)
1217                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1218                                 EXTENT_DELALLOC, 0, 0, NULL);
1219
1220        return ret;
1221}
1222
1223static void noinline_for_stack
1224cleanup_bitmap_list(struct list_head *bitmap_list)
1225{
1226        struct btrfs_free_space *entry, *next;
1227
1228        list_for_each_entry_safe(entry, next, bitmap_list, list)
1229                list_del_init(&entry->list);
1230}
1231
1232static void noinline_for_stack
1233cleanup_write_cache_enospc(struct inode *inode,
1234                           struct btrfs_io_ctl *io_ctl,
1235                           struct extent_state **cached_state)
1236{
1237        io_ctl_drop_pages(io_ctl);
1238        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1239                             i_size_read(inode) - 1, cached_state);
1240}
1241
1242static int __btrfs_wait_cache_io(struct btrfs_root *root,
1243                                 struct btrfs_trans_handle *trans,
1244                                 struct btrfs_block_group *block_group,
1245                                 struct btrfs_io_ctl *io_ctl,
1246                                 struct btrfs_path *path, u64 offset)
1247{
1248        int ret;
1249        struct inode *inode = io_ctl->inode;
1250
1251        if (!inode)
1252                return 0;
1253
1254        /* Flush the dirty pages in the cache file. */
1255        ret = flush_dirty_cache(inode);
1256        if (ret)
1257                goto out;
1258
1259        /* Update the cache item to tell everyone this cache file is valid. */
1260        ret = update_cache_item(trans, root, inode, path, offset,
1261                                io_ctl->entries, io_ctl->bitmaps);
1262out:
1263        if (ret) {
1264                invalidate_inode_pages2(inode->i_mapping);
1265                BTRFS_I(inode)->generation = 0;
1266                if (block_group)
1267                        btrfs_debug(root->fs_info,
1268          "failed to write free space cache for block group %llu error %d",
1269                                  block_group->start, ret);
1270        }
1271        btrfs_update_inode(trans, root, BTRFS_I(inode));
1272
1273        if (block_group) {
1274                /* the dirty list is protected by the dirty_bgs_lock */
1275                spin_lock(&trans->transaction->dirty_bgs_lock);
1276
1277                /* the disk_cache_state is protected by the block group lock */
1278                spin_lock(&block_group->lock);
1279
1280                /*
1281                 * only mark this as written if we didn't get put back on
1282                 * the dirty list while waiting for IO.   Otherwise our
1283                 * cache state won't be right, and we won't get written again
1284                 */
1285                if (!ret && list_empty(&block_group->dirty_list))
1286                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1287                else if (ret)
1288                        block_group->disk_cache_state = BTRFS_DC_ERROR;
1289
1290                spin_unlock(&block_group->lock);
1291                spin_unlock(&trans->transaction->dirty_bgs_lock);
1292                io_ctl->inode = NULL;
1293                iput(inode);
1294        }
1295
1296        return ret;
1297
1298}
1299
1300int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1301                        struct btrfs_block_group *block_group,
1302                        struct btrfs_path *path)
1303{
1304        return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1305                                     block_group, &block_group->io_ctl,
1306                                     path, block_group->start);
1307}
1308
1309/**
1310 * Write out cached info to an inode
1311 *
1312 * @root:        root the inode belongs to
1313 * @inode:       freespace inode we are writing out
1314 * @ctl:         free space cache we are going to write out
1315 * @block_group: block_group for this cache if it belongs to a block_group
1316 * @io_ctl:      holds context for the io
1317 * @trans:       the trans handle
1318 *
1319 * This function writes out a free space cache struct to disk for quick recovery
1320 * on mount.  This will return 0 if it was successful in writing the cache out,
1321 * or an errno if it was not.
1322 */
1323static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1324                                   struct btrfs_free_space_ctl *ctl,
1325                                   struct btrfs_block_group *block_group,
1326                                   struct btrfs_io_ctl *io_ctl,
1327                                   struct btrfs_trans_handle *trans)
1328{
1329        struct extent_state *cached_state = NULL;
1330        LIST_HEAD(bitmap_list);
1331        int entries = 0;
1332        int bitmaps = 0;
1333        int ret;
1334        int must_iput = 0;
1335
1336        if (!i_size_read(inode))
1337                return -EIO;
1338
1339        WARN_ON(io_ctl->pages);
1340        ret = io_ctl_init(io_ctl, inode, 1);
1341        if (ret)
1342                return ret;
1343
1344        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1345                down_write(&block_group->data_rwsem);
1346                spin_lock(&block_group->lock);
1347                if (block_group->delalloc_bytes) {
1348                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1349                        spin_unlock(&block_group->lock);
1350                        up_write(&block_group->data_rwsem);
1351                        BTRFS_I(inode)->generation = 0;
1352                        ret = 0;
1353                        must_iput = 1;
1354                        goto out;
1355                }
1356                spin_unlock(&block_group->lock);
1357        }
1358
1359        /* Lock all pages first so we can lock the extent safely. */
1360        ret = io_ctl_prepare_pages(io_ctl, false);
1361        if (ret)
1362                goto out_unlock;
1363
1364        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1365                         &cached_state);
1366
1367        io_ctl_set_generation(io_ctl, trans->transid);
1368
1369        mutex_lock(&ctl->cache_writeout_mutex);
1370        /* Write out the extent entries in the free space cache */
1371        spin_lock(&ctl->tree_lock);
1372        ret = write_cache_extent_entries(io_ctl, ctl,
1373                                         block_group, &entries, &bitmaps,
1374                                         &bitmap_list);
1375        if (ret)
1376                goto out_nospc_locked;
1377
1378        /*
1379         * Some spaces that are freed in the current transaction are pinned,
1380         * they will be added into free space cache after the transaction is
1381         * committed, we shouldn't lose them.
1382         *
1383         * If this changes while we are working we'll get added back to
1384         * the dirty list and redo it.  No locking needed
1385         */
1386        ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1387        if (ret)
1388                goto out_nospc_locked;
1389
1390        /*
1391         * At last, we write out all the bitmaps and keep cache_writeout_mutex
1392         * locked while doing it because a concurrent trim can be manipulating
1393         * or freeing the bitmap.
1394         */
1395        ret = write_bitmap_entries(io_ctl, &bitmap_list);
1396        spin_unlock(&ctl->tree_lock);
1397        mutex_unlock(&ctl->cache_writeout_mutex);
1398        if (ret)
1399                goto out_nospc;
1400
1401        /* Zero out the rest of the pages just to make sure */
1402        io_ctl_zero_remaining_pages(io_ctl);
1403
1404        /* Everything is written out, now we dirty the pages in the file. */
1405        ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1406                                io_ctl->num_pages, 0, i_size_read(inode),
1407                                &cached_state, false);
1408        if (ret)
1409                goto out_nospc;
1410
1411        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1412                up_write(&block_group->data_rwsem);
1413        /*
1414         * Release the pages and unlock the extent, we will flush
1415         * them out later
1416         */
1417        io_ctl_drop_pages(io_ctl);
1418        io_ctl_free(io_ctl);
1419
1420        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1421                             i_size_read(inode) - 1, &cached_state);
1422
1423        /*
1424         * at this point the pages are under IO and we're happy,
1425         * The caller is responsible for waiting on them and updating
1426         * the cache and the inode
1427         */
1428        io_ctl->entries = entries;
1429        io_ctl->bitmaps = bitmaps;
1430
1431        ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1432        if (ret)
1433                goto out;
1434
1435        return 0;
1436
1437out_nospc_locked:
1438        cleanup_bitmap_list(&bitmap_list);
1439        spin_unlock(&ctl->tree_lock);
1440        mutex_unlock(&ctl->cache_writeout_mutex);
1441
1442out_nospc:
1443        cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1444
1445out_unlock:
1446        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1447                up_write(&block_group->data_rwsem);
1448
1449out:
1450        io_ctl->inode = NULL;
1451        io_ctl_free(io_ctl);
1452        if (ret) {
1453                invalidate_inode_pages2(inode->i_mapping);
1454                BTRFS_I(inode)->generation = 0;
1455        }
1456        btrfs_update_inode(trans, root, BTRFS_I(inode));
1457        if (must_iput)
1458                iput(inode);
1459        return ret;
1460}
1461
1462int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1463                          struct btrfs_block_group *block_group,
1464                          struct btrfs_path *path)
1465{
1466        struct btrfs_fs_info *fs_info = trans->fs_info;
1467        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1468        struct inode *inode;
1469        int ret = 0;
1470
1471        spin_lock(&block_group->lock);
1472        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1473                spin_unlock(&block_group->lock);
1474                return 0;
1475        }
1476        spin_unlock(&block_group->lock);
1477
1478        inode = lookup_free_space_inode(block_group, path);
1479        if (IS_ERR(inode))
1480                return 0;
1481
1482        ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1483                                block_group, &block_group->io_ctl, trans);
1484        if (ret) {
1485                btrfs_debug(fs_info,
1486          "failed to write free space cache for block group %llu error %d",
1487                          block_group->start, ret);
1488                spin_lock(&block_group->lock);
1489                block_group->disk_cache_state = BTRFS_DC_ERROR;
1490                spin_unlock(&block_group->lock);
1491
1492                block_group->io_ctl.inode = NULL;
1493                iput(inode);
1494        }
1495
1496        /*
1497         * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1498         * to wait for IO and put the inode
1499         */
1500
1501        return ret;
1502}
1503
1504static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1505                                          u64 offset)
1506{
1507        ASSERT(offset >= bitmap_start);
1508        offset -= bitmap_start;
1509        return (unsigned long)(div_u64(offset, unit));
1510}
1511
1512static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1513{
1514        return (unsigned long)(div_u64(bytes, unit));
1515}
1516
1517static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1518                                   u64 offset)
1519{
1520        u64 bitmap_start;
1521        u64 bytes_per_bitmap;
1522
1523        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1524        bitmap_start = offset - ctl->start;
1525        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1526        bitmap_start *= bytes_per_bitmap;
1527        bitmap_start += ctl->start;
1528
1529        return bitmap_start;
1530}
1531
1532static int tree_insert_offset(struct rb_root *root, u64 offset,
1533                              struct rb_node *node, int bitmap)
1534{
1535        struct rb_node **p = &root->rb_node;
1536        struct rb_node *parent = NULL;
1537        struct btrfs_free_space *info;
1538
1539        while (*p) {
1540                parent = *p;
1541                info = rb_entry(parent, struct btrfs_free_space, offset_index);
1542
1543                if (offset < info->offset) {
1544                        p = &(*p)->rb_left;
1545                } else if (offset > info->offset) {
1546                        p = &(*p)->rb_right;
1547                } else {
1548                        /*
1549                         * we could have a bitmap entry and an extent entry
1550                         * share the same offset.  If this is the case, we want
1551                         * the extent entry to always be found first if we do a
1552                         * linear search through the tree, since we want to have
1553                         * the quickest allocation time, and allocating from an
1554                         * extent is faster than allocating from a bitmap.  So
1555                         * if we're inserting a bitmap and we find an entry at
1556                         * this offset, we want to go right, or after this entry
1557                         * logically.  If we are inserting an extent and we've
1558                         * found a bitmap, we want to go left, or before
1559                         * logically.
1560                         */
1561                        if (bitmap) {
1562                                if (info->bitmap) {
1563                                        WARN_ON_ONCE(1);
1564                                        return -EEXIST;
1565                                }
1566                                p = &(*p)->rb_right;
1567                        } else {
1568                                if (!info->bitmap) {
1569                                        WARN_ON_ONCE(1);
1570                                        return -EEXIST;
1571                                }
1572                                p = &(*p)->rb_left;
1573                        }
1574                }
1575        }
1576
1577        rb_link_node(node, parent, p);
1578        rb_insert_color(node, root);
1579
1580        return 0;
1581}
1582
1583/*
1584 * searches the tree for the given offset.
1585 *
1586 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1587 * want a section that has at least bytes size and comes at or after the given
1588 * offset.
1589 */
1590static struct btrfs_free_space *
1591tree_search_offset(struct btrfs_free_space_ctl *ctl,
1592                   u64 offset, int bitmap_only, int fuzzy)
1593{
1594        struct rb_node *n = ctl->free_space_offset.rb_node;
1595        struct btrfs_free_space *entry, *prev = NULL;
1596
1597        /* find entry that is closest to the 'offset' */
1598        while (1) {
1599                if (!n) {
1600                        entry = NULL;
1601                        break;
1602                }
1603
1604                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1605                prev = entry;
1606
1607                if (offset < entry->offset)
1608                        n = n->rb_left;
1609                else if (offset > entry->offset)
1610                        n = n->rb_right;
1611                else
1612                        break;
1613        }
1614
1615        if (bitmap_only) {
1616                if (!entry)
1617                        return NULL;
1618                if (entry->bitmap)
1619                        return entry;
1620
1621                /*
1622                 * bitmap entry and extent entry may share same offset,
1623                 * in that case, bitmap entry comes after extent entry.
1624                 */
1625                n = rb_next(n);
1626                if (!n)
1627                        return NULL;
1628                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1629                if (entry->offset != offset)
1630                        return NULL;
1631
1632                WARN_ON(!entry->bitmap);
1633                return entry;
1634        } else if (entry) {
1635                if (entry->bitmap) {
1636                        /*
1637                         * if previous extent entry covers the offset,
1638                         * we should return it instead of the bitmap entry
1639                         */
1640                        n = rb_prev(&entry->offset_index);
1641                        if (n) {
1642                                prev = rb_entry(n, struct btrfs_free_space,
1643                                                offset_index);
1644                                if (!prev->bitmap &&
1645                                    prev->offset + prev->bytes > offset)
1646                                        entry = prev;
1647                        }
1648                }
1649                return entry;
1650        }
1651
1652        if (!prev)
1653                return NULL;
1654
1655        /* find last entry before the 'offset' */
1656        entry = prev;
1657        if (entry->offset > offset) {
1658                n = rb_prev(&entry->offset_index);
1659                if (n) {
1660                        entry = rb_entry(n, struct btrfs_free_space,
1661                                        offset_index);
1662                        ASSERT(entry->offset <= offset);
1663                } else {
1664                        if (fuzzy)
1665                                return entry;
1666                        else
1667                                return NULL;
1668                }
1669        }
1670
1671        if (entry->bitmap) {
1672                n = rb_prev(&entry->offset_index);
1673                if (n) {
1674                        prev = rb_entry(n, struct btrfs_free_space,
1675                                        offset_index);
1676                        if (!prev->bitmap &&
1677                            prev->offset + prev->bytes > offset)
1678                                return prev;
1679                }
1680                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1681                        return entry;
1682        } else if (entry->offset + entry->bytes > offset)
1683                return entry;
1684
1685        if (!fuzzy)
1686                return NULL;
1687
1688        while (1) {
1689                if (entry->bitmap) {
1690                        if (entry->offset + BITS_PER_BITMAP *
1691                            ctl->unit > offset)
1692                                break;
1693                } else {
1694                        if (entry->offset + entry->bytes > offset)
1695                                break;
1696                }
1697
1698                n = rb_next(&entry->offset_index);
1699                if (!n)
1700                        return NULL;
1701                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1702        }
1703        return entry;
1704}
1705
1706static inline void
1707__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1708                    struct btrfs_free_space *info)
1709{
1710        rb_erase(&info->offset_index, &ctl->free_space_offset);
1711        ctl->free_extents--;
1712
1713        if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1714                ctl->discardable_extents[BTRFS_STAT_CURR]--;
1715                ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1716        }
1717}
1718
1719static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1720                              struct btrfs_free_space *info)
1721{
1722        __unlink_free_space(ctl, info);
1723        ctl->free_space -= info->bytes;
1724}
1725
1726static int link_free_space(struct btrfs_free_space_ctl *ctl,
1727                           struct btrfs_free_space *info)
1728{
1729        int ret = 0;
1730
1731        ASSERT(info->bytes || info->bitmap);
1732        ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1733                                 &info->offset_index, (info->bitmap != NULL));
1734        if (ret)
1735                return ret;
1736
1737        if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1738                ctl->discardable_extents[BTRFS_STAT_CURR]++;
1739                ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1740        }
1741
1742        ctl->free_space += info->bytes;
1743        ctl->free_extents++;
1744        return ret;
1745}
1746
1747static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1748                                       struct btrfs_free_space *info,
1749                                       u64 offset, u64 bytes)
1750{
1751        unsigned long start, count, end;
1752        int extent_delta = -1;
1753
1754        start = offset_to_bit(info->offset, ctl->unit, offset);
1755        count = bytes_to_bits(bytes, ctl->unit);
1756        end = start + count;
1757        ASSERT(end <= BITS_PER_BITMAP);
1758
1759        bitmap_clear(info->bitmap, start, count);
1760
1761        info->bytes -= bytes;
1762        if (info->max_extent_size > ctl->unit)
1763                info->max_extent_size = 0;
1764
1765        if (start && test_bit(start - 1, info->bitmap))
1766                extent_delta++;
1767
1768        if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1769                extent_delta++;
1770
1771        info->bitmap_extents += extent_delta;
1772        if (!btrfs_free_space_trimmed(info)) {
1773                ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1774                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1775        }
1776}
1777
1778static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1779                              struct btrfs_free_space *info, u64 offset,
1780                              u64 bytes)
1781{
1782        __bitmap_clear_bits(ctl, info, offset, bytes);
1783        ctl->free_space -= bytes;
1784}
1785
1786static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1787                            struct btrfs_free_space *info, u64 offset,
1788                            u64 bytes)
1789{
1790        unsigned long start, count, end;
1791        int extent_delta = 1;
1792
1793        start = offset_to_bit(info->offset, ctl->unit, offset);
1794        count = bytes_to_bits(bytes, ctl->unit);
1795        end = start + count;
1796        ASSERT(end <= BITS_PER_BITMAP);
1797
1798        bitmap_set(info->bitmap, start, count);
1799
1800        info->bytes += bytes;
1801        ctl->free_space += bytes;
1802
1803        if (start && test_bit(start - 1, info->bitmap))
1804                extent_delta--;
1805
1806        if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1807                extent_delta--;
1808
1809        info->bitmap_extents += extent_delta;
1810        if (!btrfs_free_space_trimmed(info)) {
1811                ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1812                ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1813        }
1814}
1815
1816/*
1817 * If we can not find suitable extent, we will use bytes to record
1818 * the size of the max extent.
1819 */
1820static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1821                         struct btrfs_free_space *bitmap_info, u64 *offset,
1822                         u64 *bytes, bool for_alloc)
1823{
1824        unsigned long found_bits = 0;
1825        unsigned long max_bits = 0;
1826        unsigned long bits, i;
1827        unsigned long next_zero;
1828        unsigned long extent_bits;
1829
1830        /*
1831         * Skip searching the bitmap if we don't have a contiguous section that
1832         * is large enough for this allocation.
1833         */
1834        if (for_alloc &&
1835            bitmap_info->max_extent_size &&
1836            bitmap_info->max_extent_size < *bytes) {
1837                *bytes = bitmap_info->max_extent_size;
1838                return -1;
1839        }
1840
1841        i = offset_to_bit(bitmap_info->offset, ctl->unit,
1842                          max_t(u64, *offset, bitmap_info->offset));
1843        bits = bytes_to_bits(*bytes, ctl->unit);
1844
1845        for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1846                if (for_alloc && bits == 1) {
1847                        found_bits = 1;
1848                        break;
1849                }
1850                next_zero = find_next_zero_bit(bitmap_info->bitmap,
1851                                               BITS_PER_BITMAP, i);
1852                extent_bits = next_zero - i;
1853                if (extent_bits >= bits) {
1854                        found_bits = extent_bits;
1855                        break;
1856                } else if (extent_bits > max_bits) {
1857                        max_bits = extent_bits;
1858                }
1859                i = next_zero;
1860        }
1861
1862        if (found_bits) {
1863                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1864                *bytes = (u64)(found_bits) * ctl->unit;
1865                return 0;
1866        }
1867
1868        *bytes = (u64)(max_bits) * ctl->unit;
1869        bitmap_info->max_extent_size = *bytes;
1870        return -1;
1871}
1872
1873static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1874{
1875        if (entry->bitmap)
1876                return entry->max_extent_size;
1877        return entry->bytes;
1878}
1879
1880/* Cache the size of the max extent in bytes */
1881static struct btrfs_free_space *
1882find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1883                unsigned long align, u64 *max_extent_size)
1884{
1885        struct btrfs_free_space *entry;
1886        struct rb_node *node;
1887        u64 tmp;
1888        u64 align_off;
1889        int ret;
1890
1891        if (!ctl->free_space_offset.rb_node)
1892                goto out;
1893
1894        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1895        if (!entry)
1896                goto out;
1897
1898        for (node = &entry->offset_index; node; node = rb_next(node)) {
1899                entry = rb_entry(node, struct btrfs_free_space, offset_index);
1900                if (entry->bytes < *bytes) {
1901                        *max_extent_size = max(get_max_extent_size(entry),
1902                                               *max_extent_size);
1903                        continue;
1904                }
1905
1906                /* make sure the space returned is big enough
1907                 * to match our requested alignment
1908                 */
1909                if (*bytes >= align) {
1910                        tmp = entry->offset - ctl->start + align - 1;
1911                        tmp = div64_u64(tmp, align);
1912                        tmp = tmp * align + ctl->start;
1913                        align_off = tmp - entry->offset;
1914                } else {
1915                        align_off = 0;
1916                        tmp = entry->offset;
1917                }
1918
1919                if (entry->bytes < *bytes + align_off) {
1920                        *max_extent_size = max(get_max_extent_size(entry),
1921                                               *max_extent_size);
1922                        continue;
1923                }
1924
1925                if (entry->bitmap) {
1926                        u64 size = *bytes;
1927
1928                        ret = search_bitmap(ctl, entry, &tmp, &size, true);
1929                        if (!ret) {
1930                                *offset = tmp;
1931                                *bytes = size;
1932                                return entry;
1933                        } else {
1934                                *max_extent_size =
1935                                        max(get_max_extent_size(entry),
1936                                            *max_extent_size);
1937                        }
1938                        continue;
1939                }
1940
1941                *offset = tmp;
1942                *bytes = entry->bytes - align_off;
1943                return entry;
1944        }
1945out:
1946        return NULL;
1947}
1948
1949static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1950                           struct btrfs_free_space *info, u64 offset)
1951{
1952        info->offset = offset_to_bitmap(ctl, offset);
1953        info->bytes = 0;
1954        info->bitmap_extents = 0;
1955        INIT_LIST_HEAD(&info->list);
1956        link_free_space(ctl, info);
1957        ctl->total_bitmaps++;
1958        recalculate_thresholds(ctl);
1959}
1960
1961static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1962                        struct btrfs_free_space *bitmap_info)
1963{
1964        /*
1965         * Normally when this is called, the bitmap is completely empty. However,
1966         * if we are blowing up the free space cache for one reason or another
1967         * via __btrfs_remove_free_space_cache(), then it may not be freed and
1968         * we may leave stats on the table.
1969         */
1970        if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
1971                ctl->discardable_extents[BTRFS_STAT_CURR] -=
1972                        bitmap_info->bitmap_extents;
1973                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
1974
1975        }
1976        unlink_free_space(ctl, bitmap_info);
1977        kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1978        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1979        ctl->total_bitmaps--;
1980        recalculate_thresholds(ctl);
1981}
1982
1983static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1984                              struct btrfs_free_space *bitmap_info,
1985                              u64 *offset, u64 *bytes)
1986{
1987        u64 end;
1988        u64 search_start, search_bytes;
1989        int ret;
1990
1991again:
1992        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1993
1994        /*
1995         * We need to search for bits in this bitmap.  We could only cover some
1996         * of the extent in this bitmap thanks to how we add space, so we need
1997         * to search for as much as it as we can and clear that amount, and then
1998         * go searching for the next bit.
1999         */
2000        search_start = *offset;
2001        search_bytes = ctl->unit;
2002        search_bytes = min(search_bytes, end - search_start + 1);
2003        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2004                            false);
2005        if (ret < 0 || search_start != *offset)
2006                return -EINVAL;
2007
2008        /* We may have found more bits than what we need */
2009        search_bytes = min(search_bytes, *bytes);
2010
2011        /* Cannot clear past the end of the bitmap */
2012        search_bytes = min(search_bytes, end - search_start + 1);
2013
2014        bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
2015        *offset += search_bytes;
2016        *bytes -= search_bytes;
2017
2018        if (*bytes) {
2019                struct rb_node *next = rb_next(&bitmap_info->offset_index);
2020                if (!bitmap_info->bytes)
2021                        free_bitmap(ctl, bitmap_info);
2022
2023                /*
2024                 * no entry after this bitmap, but we still have bytes to
2025                 * remove, so something has gone wrong.
2026                 */
2027                if (!next)
2028                        return -EINVAL;
2029
2030                bitmap_info = rb_entry(next, struct btrfs_free_space,
2031                                       offset_index);
2032
2033                /*
2034                 * if the next entry isn't a bitmap we need to return to let the
2035                 * extent stuff do its work.
2036                 */
2037                if (!bitmap_info->bitmap)
2038                        return -EAGAIN;
2039
2040                /*
2041                 * Ok the next item is a bitmap, but it may not actually hold
2042                 * the information for the rest of this free space stuff, so
2043                 * look for it, and if we don't find it return so we can try
2044                 * everything over again.
2045                 */
2046                search_start = *offset;
2047                search_bytes = ctl->unit;
2048                ret = search_bitmap(ctl, bitmap_info, &search_start,
2049                                    &search_bytes, false);
2050                if (ret < 0 || search_start != *offset)
2051                        return -EAGAIN;
2052
2053                goto again;
2054        } else if (!bitmap_info->bytes)
2055                free_bitmap(ctl, bitmap_info);
2056
2057        return 0;
2058}
2059
2060static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2061                               struct btrfs_free_space *info, u64 offset,
2062                               u64 bytes, enum btrfs_trim_state trim_state)
2063{
2064        u64 bytes_to_set = 0;
2065        u64 end;
2066
2067        /*
2068         * This is a tradeoff to make bitmap trim state minimal.  We mark the
2069         * whole bitmap untrimmed if at any point we add untrimmed regions.
2070         */
2071        if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2072                if (btrfs_free_space_trimmed(info)) {
2073                        ctl->discardable_extents[BTRFS_STAT_CURR] +=
2074                                info->bitmap_extents;
2075                        ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2076                }
2077                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2078        }
2079
2080        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2081
2082        bytes_to_set = min(end - offset, bytes);
2083
2084        bitmap_set_bits(ctl, info, offset, bytes_to_set);
2085
2086        /*
2087         * We set some bytes, we have no idea what the max extent size is
2088         * anymore.
2089         */
2090        info->max_extent_size = 0;
2091
2092        return bytes_to_set;
2093
2094}
2095
2096static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2097                      struct btrfs_free_space *info)
2098{
2099        struct btrfs_block_group *block_group = ctl->private;
2100        struct btrfs_fs_info *fs_info = block_group->fs_info;
2101        bool forced = false;
2102
2103#ifdef CONFIG_BTRFS_DEBUG
2104        if (btrfs_should_fragment_free_space(block_group))
2105                forced = true;
2106#endif
2107
2108        /* This is a way to reclaim large regions from the bitmaps. */
2109        if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2110                return false;
2111
2112        /*
2113         * If we are below the extents threshold then we can add this as an
2114         * extent, and don't have to deal with the bitmap
2115         */
2116        if (!forced && ctl->free_extents < ctl->extents_thresh) {
2117                /*
2118                 * If this block group has some small extents we don't want to
2119                 * use up all of our free slots in the cache with them, we want
2120                 * to reserve them to larger extents, however if we have plenty
2121                 * of cache left then go ahead an dadd them, no sense in adding
2122                 * the overhead of a bitmap if we don't have to.
2123                 */
2124                if (info->bytes <= fs_info->sectorsize * 8) {
2125                        if (ctl->free_extents * 3 <= ctl->extents_thresh)
2126                                return false;
2127                } else {
2128                        return false;
2129                }
2130        }
2131
2132        /*
2133         * The original block groups from mkfs can be really small, like 8
2134         * megabytes, so don't bother with a bitmap for those entries.  However
2135         * some block groups can be smaller than what a bitmap would cover but
2136         * are still large enough that they could overflow the 32k memory limit,
2137         * so allow those block groups to still be allowed to have a bitmap
2138         * entry.
2139         */
2140        if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2141                return false;
2142
2143        return true;
2144}
2145
2146static const struct btrfs_free_space_op free_space_op = {
2147        .use_bitmap             = use_bitmap,
2148};
2149
2150static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2151                              struct btrfs_free_space *info)
2152{
2153        struct btrfs_free_space *bitmap_info;
2154        struct btrfs_block_group *block_group = NULL;
2155        int added = 0;
2156        u64 bytes, offset, bytes_added;
2157        enum btrfs_trim_state trim_state;
2158        int ret;
2159
2160        bytes = info->bytes;
2161        offset = info->offset;
2162        trim_state = info->trim_state;
2163
2164        if (!ctl->op->use_bitmap(ctl, info))
2165                return 0;
2166
2167        if (ctl->op == &free_space_op)
2168                block_group = ctl->private;
2169again:
2170        /*
2171         * Since we link bitmaps right into the cluster we need to see if we
2172         * have a cluster here, and if so and it has our bitmap we need to add
2173         * the free space to that bitmap.
2174         */
2175        if (block_group && !list_empty(&block_group->cluster_list)) {
2176                struct btrfs_free_cluster *cluster;
2177                struct rb_node *node;
2178                struct btrfs_free_space *entry;
2179
2180                cluster = list_entry(block_group->cluster_list.next,
2181                                     struct btrfs_free_cluster,
2182                                     block_group_list);
2183                spin_lock(&cluster->lock);
2184                node = rb_first(&cluster->root);
2185                if (!node) {
2186                        spin_unlock(&cluster->lock);
2187                        goto no_cluster_bitmap;
2188                }
2189
2190                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2191                if (!entry->bitmap) {
2192                        spin_unlock(&cluster->lock);
2193                        goto no_cluster_bitmap;
2194                }
2195
2196                if (entry->offset == offset_to_bitmap(ctl, offset)) {
2197                        bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2198                                                          bytes, trim_state);
2199                        bytes -= bytes_added;
2200                        offset += bytes_added;
2201                }
2202                spin_unlock(&cluster->lock);
2203                if (!bytes) {
2204                        ret = 1;
2205                        goto out;
2206                }
2207        }
2208
2209no_cluster_bitmap:
2210        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2211                                         1, 0);
2212        if (!bitmap_info) {
2213                ASSERT(added == 0);
2214                goto new_bitmap;
2215        }
2216
2217        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2218                                          trim_state);
2219        bytes -= bytes_added;
2220        offset += bytes_added;
2221        added = 0;
2222
2223        if (!bytes) {
2224                ret = 1;
2225                goto out;
2226        } else
2227                goto again;
2228
2229new_bitmap:
2230        if (info && info->bitmap) {
2231                add_new_bitmap(ctl, info, offset);
2232                added = 1;
2233                info = NULL;
2234                goto again;
2235        } else {
2236                spin_unlock(&ctl->tree_lock);
2237
2238                /* no pre-allocated info, allocate a new one */
2239                if (!info) {
2240                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
2241                                                 GFP_NOFS);
2242                        if (!info) {
2243                                spin_lock(&ctl->tree_lock);
2244                                ret = -ENOMEM;
2245                                goto out;
2246                        }
2247                }
2248
2249                /* allocate the bitmap */
2250                info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2251                                                 GFP_NOFS);
2252                info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2253                spin_lock(&ctl->tree_lock);
2254                if (!info->bitmap) {
2255                        ret = -ENOMEM;
2256                        goto out;
2257                }
2258                goto again;
2259        }
2260
2261out:
2262        if (info) {
2263                if (info->bitmap)
2264                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
2265                                        info->bitmap);
2266                kmem_cache_free(btrfs_free_space_cachep, info);
2267        }
2268
2269        return ret;
2270}
2271
2272/*
2273 * Free space merging rules:
2274 *  1) Merge trimmed areas together
2275 *  2) Let untrimmed areas coalesce with trimmed areas
2276 *  3) Always pull neighboring regions from bitmaps
2277 *
2278 * The above rules are for when we merge free space based on btrfs_trim_state.
2279 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2280 * same reason: to promote larger extent regions which makes life easier for
2281 * find_free_extent().  Rule 2 enables coalescing based on the common path
2282 * being returning free space from btrfs_finish_extent_commit().  So when free
2283 * space is trimmed, it will prevent aggregating trimmed new region and
2284 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2285 * and provide find_free_extent() with the largest extents possible hoping for
2286 * the reuse path.
2287 */
2288static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2289                          struct btrfs_free_space *info, bool update_stat)
2290{
2291        struct btrfs_free_space *left_info = NULL;
2292        struct btrfs_free_space *right_info;
2293        bool merged = false;
2294        u64 offset = info->offset;
2295        u64 bytes = info->bytes;
2296        const bool is_trimmed = btrfs_free_space_trimmed(info);
2297
2298        /*
2299         * first we want to see if there is free space adjacent to the range we
2300         * are adding, if there is remove that struct and add a new one to
2301         * cover the entire range
2302         */
2303        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2304        if (right_info && rb_prev(&right_info->offset_index))
2305                left_info = rb_entry(rb_prev(&right_info->offset_index),
2306                                     struct btrfs_free_space, offset_index);
2307        else if (!right_info)
2308                left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2309
2310        /* See try_merge_free_space() comment. */
2311        if (right_info && !right_info->bitmap &&
2312            (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2313                if (update_stat)
2314                        unlink_free_space(ctl, right_info);
2315                else
2316                        __unlink_free_space(ctl, right_info);
2317                info->bytes += right_info->bytes;
2318                kmem_cache_free(btrfs_free_space_cachep, right_info);
2319                merged = true;
2320        }
2321
2322        /* See try_merge_free_space() comment. */
2323        if (left_info && !left_info->bitmap &&
2324            left_info->offset + left_info->bytes == offset &&
2325            (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2326                if (update_stat)
2327                        unlink_free_space(ctl, left_info);
2328                else
2329                        __unlink_free_space(ctl, left_info);
2330                info->offset = left_info->offset;
2331                info->bytes += left_info->bytes;
2332                kmem_cache_free(btrfs_free_space_cachep, left_info);
2333                merged = true;
2334        }
2335
2336        return merged;
2337}
2338
2339static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2340                                     struct btrfs_free_space *info,
2341                                     bool update_stat)
2342{
2343        struct btrfs_free_space *bitmap;
2344        unsigned long i;
2345        unsigned long j;
2346        const u64 end = info->offset + info->bytes;
2347        const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2348        u64 bytes;
2349
2350        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2351        if (!bitmap)
2352                return false;
2353
2354        i = offset_to_bit(bitmap->offset, ctl->unit, end);
2355        j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2356        if (j == i)
2357                return false;
2358        bytes = (j - i) * ctl->unit;
2359        info->bytes += bytes;
2360
2361        /* See try_merge_free_space() comment. */
2362        if (!btrfs_free_space_trimmed(bitmap))
2363                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2364
2365        if (update_stat)
2366                bitmap_clear_bits(ctl, bitmap, end, bytes);
2367        else
2368                __bitmap_clear_bits(ctl, bitmap, end, bytes);
2369
2370        if (!bitmap->bytes)
2371                free_bitmap(ctl, bitmap);
2372
2373        return true;
2374}
2375
2376static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2377                                       struct btrfs_free_space *info,
2378                                       bool update_stat)
2379{
2380        struct btrfs_free_space *bitmap;
2381        u64 bitmap_offset;
2382        unsigned long i;
2383        unsigned long j;
2384        unsigned long prev_j;
2385        u64 bytes;
2386
2387        bitmap_offset = offset_to_bitmap(ctl, info->offset);
2388        /* If we're on a boundary, try the previous logical bitmap. */
2389        if (bitmap_offset == info->offset) {
2390                if (info->offset == 0)
2391                        return false;
2392                bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2393        }
2394
2395        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2396        if (!bitmap)
2397                return false;
2398
2399        i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2400        j = 0;
2401        prev_j = (unsigned long)-1;
2402        for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2403                if (j > i)
2404                        break;
2405                prev_j = j;
2406        }
2407        if (prev_j == i)
2408                return false;
2409
2410        if (prev_j == (unsigned long)-1)
2411                bytes = (i + 1) * ctl->unit;
2412        else
2413                bytes = (i - prev_j) * ctl->unit;
2414
2415        info->offset -= bytes;
2416        info->bytes += bytes;
2417
2418        /* See try_merge_free_space() comment. */
2419        if (!btrfs_free_space_trimmed(bitmap))
2420                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2421
2422        if (update_stat)
2423                bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2424        else
2425                __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2426
2427        if (!bitmap->bytes)
2428                free_bitmap(ctl, bitmap);
2429
2430        return true;
2431}
2432
2433/*
2434 * We prefer always to allocate from extent entries, both for clustered and
2435 * non-clustered allocation requests. So when attempting to add a new extent
2436 * entry, try to see if there's adjacent free space in bitmap entries, and if
2437 * there is, migrate that space from the bitmaps to the extent.
2438 * Like this we get better chances of satisfying space allocation requests
2439 * because we attempt to satisfy them based on a single cache entry, and never
2440 * on 2 or more entries - even if the entries represent a contiguous free space
2441 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2442 * ends).
2443 */
2444static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2445                              struct btrfs_free_space *info,
2446                              bool update_stat)
2447{
2448        /*
2449         * Only work with disconnected entries, as we can change their offset,
2450         * and must be extent entries.
2451         */
2452        ASSERT(!info->bitmap);
2453        ASSERT(RB_EMPTY_NODE(&info->offset_index));
2454
2455        if (ctl->total_bitmaps > 0) {
2456                bool stole_end;
2457                bool stole_front = false;
2458
2459                stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2460                if (ctl->total_bitmaps > 0)
2461                        stole_front = steal_from_bitmap_to_front(ctl, info,
2462                                                                 update_stat);
2463
2464                if (stole_end || stole_front)
2465                        try_merge_free_space(ctl, info, update_stat);
2466        }
2467}
2468
2469int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2470                           struct btrfs_free_space_ctl *ctl,
2471                           u64 offset, u64 bytes,
2472                           enum btrfs_trim_state trim_state)
2473{
2474        struct btrfs_block_group *block_group = ctl->private;
2475        struct btrfs_free_space *info;
2476        int ret = 0;
2477        u64 filter_bytes = bytes;
2478
2479        ASSERT(!btrfs_is_zoned(fs_info));
2480
2481        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2482        if (!info)
2483                return -ENOMEM;
2484
2485        info->offset = offset;
2486        info->bytes = bytes;
2487        info->trim_state = trim_state;
2488        RB_CLEAR_NODE(&info->offset_index);
2489
2490        spin_lock(&ctl->tree_lock);
2491
2492        if (try_merge_free_space(ctl, info, true))
2493                goto link;
2494
2495        /*
2496         * There was no extent directly to the left or right of this new
2497         * extent then we know we're going to have to allocate a new extent, so
2498         * before we do that see if we need to drop this into a bitmap
2499         */
2500        ret = insert_into_bitmap(ctl, info);
2501        if (ret < 0) {
2502                goto out;
2503        } else if (ret) {
2504                ret = 0;
2505                goto out;
2506        }
2507link:
2508        /*
2509         * Only steal free space from adjacent bitmaps if we're sure we're not
2510         * going to add the new free space to existing bitmap entries - because
2511         * that would mean unnecessary work that would be reverted. Therefore
2512         * attempt to steal space from bitmaps if we're adding an extent entry.
2513         */
2514        steal_from_bitmap(ctl, info, true);
2515
2516        filter_bytes = max(filter_bytes, info->bytes);
2517
2518        ret = link_free_space(ctl, info);
2519        if (ret)
2520                kmem_cache_free(btrfs_free_space_cachep, info);
2521out:
2522        btrfs_discard_update_discardable(block_group);
2523        spin_unlock(&ctl->tree_lock);
2524
2525        if (ret) {
2526                btrfs_crit(fs_info, "unable to add free space :%d", ret);
2527                ASSERT(ret != -EEXIST);
2528        }
2529
2530        if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2531                btrfs_discard_check_filter(block_group, filter_bytes);
2532                btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2533        }
2534
2535        return ret;
2536}
2537
2538static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2539                                        u64 bytenr, u64 size, bool used)
2540{
2541        struct btrfs_fs_info *fs_info = block_group->fs_info;
2542        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2543        u64 offset = bytenr - block_group->start;
2544        u64 to_free, to_unusable;
2545        const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
2546        bool initial = (size == block_group->length);
2547        u64 reclaimable_unusable;
2548
2549        WARN_ON(!initial && offset + size > block_group->zone_capacity);
2550
2551        spin_lock(&ctl->tree_lock);
2552        if (!used)
2553                to_free = size;
2554        else if (initial)
2555                to_free = block_group->zone_capacity;
2556        else if (offset >= block_group->alloc_offset)
2557                to_free = size;
2558        else if (offset + size <= block_group->alloc_offset)
2559                to_free = 0;
2560        else
2561                to_free = offset + size - block_group->alloc_offset;
2562        to_unusable = size - to_free;
2563
2564        ctl->free_space += to_free;
2565        /*
2566         * If the block group is read-only, we should account freed space into
2567         * bytes_readonly.
2568         */
2569        if (!block_group->ro)
2570                block_group->zone_unusable += to_unusable;
2571        spin_unlock(&ctl->tree_lock);
2572        if (!used) {
2573                spin_lock(&block_group->lock);
2574                block_group->alloc_offset -= size;
2575                spin_unlock(&block_group->lock);
2576        }
2577
2578        reclaimable_unusable = block_group->zone_unusable -
2579                               (block_group->length - block_group->zone_capacity);
2580        /* All the region is now unusable. Mark it as unused and reclaim */
2581        if (block_group->zone_unusable == block_group->length) {
2582                btrfs_mark_bg_unused(block_group);
2583        } else if (bg_reclaim_threshold &&
2584                   reclaimable_unusable >=
2585                   div_factor_fine(block_group->zone_capacity,
2586                                   bg_reclaim_threshold)) {
2587                btrfs_mark_bg_to_reclaim(block_group);
2588        }
2589
2590        return 0;
2591}
2592
2593int btrfs_add_free_space(struct btrfs_block_group *block_group,
2594                         u64 bytenr, u64 size)
2595{
2596        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2597
2598        if (btrfs_is_zoned(block_group->fs_info))
2599                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2600                                                    true);
2601
2602        if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2603                trim_state = BTRFS_TRIM_STATE_TRIMMED;
2604
2605        return __btrfs_add_free_space(block_group->fs_info,
2606                                      block_group->free_space_ctl,
2607                                      bytenr, size, trim_state);
2608}
2609
2610int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2611                                u64 bytenr, u64 size)
2612{
2613        if (btrfs_is_zoned(block_group->fs_info))
2614                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2615                                                    false);
2616
2617        return btrfs_add_free_space(block_group, bytenr, size);
2618}
2619
2620/*
2621 * This is a subtle distinction because when adding free space back in general,
2622 * we want it to be added as untrimmed for async. But in the case where we add
2623 * it on loading of a block group, we want to consider it trimmed.
2624 */
2625int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2626                                       u64 bytenr, u64 size)
2627{
2628        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2629
2630        if (btrfs_is_zoned(block_group->fs_info))
2631                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2632                                                    true);
2633
2634        if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2635            btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2636                trim_state = BTRFS_TRIM_STATE_TRIMMED;
2637
2638        return __btrfs_add_free_space(block_group->fs_info,
2639                                      block_group->free_space_ctl,
2640                                      bytenr, size, trim_state);
2641}
2642
2643int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2644                            u64 offset, u64 bytes)
2645{
2646        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2647        struct btrfs_free_space *info;
2648        int ret;
2649        bool re_search = false;
2650
2651        if (btrfs_is_zoned(block_group->fs_info)) {
2652                /*
2653                 * This can happen with conventional zones when replaying log.
2654                 * Since the allocation info of tree-log nodes are not recorded
2655                 * to the extent-tree, calculate_alloc_pointer() failed to
2656                 * advance the allocation pointer after last allocated tree log
2657                 * node blocks.
2658                 *
2659                 * This function is called from
2660                 * btrfs_pin_extent_for_log_replay() when replaying the log.
2661                 * Advance the pointer not to overwrite the tree-log nodes.
2662                 */
2663                if (block_group->start + block_group->alloc_offset <
2664                    offset + bytes) {
2665                        block_group->alloc_offset =
2666                                offset + bytes - block_group->start;
2667                }
2668                return 0;
2669        }
2670
2671        spin_lock(&ctl->tree_lock);
2672
2673again:
2674        ret = 0;
2675        if (!bytes)
2676                goto out_lock;
2677
2678        info = tree_search_offset(ctl, offset, 0, 0);
2679        if (!info) {
2680                /*
2681                 * oops didn't find an extent that matched the space we wanted
2682                 * to remove, look for a bitmap instead
2683                 */
2684                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2685                                          1, 0);
2686                if (!info) {
2687                        /*
2688                         * If we found a partial bit of our free space in a
2689                         * bitmap but then couldn't find the other part this may
2690                         * be a problem, so WARN about it.
2691                         */
2692                        WARN_ON(re_search);
2693                        goto out_lock;
2694                }
2695        }
2696
2697        re_search = false;
2698        if (!info->bitmap) {
2699                unlink_free_space(ctl, info);
2700                if (offset == info->offset) {
2701                        u64 to_free = min(bytes, info->bytes);
2702
2703                        info->bytes -= to_free;
2704                        info->offset += to_free;
2705                        if (info->bytes) {
2706                                ret = link_free_space(ctl, info);
2707                                WARN_ON(ret);
2708                        } else {
2709                                kmem_cache_free(btrfs_free_space_cachep, info);
2710                        }
2711
2712                        offset += to_free;
2713                        bytes -= to_free;
2714                        goto again;
2715                } else {
2716                        u64 old_end = info->bytes + info->offset;
2717
2718                        info->bytes = offset - info->offset;
2719                        ret = link_free_space(ctl, info);
2720                        WARN_ON(ret);
2721                        if (ret)
2722                                goto out_lock;
2723
2724                        /* Not enough bytes in this entry to satisfy us */
2725                        if (old_end < offset + bytes) {
2726                                bytes -= old_end - offset;
2727                                offset = old_end;
2728                                goto again;
2729                        } else if (old_end == offset + bytes) {
2730                                /* all done */
2731                                goto out_lock;
2732                        }
2733                        spin_unlock(&ctl->tree_lock);
2734
2735                        ret = __btrfs_add_free_space(block_group->fs_info, ctl,
2736                                                     offset + bytes,
2737                                                     old_end - (offset + bytes),
2738                                                     info->trim_state);
2739                        WARN_ON(ret);
2740                        goto out;
2741                }
2742        }
2743
2744        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2745        if (ret == -EAGAIN) {
2746                re_search = true;
2747                goto again;
2748        }
2749out_lock:
2750        btrfs_discard_update_discardable(block_group);
2751        spin_unlock(&ctl->tree_lock);
2752out:
2753        return ret;
2754}
2755
2756void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2757                           u64 bytes)
2758{
2759        struct btrfs_fs_info *fs_info = block_group->fs_info;
2760        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2761        struct btrfs_free_space *info;
2762        struct rb_node *n;
2763        int count = 0;
2764
2765        /*
2766         * Zoned btrfs does not use free space tree and cluster. Just print
2767         * out the free space after the allocation offset.
2768         */
2769        if (btrfs_is_zoned(fs_info)) {
2770                btrfs_info(fs_info, "free space %llu active %d",
2771                           block_group->zone_capacity - block_group->alloc_offset,
2772                           block_group->zone_is_active);
2773                return;
2774        }
2775
2776        spin_lock(&ctl->tree_lock);
2777        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2778                info = rb_entry(n, struct btrfs_free_space, offset_index);
2779                if (info->bytes >= bytes && !block_group->ro)
2780                        count++;
2781                btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2782                           info->offset, info->bytes,
2783                       (info->bitmap) ? "yes" : "no");
2784        }
2785        spin_unlock(&ctl->tree_lock);
2786        btrfs_info(fs_info, "block group has cluster?: %s",
2787               list_empty(&block_group->cluster_list) ? "no" : "yes");
2788        btrfs_info(fs_info,
2789                   "%d blocks of free space at or bigger than bytes is", count);
2790}
2791
2792void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2793                               struct btrfs_free_space_ctl *ctl)
2794{
2795        struct btrfs_fs_info *fs_info = block_group->fs_info;
2796
2797        spin_lock_init(&ctl->tree_lock);
2798        ctl->unit = fs_info->sectorsize;
2799        ctl->start = block_group->start;
2800        ctl->private = block_group;
2801        ctl->op = &free_space_op;
2802        INIT_LIST_HEAD(&ctl->trimming_ranges);
2803        mutex_init(&ctl->cache_writeout_mutex);
2804
2805        /*
2806         * we only want to have 32k of ram per block group for keeping
2807         * track of free space, and if we pass 1/2 of that we want to
2808         * start converting things over to using bitmaps
2809         */
2810        ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2811}
2812
2813/*
2814 * for a given cluster, put all of its extents back into the free
2815 * space cache.  If the block group passed doesn't match the block group
2816 * pointed to by the cluster, someone else raced in and freed the
2817 * cluster already.  In that case, we just return without changing anything
2818 */
2819static void __btrfs_return_cluster_to_free_space(
2820                             struct btrfs_block_group *block_group,
2821                             struct btrfs_free_cluster *cluster)
2822{
2823        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2824        struct btrfs_free_space *entry;
2825        struct rb_node *node;
2826
2827        spin_lock(&cluster->lock);
2828        if (cluster->block_group != block_group) {
2829                spin_unlock(&cluster->lock);
2830                return;
2831        }
2832
2833        cluster->block_group = NULL;
2834        cluster->window_start = 0;
2835        list_del_init(&cluster->block_group_list);
2836
2837        node = rb_first(&cluster->root);
2838        while (node) {
2839                bool bitmap;
2840
2841                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2842                node = rb_next(&entry->offset_index);
2843                rb_erase(&entry->offset_index, &cluster->root);
2844                RB_CLEAR_NODE(&entry->offset_index);
2845
2846                bitmap = (entry->bitmap != NULL);
2847                if (!bitmap) {
2848                        /* Merging treats extents as if they were new */
2849                        if (!btrfs_free_space_trimmed(entry)) {
2850                                ctl->discardable_extents[BTRFS_STAT_CURR]--;
2851                                ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2852                                        entry->bytes;
2853                        }
2854
2855                        try_merge_free_space(ctl, entry, false);
2856                        steal_from_bitmap(ctl, entry, false);
2857
2858                        /* As we insert directly, update these statistics */
2859                        if (!btrfs_free_space_trimmed(entry)) {
2860                                ctl->discardable_extents[BTRFS_STAT_CURR]++;
2861                                ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2862                                        entry->bytes;
2863                        }
2864                }
2865                tree_insert_offset(&ctl->free_space_offset,
2866                                   entry->offset, &entry->offset_index, bitmap);
2867        }
2868        cluster->root = RB_ROOT;
2869        spin_unlock(&cluster->lock);
2870        btrfs_put_block_group(block_group);
2871}
2872
2873static void __btrfs_remove_free_space_cache_locked(
2874                                struct btrfs_free_space_ctl *ctl)
2875{
2876        struct btrfs_free_space *info;
2877        struct rb_node *node;
2878
2879        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2880                info = rb_entry(node, struct btrfs_free_space, offset_index);
2881                if (!info->bitmap) {
2882                        unlink_free_space(ctl, info);
2883                        kmem_cache_free(btrfs_free_space_cachep, info);
2884                } else {
2885                        free_bitmap(ctl, info);
2886                }
2887
2888                cond_resched_lock(&ctl->tree_lock);
2889        }
2890}
2891
2892void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2893{
2894        spin_lock(&ctl->tree_lock);
2895        __btrfs_remove_free_space_cache_locked(ctl);
2896        if (ctl->private)
2897                btrfs_discard_update_discardable(ctl->private);
2898        spin_unlock(&ctl->tree_lock);
2899}
2900
2901void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2902{
2903        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2904        struct btrfs_free_cluster *cluster;
2905        struct list_head *head;
2906
2907        spin_lock(&ctl->tree_lock);
2908        while ((head = block_group->cluster_list.next) !=
2909               &block_group->cluster_list) {
2910                cluster = list_entry(head, struct btrfs_free_cluster,
2911                                     block_group_list);
2912
2913                WARN_ON(cluster->block_group != block_group);
2914                __btrfs_return_cluster_to_free_space(block_group, cluster);
2915
2916                cond_resched_lock(&ctl->tree_lock);
2917        }
2918        __btrfs_remove_free_space_cache_locked(ctl);
2919        btrfs_discard_update_discardable(block_group);
2920        spin_unlock(&ctl->tree_lock);
2921
2922}
2923
2924/**
2925 * btrfs_is_free_space_trimmed - see if everything is trimmed
2926 * @block_group: block_group of interest
2927 *
2928 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
2929 */
2930bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
2931{
2932        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2933        struct btrfs_free_space *info;
2934        struct rb_node *node;
2935        bool ret = true;
2936
2937        spin_lock(&ctl->tree_lock);
2938        node = rb_first(&ctl->free_space_offset);
2939
2940        while (node) {
2941                info = rb_entry(node, struct btrfs_free_space, offset_index);
2942
2943                if (!btrfs_free_space_trimmed(info)) {
2944                        ret = false;
2945                        break;
2946                }
2947
2948                node = rb_next(node);
2949        }
2950
2951        spin_unlock(&ctl->tree_lock);
2952        return ret;
2953}
2954
2955u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
2956                               u64 offset, u64 bytes, u64 empty_size,
2957                               u64 *max_extent_size)
2958{
2959        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2960        struct btrfs_discard_ctl *discard_ctl =
2961                                        &block_group->fs_info->discard_ctl;
2962        struct btrfs_free_space *entry = NULL;
2963        u64 bytes_search = bytes + empty_size;
2964        u64 ret = 0;
2965        u64 align_gap = 0;
2966        u64 align_gap_len = 0;
2967        enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2968
2969        ASSERT(!btrfs_is_zoned(block_group->fs_info));
2970
2971        spin_lock(&ctl->tree_lock);
2972        entry = find_free_space(ctl, &offset, &bytes_search,
2973                                block_group->full_stripe_len, max_extent_size);
2974        if (!entry)
2975                goto out;
2976
2977        ret = offset;
2978        if (entry->bitmap) {
2979                bitmap_clear_bits(ctl, entry, offset, bytes);
2980
2981                if (!btrfs_free_space_trimmed(entry))
2982                        atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2983
2984                if (!entry->bytes)
2985                        free_bitmap(ctl, entry);
2986        } else {
2987                unlink_free_space(ctl, entry);
2988                align_gap_len = offset - entry->offset;
2989                align_gap = entry->offset;
2990                align_gap_trim_state = entry->trim_state;
2991
2992                if (!btrfs_free_space_trimmed(entry))
2993                        atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
2994
2995                entry->offset = offset + bytes;
2996                WARN_ON(entry->bytes < bytes + align_gap_len);
2997
2998                entry->bytes -= bytes + align_gap_len;
2999                if (!entry->bytes)
3000                        kmem_cache_free(btrfs_free_space_cachep, entry);
3001                else
3002                        link_free_space(ctl, entry);
3003        }
3004out:
3005        btrfs_discard_update_discardable(block_group);
3006        spin_unlock(&ctl->tree_lock);
3007
3008        if (align_gap_len)
3009                __btrfs_add_free_space(block_group->fs_info, ctl,
3010                                       align_gap, align_gap_len,
3011                                       align_gap_trim_state);
3012        return ret;
3013}
3014
3015/*
3016 * given a cluster, put all of its extents back into the free space
3017 * cache.  If a block group is passed, this function will only free
3018 * a cluster that belongs to the passed block group.
3019 *
3020 * Otherwise, it'll get a reference on the block group pointed to by the
3021 * cluster and remove the cluster from it.
3022 */
3023void btrfs_return_cluster_to_free_space(
3024                               struct btrfs_block_group *block_group,
3025                               struct btrfs_free_cluster *cluster)
3026{
3027        struct btrfs_free_space_ctl *ctl;
3028
3029        /* first, get a safe pointer to the block group */
3030        spin_lock(&cluster->lock);
3031        if (!block_group) {
3032                block_group = cluster->block_group;
3033                if (!block_group) {
3034                        spin_unlock(&cluster->lock);
3035                        return;
3036                }
3037        } else if (cluster->block_group != block_group) {
3038                /* someone else has already freed it don't redo their work */
3039                spin_unlock(&cluster->lock);
3040                return;
3041        }
3042        btrfs_get_block_group(block_group);
3043        spin_unlock(&cluster->lock);
3044
3045        ctl = block_group->free_space_ctl;
3046
3047        /* now return any extents the cluster had on it */
3048        spin_lock(&ctl->tree_lock);
3049        __btrfs_return_cluster_to_free_space(block_group, cluster);
3050        spin_unlock(&ctl->tree_lock);
3051
3052        btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3053
3054        /* finally drop our ref */
3055        btrfs_put_block_group(block_group);
3056}
3057
3058static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3059                                   struct btrfs_free_cluster *cluster,
3060                                   struct btrfs_free_space *entry,
3061                                   u64 bytes, u64 min_start,
3062                                   u64 *max_extent_size)
3063{
3064        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3065        int err;
3066        u64 search_start = cluster->window_start;
3067        u64 search_bytes = bytes;
3068        u64 ret = 0;
3069
3070        search_start = min_start;
3071        search_bytes = bytes;
3072
3073        err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3074        if (err) {
3075                *max_extent_size = max(get_max_extent_size(entry),
3076                                       *max_extent_size);
3077                return 0;
3078        }
3079
3080        ret = search_start;
3081        __bitmap_clear_bits(ctl, entry, ret, bytes);
3082
3083        return ret;
3084}
3085
3086/*
3087 * given a cluster, try to allocate 'bytes' from it, returns 0
3088 * if it couldn't find anything suitably large, or a logical disk offset
3089 * if things worked out
3090 */
3091u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3092                             struct btrfs_free_cluster *cluster, u64 bytes,
3093                             u64 min_start, u64 *max_extent_size)
3094{
3095        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3096        struct btrfs_discard_ctl *discard_ctl =
3097                                        &block_group->fs_info->discard_ctl;
3098        struct btrfs_free_space *entry = NULL;
3099        struct rb_node *node;
3100        u64 ret = 0;
3101
3102        ASSERT(!btrfs_is_zoned(block_group->fs_info));
3103
3104        spin_lock(&cluster->lock);
3105        if (bytes > cluster->max_size)
3106                goto out;
3107
3108        if (cluster->block_group != block_group)
3109                goto out;
3110
3111        node = rb_first(&cluster->root);
3112        if (!node)
3113                goto out;
3114
3115        entry = rb_entry(node, struct btrfs_free_space, offset_index);
3116        while (1) {
3117                if (entry->bytes < bytes)
3118                        *max_extent_size = max(get_max_extent_size(entry),
3119                                               *max_extent_size);
3120
3121                if (entry->bytes < bytes ||
3122                    (!entry->bitmap && entry->offset < min_start)) {
3123                        node = rb_next(&entry->offset_index);
3124                        if (!node)
3125                                break;
3126                        entry = rb_entry(node, struct btrfs_free_space,
3127                                         offset_index);
3128                        continue;
3129                }
3130
3131                if (entry->bitmap) {
3132                        ret = btrfs_alloc_from_bitmap(block_group,
3133                                                      cluster, entry, bytes,
3134                                                      cluster->window_start,
3135                                                      max_extent_size);
3136                        if (ret == 0) {
3137                                node = rb_next(&entry->offset_index);
3138                                if (!node)
3139                                        break;
3140                                entry = rb_entry(node, struct btrfs_free_space,
3141                                                 offset_index);
3142                                continue;
3143                        }
3144                        cluster->window_start += bytes;
3145                } else {
3146                        ret = entry->offset;
3147
3148                        entry->offset += bytes;
3149                        entry->bytes -= bytes;
3150                }
3151
3152                break;
3153        }
3154out:
3155        spin_unlock(&cluster->lock);
3156
3157        if (!ret)
3158                return 0;
3159
3160        spin_lock(&ctl->tree_lock);
3161
3162        if (!btrfs_free_space_trimmed(entry))
3163                atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3164
3165        ctl->free_space -= bytes;
3166        if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3167                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3168
3169        spin_lock(&cluster->lock);
3170        if (entry->bytes == 0) {
3171                rb_erase(&entry->offset_index, &cluster->root);
3172                ctl->free_extents--;
3173                if (entry->bitmap) {
3174                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
3175                                        entry->bitmap);
3176                        ctl->total_bitmaps--;
3177                        recalculate_thresholds(ctl);
3178                } else if (!btrfs_free_space_trimmed(entry)) {
3179                        ctl->discardable_extents[BTRFS_STAT_CURR]--;
3180                }
3181                kmem_cache_free(btrfs_free_space_cachep, entry);
3182        }
3183
3184        spin_unlock(&cluster->lock);
3185        spin_unlock(&ctl->tree_lock);
3186
3187        return ret;
3188}
3189
3190static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3191                                struct btrfs_free_space *entry,
3192                                struct btrfs_free_cluster *cluster,
3193                                u64 offset, u64 bytes,
3194                                u64 cont1_bytes, u64 min_bytes)
3195{
3196        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3197        unsigned long next_zero;
3198        unsigned long i;
3199        unsigned long want_bits;
3200        unsigned long min_bits;
3201        unsigned long found_bits;
3202        unsigned long max_bits = 0;
3203        unsigned long start = 0;
3204        unsigned long total_found = 0;
3205        int ret;
3206
3207        i = offset_to_bit(entry->offset, ctl->unit,
3208                          max_t(u64, offset, entry->offset));
3209        want_bits = bytes_to_bits(bytes, ctl->unit);
3210        min_bits = bytes_to_bits(min_bytes, ctl->unit);
3211
3212        /*
3213         * Don't bother looking for a cluster in this bitmap if it's heavily
3214         * fragmented.
3215         */
3216        if (entry->max_extent_size &&
3217            entry->max_extent_size < cont1_bytes)
3218                return -ENOSPC;
3219again:
3220        found_bits = 0;
3221        for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3222                next_zero = find_next_zero_bit(entry->bitmap,
3223                                               BITS_PER_BITMAP, i);
3224                if (next_zero - i >= min_bits) {
3225                        found_bits = next_zero - i;
3226                        if (found_bits > max_bits)
3227                                max_bits = found_bits;
3228                        break;
3229                }
3230                if (next_zero - i > max_bits)
3231                        max_bits = next_zero - i;
3232                i = next_zero;
3233        }
3234
3235        if (!found_bits) {
3236                entry->max_extent_size = (u64)max_bits * ctl->unit;
3237                return -ENOSPC;
3238        }
3239
3240        if (!total_found) {
3241                start = i;
3242                cluster->max_size = 0;
3243        }
3244
3245        total_found += found_bits;
3246
3247        if (cluster->max_size < found_bits * ctl->unit)
3248                cluster->max_size = found_bits * ctl->unit;
3249
3250        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3251                i = next_zero + 1;
3252                goto again;
3253        }
3254
3255        cluster->window_start = start * ctl->unit + entry->offset;
3256        rb_erase(&entry->offset_index, &ctl->free_space_offset);
3257        ret = tree_insert_offset(&cluster->root, entry->offset,
3258                                 &entry->offset_index, 1);
3259        ASSERT(!ret); /* -EEXIST; Logic error */
3260
3261        trace_btrfs_setup_cluster(block_group, cluster,
3262                                  total_found * ctl->unit, 1);
3263        return 0;
3264}
3265
3266/*
3267 * This searches the block group for just extents to fill the cluster with.
3268 * Try to find a cluster with at least bytes total bytes, at least one
3269 * extent of cont1_bytes, and other clusters of at least min_bytes.
3270 */
3271static noinline int
3272setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3273                        struct btrfs_free_cluster *cluster,
3274                        struct list_head *bitmaps, u64 offset, u64 bytes,
3275                        u64 cont1_bytes, u64 min_bytes)
3276{
3277        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3278        struct btrfs_free_space *first = NULL;
3279        struct btrfs_free_space *entry = NULL;
3280        struct btrfs_free_space *last;
3281        struct rb_node *node;
3282        u64 window_free;
3283        u64 max_extent;
3284        u64 total_size = 0;
3285
3286        entry = tree_search_offset(ctl, offset, 0, 1);
3287        if (!entry)
3288                return -ENOSPC;
3289
3290        /*
3291         * We don't want bitmaps, so just move along until we find a normal
3292         * extent entry.
3293         */
3294        while (entry->bitmap || entry->bytes < min_bytes) {
3295                if (entry->bitmap && list_empty(&entry->list))
3296                        list_add_tail(&entry->list, bitmaps);
3297                node = rb_next(&entry->offset_index);
3298                if (!node)
3299                        return -ENOSPC;
3300                entry = rb_entry(node, struct btrfs_free_space, offset_index);
3301        }
3302
3303        window_free = entry->bytes;
3304        max_extent = entry->bytes;
3305        first = entry;
3306        last = entry;
3307
3308        for (node = rb_next(&entry->offset_index); node;
3309             node = rb_next(&entry->offset_index)) {
3310                entry = rb_entry(node, struct btrfs_free_space, offset_index);
3311
3312                if (entry->bitmap) {
3313                        if (list_empty(&entry->list))
3314                                list_add_tail(&entry->list, bitmaps);
3315                        continue;
3316                }
3317
3318                if (entry->bytes < min_bytes)
3319                        continue;
3320
3321                last = entry;
3322                window_free += entry->bytes;
3323                if (entry->bytes > max_extent)
3324                        max_extent = entry->bytes;
3325        }
3326
3327        if (window_free < bytes || max_extent < cont1_bytes)
3328                return -ENOSPC;
3329
3330        cluster->window_start = first->offset;
3331
3332        node = &first->offset_index;
3333
3334        /*
3335         * now we've found our entries, pull them out of the free space
3336         * cache and put them into the cluster rbtree
3337         */
3338        do {
3339                int ret;
3340
3341                entry = rb_entry(node, struct btrfs_free_space, offset_index);
3342                node = rb_next(&entry->offset_index);
3343                if (entry->bitmap || entry->bytes < min_bytes)
3344                        continue;
3345
3346                rb_erase(&entry->offset_index, &ctl->free_space_offset);
3347                ret = tree_insert_offset(&cluster->root, entry->offset,
3348                                         &entry->offset_index, 0);
3349                total_size += entry->bytes;
3350                ASSERT(!ret); /* -EEXIST; Logic error */
3351        } while (node && entry != last);
3352
3353        cluster->max_size = max_extent;
3354        trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3355        return 0;
3356}
3357
3358/*
3359 * This specifically looks for bitmaps that may work in the cluster, we assume
3360 * that we have already failed to find extents that will work.
3361 */
3362static noinline int
3363setup_cluster_bitmap(struct btrfs_block_group *block_group,
3364                     struct btrfs_free_cluster *cluster,
3365                     struct list_head *bitmaps, u64 offset, u64 bytes,
3366                     u64 cont1_bytes, u64 min_bytes)
3367{
3368        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3369        struct btrfs_free_space *entry = NULL;
3370        int ret = -ENOSPC;
3371        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3372
3373        if (ctl->total_bitmaps == 0)
3374                return -ENOSPC;
3375
3376        /*
3377         * The bitmap that covers offset won't be in the list unless offset
3378         * is just its start offset.
3379         */
3380        if (!list_empty(bitmaps))
3381                entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3382
3383        if (!entry || entry->offset != bitmap_offset) {
3384                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3385                if (entry && list_empty(&entry->list))
3386                        list_add(&entry->list, bitmaps);
3387        }
3388
3389        list_for_each_entry(entry, bitmaps, list) {
3390                if (entry->bytes < bytes)
3391                        continue;
3392                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3393                                           bytes, cont1_bytes, min_bytes);
3394                if (!ret)
3395                        return 0;
3396        }
3397
3398        /*
3399         * The bitmaps list has all the bitmaps that record free space
3400         * starting after offset, so no more search is required.
3401         */
3402        return -ENOSPC;
3403}
3404
3405/*
3406 * here we try to find a cluster of blocks in a block group.  The goal
3407 * is to find at least bytes+empty_size.
3408 * We might not find them all in one contiguous area.
3409 *
3410 * returns zero and sets up cluster if things worked out, otherwise
3411 * it returns -enospc
3412 */
3413int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3414                             struct btrfs_free_cluster *cluster,
3415                             u64 offset, u64 bytes, u64 empty_size)
3416{
3417        struct btrfs_fs_info *fs_info = block_group->fs_info;
3418        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3419        struct btrfs_free_space *entry, *tmp;
3420        LIST_HEAD(bitmaps);
3421        u64 min_bytes;
3422        u64 cont1_bytes;
3423        int ret;
3424
3425        /*
3426         * Choose the minimum extent size we'll require for this
3427         * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3428         * For metadata, allow allocates with smaller extents.  For
3429         * data, keep it dense.
3430         */
3431        if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3432                cont1_bytes = min_bytes = bytes + empty_size;
3433        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3434                cont1_bytes = bytes;
3435                min_bytes = fs_info->sectorsize;
3436        } else {
3437                cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3438                min_bytes = fs_info->sectorsize;
3439        }
3440
3441        spin_lock(&ctl->tree_lock);
3442
3443        /*
3444         * If we know we don't have enough space to make a cluster don't even
3445         * bother doing all the work to try and find one.
3446         */
3447        if (ctl->free_space < bytes) {
3448                spin_unlock(&ctl->tree_lock);
3449                return -ENOSPC;
3450        }
3451
3452        spin_lock(&cluster->lock);
3453
3454        /* someone already found a cluster, hooray */
3455        if (cluster->block_group) {
3456                ret = 0;
3457                goto out;
3458        }
3459
3460        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3461                                 min_bytes);
3462
3463        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3464                                      bytes + empty_size,
3465                                      cont1_bytes, min_bytes);
3466        if (ret)
3467                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3468                                           offset, bytes + empty_size,
3469                                           cont1_bytes, min_bytes);
3470
3471        /* Clear our temporary list */
3472        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3473                list_del_init(&entry->list);
3474
3475        if (!ret) {
3476                btrfs_get_block_group(block_group);
3477                list_add_tail(&cluster->block_group_list,
3478                              &block_group->cluster_list);
3479                cluster->block_group = block_group;
3480        } else {
3481                trace_btrfs_failed_cluster_setup(block_group);
3482        }
3483out:
3484        spin_unlock(&cluster->lock);
3485        spin_unlock(&ctl->tree_lock);
3486
3487        return ret;
3488}
3489
3490/*
3491 * simple code to zero out a cluster
3492 */
3493void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3494{
3495        spin_lock_init(&cluster->lock);
3496        spin_lock_init(&cluster->refill_lock);
3497        cluster->root = RB_ROOT;
3498        cluster->max_size = 0;
3499        cluster->fragmented = false;
3500        INIT_LIST_HEAD(&cluster->block_group_list);
3501        cluster->block_group = NULL;
3502}
3503
3504static int do_trimming(struct btrfs_block_group *block_group,
3505                       u64 *total_trimmed, u64 start, u64 bytes,
3506                       u64 reserved_start, u64 reserved_bytes,
3507                       enum btrfs_trim_state reserved_trim_state,
3508                       struct btrfs_trim_range *trim_entry)
3509{
3510        struct btrfs_space_info *space_info = block_group->space_info;
3511        struct btrfs_fs_info *fs_info = block_group->fs_info;
3512        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3513        int ret;
3514        int update = 0;
3515        const u64 end = start + bytes;
3516        const u64 reserved_end = reserved_start + reserved_bytes;
3517        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3518        u64 trimmed = 0;
3519
3520        spin_lock(&space_info->lock);
3521        spin_lock(&block_group->lock);
3522        if (!block_group->ro) {
3523                block_group->reserved += reserved_bytes;
3524                space_info->bytes_reserved += reserved_bytes;
3525                update = 1;
3526        }
3527        spin_unlock(&block_group->lock);
3528        spin_unlock(&space_info->lock);
3529
3530        ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3531        if (!ret) {
3532                *total_trimmed += trimmed;
3533                trim_state = BTRFS_TRIM_STATE_TRIMMED;
3534        }
3535
3536        mutex_lock(&ctl->cache_writeout_mutex);
3537        if (reserved_start < start)
3538                __btrfs_add_free_space(fs_info, ctl, reserved_start,
3539                                       start - reserved_start,
3540                                       reserved_trim_state);
3541        if (start + bytes < reserved_start + reserved_bytes)
3542                __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end,
3543                                       reserved_trim_state);
3544        __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state);
3545        list_del(&trim_entry->list);
3546        mutex_unlock(&ctl->cache_writeout_mutex);
3547
3548        if (update) {
3549                spin_lock(&space_info->lock);
3550                spin_lock(&block_group->lock);
3551                if (block_group->ro)
3552                        space_info->bytes_readonly += reserved_bytes;
3553                block_group->reserved -= reserved_bytes;
3554                space_info->bytes_reserved -= reserved_bytes;
3555                spin_unlock(&block_group->lock);
3556                spin_unlock(&space_info->lock);
3557        }
3558
3559        return ret;
3560}
3561
3562/*
3563 * If @async is set, then we will trim 1 region and return.
3564 */
3565static int trim_no_bitmap(struct btrfs_block_group *block_group,
3566                          u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3567                          bool async)
3568{
3569        struct btrfs_discard_ctl *discard_ctl =
3570                                        &block_group->fs_info->discard_ctl;
3571        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3572        struct btrfs_free_space *entry;
3573        struct rb_node *node;
3574        int ret = 0;
3575        u64 extent_start;
3576        u64 extent_bytes;
3577        enum btrfs_trim_state extent_trim_state;
3578        u64 bytes;
3579        const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3580
3581        while (start < end) {
3582                struct btrfs_trim_range trim_entry;
3583
3584                mutex_lock(&ctl->cache_writeout_mutex);
3585                spin_lock(&ctl->tree_lock);
3586
3587                if (ctl->free_space < minlen)
3588                        goto out_unlock;
3589
3590                entry = tree_search_offset(ctl, start, 0, 1);
3591                if (!entry)
3592                        goto out_unlock;
3593
3594                /* Skip bitmaps and if async, already trimmed entries */
3595                while (entry->bitmap ||
3596                       (async && btrfs_free_space_trimmed(entry))) {
3597                        node = rb_next(&entry->offset_index);
3598                        if (!node)
3599                                goto out_unlock;
3600                        entry = rb_entry(node, struct btrfs_free_space,
3601                                         offset_index);
3602                }
3603
3604                if (entry->offset >= end)
3605                        goto out_unlock;
3606
3607                extent_start = entry->offset;
3608                extent_bytes = entry->bytes;
3609                extent_trim_state = entry->trim_state;
3610                if (async) {
3611                        start = entry->offset;
3612                        bytes = entry->bytes;
3613                        if (bytes < minlen) {
3614                                spin_unlock(&ctl->tree_lock);
3615                                mutex_unlock(&ctl->cache_writeout_mutex);
3616                                goto next;
3617                        }
3618                        unlink_free_space(ctl, entry);
3619                        /*
3620                         * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3621                         * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3622                         * X when we come back around.  So trim it now.
3623                         */
3624                        if (max_discard_size &&
3625                            bytes >= (max_discard_size +
3626                                      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3627                                bytes = max_discard_size;
3628                                extent_bytes = max_discard_size;
3629                                entry->offset += max_discard_size;
3630                                entry->bytes -= max_discard_size;
3631                                link_free_space(ctl, entry);
3632                        } else {
3633                                kmem_cache_free(btrfs_free_space_cachep, entry);
3634                        }
3635                } else {
3636                        start = max(start, extent_start);
3637                        bytes = min(extent_start + extent_bytes, end) - start;
3638                        if (bytes < minlen) {
3639                                spin_unlock(&ctl->tree_lock);
3640                                mutex_unlock(&ctl->cache_writeout_mutex);
3641                                goto next;
3642                        }
3643
3644                        unlink_free_space(ctl, entry);
3645                        kmem_cache_free(btrfs_free_space_cachep, entry);
3646                }
3647
3648                spin_unlock(&ctl->tree_lock);
3649                trim_entry.start = extent_start;
3650                trim_entry.bytes = extent_bytes;
3651                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3652                mutex_unlock(&ctl->cache_writeout_mutex);
3653
3654                ret = do_trimming(block_group, total_trimmed, start, bytes,
3655                                  extent_start, extent_bytes, extent_trim_state,
3656                                  &trim_entry);
3657                if (ret) {
3658                        block_group->discard_cursor = start + bytes;
3659                        break;
3660                }
3661next:
3662                start += bytes;
3663                block_group->discard_cursor = start;
3664                if (async && *total_trimmed)
3665                        break;
3666
3667                if (fatal_signal_pending(current)) {
3668                        ret = -ERESTARTSYS;
3669                        break;
3670                }
3671
3672                cond_resched();
3673        }
3674
3675        return ret;
3676
3677out_unlock:
3678        block_group->discard_cursor = btrfs_block_group_end(block_group);
3679        spin_unlock(&ctl->tree_lock);
3680        mutex_unlock(&ctl->cache_writeout_mutex);
3681
3682        return ret;
3683}
3684
3685/*
3686 * If we break out of trimming a bitmap prematurely, we should reset the
3687 * trimming bit.  In a rather contrieved case, it's possible to race here so
3688 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3689 *
3690 * start = start of bitmap
3691 * end = near end of bitmap
3692 *
3693 * Thread 1:                    Thread 2:
3694 * trim_bitmaps(start)
3695 *                              trim_bitmaps(end)
3696 *                              end_trimming_bitmap()
3697 * reset_trimming_bitmap()
3698 */
3699static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3700{
3701        struct btrfs_free_space *entry;
3702
3703        spin_lock(&ctl->tree_lock);
3704        entry = tree_search_offset(ctl, offset, 1, 0);
3705        if (entry) {
3706                if (btrfs_free_space_trimmed(entry)) {
3707                        ctl->discardable_extents[BTRFS_STAT_CURR] +=
3708                                entry->bitmap_extents;
3709                        ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3710                }
3711                entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3712        }
3713
3714        spin_unlock(&ctl->tree_lock);
3715}
3716
3717static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3718                                struct btrfs_free_space *entry)
3719{
3720        if (btrfs_free_space_trimming_bitmap(entry)) {
3721                entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3722                ctl->discardable_extents[BTRFS_STAT_CURR] -=
3723                        entry->bitmap_extents;
3724                ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3725        }
3726}
3727
3728/*
3729 * If @async is set, then we will trim 1 region and return.
3730 */
3731static int trim_bitmaps(struct btrfs_block_group *block_group,
3732                        u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3733                        u64 maxlen, bool async)
3734{
3735        struct btrfs_discard_ctl *discard_ctl =
3736                                        &block_group->fs_info->discard_ctl;
3737        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3738        struct btrfs_free_space *entry;
3739        int ret = 0;
3740        int ret2;
3741        u64 bytes;
3742        u64 offset = offset_to_bitmap(ctl, start);
3743        const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3744
3745        while (offset < end) {
3746                bool next_bitmap = false;
3747                struct btrfs_trim_range trim_entry;
3748
3749                mutex_lock(&ctl->cache_writeout_mutex);
3750                spin_lock(&ctl->tree_lock);
3751
3752                if (ctl->free_space < minlen) {
3753                        block_group->discard_cursor =
3754                                btrfs_block_group_end(block_group);
3755                        spin_unlock(&ctl->tree_lock);
3756                        mutex_unlock(&ctl->cache_writeout_mutex);
3757                        break;
3758                }
3759
3760                entry = tree_search_offset(ctl, offset, 1, 0);
3761                /*
3762                 * Bitmaps are marked trimmed lossily now to prevent constant
3763                 * discarding of the same bitmap (the reason why we are bound
3764                 * by the filters).  So, retrim the block group bitmaps when we
3765                 * are preparing to punt to the unused_bgs list.  This uses
3766                 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3767                 * which is the only discard index which sets minlen to 0.
3768                 */
3769                if (!entry || (async && minlen && start == offset &&
3770                               btrfs_free_space_trimmed(entry))) {
3771                        spin_unlock(&ctl->tree_lock);
3772                        mutex_unlock(&ctl->cache_writeout_mutex);
3773                        next_bitmap = true;
3774                        goto next;
3775                }
3776
3777                /*
3778                 * Async discard bitmap trimming begins at by setting the start
3779                 * to be key.objectid and the offset_to_bitmap() aligns to the
3780                 * start of the bitmap.  This lets us know we are fully
3781                 * scanning the bitmap rather than only some portion of it.
3782                 */
3783                if (start == offset)
3784                        entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3785
3786                bytes = minlen;
3787                ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3788                if (ret2 || start >= end) {
3789                        /*
3790                         * We lossily consider a bitmap trimmed if we only skip
3791                         * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3792                         */
3793                        if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3794                                end_trimming_bitmap(ctl, entry);
3795                        else
3796                                entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3797                        spin_unlock(&ctl->tree_lock);
3798                        mutex_unlock(&ctl->cache_writeout_mutex);
3799                        next_bitmap = true;
3800                        goto next;
3801                }
3802
3803                /*
3804                 * We already trimmed a region, but are using the locking above
3805                 * to reset the trim_state.
3806                 */
3807                if (async && *total_trimmed) {
3808                        spin_unlock(&ctl->tree_lock);
3809                        mutex_unlock(&ctl->cache_writeout_mutex);
3810                        goto out;
3811                }
3812
3813                bytes = min(bytes, end - start);
3814                if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3815                        spin_unlock(&ctl->tree_lock);
3816                        mutex_unlock(&ctl->cache_writeout_mutex);
3817                        goto next;
3818                }
3819
3820                /*
3821                 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3822                 * If X < @minlen, we won't trim X when we come back around.
3823                 * So trim it now.  We differ here from trimming extents as we
3824                 * don't keep individual state per bit.
3825                 */
3826                if (async &&
3827                    max_discard_size &&
3828                    bytes > (max_discard_size + minlen))
3829                        bytes = max_discard_size;
3830
3831                bitmap_clear_bits(ctl, entry, start, bytes);
3832                if (entry->bytes == 0)
3833                        free_bitmap(ctl, entry);
3834
3835                spin_unlock(&ctl->tree_lock);
3836                trim_entry.start = start;
3837                trim_entry.bytes = bytes;
3838                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3839                mutex_unlock(&ctl->cache_writeout_mutex);
3840
3841                ret = do_trimming(block_group, total_trimmed, start, bytes,
3842                                  start, bytes, 0, &trim_entry);
3843                if (ret) {
3844                        reset_trimming_bitmap(ctl, offset);
3845                        block_group->discard_cursor =
3846                                btrfs_block_group_end(block_group);
3847                        break;
3848                }
3849next:
3850                if (next_bitmap) {
3851                        offset += BITS_PER_BITMAP * ctl->unit;
3852                        start = offset;
3853                } else {
3854                        start += bytes;
3855                }
3856                block_group->discard_cursor = start;
3857
3858                if (fatal_signal_pending(current)) {
3859                        if (start != offset)
3860                                reset_trimming_bitmap(ctl, offset);
3861                        ret = -ERESTARTSYS;
3862                        break;
3863                }
3864
3865                cond_resched();
3866        }
3867
3868        if (offset >= end)
3869                block_group->discard_cursor = end;
3870
3871out:
3872        return ret;
3873}
3874
3875int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3876                           u64 *trimmed, u64 start, u64 end, u64 minlen)
3877{
3878        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3879        int ret;
3880        u64 rem = 0;
3881
3882        ASSERT(!btrfs_is_zoned(block_group->fs_info));
3883
3884        *trimmed = 0;
3885
3886        spin_lock(&block_group->lock);
3887        if (block_group->removed) {
3888                spin_unlock(&block_group->lock);
3889                return 0;
3890        }
3891        btrfs_freeze_block_group(block_group);
3892        spin_unlock(&block_group->lock);
3893
3894        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3895        if (ret)
3896                goto out;
3897
3898        ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
3899        div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
3900        /* If we ended in the middle of a bitmap, reset the trimming flag */
3901        if (rem)
3902                reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
3903out:
3904        btrfs_unfreeze_block_group(block_group);
3905        return ret;
3906}
3907
3908int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
3909                                   u64 *trimmed, u64 start, u64 end, u64 minlen,
3910                                   bool async)
3911{
3912        int ret;
3913
3914        *trimmed = 0;
3915
3916        spin_lock(&block_group->lock);
3917        if (block_group->removed) {
3918                spin_unlock(&block_group->lock);
3919                return 0;
3920        }
3921        btrfs_freeze_block_group(block_group);
3922        spin_unlock(&block_group->lock);
3923
3924        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
3925        btrfs_unfreeze_block_group(block_group);
3926
3927        return ret;
3928}
3929
3930int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
3931                                   u64 *trimmed, u64 start, u64 end, u64 minlen,
3932                                   u64 maxlen, bool async)
3933{
3934        int ret;
3935
3936        *trimmed = 0;
3937
3938        spin_lock(&block_group->lock);
3939        if (block_group->removed) {
3940                spin_unlock(&block_group->lock);
3941                return 0;
3942        }
3943        btrfs_freeze_block_group(block_group);
3944        spin_unlock(&block_group->lock);
3945
3946        ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
3947                           async);
3948
3949        btrfs_unfreeze_block_group(block_group);
3950
3951        return ret;
3952}
3953
3954bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
3955{
3956        return btrfs_super_cache_generation(fs_info->super_copy);
3957}
3958
3959static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
3960                                       struct btrfs_trans_handle *trans)
3961{
3962        struct btrfs_block_group *block_group;
3963        struct rb_node *node;
3964        int ret = 0;
3965
3966        btrfs_info(fs_info, "cleaning free space cache v1");
3967
3968        node = rb_first(&fs_info->block_group_cache_tree);
3969        while (node) {
3970                block_group = rb_entry(node, struct btrfs_block_group, cache_node);
3971                ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
3972                if (ret)
3973                        goto out;
3974                node = rb_next(node);
3975        }
3976out:
3977        return ret;
3978}
3979
3980int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
3981{
3982        struct btrfs_trans_handle *trans;
3983        int ret;
3984
3985        /*
3986         * update_super_roots will appropriately set or unset
3987         * super_copy->cache_generation based on SPACE_CACHE and
3988         * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
3989         * transaction commit whether we are enabling space cache v1 and don't
3990         * have any other work to do, or are disabling it and removing free
3991         * space inodes.
3992         */
3993        trans = btrfs_start_transaction(fs_info->tree_root, 0);
3994        if (IS_ERR(trans))
3995                return PTR_ERR(trans);
3996
3997        if (!active) {
3998                set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
3999                ret = cleanup_free_space_cache_v1(fs_info, trans);
4000                if (ret) {
4001                        btrfs_abort_transaction(trans, ret);
4002                        btrfs_end_transaction(trans);
4003                        goto out;
4004                }
4005        }
4006
4007        ret = btrfs_commit_transaction(trans);
4008out:
4009        clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4010
4011        return ret;
4012}
4013
4014#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4015/*
4016 * Use this if you need to make a bitmap or extent entry specifically, it
4017 * doesn't do any of the merging that add_free_space does, this acts a lot like
4018 * how the free space cache loading stuff works, so you can get really weird
4019 * configurations.
4020 */
4021int test_add_free_space_entry(struct btrfs_block_group *cache,
4022                              u64 offset, u64 bytes, bool bitmap)
4023{
4024        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4025        struct btrfs_free_space *info = NULL, *bitmap_info;
4026        void *map = NULL;
4027        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4028        u64 bytes_added;
4029        int ret;
4030
4031again:
4032        if (!info) {
4033                info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4034                if (!info)
4035                        return -ENOMEM;
4036        }
4037
4038        if (!bitmap) {
4039                spin_lock(&ctl->tree_lock);
4040                info->offset = offset;
4041                info->bytes = bytes;
4042                info->max_extent_size = 0;
4043                ret = link_free_space(ctl, info);
4044                spin_unlock(&ctl->tree_lock);
4045                if (ret)
4046                        kmem_cache_free(btrfs_free_space_cachep, info);
4047                return ret;
4048        }
4049
4050        if (!map) {
4051                map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4052                if (!map) {
4053                        kmem_cache_free(btrfs_free_space_cachep, info);
4054                        return -ENOMEM;
4055                }
4056        }
4057
4058        spin_lock(&ctl->tree_lock);
4059        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4060                                         1, 0);
4061        if (!bitmap_info) {
4062                info->bitmap = map;
4063                map = NULL;
4064                add_new_bitmap(ctl, info, offset);
4065                bitmap_info = info;
4066                info = NULL;
4067        }
4068
4069        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4070                                          trim_state);
4071
4072        bytes -= bytes_added;
4073        offset += bytes_added;
4074        spin_unlock(&ctl->tree_lock);
4075
4076        if (bytes)
4077                goto again;
4078
4079        if (info)
4080                kmem_cache_free(btrfs_free_space_cachep, info);
4081        if (map)
4082                kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4083        return 0;
4084}
4085
4086/*
4087 * Checks to see if the given range is in the free space cache.  This is really
4088 * just used to check the absence of space, so if there is free space in the
4089 * range at all we will return 1.
4090 */
4091int test_check_exists(struct btrfs_block_group *cache,
4092                      u64 offset, u64 bytes)
4093{
4094        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4095        struct btrfs_free_space *info;
4096        int ret = 0;
4097
4098        spin_lock(&ctl->tree_lock);
4099        info = tree_search_offset(ctl, offset, 0, 0);
4100        if (!info) {
4101                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4102                                          1, 0);
4103                if (!info)
4104                        goto out;
4105        }
4106
4107have_info:
4108        if (info->bitmap) {
4109                u64 bit_off, bit_bytes;
4110                struct rb_node *n;
4111                struct btrfs_free_space *tmp;
4112
4113                bit_off = offset;
4114                bit_bytes = ctl->unit;
4115                ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4116                if (!ret) {
4117                        if (bit_off == offset) {
4118                                ret = 1;
4119                                goto out;
4120                        } else if (bit_off > offset &&
4121                                   offset + bytes > bit_off) {
4122                                ret = 1;
4123                                goto out;
4124                        }
4125                }
4126
4127                n = rb_prev(&info->offset_index);
4128                while (n) {
4129                        tmp = rb_entry(n, struct btrfs_free_space,
4130                                       offset_index);
4131                        if (tmp->offset + tmp->bytes < offset)
4132                                break;
4133                        if (offset + bytes < tmp->offset) {
4134                                n = rb_prev(&tmp->offset_index);
4135                                continue;
4136                        }
4137                        info = tmp;
4138                        goto have_info;
4139                }
4140
4141                n = rb_next(&info->offset_index);
4142                while (n) {
4143                        tmp = rb_entry(n, struct btrfs_free_space,
4144                                       offset_index);
4145                        if (offset + bytes < tmp->offset)
4146                                break;
4147                        if (tmp->offset + tmp->bytes < offset) {
4148                                n = rb_next(&tmp->offset_index);
4149                                continue;
4150                        }
4151                        info = tmp;
4152                        goto have_info;
4153                }
4154
4155                ret = 0;
4156                goto out;
4157        }
4158
4159        if (info->offset == offset) {
4160                ret = 1;
4161                goto out;
4162        }
4163
4164        if (offset > info->offset && offset < info->offset + info->bytes)
4165                ret = 1;
4166out:
4167        spin_unlock(&ctl->tree_lock);
4168        return ret;
4169}
4170#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
4171