linux/fs/btrfs/free-space-cache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2008 Red Hat.  All rights reserved.
   4 */
   5
   6#include <linux/pagemap.h>
   7#include <linux/sched.h>
   8#include <linux/sched/signal.h>
   9#include <linux/slab.h>
  10#include <linux/math64.h>
  11#include <linux/ratelimit.h>
  12#include <linux/error-injection.h>
  13#include <linux/sched/mm.h>
  14#include "misc.h"
  15#include "ctree.h"
  16#include "free-space-cache.h"
  17#include "transaction.h"
  18#include "disk-io.h"
  19#include "extent_io.h"
  20#include "volumes.h"
  21#include "space-info.h"
  22#include "delalloc-space.h"
  23#include "block-group.h"
  24#include "discard.h"
  25#include "subpage.h"
  26#include "inode-item.h"
  27
  28#define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
  29#define MAX_CACHE_BYTES_PER_GIG SZ_64K
  30#define FORCE_EXTENT_THRESHOLD  SZ_1M
  31
  32struct btrfs_trim_range {
  33        u64 start;
  34        u64 bytes;
  35        struct list_head list;
  36};
  37
  38static int link_free_space(struct btrfs_free_space_ctl *ctl,
  39                           struct btrfs_free_space *info);
  40static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
  41                              struct btrfs_free_space *info, bool update_stat);
  42static int search_bitmap(struct btrfs_free_space_ctl *ctl,
  43                         struct btrfs_free_space *bitmap_info, u64 *offset,
  44                         u64 *bytes, bool for_alloc);
  45static void free_bitmap(struct btrfs_free_space_ctl *ctl,
  46                        struct btrfs_free_space *bitmap_info);
  47static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
  48                              struct btrfs_free_space *info, u64 offset,
  49                              u64 bytes, bool update_stats);
  50
  51static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
  52                                               struct btrfs_path *path,
  53                                               u64 offset)
  54{
  55        struct btrfs_fs_info *fs_info = root->fs_info;
  56        struct btrfs_key key;
  57        struct btrfs_key location;
  58        struct btrfs_disk_key disk_key;
  59        struct btrfs_free_space_header *header;
  60        struct extent_buffer *leaf;
  61        struct inode *inode = NULL;
  62        unsigned nofs_flag;
  63        int ret;
  64
  65        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
  66        key.offset = offset;
  67        key.type = 0;
  68
  69        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  70        if (ret < 0)
  71                return ERR_PTR(ret);
  72        if (ret > 0) {
  73                btrfs_release_path(path);
  74                return ERR_PTR(-ENOENT);
  75        }
  76
  77        leaf = path->nodes[0];
  78        header = btrfs_item_ptr(leaf, path->slots[0],
  79                                struct btrfs_free_space_header);
  80        btrfs_free_space_key(leaf, header, &disk_key);
  81        btrfs_disk_key_to_cpu(&location, &disk_key);
  82        btrfs_release_path(path);
  83
  84        /*
  85         * We are often under a trans handle at this point, so we need to make
  86         * sure NOFS is set to keep us from deadlocking.
  87         */
  88        nofs_flag = memalloc_nofs_save();
  89        inode = btrfs_iget_path(fs_info->sb, location.objectid, root, path);
  90        btrfs_release_path(path);
  91        memalloc_nofs_restore(nofs_flag);
  92        if (IS_ERR(inode))
  93                return inode;
  94
  95        mapping_set_gfp_mask(inode->i_mapping,
  96                        mapping_gfp_constraint(inode->i_mapping,
  97                        ~(__GFP_FS | __GFP_HIGHMEM)));
  98
  99        return inode;
 100}
 101
 102struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
 103                struct btrfs_path *path)
 104{
 105        struct btrfs_fs_info *fs_info = block_group->fs_info;
 106        struct inode *inode = NULL;
 107        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 108
 109        spin_lock(&block_group->lock);
 110        if (block_group->inode)
 111                inode = igrab(block_group->inode);
 112        spin_unlock(&block_group->lock);
 113        if (inode)
 114                return inode;
 115
 116        inode = __lookup_free_space_inode(fs_info->tree_root, path,
 117                                          block_group->start);
 118        if (IS_ERR(inode))
 119                return inode;
 120
 121        spin_lock(&block_group->lock);
 122        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
 123                btrfs_info(fs_info, "Old style space inode found, converting.");
 124                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
 125                        BTRFS_INODE_NODATACOW;
 126                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 127        }
 128
 129        if (!block_group->iref) {
 130                block_group->inode = igrab(inode);
 131                block_group->iref = 1;
 132        }
 133        spin_unlock(&block_group->lock);
 134
 135        return inode;
 136}
 137
 138static int __create_free_space_inode(struct btrfs_root *root,
 139                                     struct btrfs_trans_handle *trans,
 140                                     struct btrfs_path *path,
 141                                     u64 ino, u64 offset)
 142{
 143        struct btrfs_key key;
 144        struct btrfs_disk_key disk_key;
 145        struct btrfs_free_space_header *header;
 146        struct btrfs_inode_item *inode_item;
 147        struct extent_buffer *leaf;
 148        /* We inline CRCs for the free disk space cache */
 149        const u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC |
 150                          BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 151        int ret;
 152
 153        ret = btrfs_insert_empty_inode(trans, root, path, ino);
 154        if (ret)
 155                return ret;
 156
 157        leaf = path->nodes[0];
 158        inode_item = btrfs_item_ptr(leaf, path->slots[0],
 159                                    struct btrfs_inode_item);
 160        btrfs_item_key(leaf, &disk_key, path->slots[0]);
 161        memzero_extent_buffer(leaf, (unsigned long)inode_item,
 162                             sizeof(*inode_item));
 163        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
 164        btrfs_set_inode_size(leaf, inode_item, 0);
 165        btrfs_set_inode_nbytes(leaf, inode_item, 0);
 166        btrfs_set_inode_uid(leaf, inode_item, 0);
 167        btrfs_set_inode_gid(leaf, inode_item, 0);
 168        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
 169        btrfs_set_inode_flags(leaf, inode_item, flags);
 170        btrfs_set_inode_nlink(leaf, inode_item, 1);
 171        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 172        btrfs_set_inode_block_group(leaf, inode_item, offset);
 173        btrfs_mark_buffer_dirty(leaf);
 174        btrfs_release_path(path);
 175
 176        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 177        key.offset = offset;
 178        key.type = 0;
 179        ret = btrfs_insert_empty_item(trans, root, path, &key,
 180                                      sizeof(struct btrfs_free_space_header));
 181        if (ret < 0) {
 182                btrfs_release_path(path);
 183                return ret;
 184        }
 185
 186        leaf = path->nodes[0];
 187        header = btrfs_item_ptr(leaf, path->slots[0],
 188                                struct btrfs_free_space_header);
 189        memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
 190        btrfs_set_free_space_key(leaf, header, &disk_key);
 191        btrfs_mark_buffer_dirty(leaf);
 192        btrfs_release_path(path);
 193
 194        return 0;
 195}
 196
 197int create_free_space_inode(struct btrfs_trans_handle *trans,
 198                            struct btrfs_block_group *block_group,
 199                            struct btrfs_path *path)
 200{
 201        int ret;
 202        u64 ino;
 203
 204        ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino);
 205        if (ret < 0)
 206                return ret;
 207
 208        return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
 209                                         ino, block_group->start);
 210}
 211
 212/*
 213 * inode is an optional sink: if it is NULL, btrfs_remove_free_space_inode
 214 * handles lookup, otherwise it takes ownership and iputs the inode.
 215 * Don't reuse an inode pointer after passing it into this function.
 216 */
 217int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
 218                                  struct inode *inode,
 219                                  struct btrfs_block_group *block_group)
 220{
 221        struct btrfs_path *path;
 222        struct btrfs_key key;
 223        int ret = 0;
 224
 225        path = btrfs_alloc_path();
 226        if (!path)
 227                return -ENOMEM;
 228
 229        if (!inode)
 230                inode = lookup_free_space_inode(block_group, path);
 231        if (IS_ERR(inode)) {
 232                if (PTR_ERR(inode) != -ENOENT)
 233                        ret = PTR_ERR(inode);
 234                goto out;
 235        }
 236        ret = btrfs_orphan_add(trans, BTRFS_I(inode));
 237        if (ret) {
 238                btrfs_add_delayed_iput(inode);
 239                goto out;
 240        }
 241        clear_nlink(inode);
 242        /* One for the block groups ref */
 243        spin_lock(&block_group->lock);
 244        if (block_group->iref) {
 245                block_group->iref = 0;
 246                block_group->inode = NULL;
 247                spin_unlock(&block_group->lock);
 248                iput(inode);
 249        } else {
 250                spin_unlock(&block_group->lock);
 251        }
 252        /* One for the lookup ref */
 253        btrfs_add_delayed_iput(inode);
 254
 255        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 256        key.type = 0;
 257        key.offset = block_group->start;
 258        ret = btrfs_search_slot(trans, trans->fs_info->tree_root, &key, path,
 259                                -1, 1);
 260        if (ret) {
 261                if (ret > 0)
 262                        ret = 0;
 263                goto out;
 264        }
 265        ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
 266out:
 267        btrfs_free_path(path);
 268        return ret;
 269}
 270
 271int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
 272                                       struct btrfs_block_rsv *rsv)
 273{
 274        u64 needed_bytes;
 275        int ret;
 276
 277        /* 1 for slack space, 1 for updating the inode */
 278        needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
 279                btrfs_calc_metadata_size(fs_info, 1);
 280
 281        spin_lock(&rsv->lock);
 282        if (rsv->reserved < needed_bytes)
 283                ret = -ENOSPC;
 284        else
 285                ret = 0;
 286        spin_unlock(&rsv->lock);
 287        return ret;
 288}
 289
 290int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 291                                    struct btrfs_block_group *block_group,
 292                                    struct inode *vfs_inode)
 293{
 294        struct btrfs_truncate_control control = {
 295                .inode = BTRFS_I(vfs_inode),
 296                .new_size = 0,
 297                .ino = btrfs_ino(BTRFS_I(vfs_inode)),
 298                .min_type = BTRFS_EXTENT_DATA_KEY,
 299                .clear_extent_range = true,
 300        };
 301        struct btrfs_inode *inode = BTRFS_I(vfs_inode);
 302        struct btrfs_root *root = inode->root;
 303        struct extent_state *cached_state = NULL;
 304        int ret = 0;
 305        bool locked = false;
 306
 307        if (block_group) {
 308                struct btrfs_path *path = btrfs_alloc_path();
 309
 310                if (!path) {
 311                        ret = -ENOMEM;
 312                        goto fail;
 313                }
 314                locked = true;
 315                mutex_lock(&trans->transaction->cache_write_mutex);
 316                if (!list_empty(&block_group->io_list)) {
 317                        list_del_init(&block_group->io_list);
 318
 319                        btrfs_wait_cache_io(trans, block_group, path);
 320                        btrfs_put_block_group(block_group);
 321                }
 322
 323                /*
 324                 * now that we've truncated the cache away, its no longer
 325                 * setup or written
 326                 */
 327                spin_lock(&block_group->lock);
 328                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 329                spin_unlock(&block_group->lock);
 330                btrfs_free_path(path);
 331        }
 332
 333        btrfs_i_size_write(inode, 0);
 334        truncate_pagecache(vfs_inode, 0);
 335
 336        lock_extent_bits(&inode->io_tree, 0, (u64)-1, &cached_state);
 337        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 338
 339        /*
 340         * We skip the throttling logic for free space cache inodes, so we don't
 341         * need to check for -EAGAIN.
 342         */
 343        ret = btrfs_truncate_inode_items(trans, root, &control);
 344
 345        inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
 346        btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
 347
 348        unlock_extent_cached(&inode->io_tree, 0, (u64)-1, &cached_state);
 349        if (ret)
 350                goto fail;
 351
 352        ret = btrfs_update_inode(trans, root, inode);
 353
 354fail:
 355        if (locked)
 356                mutex_unlock(&trans->transaction->cache_write_mutex);
 357        if (ret)
 358                btrfs_abort_transaction(trans, ret);
 359
 360        return ret;
 361}
 362
 363static void readahead_cache(struct inode *inode)
 364{
 365        struct file_ra_state ra;
 366        unsigned long last_index;
 367
 368        file_ra_state_init(&ra, inode->i_mapping);
 369        last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 370
 371        page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
 372}
 373
 374static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 375                       int write)
 376{
 377        int num_pages;
 378
 379        num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 380
 381        /* Make sure we can fit our crcs and generation into the first page */
 382        if (write && (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
 383                return -ENOSPC;
 384
 385        memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
 386
 387        io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
 388        if (!io_ctl->pages)
 389                return -ENOMEM;
 390
 391        io_ctl->num_pages = num_pages;
 392        io_ctl->fs_info = btrfs_sb(inode->i_sb);
 393        io_ctl->inode = inode;
 394
 395        return 0;
 396}
 397ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
 398
 399static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 400{
 401        kfree(io_ctl->pages);
 402        io_ctl->pages = NULL;
 403}
 404
 405static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 406{
 407        if (io_ctl->cur) {
 408                io_ctl->cur = NULL;
 409                io_ctl->orig = NULL;
 410        }
 411}
 412
 413static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 414{
 415        ASSERT(io_ctl->index < io_ctl->num_pages);
 416        io_ctl->page = io_ctl->pages[io_ctl->index++];
 417        io_ctl->cur = page_address(io_ctl->page);
 418        io_ctl->orig = io_ctl->cur;
 419        io_ctl->size = PAGE_SIZE;
 420        if (clear)
 421                clear_page(io_ctl->cur);
 422}
 423
 424static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 425{
 426        int i;
 427
 428        io_ctl_unmap_page(io_ctl);
 429
 430        for (i = 0; i < io_ctl->num_pages; i++) {
 431                if (io_ctl->pages[i]) {
 432                        btrfs_page_clear_checked(io_ctl->fs_info,
 433                                        io_ctl->pages[i],
 434                                        page_offset(io_ctl->pages[i]),
 435                                        PAGE_SIZE);
 436                        unlock_page(io_ctl->pages[i]);
 437                        put_page(io_ctl->pages[i]);
 438                }
 439        }
 440}
 441
 442static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
 443{
 444        struct page *page;
 445        struct inode *inode = io_ctl->inode;
 446        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 447        int i;
 448
 449        for (i = 0; i < io_ctl->num_pages; i++) {
 450                int ret;
 451
 452                page = find_or_create_page(inode->i_mapping, i, mask);
 453                if (!page) {
 454                        io_ctl_drop_pages(io_ctl);
 455                        return -ENOMEM;
 456                }
 457
 458                ret = set_page_extent_mapped(page);
 459                if (ret < 0) {
 460                        unlock_page(page);
 461                        put_page(page);
 462                        io_ctl_drop_pages(io_ctl);
 463                        return ret;
 464                }
 465
 466                io_ctl->pages[i] = page;
 467                if (uptodate && !PageUptodate(page)) {
 468                        btrfs_readpage(NULL, page);
 469                        lock_page(page);
 470                        if (page->mapping != inode->i_mapping) {
 471                                btrfs_err(BTRFS_I(inode)->root->fs_info,
 472                                          "free space cache page truncated");
 473                                io_ctl_drop_pages(io_ctl);
 474                                return -EIO;
 475                        }
 476                        if (!PageUptodate(page)) {
 477                                btrfs_err(BTRFS_I(inode)->root->fs_info,
 478                                           "error reading free space cache");
 479                                io_ctl_drop_pages(io_ctl);
 480                                return -EIO;
 481                        }
 482                }
 483        }
 484
 485        for (i = 0; i < io_ctl->num_pages; i++)
 486                clear_page_dirty_for_io(io_ctl->pages[i]);
 487
 488        return 0;
 489}
 490
 491static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 492{
 493        io_ctl_map_page(io_ctl, 1);
 494
 495        /*
 496         * Skip the csum areas.  If we don't check crcs then we just have a
 497         * 64bit chunk at the front of the first page.
 498         */
 499        io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
 500        io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
 501
 502        put_unaligned_le64(generation, io_ctl->cur);
 503        io_ctl->cur += sizeof(u64);
 504}
 505
 506static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 507{
 508        u64 cache_gen;
 509
 510        /*
 511         * Skip the crc area.  If we don't check crcs then we just have a 64bit
 512         * chunk at the front of the first page.
 513         */
 514        io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
 515        io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
 516
 517        cache_gen = get_unaligned_le64(io_ctl->cur);
 518        if (cache_gen != generation) {
 519                btrfs_err_rl(io_ctl->fs_info,
 520                        "space cache generation (%llu) does not match inode (%llu)",
 521                                cache_gen, generation);
 522                io_ctl_unmap_page(io_ctl);
 523                return -EIO;
 524        }
 525        io_ctl->cur += sizeof(u64);
 526        return 0;
 527}
 528
 529static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 530{
 531        u32 *tmp;
 532        u32 crc = ~(u32)0;
 533        unsigned offset = 0;
 534
 535        if (index == 0)
 536                offset = sizeof(u32) * io_ctl->num_pages;
 537
 538        crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 539        btrfs_crc32c_final(crc, (u8 *)&crc);
 540        io_ctl_unmap_page(io_ctl);
 541        tmp = page_address(io_ctl->pages[0]);
 542        tmp += index;
 543        *tmp = crc;
 544}
 545
 546static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 547{
 548        u32 *tmp, val;
 549        u32 crc = ~(u32)0;
 550        unsigned offset = 0;
 551
 552        if (index == 0)
 553                offset = sizeof(u32) * io_ctl->num_pages;
 554
 555        tmp = page_address(io_ctl->pages[0]);
 556        tmp += index;
 557        val = *tmp;
 558
 559        io_ctl_map_page(io_ctl, 0);
 560        crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 561        btrfs_crc32c_final(crc, (u8 *)&crc);
 562        if (val != crc) {
 563                btrfs_err_rl(io_ctl->fs_info,
 564                        "csum mismatch on free space cache");
 565                io_ctl_unmap_page(io_ctl);
 566                return -EIO;
 567        }
 568
 569        return 0;
 570}
 571
 572static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
 573                            void *bitmap)
 574{
 575        struct btrfs_free_space_entry *entry;
 576
 577        if (!io_ctl->cur)
 578                return -ENOSPC;
 579
 580        entry = io_ctl->cur;
 581        put_unaligned_le64(offset, &entry->offset);
 582        put_unaligned_le64(bytes, &entry->bytes);
 583        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
 584                BTRFS_FREE_SPACE_EXTENT;
 585        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 586        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 587
 588        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 589                return 0;
 590
 591        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 592
 593        /* No more pages to map */
 594        if (io_ctl->index >= io_ctl->num_pages)
 595                return 0;
 596
 597        /* map the next page */
 598        io_ctl_map_page(io_ctl, 1);
 599        return 0;
 600}
 601
 602static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 603{
 604        if (!io_ctl->cur)
 605                return -ENOSPC;
 606
 607        /*
 608         * If we aren't at the start of the current page, unmap this one and
 609         * map the next one if there is any left.
 610         */
 611        if (io_ctl->cur != io_ctl->orig) {
 612                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 613                if (io_ctl->index >= io_ctl->num_pages)
 614                        return -ENOSPC;
 615                io_ctl_map_page(io_ctl, 0);
 616        }
 617
 618        copy_page(io_ctl->cur, bitmap);
 619        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 620        if (io_ctl->index < io_ctl->num_pages)
 621                io_ctl_map_page(io_ctl, 0);
 622        return 0;
 623}
 624
 625static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
 626{
 627        /*
 628         * If we're not on the boundary we know we've modified the page and we
 629         * need to crc the page.
 630         */
 631        if (io_ctl->cur != io_ctl->orig)
 632                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 633        else
 634                io_ctl_unmap_page(io_ctl);
 635
 636        while (io_ctl->index < io_ctl->num_pages) {
 637                io_ctl_map_page(io_ctl, 1);
 638                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 639        }
 640}
 641
 642static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
 643                            struct btrfs_free_space *entry, u8 *type)
 644{
 645        struct btrfs_free_space_entry *e;
 646        int ret;
 647
 648        if (!io_ctl->cur) {
 649                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 650                if (ret)
 651                        return ret;
 652        }
 653
 654        e = io_ctl->cur;
 655        entry->offset = get_unaligned_le64(&e->offset);
 656        entry->bytes = get_unaligned_le64(&e->bytes);
 657        *type = e->type;
 658        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 659        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 660
 661        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 662                return 0;
 663
 664        io_ctl_unmap_page(io_ctl);
 665
 666        return 0;
 667}
 668
 669static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 670                              struct btrfs_free_space *entry)
 671{
 672        int ret;
 673
 674        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 675        if (ret)
 676                return ret;
 677
 678        copy_page(entry->bitmap, io_ctl->cur);
 679        io_ctl_unmap_page(io_ctl);
 680
 681        return 0;
 682}
 683
 684static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 685{
 686        struct btrfs_block_group *block_group = ctl->block_group;
 687        u64 max_bytes;
 688        u64 bitmap_bytes;
 689        u64 extent_bytes;
 690        u64 size = block_group->length;
 691        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
 692        u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
 693
 694        max_bitmaps = max_t(u64, max_bitmaps, 1);
 695
 696        ASSERT(ctl->total_bitmaps <= max_bitmaps);
 697
 698        /*
 699         * We are trying to keep the total amount of memory used per 1GiB of
 700         * space to be MAX_CACHE_BYTES_PER_GIG.  However, with a reclamation
 701         * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of
 702         * bitmaps, we may end up using more memory than this.
 703         */
 704        if (size < SZ_1G)
 705                max_bytes = MAX_CACHE_BYTES_PER_GIG;
 706        else
 707                max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
 708
 709        bitmap_bytes = ctl->total_bitmaps * ctl->unit;
 710
 711        /*
 712         * we want the extent entry threshold to always be at most 1/2 the max
 713         * bytes we can have, or whatever is less than that.
 714         */
 715        extent_bytes = max_bytes - bitmap_bytes;
 716        extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
 717
 718        ctl->extents_thresh =
 719                div_u64(extent_bytes, sizeof(struct btrfs_free_space));
 720}
 721
 722static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 723                                   struct btrfs_free_space_ctl *ctl,
 724                                   struct btrfs_path *path, u64 offset)
 725{
 726        struct btrfs_fs_info *fs_info = root->fs_info;
 727        struct btrfs_free_space_header *header;
 728        struct extent_buffer *leaf;
 729        struct btrfs_io_ctl io_ctl;
 730        struct btrfs_key key;
 731        struct btrfs_free_space *e, *n;
 732        LIST_HEAD(bitmaps);
 733        u64 num_entries;
 734        u64 num_bitmaps;
 735        u64 generation;
 736        u8 type;
 737        int ret = 0;
 738
 739        /* Nothing in the space cache, goodbye */
 740        if (!i_size_read(inode))
 741                return 0;
 742
 743        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 744        key.offset = offset;
 745        key.type = 0;
 746
 747        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 748        if (ret < 0)
 749                return 0;
 750        else if (ret > 0) {
 751                btrfs_release_path(path);
 752                return 0;
 753        }
 754
 755        ret = -1;
 756
 757        leaf = path->nodes[0];
 758        header = btrfs_item_ptr(leaf, path->slots[0],
 759                                struct btrfs_free_space_header);
 760        num_entries = btrfs_free_space_entries(leaf, header);
 761        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
 762        generation = btrfs_free_space_generation(leaf, header);
 763        btrfs_release_path(path);
 764
 765        if (!BTRFS_I(inode)->generation) {
 766                btrfs_info(fs_info,
 767                           "the free space cache file (%llu) is invalid, skip it",
 768                           offset);
 769                return 0;
 770        }
 771
 772        if (BTRFS_I(inode)->generation != generation) {
 773                btrfs_err(fs_info,
 774                          "free space inode generation (%llu) did not match free space cache generation (%llu)",
 775                          BTRFS_I(inode)->generation, generation);
 776                return 0;
 777        }
 778
 779        if (!num_entries)
 780                return 0;
 781
 782        ret = io_ctl_init(&io_ctl, inode, 0);
 783        if (ret)
 784                return ret;
 785
 786        readahead_cache(inode);
 787
 788        ret = io_ctl_prepare_pages(&io_ctl, true);
 789        if (ret)
 790                goto out;
 791
 792        ret = io_ctl_check_crc(&io_ctl, 0);
 793        if (ret)
 794                goto free_cache;
 795
 796        ret = io_ctl_check_generation(&io_ctl, generation);
 797        if (ret)
 798                goto free_cache;
 799
 800        while (num_entries) {
 801                e = kmem_cache_zalloc(btrfs_free_space_cachep,
 802                                      GFP_NOFS);
 803                if (!e) {
 804                        ret = -ENOMEM;
 805                        goto free_cache;
 806                }
 807
 808                ret = io_ctl_read_entry(&io_ctl, e, &type);
 809                if (ret) {
 810                        kmem_cache_free(btrfs_free_space_cachep, e);
 811                        goto free_cache;
 812                }
 813
 814                if (!e->bytes) {
 815                        ret = -1;
 816                        kmem_cache_free(btrfs_free_space_cachep, e);
 817                        goto free_cache;
 818                }
 819
 820                if (type == BTRFS_FREE_SPACE_EXTENT) {
 821                        spin_lock(&ctl->tree_lock);
 822                        ret = link_free_space(ctl, e);
 823                        spin_unlock(&ctl->tree_lock);
 824                        if (ret) {
 825                                btrfs_err(fs_info,
 826                                        "Duplicate entries in free space cache, dumping");
 827                                kmem_cache_free(btrfs_free_space_cachep, e);
 828                                goto free_cache;
 829                        }
 830                } else {
 831                        ASSERT(num_bitmaps);
 832                        num_bitmaps--;
 833                        e->bitmap = kmem_cache_zalloc(
 834                                        btrfs_free_space_bitmap_cachep, GFP_NOFS);
 835                        if (!e->bitmap) {
 836                                ret = -ENOMEM;
 837                                kmem_cache_free(
 838                                        btrfs_free_space_cachep, e);
 839                                goto free_cache;
 840                        }
 841                        spin_lock(&ctl->tree_lock);
 842                        ret = link_free_space(ctl, e);
 843                        ctl->total_bitmaps++;
 844                        recalculate_thresholds(ctl);
 845                        spin_unlock(&ctl->tree_lock);
 846                        if (ret) {
 847                                btrfs_err(fs_info,
 848                                        "Duplicate entries in free space cache, dumping");
 849                                kmem_cache_free(btrfs_free_space_cachep, e);
 850                                goto free_cache;
 851                        }
 852                        list_add_tail(&e->list, &bitmaps);
 853                }
 854
 855                num_entries--;
 856        }
 857
 858        io_ctl_unmap_page(&io_ctl);
 859
 860        /*
 861         * We add the bitmaps at the end of the entries in order that
 862         * the bitmap entries are added to the cache.
 863         */
 864        list_for_each_entry_safe(e, n, &bitmaps, list) {
 865                list_del_init(&e->list);
 866                ret = io_ctl_read_bitmap(&io_ctl, e);
 867                if (ret)
 868                        goto free_cache;
 869        }
 870
 871        io_ctl_drop_pages(&io_ctl);
 872        ret = 1;
 873out:
 874        io_ctl_free(&io_ctl);
 875        return ret;
 876free_cache:
 877        io_ctl_drop_pages(&io_ctl);
 878        __btrfs_remove_free_space_cache(ctl);
 879        goto out;
 880}
 881
 882static int copy_free_space_cache(struct btrfs_block_group *block_group,
 883                                 struct btrfs_free_space_ctl *ctl)
 884{
 885        struct btrfs_free_space *info;
 886        struct rb_node *n;
 887        int ret = 0;
 888
 889        while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) {
 890                info = rb_entry(n, struct btrfs_free_space, offset_index);
 891                if (!info->bitmap) {
 892                        unlink_free_space(ctl, info, true);
 893                        ret = btrfs_add_free_space(block_group, info->offset,
 894                                                   info->bytes);
 895                        kmem_cache_free(btrfs_free_space_cachep, info);
 896                } else {
 897                        u64 offset = info->offset;
 898                        u64 bytes = ctl->unit;
 899
 900                        while (search_bitmap(ctl, info, &offset, &bytes,
 901                                             false) == 0) {
 902                                ret = btrfs_add_free_space(block_group, offset,
 903                                                           bytes);
 904                                if (ret)
 905                                        break;
 906                                bitmap_clear_bits(ctl, info, offset, bytes, true);
 907                                offset = info->offset;
 908                                bytes = ctl->unit;
 909                        }
 910                        free_bitmap(ctl, info);
 911                }
 912                cond_resched();
 913        }
 914        return ret;
 915}
 916
 917int load_free_space_cache(struct btrfs_block_group *block_group)
 918{
 919        struct btrfs_fs_info *fs_info = block_group->fs_info;
 920        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 921        struct btrfs_free_space_ctl tmp_ctl = {};
 922        struct inode *inode;
 923        struct btrfs_path *path;
 924        int ret = 0;
 925        bool matched;
 926        u64 used = block_group->used;
 927
 928        /*
 929         * Because we could potentially discard our loaded free space, we want
 930         * to load everything into a temporary structure first, and then if it's
 931         * valid copy it all into the actual free space ctl.
 932         */
 933        btrfs_init_free_space_ctl(block_group, &tmp_ctl);
 934
 935        /*
 936         * If this block group has been marked to be cleared for one reason or
 937         * another then we can't trust the on disk cache, so just return.
 938         */
 939        spin_lock(&block_group->lock);
 940        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 941                spin_unlock(&block_group->lock);
 942                return 0;
 943        }
 944        spin_unlock(&block_group->lock);
 945
 946        path = btrfs_alloc_path();
 947        if (!path)
 948                return 0;
 949        path->search_commit_root = 1;
 950        path->skip_locking = 1;
 951
 952        /*
 953         * We must pass a path with search_commit_root set to btrfs_iget in
 954         * order to avoid a deadlock when allocating extents for the tree root.
 955         *
 956         * When we are COWing an extent buffer from the tree root, when looking
 957         * for a free extent, at extent-tree.c:find_free_extent(), we can find
 958         * block group without its free space cache loaded. When we find one
 959         * we must load its space cache which requires reading its free space
 960         * cache's inode item from the root tree. If this inode item is located
 961         * in the same leaf that we started COWing before, then we end up in
 962         * deadlock on the extent buffer (trying to read lock it when we
 963         * previously write locked it).
 964         *
 965         * It's safe to read the inode item using the commit root because
 966         * block groups, once loaded, stay in memory forever (until they are
 967         * removed) as well as their space caches once loaded. New block groups
 968         * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
 969         * we will never try to read their inode item while the fs is mounted.
 970         */
 971        inode = lookup_free_space_inode(block_group, path);
 972        if (IS_ERR(inode)) {
 973                btrfs_free_path(path);
 974                return 0;
 975        }
 976
 977        /* We may have converted the inode and made the cache invalid. */
 978        spin_lock(&block_group->lock);
 979        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 980                spin_unlock(&block_group->lock);
 981                btrfs_free_path(path);
 982                goto out;
 983        }
 984        spin_unlock(&block_group->lock);
 985
 986        ret = __load_free_space_cache(fs_info->tree_root, inode, &tmp_ctl,
 987                                      path, block_group->start);
 988        btrfs_free_path(path);
 989        if (ret <= 0)
 990                goto out;
 991
 992        matched = (tmp_ctl.free_space == (block_group->length - used -
 993                                          block_group->bytes_super));
 994
 995        if (matched) {
 996                ret = copy_free_space_cache(block_group, &tmp_ctl);
 997                /*
 998                 * ret == 1 means we successfully loaded the free space cache,
 999                 * so we need to re-set it here.
1000                 */
1001                if (ret == 0)
1002                        ret = 1;
1003        } else {
1004                __btrfs_remove_free_space_cache(&tmp_ctl);
1005                btrfs_warn(fs_info,
1006                           "block group %llu has wrong amount of free space",
1007                           block_group->start);
1008                ret = -1;
1009        }
1010out:
1011        if (ret < 0) {
1012                /* This cache is bogus, make sure it gets cleared */
1013                spin_lock(&block_group->lock);
1014                block_group->disk_cache_state = BTRFS_DC_CLEAR;
1015                spin_unlock(&block_group->lock);
1016                ret = 0;
1017
1018                btrfs_warn(fs_info,
1019                           "failed to load free space cache for block group %llu, rebuilding it now",
1020                           block_group->start);
1021        }
1022
1023        spin_lock(&ctl->tree_lock);
1024        btrfs_discard_update_discardable(block_group);
1025        spin_unlock(&ctl->tree_lock);
1026        iput(inode);
1027        return ret;
1028}
1029
1030static noinline_for_stack
1031int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
1032                              struct btrfs_free_space_ctl *ctl,
1033                              struct btrfs_block_group *block_group,
1034                              int *entries, int *bitmaps,
1035                              struct list_head *bitmap_list)
1036{
1037        int ret;
1038        struct btrfs_free_cluster *cluster = NULL;
1039        struct btrfs_free_cluster *cluster_locked = NULL;
1040        struct rb_node *node = rb_first(&ctl->free_space_offset);
1041        struct btrfs_trim_range *trim_entry;
1042
1043        /* Get the cluster for this block_group if it exists */
1044        if (block_group && !list_empty(&block_group->cluster_list)) {
1045                cluster = list_entry(block_group->cluster_list.next,
1046                                     struct btrfs_free_cluster,
1047                                     block_group_list);
1048        }
1049
1050        if (!node && cluster) {
1051                cluster_locked = cluster;
1052                spin_lock(&cluster_locked->lock);
1053                node = rb_first(&cluster->root);
1054                cluster = NULL;
1055        }
1056
1057        /* Write out the extent entries */
1058        while (node) {
1059                struct btrfs_free_space *e;
1060
1061                e = rb_entry(node, struct btrfs_free_space, offset_index);
1062                *entries += 1;
1063
1064                ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
1065                                       e->bitmap);
1066                if (ret)
1067                        goto fail;
1068
1069                if (e->bitmap) {
1070                        list_add_tail(&e->list, bitmap_list);
1071                        *bitmaps += 1;
1072                }
1073                node = rb_next(node);
1074                if (!node && cluster) {
1075                        node = rb_first(&cluster->root);
1076                        cluster_locked = cluster;
1077                        spin_lock(&cluster_locked->lock);
1078                        cluster = NULL;
1079                }
1080        }
1081        if (cluster_locked) {
1082                spin_unlock(&cluster_locked->lock);
1083                cluster_locked = NULL;
1084        }
1085
1086        /*
1087         * Make sure we don't miss any range that was removed from our rbtree
1088         * because trimming is running. Otherwise after a umount+mount (or crash
1089         * after committing the transaction) we would leak free space and get
1090         * an inconsistent free space cache report from fsck.
1091         */
1092        list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
1093                ret = io_ctl_add_entry(io_ctl, trim_entry->start,
1094                                       trim_entry->bytes, NULL);
1095                if (ret)
1096                        goto fail;
1097                *entries += 1;
1098        }
1099
1100        return 0;
1101fail:
1102        if (cluster_locked)
1103                spin_unlock(&cluster_locked->lock);
1104        return -ENOSPC;
1105}
1106
1107static noinline_for_stack int
1108update_cache_item(struct btrfs_trans_handle *trans,
1109                  struct btrfs_root *root,
1110                  struct inode *inode,
1111                  struct btrfs_path *path, u64 offset,
1112                  int entries, int bitmaps)
1113{
1114        struct btrfs_key key;
1115        struct btrfs_free_space_header *header;
1116        struct extent_buffer *leaf;
1117        int ret;
1118
1119        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1120        key.offset = offset;
1121        key.type = 0;
1122
1123        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1124        if (ret < 0) {
1125                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1126                                 EXTENT_DELALLOC, 0, 0, NULL);
1127                goto fail;
1128        }
1129        leaf = path->nodes[0];
1130        if (ret > 0) {
1131                struct btrfs_key found_key;
1132                ASSERT(path->slots[0]);
1133                path->slots[0]--;
1134                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1135                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1136                    found_key.offset != offset) {
1137                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1138                                         inode->i_size - 1, EXTENT_DELALLOC, 0,
1139                                         0, NULL);
1140                        btrfs_release_path(path);
1141                        goto fail;
1142                }
1143        }
1144
1145        BTRFS_I(inode)->generation = trans->transid;
1146        header = btrfs_item_ptr(leaf, path->slots[0],
1147                                struct btrfs_free_space_header);
1148        btrfs_set_free_space_entries(leaf, header, entries);
1149        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1150        btrfs_set_free_space_generation(leaf, header, trans->transid);
1151        btrfs_mark_buffer_dirty(leaf);
1152        btrfs_release_path(path);
1153
1154        return 0;
1155
1156fail:
1157        return -1;
1158}
1159
1160static noinline_for_stack int write_pinned_extent_entries(
1161                            struct btrfs_trans_handle *trans,
1162                            struct btrfs_block_group *block_group,
1163                            struct btrfs_io_ctl *io_ctl,
1164                            int *entries)
1165{
1166        u64 start, extent_start, extent_end, len;
1167        struct extent_io_tree *unpin = NULL;
1168        int ret;
1169
1170        if (!block_group)
1171                return 0;
1172
1173        /*
1174         * We want to add any pinned extents to our free space cache
1175         * so we don't leak the space
1176         *
1177         * We shouldn't have switched the pinned extents yet so this is the
1178         * right one
1179         */
1180        unpin = &trans->transaction->pinned_extents;
1181
1182        start = block_group->start;
1183
1184        while (start < block_group->start + block_group->length) {
1185                ret = find_first_extent_bit(unpin, start,
1186                                            &extent_start, &extent_end,
1187                                            EXTENT_DIRTY, NULL);
1188                if (ret)
1189                        return 0;
1190
1191                /* This pinned extent is out of our range */
1192                if (extent_start >= block_group->start + block_group->length)
1193                        return 0;
1194
1195                extent_start = max(extent_start, start);
1196                extent_end = min(block_group->start + block_group->length,
1197                                 extent_end + 1);
1198                len = extent_end - extent_start;
1199
1200                *entries += 1;
1201                ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1202                if (ret)
1203                        return -ENOSPC;
1204
1205                start = extent_end;
1206        }
1207
1208        return 0;
1209}
1210
1211static noinline_for_stack int
1212write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1213{
1214        struct btrfs_free_space *entry, *next;
1215        int ret;
1216
1217        /* Write out the bitmaps */
1218        list_for_each_entry_safe(entry, next, bitmap_list, list) {
1219                ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1220                if (ret)
1221                        return -ENOSPC;
1222                list_del_init(&entry->list);
1223        }
1224
1225        return 0;
1226}
1227
1228static int flush_dirty_cache(struct inode *inode)
1229{
1230        int ret;
1231
1232        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1233        if (ret)
1234                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1235                                 EXTENT_DELALLOC, 0, 0, NULL);
1236
1237        return ret;
1238}
1239
1240static void noinline_for_stack
1241cleanup_bitmap_list(struct list_head *bitmap_list)
1242{
1243        struct btrfs_free_space *entry, *next;
1244
1245        list_for_each_entry_safe(entry, next, bitmap_list, list)
1246                list_del_init(&entry->list);
1247}
1248
1249static void noinline_for_stack
1250cleanup_write_cache_enospc(struct inode *inode,
1251                           struct btrfs_io_ctl *io_ctl,
1252                           struct extent_state **cached_state)
1253{
1254        io_ctl_drop_pages(io_ctl);
1255        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1256                             i_size_read(inode) - 1, cached_state);
1257}
1258
1259static int __btrfs_wait_cache_io(struct btrfs_root *root,
1260                                 struct btrfs_trans_handle *trans,
1261                                 struct btrfs_block_group *block_group,
1262                                 struct btrfs_io_ctl *io_ctl,
1263                                 struct btrfs_path *path, u64 offset)
1264{
1265        int ret;
1266        struct inode *inode = io_ctl->inode;
1267
1268        if (!inode)
1269                return 0;
1270
1271        /* Flush the dirty pages in the cache file. */
1272        ret = flush_dirty_cache(inode);
1273        if (ret)
1274                goto out;
1275
1276        /* Update the cache item to tell everyone this cache file is valid. */
1277        ret = update_cache_item(trans, root, inode, path, offset,
1278                                io_ctl->entries, io_ctl->bitmaps);
1279out:
1280        if (ret) {
1281                invalidate_inode_pages2(inode->i_mapping);
1282                BTRFS_I(inode)->generation = 0;
1283                if (block_group)
1284                        btrfs_debug(root->fs_info,
1285          "failed to write free space cache for block group %llu error %d",
1286                                  block_group->start, ret);
1287        }
1288        btrfs_update_inode(trans, root, BTRFS_I(inode));
1289
1290        if (block_group) {
1291                /* the dirty list is protected by the dirty_bgs_lock */
1292                spin_lock(&trans->transaction->dirty_bgs_lock);
1293
1294                /* the disk_cache_state is protected by the block group lock */
1295                spin_lock(&block_group->lock);
1296
1297                /*
1298                 * only mark this as written if we didn't get put back on
1299                 * the dirty list while waiting for IO.   Otherwise our
1300                 * cache state won't be right, and we won't get written again
1301                 */
1302                if (!ret && list_empty(&block_group->dirty_list))
1303                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1304                else if (ret)
1305                        block_group->disk_cache_state = BTRFS_DC_ERROR;
1306
1307                spin_unlock(&block_group->lock);
1308                spin_unlock(&trans->transaction->dirty_bgs_lock);
1309                io_ctl->inode = NULL;
1310                iput(inode);
1311        }
1312
1313        return ret;
1314
1315}
1316
1317int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1318                        struct btrfs_block_group *block_group,
1319                        struct btrfs_path *path)
1320{
1321        return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1322                                     block_group, &block_group->io_ctl,
1323                                     path, block_group->start);
1324}
1325
1326/**
1327 * Write out cached info to an inode
1328 *
1329 * @root:        root the inode belongs to
1330 * @inode:       freespace inode we are writing out
1331 * @ctl:         free space cache we are going to write out
1332 * @block_group: block_group for this cache if it belongs to a block_group
1333 * @io_ctl:      holds context for the io
1334 * @trans:       the trans handle
1335 *
1336 * This function writes out a free space cache struct to disk for quick recovery
1337 * on mount.  This will return 0 if it was successful in writing the cache out,
1338 * or an errno if it was not.
1339 */
1340static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1341                                   struct btrfs_free_space_ctl *ctl,
1342                                   struct btrfs_block_group *block_group,
1343                                   struct btrfs_io_ctl *io_ctl,
1344                                   struct btrfs_trans_handle *trans)
1345{
1346        struct extent_state *cached_state = NULL;
1347        LIST_HEAD(bitmap_list);
1348        int entries = 0;
1349        int bitmaps = 0;
1350        int ret;
1351        int must_iput = 0;
1352
1353        if (!i_size_read(inode))
1354                return -EIO;
1355
1356        WARN_ON(io_ctl->pages);
1357        ret = io_ctl_init(io_ctl, inode, 1);
1358        if (ret)
1359                return ret;
1360
1361        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1362                down_write(&block_group->data_rwsem);
1363                spin_lock(&block_group->lock);
1364                if (block_group->delalloc_bytes) {
1365                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1366                        spin_unlock(&block_group->lock);
1367                        up_write(&block_group->data_rwsem);
1368                        BTRFS_I(inode)->generation = 0;
1369                        ret = 0;
1370                        must_iput = 1;
1371                        goto out;
1372                }
1373                spin_unlock(&block_group->lock);
1374        }
1375
1376        /* Lock all pages first so we can lock the extent safely. */
1377        ret = io_ctl_prepare_pages(io_ctl, false);
1378        if (ret)
1379                goto out_unlock;
1380
1381        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1382                         &cached_state);
1383
1384        io_ctl_set_generation(io_ctl, trans->transid);
1385
1386        mutex_lock(&ctl->cache_writeout_mutex);
1387        /* Write out the extent entries in the free space cache */
1388        spin_lock(&ctl->tree_lock);
1389        ret = write_cache_extent_entries(io_ctl, ctl,
1390                                         block_group, &entries, &bitmaps,
1391                                         &bitmap_list);
1392        if (ret)
1393                goto out_nospc_locked;
1394
1395        /*
1396         * Some spaces that are freed in the current transaction are pinned,
1397         * they will be added into free space cache after the transaction is
1398         * committed, we shouldn't lose them.
1399         *
1400         * If this changes while we are working we'll get added back to
1401         * the dirty list and redo it.  No locking needed
1402         */
1403        ret = write_pinned_extent_entries(trans, block_group, io_ctl, &entries);
1404        if (ret)
1405                goto out_nospc_locked;
1406
1407        /*
1408         * At last, we write out all the bitmaps and keep cache_writeout_mutex
1409         * locked while doing it because a concurrent trim can be manipulating
1410         * or freeing the bitmap.
1411         */
1412        ret = write_bitmap_entries(io_ctl, &bitmap_list);
1413        spin_unlock(&ctl->tree_lock);
1414        mutex_unlock(&ctl->cache_writeout_mutex);
1415        if (ret)
1416                goto out_nospc;
1417
1418        /* Zero out the rest of the pages just to make sure */
1419        io_ctl_zero_remaining_pages(io_ctl);
1420
1421        /* Everything is written out, now we dirty the pages in the file. */
1422        ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
1423                                io_ctl->num_pages, 0, i_size_read(inode),
1424                                &cached_state, false);
1425        if (ret)
1426                goto out_nospc;
1427
1428        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1429                up_write(&block_group->data_rwsem);
1430        /*
1431         * Release the pages and unlock the extent, we will flush
1432         * them out later
1433         */
1434        io_ctl_drop_pages(io_ctl);
1435        io_ctl_free(io_ctl);
1436
1437        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1438                             i_size_read(inode) - 1, &cached_state);
1439
1440        /*
1441         * at this point the pages are under IO and we're happy,
1442         * The caller is responsible for waiting on them and updating
1443         * the cache and the inode
1444         */
1445        io_ctl->entries = entries;
1446        io_ctl->bitmaps = bitmaps;
1447
1448        ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1449        if (ret)
1450                goto out;
1451
1452        return 0;
1453
1454out_nospc_locked:
1455        cleanup_bitmap_list(&bitmap_list);
1456        spin_unlock(&ctl->tree_lock);
1457        mutex_unlock(&ctl->cache_writeout_mutex);
1458
1459out_nospc:
1460        cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1461
1462out_unlock:
1463        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1464                up_write(&block_group->data_rwsem);
1465
1466out:
1467        io_ctl->inode = NULL;
1468        io_ctl_free(io_ctl);
1469        if (ret) {
1470                invalidate_inode_pages2(inode->i_mapping);
1471                BTRFS_I(inode)->generation = 0;
1472        }
1473        btrfs_update_inode(trans, root, BTRFS_I(inode));
1474        if (must_iput)
1475                iput(inode);
1476        return ret;
1477}
1478
1479int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1480                          struct btrfs_block_group *block_group,
1481                          struct btrfs_path *path)
1482{
1483        struct btrfs_fs_info *fs_info = trans->fs_info;
1484        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1485        struct inode *inode;
1486        int ret = 0;
1487
1488        spin_lock(&block_group->lock);
1489        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1490                spin_unlock(&block_group->lock);
1491                return 0;
1492        }
1493        spin_unlock(&block_group->lock);
1494
1495        inode = lookup_free_space_inode(block_group, path);
1496        if (IS_ERR(inode))
1497                return 0;
1498
1499        ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1500                                block_group, &block_group->io_ctl, trans);
1501        if (ret) {
1502                btrfs_debug(fs_info,
1503          "failed to write free space cache for block group %llu error %d",
1504                          block_group->start, ret);
1505                spin_lock(&block_group->lock);
1506                block_group->disk_cache_state = BTRFS_DC_ERROR;
1507                spin_unlock(&block_group->lock);
1508
1509                block_group->io_ctl.inode = NULL;
1510                iput(inode);
1511        }
1512
1513        /*
1514         * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1515         * to wait for IO and put the inode
1516         */
1517
1518        return ret;
1519}
1520
1521static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1522                                          u64 offset)
1523{
1524        ASSERT(offset >= bitmap_start);
1525        offset -= bitmap_start;
1526        return (unsigned long)(div_u64(offset, unit));
1527}
1528
1529static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1530{
1531        return (unsigned long)(div_u64(bytes, unit));
1532}
1533
1534static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1535                                   u64 offset)
1536{
1537        u64 bitmap_start;
1538        u64 bytes_per_bitmap;
1539
1540        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1541        bitmap_start = offset - ctl->start;
1542        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1543        bitmap_start *= bytes_per_bitmap;
1544        bitmap_start += ctl->start;
1545
1546        return bitmap_start;
1547}
1548
1549static int tree_insert_offset(struct rb_root *root, u64 offset,
1550                              struct rb_node *node, int bitmap)
1551{
1552        struct rb_node **p = &root->rb_node;
1553        struct rb_node *parent = NULL;
1554        struct btrfs_free_space *info;
1555
1556        while (*p) {
1557                parent = *p;
1558                info = rb_entry(parent, struct btrfs_free_space, offset_index);
1559
1560                if (offset < info->offset) {
1561                        p = &(*p)->rb_left;
1562                } else if (offset > info->offset) {
1563                        p = &(*p)->rb_right;
1564                } else {
1565                        /*
1566                         * we could have a bitmap entry and an extent entry
1567                         * share the same offset.  If this is the case, we want
1568                         * the extent entry to always be found first if we do a
1569                         * linear search through the tree, since we want to have
1570                         * the quickest allocation time, and allocating from an
1571                         * extent is faster than allocating from a bitmap.  So
1572                         * if we're inserting a bitmap and we find an entry at
1573                         * this offset, we want to go right, or after this entry
1574                         * logically.  If we are inserting an extent and we've
1575                         * found a bitmap, we want to go left, or before
1576                         * logically.
1577                         */
1578                        if (bitmap) {
1579                                if (info->bitmap) {
1580                                        WARN_ON_ONCE(1);
1581                                        return -EEXIST;
1582                                }
1583                                p = &(*p)->rb_right;
1584                        } else {
1585                                if (!info->bitmap) {
1586                                        WARN_ON_ONCE(1);
1587                                        return -EEXIST;
1588                                }
1589                                p = &(*p)->rb_left;
1590                        }
1591                }
1592        }
1593
1594        rb_link_node(node, parent, p);
1595        rb_insert_color(node, root);
1596
1597        return 0;
1598}
1599
1600/*
1601 * This is a little subtle.  We *only* have ->max_extent_size set if we actually
1602 * searched through the bitmap and figured out the largest ->max_extent_size,
1603 * otherwise it's 0.  In the case that it's 0 we don't want to tell the
1604 * allocator the wrong thing, we want to use the actual real max_extent_size
1605 * we've found already if it's larger, or we want to use ->bytes.
1606 *
1607 * This matters because find_free_space() will skip entries who's ->bytes is
1608 * less than the required bytes.  So if we didn't search down this bitmap, we
1609 * may pick some previous entry that has a smaller ->max_extent_size than we
1610 * have.  For example, assume we have two entries, one that has
1611 * ->max_extent_size set to 4K and ->bytes set to 1M.  A second entry hasn't set
1612 * ->max_extent_size yet, has ->bytes set to 8K and it's contiguous.  We will
1613 *  call into find_free_space(), and return with max_extent_size == 4K, because
1614 *  that first bitmap entry had ->max_extent_size set, but the second one did
1615 *  not.  If instead we returned 8K we'd come in searching for 8K, and find the
1616 *  8K contiguous range.
1617 *
1618 *  Consider the other case, we have 2 8K chunks in that second entry and still
1619 *  don't have ->max_extent_size set.  We'll return 16K, and the next time the
1620 *  allocator comes in it'll fully search our second bitmap, and this time it'll
1621 *  get an uptodate value of 8K as the maximum chunk size.  Then we'll get the
1622 *  right allocation the next loop through.
1623 */
1624static inline u64 get_max_extent_size(const struct btrfs_free_space *entry)
1625{
1626        if (entry->bitmap && entry->max_extent_size)
1627                return entry->max_extent_size;
1628        return entry->bytes;
1629}
1630
1631/*
1632 * We want the largest entry to be leftmost, so this is inverted from what you'd
1633 * normally expect.
1634 */
1635static bool entry_less(struct rb_node *node, const struct rb_node *parent)
1636{
1637        const struct btrfs_free_space *entry, *exist;
1638
1639        entry = rb_entry(node, struct btrfs_free_space, bytes_index);
1640        exist = rb_entry(parent, struct btrfs_free_space, bytes_index);
1641        return get_max_extent_size(exist) < get_max_extent_size(entry);
1642}
1643
1644/*
1645 * searches the tree for the given offset.
1646 *
1647 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1648 * want a section that has at least bytes size and comes at or after the given
1649 * offset.
1650 */
1651static struct btrfs_free_space *
1652tree_search_offset(struct btrfs_free_space_ctl *ctl,
1653                   u64 offset, int bitmap_only, int fuzzy)
1654{
1655        struct rb_node *n = ctl->free_space_offset.rb_node;
1656        struct btrfs_free_space *entry = NULL, *prev = NULL;
1657
1658        /* find entry that is closest to the 'offset' */
1659        while (n) {
1660                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1661                prev = entry;
1662
1663                if (offset < entry->offset)
1664                        n = n->rb_left;
1665                else if (offset > entry->offset)
1666                        n = n->rb_right;
1667                else
1668                        break;
1669
1670                entry = NULL;
1671        }
1672
1673        if (bitmap_only) {
1674                if (!entry)
1675                        return NULL;
1676                if (entry->bitmap)
1677                        return entry;
1678
1679                /*
1680                 * bitmap entry and extent entry may share same offset,
1681                 * in that case, bitmap entry comes after extent entry.
1682                 */
1683                n = rb_next(n);
1684                if (!n)
1685                        return NULL;
1686                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1687                if (entry->offset != offset)
1688                        return NULL;
1689
1690                WARN_ON(!entry->bitmap);
1691                return entry;
1692        } else if (entry) {
1693                if (entry->bitmap) {
1694                        /*
1695                         * if previous extent entry covers the offset,
1696                         * we should return it instead of the bitmap entry
1697                         */
1698                        n = rb_prev(&entry->offset_index);
1699                        if (n) {
1700                                prev = rb_entry(n, struct btrfs_free_space,
1701                                                offset_index);
1702                                if (!prev->bitmap &&
1703                                    prev->offset + prev->bytes > offset)
1704                                        entry = prev;
1705                        }
1706                }
1707                return entry;
1708        }
1709
1710        if (!prev)
1711                return NULL;
1712
1713        /* find last entry before the 'offset' */
1714        entry = prev;
1715        if (entry->offset > offset) {
1716                n = rb_prev(&entry->offset_index);
1717                if (n) {
1718                        entry = rb_entry(n, struct btrfs_free_space,
1719                                        offset_index);
1720                        ASSERT(entry->offset <= offset);
1721                } else {
1722                        if (fuzzy)
1723                                return entry;
1724                        else
1725                                return NULL;
1726                }
1727        }
1728
1729        if (entry->bitmap) {
1730                n = rb_prev(&entry->offset_index);
1731                if (n) {
1732                        prev = rb_entry(n, struct btrfs_free_space,
1733                                        offset_index);
1734                        if (!prev->bitmap &&
1735                            prev->offset + prev->bytes > offset)
1736                                return prev;
1737                }
1738                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1739                        return entry;
1740        } else if (entry->offset + entry->bytes > offset)
1741                return entry;
1742
1743        if (!fuzzy)
1744                return NULL;
1745
1746        while (1) {
1747                n = rb_next(&entry->offset_index);
1748                if (!n)
1749                        return NULL;
1750                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1751                if (entry->bitmap) {
1752                        if (entry->offset + BITS_PER_BITMAP *
1753                            ctl->unit > offset)
1754                                break;
1755                } else {
1756                        if (entry->offset + entry->bytes > offset)
1757                                break;
1758                }
1759        }
1760        return entry;
1761}
1762
1763static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1764                                     struct btrfs_free_space *info,
1765                                     bool update_stat)
1766{
1767        rb_erase(&info->offset_index, &ctl->free_space_offset);
1768        rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1769        ctl->free_extents--;
1770
1771        if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1772                ctl->discardable_extents[BTRFS_STAT_CURR]--;
1773                ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes;
1774        }
1775
1776        if (update_stat)
1777                ctl->free_space -= info->bytes;
1778}
1779
1780static int link_free_space(struct btrfs_free_space_ctl *ctl,
1781                           struct btrfs_free_space *info)
1782{
1783        int ret = 0;
1784
1785        ASSERT(info->bytes || info->bitmap);
1786        ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1787                                 &info->offset_index, (info->bitmap != NULL));
1788        if (ret)
1789                return ret;
1790
1791        rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1792
1793        if (!info->bitmap && !btrfs_free_space_trimmed(info)) {
1794                ctl->discardable_extents[BTRFS_STAT_CURR]++;
1795                ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
1796        }
1797
1798        ctl->free_space += info->bytes;
1799        ctl->free_extents++;
1800        return ret;
1801}
1802
1803static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl,
1804                                struct btrfs_free_space *info)
1805{
1806        ASSERT(info->bitmap);
1807
1808        /*
1809         * If our entry is empty it's because we're on a cluster and we don't
1810         * want to re-link it into our ctl bytes index.
1811         */
1812        if (RB_EMPTY_NODE(&info->bytes_index))
1813                return;
1814
1815        rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes);
1816        rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less);
1817}
1818
1819static inline void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1820                                     struct btrfs_free_space *info,
1821                                     u64 offset, u64 bytes, bool update_stat)
1822{
1823        unsigned long start, count, end;
1824        int extent_delta = -1;
1825
1826        start = offset_to_bit(info->offset, ctl->unit, offset);
1827        count = bytes_to_bits(bytes, ctl->unit);
1828        end = start + count;
1829        ASSERT(end <= BITS_PER_BITMAP);
1830
1831        bitmap_clear(info->bitmap, start, count);
1832
1833        info->bytes -= bytes;
1834        if (info->max_extent_size > ctl->unit)
1835                info->max_extent_size = 0;
1836
1837        relink_bitmap_entry(ctl, info);
1838
1839        if (start && test_bit(start - 1, info->bitmap))
1840                extent_delta++;
1841
1842        if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1843                extent_delta++;
1844
1845        info->bitmap_extents += extent_delta;
1846        if (!btrfs_free_space_trimmed(info)) {
1847                ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1848                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
1849        }
1850
1851        if (update_stat)
1852                ctl->free_space -= bytes;
1853}
1854
1855static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1856                            struct btrfs_free_space *info, u64 offset,
1857                            u64 bytes)
1858{
1859        unsigned long start, count, end;
1860        int extent_delta = 1;
1861
1862        start = offset_to_bit(info->offset, ctl->unit, offset);
1863        count = bytes_to_bits(bytes, ctl->unit);
1864        end = start + count;
1865        ASSERT(end <= BITS_PER_BITMAP);
1866
1867        bitmap_set(info->bitmap, start, count);
1868
1869        /*
1870         * We set some bytes, we have no idea what the max extent size is
1871         * anymore.
1872         */
1873        info->max_extent_size = 0;
1874        info->bytes += bytes;
1875        ctl->free_space += bytes;
1876
1877        relink_bitmap_entry(ctl, info);
1878
1879        if (start && test_bit(start - 1, info->bitmap))
1880                extent_delta--;
1881
1882        if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap))
1883                extent_delta--;
1884
1885        info->bitmap_extents += extent_delta;
1886        if (!btrfs_free_space_trimmed(info)) {
1887                ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta;
1888                ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes;
1889        }
1890}
1891
1892/*
1893 * If we can not find suitable extent, we will use bytes to record
1894 * the size of the max extent.
1895 */
1896static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1897                         struct btrfs_free_space *bitmap_info, u64 *offset,
1898                         u64 *bytes, bool for_alloc)
1899{
1900        unsigned long found_bits = 0;
1901        unsigned long max_bits = 0;
1902        unsigned long bits, i;
1903        unsigned long next_zero;
1904        unsigned long extent_bits;
1905
1906        /*
1907         * Skip searching the bitmap if we don't have a contiguous section that
1908         * is large enough for this allocation.
1909         */
1910        if (for_alloc &&
1911            bitmap_info->max_extent_size &&
1912            bitmap_info->max_extent_size < *bytes) {
1913                *bytes = bitmap_info->max_extent_size;
1914                return -1;
1915        }
1916
1917        i = offset_to_bit(bitmap_info->offset, ctl->unit,
1918                          max_t(u64, *offset, bitmap_info->offset));
1919        bits = bytes_to_bits(*bytes, ctl->unit);
1920
1921        for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1922                if (for_alloc && bits == 1) {
1923                        found_bits = 1;
1924                        break;
1925                }
1926                next_zero = find_next_zero_bit(bitmap_info->bitmap,
1927                                               BITS_PER_BITMAP, i);
1928                extent_bits = next_zero - i;
1929                if (extent_bits >= bits) {
1930                        found_bits = extent_bits;
1931                        break;
1932                } else if (extent_bits > max_bits) {
1933                        max_bits = extent_bits;
1934                }
1935                i = next_zero;
1936        }
1937
1938        if (found_bits) {
1939                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1940                *bytes = (u64)(found_bits) * ctl->unit;
1941                return 0;
1942        }
1943
1944        *bytes = (u64)(max_bits) * ctl->unit;
1945        bitmap_info->max_extent_size = *bytes;
1946        relink_bitmap_entry(ctl, bitmap_info);
1947        return -1;
1948}
1949
1950/* Cache the size of the max extent in bytes */
1951static struct btrfs_free_space *
1952find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1953                unsigned long align, u64 *max_extent_size, bool use_bytes_index)
1954{
1955        struct btrfs_free_space *entry;
1956        struct rb_node *node;
1957        u64 tmp;
1958        u64 align_off;
1959        int ret;
1960
1961        if (!ctl->free_space_offset.rb_node)
1962                goto out;
1963again:
1964        if (use_bytes_index) {
1965                node = rb_first_cached(&ctl->free_space_bytes);
1966        } else {
1967                entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset),
1968                                           0, 1);
1969                if (!entry)
1970                        goto out;
1971                node = &entry->offset_index;
1972        }
1973
1974        for (; node; node = rb_next(node)) {
1975                if (use_bytes_index)
1976                        entry = rb_entry(node, struct btrfs_free_space,
1977                                         bytes_index);
1978                else
1979                        entry = rb_entry(node, struct btrfs_free_space,
1980                                         offset_index);
1981
1982                /*
1983                 * If we are using the bytes index then all subsequent entries
1984                 * in this tree are going to be < bytes, so simply set the max
1985                 * extent size and exit the loop.
1986                 *
1987                 * If we're using the offset index then we need to keep going
1988                 * through the rest of the tree.
1989                 */
1990                if (entry->bytes < *bytes) {
1991                        *max_extent_size = max(get_max_extent_size(entry),
1992                                               *max_extent_size);
1993                        if (use_bytes_index)
1994                                break;
1995                        continue;
1996                }
1997
1998                /* make sure the space returned is big enough
1999                 * to match our requested alignment
2000                 */
2001                if (*bytes >= align) {
2002                        tmp = entry->offset - ctl->start + align - 1;
2003                        tmp = div64_u64(tmp, align);
2004                        tmp = tmp * align + ctl->start;
2005                        align_off = tmp - entry->offset;
2006                } else {
2007                        align_off = 0;
2008                        tmp = entry->offset;
2009                }
2010
2011                /*
2012                 * We don't break here if we're using the bytes index because we
2013                 * may have another entry that has the correct alignment that is
2014                 * the right size, so we don't want to miss that possibility.
2015                 * At worst this adds another loop through the logic, but if we
2016                 * broke here we could prematurely ENOSPC.
2017                 */
2018                if (entry->bytes < *bytes + align_off) {
2019                        *max_extent_size = max(get_max_extent_size(entry),
2020                                               *max_extent_size);
2021                        continue;
2022                }
2023
2024                if (entry->bitmap) {
2025                        struct rb_node *old_next = rb_next(node);
2026                        u64 size = *bytes;
2027
2028                        ret = search_bitmap(ctl, entry, &tmp, &size, true);
2029                        if (!ret) {
2030                                *offset = tmp;
2031                                *bytes = size;
2032                                return entry;
2033                        } else {
2034                                *max_extent_size =
2035                                        max(get_max_extent_size(entry),
2036                                            *max_extent_size);
2037                        }
2038
2039                        /*
2040                         * The bitmap may have gotten re-arranged in the space
2041                         * index here because the max_extent_size may have been
2042                         * updated.  Start from the beginning again if this
2043                         * happened.
2044                         */
2045                        if (use_bytes_index && old_next != rb_next(node))
2046                                goto again;
2047                        continue;
2048                }
2049
2050                *offset = tmp;
2051                *bytes = entry->bytes - align_off;
2052                return entry;
2053        }
2054out:
2055        return NULL;
2056}
2057
2058static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
2059                           struct btrfs_free_space *info, u64 offset)
2060{
2061        info->offset = offset_to_bitmap(ctl, offset);
2062        info->bytes = 0;
2063        info->bitmap_extents = 0;
2064        INIT_LIST_HEAD(&info->list);
2065        link_free_space(ctl, info);
2066        ctl->total_bitmaps++;
2067        recalculate_thresholds(ctl);
2068}
2069
2070static void free_bitmap(struct btrfs_free_space_ctl *ctl,
2071                        struct btrfs_free_space *bitmap_info)
2072{
2073        /*
2074         * Normally when this is called, the bitmap is completely empty. However,
2075         * if we are blowing up the free space cache for one reason or another
2076         * via __btrfs_remove_free_space_cache(), then it may not be freed and
2077         * we may leave stats on the table.
2078         */
2079        if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) {
2080                ctl->discardable_extents[BTRFS_STAT_CURR] -=
2081                        bitmap_info->bitmap_extents;
2082                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes;
2083
2084        }
2085        unlink_free_space(ctl, bitmap_info, true);
2086        kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
2087        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
2088        ctl->total_bitmaps--;
2089        recalculate_thresholds(ctl);
2090}
2091
2092static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
2093                              struct btrfs_free_space *bitmap_info,
2094                              u64 *offset, u64 *bytes)
2095{
2096        u64 end;
2097        u64 search_start, search_bytes;
2098        int ret;
2099
2100again:
2101        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
2102
2103        /*
2104         * We need to search for bits in this bitmap.  We could only cover some
2105         * of the extent in this bitmap thanks to how we add space, so we need
2106         * to search for as much as it as we can and clear that amount, and then
2107         * go searching for the next bit.
2108         */
2109        search_start = *offset;
2110        search_bytes = ctl->unit;
2111        search_bytes = min(search_bytes, end - search_start + 1);
2112        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
2113                            false);
2114        if (ret < 0 || search_start != *offset)
2115                return -EINVAL;
2116
2117        /* We may have found more bits than what we need */
2118        search_bytes = min(search_bytes, *bytes);
2119
2120        /* Cannot clear past the end of the bitmap */
2121        search_bytes = min(search_bytes, end - search_start + 1);
2122
2123        bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes, true);
2124        *offset += search_bytes;
2125        *bytes -= search_bytes;
2126
2127        if (*bytes) {
2128                struct rb_node *next = rb_next(&bitmap_info->offset_index);
2129                if (!bitmap_info->bytes)
2130                        free_bitmap(ctl, bitmap_info);
2131
2132                /*
2133                 * no entry after this bitmap, but we still have bytes to
2134                 * remove, so something has gone wrong.
2135                 */
2136                if (!next)
2137                        return -EINVAL;
2138
2139                bitmap_info = rb_entry(next, struct btrfs_free_space,
2140                                       offset_index);
2141
2142                /*
2143                 * if the next entry isn't a bitmap we need to return to let the
2144                 * extent stuff do its work.
2145                 */
2146                if (!bitmap_info->bitmap)
2147                        return -EAGAIN;
2148
2149                /*
2150                 * Ok the next item is a bitmap, but it may not actually hold
2151                 * the information for the rest of this free space stuff, so
2152                 * look for it, and if we don't find it return so we can try
2153                 * everything over again.
2154                 */
2155                search_start = *offset;
2156                search_bytes = ctl->unit;
2157                ret = search_bitmap(ctl, bitmap_info, &search_start,
2158                                    &search_bytes, false);
2159                if (ret < 0 || search_start != *offset)
2160                        return -EAGAIN;
2161
2162                goto again;
2163        } else if (!bitmap_info->bytes)
2164                free_bitmap(ctl, bitmap_info);
2165
2166        return 0;
2167}
2168
2169static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
2170                               struct btrfs_free_space *info, u64 offset,
2171                               u64 bytes, enum btrfs_trim_state trim_state)
2172{
2173        u64 bytes_to_set = 0;
2174        u64 end;
2175
2176        /*
2177         * This is a tradeoff to make bitmap trim state minimal.  We mark the
2178         * whole bitmap untrimmed if at any point we add untrimmed regions.
2179         */
2180        if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) {
2181                if (btrfs_free_space_trimmed(info)) {
2182                        ctl->discardable_extents[BTRFS_STAT_CURR] +=
2183                                info->bitmap_extents;
2184                        ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes;
2185                }
2186                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2187        }
2188
2189        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
2190
2191        bytes_to_set = min(end - offset, bytes);
2192
2193        bitmap_set_bits(ctl, info, offset, bytes_to_set);
2194
2195        return bytes_to_set;
2196
2197}
2198
2199static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
2200                      struct btrfs_free_space *info)
2201{
2202        struct btrfs_block_group *block_group = ctl->block_group;
2203        struct btrfs_fs_info *fs_info = block_group->fs_info;
2204        bool forced = false;
2205
2206#ifdef CONFIG_BTRFS_DEBUG
2207        if (btrfs_should_fragment_free_space(block_group))
2208                forced = true;
2209#endif
2210
2211        /* This is a way to reclaim large regions from the bitmaps. */
2212        if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD)
2213                return false;
2214
2215        /*
2216         * If we are below the extents threshold then we can add this as an
2217         * extent, and don't have to deal with the bitmap
2218         */
2219        if (!forced && ctl->free_extents < ctl->extents_thresh) {
2220                /*
2221                 * If this block group has some small extents we don't want to
2222                 * use up all of our free slots in the cache with them, we want
2223                 * to reserve them to larger extents, however if we have plenty
2224                 * of cache left then go ahead an dadd them, no sense in adding
2225                 * the overhead of a bitmap if we don't have to.
2226                 */
2227                if (info->bytes <= fs_info->sectorsize * 8) {
2228                        if (ctl->free_extents * 3 <= ctl->extents_thresh)
2229                                return false;
2230                } else {
2231                        return false;
2232                }
2233        }
2234
2235        /*
2236         * The original block groups from mkfs can be really small, like 8
2237         * megabytes, so don't bother with a bitmap for those entries.  However
2238         * some block groups can be smaller than what a bitmap would cover but
2239         * are still large enough that they could overflow the 32k memory limit,
2240         * so allow those block groups to still be allowed to have a bitmap
2241         * entry.
2242         */
2243        if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->length)
2244                return false;
2245
2246        return true;
2247}
2248
2249static const struct btrfs_free_space_op free_space_op = {
2250        .use_bitmap             = use_bitmap,
2251};
2252
2253static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2254                              struct btrfs_free_space *info)
2255{
2256        struct btrfs_free_space *bitmap_info;
2257        struct btrfs_block_group *block_group = NULL;
2258        int added = 0;
2259        u64 bytes, offset, bytes_added;
2260        enum btrfs_trim_state trim_state;
2261        int ret;
2262
2263        bytes = info->bytes;
2264        offset = info->offset;
2265        trim_state = info->trim_state;
2266
2267        if (!ctl->op->use_bitmap(ctl, info))
2268                return 0;
2269
2270        if (ctl->op == &free_space_op)
2271                block_group = ctl->block_group;
2272again:
2273        /*
2274         * Since we link bitmaps right into the cluster we need to see if we
2275         * have a cluster here, and if so and it has our bitmap we need to add
2276         * the free space to that bitmap.
2277         */
2278        if (block_group && !list_empty(&block_group->cluster_list)) {
2279                struct btrfs_free_cluster *cluster;
2280                struct rb_node *node;
2281                struct btrfs_free_space *entry;
2282
2283                cluster = list_entry(block_group->cluster_list.next,
2284                                     struct btrfs_free_cluster,
2285                                     block_group_list);
2286                spin_lock(&cluster->lock);
2287                node = rb_first(&cluster->root);
2288                if (!node) {
2289                        spin_unlock(&cluster->lock);
2290                        goto no_cluster_bitmap;
2291                }
2292
2293                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2294                if (!entry->bitmap) {
2295                        spin_unlock(&cluster->lock);
2296                        goto no_cluster_bitmap;
2297                }
2298
2299                if (entry->offset == offset_to_bitmap(ctl, offset)) {
2300                        bytes_added = add_bytes_to_bitmap(ctl, entry, offset,
2301                                                          bytes, trim_state);
2302                        bytes -= bytes_added;
2303                        offset += bytes_added;
2304                }
2305                spin_unlock(&cluster->lock);
2306                if (!bytes) {
2307                        ret = 1;
2308                        goto out;
2309                }
2310        }
2311
2312no_cluster_bitmap:
2313        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2314                                         1, 0);
2315        if (!bitmap_info) {
2316                ASSERT(added == 0);
2317                goto new_bitmap;
2318        }
2319
2320        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
2321                                          trim_state);
2322        bytes -= bytes_added;
2323        offset += bytes_added;
2324        added = 0;
2325
2326        if (!bytes) {
2327                ret = 1;
2328                goto out;
2329        } else
2330                goto again;
2331
2332new_bitmap:
2333        if (info && info->bitmap) {
2334                add_new_bitmap(ctl, info, offset);
2335                added = 1;
2336                info = NULL;
2337                goto again;
2338        } else {
2339                spin_unlock(&ctl->tree_lock);
2340
2341                /* no pre-allocated info, allocate a new one */
2342                if (!info) {
2343                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
2344                                                 GFP_NOFS);
2345                        if (!info) {
2346                                spin_lock(&ctl->tree_lock);
2347                                ret = -ENOMEM;
2348                                goto out;
2349                        }
2350                }
2351
2352                /* allocate the bitmap */
2353                info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2354                                                 GFP_NOFS);
2355                info->trim_state = BTRFS_TRIM_STATE_TRIMMED;
2356                spin_lock(&ctl->tree_lock);
2357                if (!info->bitmap) {
2358                        ret = -ENOMEM;
2359                        goto out;
2360                }
2361                goto again;
2362        }
2363
2364out:
2365        if (info) {
2366                if (info->bitmap)
2367                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
2368                                        info->bitmap);
2369                kmem_cache_free(btrfs_free_space_cachep, info);
2370        }
2371
2372        return ret;
2373}
2374
2375/*
2376 * Free space merging rules:
2377 *  1) Merge trimmed areas together
2378 *  2) Let untrimmed areas coalesce with trimmed areas
2379 *  3) Always pull neighboring regions from bitmaps
2380 *
2381 * The above rules are for when we merge free space based on btrfs_trim_state.
2382 * Rules 2 and 3 are subtle because they are suboptimal, but are done for the
2383 * same reason: to promote larger extent regions which makes life easier for
2384 * find_free_extent().  Rule 2 enables coalescing based on the common path
2385 * being returning free space from btrfs_finish_extent_commit().  So when free
2386 * space is trimmed, it will prevent aggregating trimmed new region and
2387 * untrimmed regions in the rb_tree.  Rule 3 is purely to obtain larger extents
2388 * and provide find_free_extent() with the largest extents possible hoping for
2389 * the reuse path.
2390 */
2391static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2392                          struct btrfs_free_space *info, bool update_stat)
2393{
2394        struct btrfs_free_space *left_info = NULL;
2395        struct btrfs_free_space *right_info;
2396        bool merged = false;
2397        u64 offset = info->offset;
2398        u64 bytes = info->bytes;
2399        const bool is_trimmed = btrfs_free_space_trimmed(info);
2400
2401        /*
2402         * first we want to see if there is free space adjacent to the range we
2403         * are adding, if there is remove that struct and add a new one to
2404         * cover the entire range
2405         */
2406        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2407        if (right_info && rb_prev(&right_info->offset_index))
2408                left_info = rb_entry(rb_prev(&right_info->offset_index),
2409                                     struct btrfs_free_space, offset_index);
2410        else if (!right_info)
2411                left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2412
2413        /* See try_merge_free_space() comment. */
2414        if (right_info && !right_info->bitmap &&
2415            (!is_trimmed || btrfs_free_space_trimmed(right_info))) {
2416                unlink_free_space(ctl, right_info, update_stat);
2417                info->bytes += right_info->bytes;
2418                kmem_cache_free(btrfs_free_space_cachep, right_info);
2419                merged = true;
2420        }
2421
2422        /* See try_merge_free_space() comment. */
2423        if (left_info && !left_info->bitmap &&
2424            left_info->offset + left_info->bytes == offset &&
2425            (!is_trimmed || btrfs_free_space_trimmed(left_info))) {
2426                unlink_free_space(ctl, left_info, update_stat);
2427                info->offset = left_info->offset;
2428                info->bytes += left_info->bytes;
2429                kmem_cache_free(btrfs_free_space_cachep, left_info);
2430                merged = true;
2431        }
2432
2433        return merged;
2434}
2435
2436static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2437                                     struct btrfs_free_space *info,
2438                                     bool update_stat)
2439{
2440        struct btrfs_free_space *bitmap;
2441        unsigned long i;
2442        unsigned long j;
2443        const u64 end = info->offset + info->bytes;
2444        const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2445        u64 bytes;
2446
2447        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2448        if (!bitmap)
2449                return false;
2450
2451        i = offset_to_bit(bitmap->offset, ctl->unit, end);
2452        j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2453        if (j == i)
2454                return false;
2455        bytes = (j - i) * ctl->unit;
2456        info->bytes += bytes;
2457
2458        /* See try_merge_free_space() comment. */
2459        if (!btrfs_free_space_trimmed(bitmap))
2460                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2461
2462        bitmap_clear_bits(ctl, bitmap, end, bytes, update_stat);
2463
2464        if (!bitmap->bytes)
2465                free_bitmap(ctl, bitmap);
2466
2467        return true;
2468}
2469
2470static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2471                                       struct btrfs_free_space *info,
2472                                       bool update_stat)
2473{
2474        struct btrfs_free_space *bitmap;
2475        u64 bitmap_offset;
2476        unsigned long i;
2477        unsigned long j;
2478        unsigned long prev_j;
2479        u64 bytes;
2480
2481        bitmap_offset = offset_to_bitmap(ctl, info->offset);
2482        /* If we're on a boundary, try the previous logical bitmap. */
2483        if (bitmap_offset == info->offset) {
2484                if (info->offset == 0)
2485                        return false;
2486                bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2487        }
2488
2489        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2490        if (!bitmap)
2491                return false;
2492
2493        i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2494        j = 0;
2495        prev_j = (unsigned long)-1;
2496        for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2497                if (j > i)
2498                        break;
2499                prev_j = j;
2500        }
2501        if (prev_j == i)
2502                return false;
2503
2504        if (prev_j == (unsigned long)-1)
2505                bytes = (i + 1) * ctl->unit;
2506        else
2507                bytes = (i - prev_j) * ctl->unit;
2508
2509        info->offset -= bytes;
2510        info->bytes += bytes;
2511
2512        /* See try_merge_free_space() comment. */
2513        if (!btrfs_free_space_trimmed(bitmap))
2514                info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2515
2516        bitmap_clear_bits(ctl, bitmap, info->offset, bytes, update_stat);
2517
2518        if (!bitmap->bytes)
2519                free_bitmap(ctl, bitmap);
2520
2521        return true;
2522}
2523
2524/*
2525 * We prefer always to allocate from extent entries, both for clustered and
2526 * non-clustered allocation requests. So when attempting to add a new extent
2527 * entry, try to see if there's adjacent free space in bitmap entries, and if
2528 * there is, migrate that space from the bitmaps to the extent.
2529 * Like this we get better chances of satisfying space allocation requests
2530 * because we attempt to satisfy them based on a single cache entry, and never
2531 * on 2 or more entries - even if the entries represent a contiguous free space
2532 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2533 * ends).
2534 */
2535static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2536                              struct btrfs_free_space *info,
2537                              bool update_stat)
2538{
2539        /*
2540         * Only work with disconnected entries, as we can change their offset,
2541         * and must be extent entries.
2542         */
2543        ASSERT(!info->bitmap);
2544        ASSERT(RB_EMPTY_NODE(&info->offset_index));
2545
2546        if (ctl->total_bitmaps > 0) {
2547                bool stole_end;
2548                bool stole_front = false;
2549
2550                stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2551                if (ctl->total_bitmaps > 0)
2552                        stole_front = steal_from_bitmap_to_front(ctl, info,
2553                                                                 update_stat);
2554
2555                if (stole_end || stole_front)
2556                        try_merge_free_space(ctl, info, update_stat);
2557        }
2558}
2559
2560int __btrfs_add_free_space(struct btrfs_block_group *block_group,
2561                           u64 offset, u64 bytes,
2562                           enum btrfs_trim_state trim_state)
2563{
2564        struct btrfs_fs_info *fs_info = block_group->fs_info;
2565        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2566        struct btrfs_free_space *info;
2567        int ret = 0;
2568        u64 filter_bytes = bytes;
2569
2570        ASSERT(!btrfs_is_zoned(fs_info));
2571
2572        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2573        if (!info)
2574                return -ENOMEM;
2575
2576        info->offset = offset;
2577        info->bytes = bytes;
2578        info->trim_state = trim_state;
2579        RB_CLEAR_NODE(&info->offset_index);
2580        RB_CLEAR_NODE(&info->bytes_index);
2581
2582        spin_lock(&ctl->tree_lock);
2583
2584        if (try_merge_free_space(ctl, info, true))
2585                goto link;
2586
2587        /*
2588         * There was no extent directly to the left or right of this new
2589         * extent then we know we're going to have to allocate a new extent, so
2590         * before we do that see if we need to drop this into a bitmap
2591         */
2592        ret = insert_into_bitmap(ctl, info);
2593        if (ret < 0) {
2594                goto out;
2595        } else if (ret) {
2596                ret = 0;
2597                goto out;
2598        }
2599link:
2600        /*
2601         * Only steal free space from adjacent bitmaps if we're sure we're not
2602         * going to add the new free space to existing bitmap entries - because
2603         * that would mean unnecessary work that would be reverted. Therefore
2604         * attempt to steal space from bitmaps if we're adding an extent entry.
2605         */
2606        steal_from_bitmap(ctl, info, true);
2607
2608        filter_bytes = max(filter_bytes, info->bytes);
2609
2610        ret = link_free_space(ctl, info);
2611        if (ret)
2612                kmem_cache_free(btrfs_free_space_cachep, info);
2613out:
2614        btrfs_discard_update_discardable(block_group);
2615        spin_unlock(&ctl->tree_lock);
2616
2617        if (ret) {
2618                btrfs_crit(fs_info, "unable to add free space :%d", ret);
2619                ASSERT(ret != -EEXIST);
2620        }
2621
2622        if (trim_state != BTRFS_TRIM_STATE_TRIMMED) {
2623                btrfs_discard_check_filter(block_group, filter_bytes);
2624                btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
2625        }
2626
2627        return ret;
2628}
2629
2630static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
2631                                        u64 bytenr, u64 size, bool used)
2632{
2633        struct btrfs_fs_info *fs_info = block_group->fs_info;
2634        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2635        u64 offset = bytenr - block_group->start;
2636        u64 to_free, to_unusable;
2637        const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
2638        bool initial = (size == block_group->length);
2639        u64 reclaimable_unusable;
2640
2641        WARN_ON(!initial && offset + size > block_group->zone_capacity);
2642
2643        spin_lock(&ctl->tree_lock);
2644        if (!used)
2645                to_free = size;
2646        else if (initial)
2647                to_free = block_group->zone_capacity;
2648        else if (offset >= block_group->alloc_offset)
2649                to_free = size;
2650        else if (offset + size <= block_group->alloc_offset)
2651                to_free = 0;
2652        else
2653                to_free = offset + size - block_group->alloc_offset;
2654        to_unusable = size - to_free;
2655
2656        ctl->free_space += to_free;
2657        /*
2658         * If the block group is read-only, we should account freed space into
2659         * bytes_readonly.
2660         */
2661        if (!block_group->ro)
2662                block_group->zone_unusable += to_unusable;
2663        spin_unlock(&ctl->tree_lock);
2664        if (!used) {
2665                spin_lock(&block_group->lock);
2666                block_group->alloc_offset -= size;
2667                spin_unlock(&block_group->lock);
2668        }
2669
2670        reclaimable_unusable = block_group->zone_unusable -
2671                               (block_group->length - block_group->zone_capacity);
2672        /* All the region is now unusable. Mark it as unused and reclaim */
2673        if (block_group->zone_unusable == block_group->length) {
2674                btrfs_mark_bg_unused(block_group);
2675        } else if (bg_reclaim_threshold &&
2676                   reclaimable_unusable >=
2677                   div_factor_fine(block_group->zone_capacity,
2678                                   bg_reclaim_threshold)) {
2679                btrfs_mark_bg_to_reclaim(block_group);
2680        }
2681
2682        return 0;
2683}
2684
2685int btrfs_add_free_space(struct btrfs_block_group *block_group,
2686                         u64 bytenr, u64 size)
2687{
2688        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2689
2690        if (btrfs_is_zoned(block_group->fs_info))
2691                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2692                                                    true);
2693
2694        if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC))
2695                trim_state = BTRFS_TRIM_STATE_TRIMMED;
2696
2697        return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2698}
2699
2700int btrfs_add_free_space_unused(struct btrfs_block_group *block_group,
2701                                u64 bytenr, u64 size)
2702{
2703        if (btrfs_is_zoned(block_group->fs_info))
2704                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2705                                                    false);
2706
2707        return btrfs_add_free_space(block_group, bytenr, size);
2708}
2709
2710/*
2711 * This is a subtle distinction because when adding free space back in general,
2712 * we want it to be added as untrimmed for async. But in the case where we add
2713 * it on loading of a block group, we want to consider it trimmed.
2714 */
2715int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
2716                                       u64 bytenr, u64 size)
2717{
2718        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
2719
2720        if (btrfs_is_zoned(block_group->fs_info))
2721                return __btrfs_add_free_space_zoned(block_group, bytenr, size,
2722                                                    true);
2723
2724        if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) ||
2725            btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
2726                trim_state = BTRFS_TRIM_STATE_TRIMMED;
2727
2728        return __btrfs_add_free_space(block_group, bytenr, size, trim_state);
2729}
2730
2731int btrfs_remove_free_space(struct btrfs_block_group *block_group,
2732                            u64 offset, u64 bytes)
2733{
2734        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2735        struct btrfs_free_space *info;
2736        int ret;
2737        bool re_search = false;
2738
2739        if (btrfs_is_zoned(block_group->fs_info)) {
2740                /*
2741                 * This can happen with conventional zones when replaying log.
2742                 * Since the allocation info of tree-log nodes are not recorded
2743                 * to the extent-tree, calculate_alloc_pointer() failed to
2744                 * advance the allocation pointer after last allocated tree log
2745                 * node blocks.
2746                 *
2747                 * This function is called from
2748                 * btrfs_pin_extent_for_log_replay() when replaying the log.
2749                 * Advance the pointer not to overwrite the tree-log nodes.
2750                 */
2751                if (block_group->start + block_group->alloc_offset <
2752                    offset + bytes) {
2753                        block_group->alloc_offset =
2754                                offset + bytes - block_group->start;
2755                }
2756                return 0;
2757        }
2758
2759        spin_lock(&ctl->tree_lock);
2760
2761again:
2762        ret = 0;
2763        if (!bytes)
2764                goto out_lock;
2765
2766        info = tree_search_offset(ctl, offset, 0, 0);
2767        if (!info) {
2768                /*
2769                 * oops didn't find an extent that matched the space we wanted
2770                 * to remove, look for a bitmap instead
2771                 */
2772                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2773                                          1, 0);
2774                if (!info) {
2775                        /*
2776                         * If we found a partial bit of our free space in a
2777                         * bitmap but then couldn't find the other part this may
2778                         * be a problem, so WARN about it.
2779                         */
2780                        WARN_ON(re_search);
2781                        goto out_lock;
2782                }
2783        }
2784
2785        re_search = false;
2786        if (!info->bitmap) {
2787                unlink_free_space(ctl, info, true);
2788                if (offset == info->offset) {
2789                        u64 to_free = min(bytes, info->bytes);
2790
2791                        info->bytes -= to_free;
2792                        info->offset += to_free;
2793                        if (info->bytes) {
2794                                ret = link_free_space(ctl, info);
2795                                WARN_ON(ret);
2796                        } else {
2797                                kmem_cache_free(btrfs_free_space_cachep, info);
2798                        }
2799
2800                        offset += to_free;
2801                        bytes -= to_free;
2802                        goto again;
2803                } else {
2804                        u64 old_end = info->bytes + info->offset;
2805
2806                        info->bytes = offset - info->offset;
2807                        ret = link_free_space(ctl, info);
2808                        WARN_ON(ret);
2809                        if (ret)
2810                                goto out_lock;
2811
2812                        /* Not enough bytes in this entry to satisfy us */
2813                        if (old_end < offset + bytes) {
2814                                bytes -= old_end - offset;
2815                                offset = old_end;
2816                                goto again;
2817                        } else if (old_end == offset + bytes) {
2818                                /* all done */
2819                                goto out_lock;
2820                        }
2821                        spin_unlock(&ctl->tree_lock);
2822
2823                        ret = __btrfs_add_free_space(block_group,
2824                                                     offset + bytes,
2825                                                     old_end - (offset + bytes),
2826                                                     info->trim_state);
2827                        WARN_ON(ret);
2828                        goto out;
2829                }
2830        }
2831
2832        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2833        if (ret == -EAGAIN) {
2834                re_search = true;
2835                goto again;
2836        }
2837out_lock:
2838        btrfs_discard_update_discardable(block_group);
2839        spin_unlock(&ctl->tree_lock);
2840out:
2841        return ret;
2842}
2843
2844void btrfs_dump_free_space(struct btrfs_block_group *block_group,
2845                           u64 bytes)
2846{
2847        struct btrfs_fs_info *fs_info = block_group->fs_info;
2848        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2849        struct btrfs_free_space *info;
2850        struct rb_node *n;
2851        int count = 0;
2852
2853        /*
2854         * Zoned btrfs does not use free space tree and cluster. Just print
2855         * out the free space after the allocation offset.
2856         */
2857        if (btrfs_is_zoned(fs_info)) {
2858                btrfs_info(fs_info, "free space %llu active %d",
2859                           block_group->zone_capacity - block_group->alloc_offset,
2860                           block_group->zone_is_active);
2861                return;
2862        }
2863
2864        spin_lock(&ctl->tree_lock);
2865        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2866                info = rb_entry(n, struct btrfs_free_space, offset_index);
2867                if (info->bytes >= bytes && !block_group->ro)
2868                        count++;
2869                btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2870                           info->offset, info->bytes,
2871                       (info->bitmap) ? "yes" : "no");
2872        }
2873        spin_unlock(&ctl->tree_lock);
2874        btrfs_info(fs_info, "block group has cluster?: %s",
2875               list_empty(&block_group->cluster_list) ? "no" : "yes");
2876        btrfs_info(fs_info,
2877                   "%d blocks of free space at or bigger than bytes is", count);
2878}
2879
2880void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group,
2881                               struct btrfs_free_space_ctl *ctl)
2882{
2883        struct btrfs_fs_info *fs_info = block_group->fs_info;
2884
2885        spin_lock_init(&ctl->tree_lock);
2886        ctl->unit = fs_info->sectorsize;
2887        ctl->start = block_group->start;
2888        ctl->block_group = block_group;
2889        ctl->op = &free_space_op;
2890        ctl->free_space_bytes = RB_ROOT_CACHED;
2891        INIT_LIST_HEAD(&ctl->trimming_ranges);
2892        mutex_init(&ctl->cache_writeout_mutex);
2893
2894        /*
2895         * we only want to have 32k of ram per block group for keeping
2896         * track of free space, and if we pass 1/2 of that we want to
2897         * start converting things over to using bitmaps
2898         */
2899        ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2900}
2901
2902/*
2903 * for a given cluster, put all of its extents back into the free
2904 * space cache.  If the block group passed doesn't match the block group
2905 * pointed to by the cluster, someone else raced in and freed the
2906 * cluster already.  In that case, we just return without changing anything
2907 */
2908static void __btrfs_return_cluster_to_free_space(
2909                             struct btrfs_block_group *block_group,
2910                             struct btrfs_free_cluster *cluster)
2911{
2912        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2913        struct btrfs_free_space *entry;
2914        struct rb_node *node;
2915
2916        spin_lock(&cluster->lock);
2917        if (cluster->block_group != block_group) {
2918                spin_unlock(&cluster->lock);
2919                return;
2920        }
2921
2922        cluster->block_group = NULL;
2923        cluster->window_start = 0;
2924        list_del_init(&cluster->block_group_list);
2925
2926        node = rb_first(&cluster->root);
2927        while (node) {
2928                bool bitmap;
2929
2930                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2931                node = rb_next(&entry->offset_index);
2932                rb_erase(&entry->offset_index, &cluster->root);
2933                RB_CLEAR_NODE(&entry->offset_index);
2934
2935                bitmap = (entry->bitmap != NULL);
2936                if (!bitmap) {
2937                        /* Merging treats extents as if they were new */
2938                        if (!btrfs_free_space_trimmed(entry)) {
2939                                ctl->discardable_extents[BTRFS_STAT_CURR]--;
2940                                ctl->discardable_bytes[BTRFS_STAT_CURR] -=
2941                                        entry->bytes;
2942                        }
2943
2944                        try_merge_free_space(ctl, entry, false);
2945                        steal_from_bitmap(ctl, entry, false);
2946
2947                        /* As we insert directly, update these statistics */
2948                        if (!btrfs_free_space_trimmed(entry)) {
2949                                ctl->discardable_extents[BTRFS_STAT_CURR]++;
2950                                ctl->discardable_bytes[BTRFS_STAT_CURR] +=
2951                                        entry->bytes;
2952                        }
2953                }
2954                tree_insert_offset(&ctl->free_space_offset,
2955                                   entry->offset, &entry->offset_index, bitmap);
2956                rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes,
2957                              entry_less);
2958        }
2959        cluster->root = RB_ROOT;
2960        spin_unlock(&cluster->lock);
2961        btrfs_put_block_group(block_group);
2962}
2963
2964static void __btrfs_remove_free_space_cache_locked(
2965                                struct btrfs_free_space_ctl *ctl)
2966{
2967        struct btrfs_free_space *info;
2968        struct rb_node *node;
2969
2970        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2971                info = rb_entry(node, struct btrfs_free_space, offset_index);
2972                if (!info->bitmap) {
2973                        unlink_free_space(ctl, info, true);
2974                        kmem_cache_free(btrfs_free_space_cachep, info);
2975                } else {
2976                        free_bitmap(ctl, info);
2977                }
2978
2979                cond_resched_lock(&ctl->tree_lock);
2980        }
2981}
2982
2983void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2984{
2985        spin_lock(&ctl->tree_lock);
2986        __btrfs_remove_free_space_cache_locked(ctl);
2987        if (ctl->block_group)
2988                btrfs_discard_update_discardable(ctl->block_group);
2989        spin_unlock(&ctl->tree_lock);
2990}
2991
2992void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
2993{
2994        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2995        struct btrfs_free_cluster *cluster;
2996        struct list_head *head;
2997
2998        spin_lock(&ctl->tree_lock);
2999        while ((head = block_group->cluster_list.next) !=
3000               &block_group->cluster_list) {
3001                cluster = list_entry(head, struct btrfs_free_cluster,
3002                                     block_group_list);
3003
3004                WARN_ON(cluster->block_group != block_group);
3005                __btrfs_return_cluster_to_free_space(block_group, cluster);
3006
3007                cond_resched_lock(&ctl->tree_lock);
3008        }
3009        __btrfs_remove_free_space_cache_locked(ctl);
3010        btrfs_discard_update_discardable(block_group);
3011        spin_unlock(&ctl->tree_lock);
3012
3013}
3014
3015/**
3016 * btrfs_is_free_space_trimmed - see if everything is trimmed
3017 * @block_group: block_group of interest
3018 *
3019 * Walk @block_group's free space rb_tree to determine if everything is trimmed.
3020 */
3021bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
3022{
3023        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3024        struct btrfs_free_space *info;
3025        struct rb_node *node;
3026        bool ret = true;
3027
3028        spin_lock(&ctl->tree_lock);
3029        node = rb_first(&ctl->free_space_offset);
3030
3031        while (node) {
3032                info = rb_entry(node, struct btrfs_free_space, offset_index);
3033
3034                if (!btrfs_free_space_trimmed(info)) {
3035                        ret = false;
3036                        break;
3037                }
3038
3039                node = rb_next(node);
3040        }
3041
3042        spin_unlock(&ctl->tree_lock);
3043        return ret;
3044}
3045
3046u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
3047                               u64 offset, u64 bytes, u64 empty_size,
3048                               u64 *max_extent_size)
3049{
3050        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3051        struct btrfs_discard_ctl *discard_ctl =
3052                                        &block_group->fs_info->discard_ctl;
3053        struct btrfs_free_space *entry = NULL;
3054        u64 bytes_search = bytes + empty_size;
3055        u64 ret = 0;
3056        u64 align_gap = 0;
3057        u64 align_gap_len = 0;
3058        enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3059        bool use_bytes_index = (offset == block_group->start);
3060
3061        ASSERT(!btrfs_is_zoned(block_group->fs_info));
3062
3063        spin_lock(&ctl->tree_lock);
3064        entry = find_free_space(ctl, &offset, &bytes_search,
3065                                block_group->full_stripe_len, max_extent_size,
3066                                use_bytes_index);
3067        if (!entry)
3068                goto out;
3069
3070        ret = offset;
3071        if (entry->bitmap) {
3072                bitmap_clear_bits(ctl, entry, offset, bytes, true);
3073
3074                if (!btrfs_free_space_trimmed(entry))
3075                        atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3076
3077                if (!entry->bytes)
3078                        free_bitmap(ctl, entry);
3079        } else {
3080                unlink_free_space(ctl, entry, true);
3081                align_gap_len = offset - entry->offset;
3082                align_gap = entry->offset;
3083                align_gap_trim_state = entry->trim_state;
3084
3085                if (!btrfs_free_space_trimmed(entry))
3086                        atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3087
3088                entry->offset = offset + bytes;
3089                WARN_ON(entry->bytes < bytes + align_gap_len);
3090
3091                entry->bytes -= bytes + align_gap_len;
3092                if (!entry->bytes)
3093                        kmem_cache_free(btrfs_free_space_cachep, entry);
3094                else
3095                        link_free_space(ctl, entry);
3096        }
3097out:
3098        btrfs_discard_update_discardable(block_group);
3099        spin_unlock(&ctl->tree_lock);
3100
3101        if (align_gap_len)
3102                __btrfs_add_free_space(block_group, align_gap, align_gap_len,
3103                                       align_gap_trim_state);
3104        return ret;
3105}
3106
3107/*
3108 * given a cluster, put all of its extents back into the free space
3109 * cache.  If a block group is passed, this function will only free
3110 * a cluster that belongs to the passed block group.
3111 *
3112 * Otherwise, it'll get a reference on the block group pointed to by the
3113 * cluster and remove the cluster from it.
3114 */
3115void btrfs_return_cluster_to_free_space(
3116                               struct btrfs_block_group *block_group,
3117                               struct btrfs_free_cluster *cluster)
3118{
3119        struct btrfs_free_space_ctl *ctl;
3120
3121        /* first, get a safe pointer to the block group */
3122        spin_lock(&cluster->lock);
3123        if (!block_group) {
3124                block_group = cluster->block_group;
3125                if (!block_group) {
3126                        spin_unlock(&cluster->lock);
3127                        return;
3128                }
3129        } else if (cluster->block_group != block_group) {
3130                /* someone else has already freed it don't redo their work */
3131                spin_unlock(&cluster->lock);
3132                return;
3133        }
3134        btrfs_get_block_group(block_group);
3135        spin_unlock(&cluster->lock);
3136
3137        ctl = block_group->free_space_ctl;
3138
3139        /* now return any extents the cluster had on it */
3140        spin_lock(&ctl->tree_lock);
3141        __btrfs_return_cluster_to_free_space(block_group, cluster);
3142        spin_unlock(&ctl->tree_lock);
3143
3144        btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group);
3145
3146        /* finally drop our ref */
3147        btrfs_put_block_group(block_group);
3148}
3149
3150static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
3151                                   struct btrfs_free_cluster *cluster,
3152                                   struct btrfs_free_space *entry,
3153                                   u64 bytes, u64 min_start,
3154                                   u64 *max_extent_size)
3155{
3156        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3157        int err;
3158        u64 search_start = cluster->window_start;
3159        u64 search_bytes = bytes;
3160        u64 ret = 0;
3161
3162        search_start = min_start;
3163        search_bytes = bytes;
3164
3165        err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
3166        if (err) {
3167                *max_extent_size = max(get_max_extent_size(entry),
3168                                       *max_extent_size);
3169                return 0;
3170        }
3171
3172        ret = search_start;
3173        bitmap_clear_bits(ctl, entry, ret, bytes, false);
3174
3175        return ret;
3176}
3177
3178/*
3179 * given a cluster, try to allocate 'bytes' from it, returns 0
3180 * if it couldn't find anything suitably large, or a logical disk offset
3181 * if things worked out
3182 */
3183u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
3184                             struct btrfs_free_cluster *cluster, u64 bytes,
3185                             u64 min_start, u64 *max_extent_size)
3186{
3187        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3188        struct btrfs_discard_ctl *discard_ctl =
3189                                        &block_group->fs_info->discard_ctl;
3190        struct btrfs_free_space *entry = NULL;
3191        struct rb_node *node;
3192        u64 ret = 0;
3193
3194        ASSERT(!btrfs_is_zoned(block_group->fs_info));
3195
3196        spin_lock(&cluster->lock);
3197        if (bytes > cluster->max_size)
3198                goto out;
3199
3200        if (cluster->block_group != block_group)
3201                goto out;
3202
3203        node = rb_first(&cluster->root);
3204        if (!node)
3205                goto out;
3206
3207        entry = rb_entry(node, struct btrfs_free_space, offset_index);
3208        while (1) {
3209                if (entry->bytes < bytes)
3210                        *max_extent_size = max(get_max_extent_size(entry),
3211                                               *max_extent_size);
3212
3213                if (entry->bytes < bytes ||
3214                    (!entry->bitmap && entry->offset < min_start)) {
3215                        node = rb_next(&entry->offset_index);
3216                        if (!node)
3217                                break;
3218                        entry = rb_entry(node, struct btrfs_free_space,
3219                                         offset_index);
3220                        continue;
3221                }
3222
3223                if (entry->bitmap) {
3224                        ret = btrfs_alloc_from_bitmap(block_group,
3225                                                      cluster, entry, bytes,
3226                                                      cluster->window_start,
3227                                                      max_extent_size);
3228                        if (ret == 0) {
3229                                node = rb_next(&entry->offset_index);
3230                                if (!node)
3231                                        break;
3232                                entry = rb_entry(node, struct btrfs_free_space,
3233                                                 offset_index);
3234                                continue;
3235                        }
3236                        cluster->window_start += bytes;
3237                } else {
3238                        ret = entry->offset;
3239
3240                        entry->offset += bytes;
3241                        entry->bytes -= bytes;
3242                }
3243
3244                break;
3245        }
3246out:
3247        spin_unlock(&cluster->lock);
3248
3249        if (!ret)
3250                return 0;
3251
3252        spin_lock(&ctl->tree_lock);
3253
3254        if (!btrfs_free_space_trimmed(entry))
3255                atomic64_add(bytes, &discard_ctl->discard_bytes_saved);
3256
3257        ctl->free_space -= bytes;
3258        if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
3259                ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
3260
3261        spin_lock(&cluster->lock);
3262        if (entry->bytes == 0) {
3263                rb_erase(&entry->offset_index, &cluster->root);
3264                ctl->free_extents--;
3265                if (entry->bitmap) {
3266                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
3267                                        entry->bitmap);
3268                        ctl->total_bitmaps--;
3269                        recalculate_thresholds(ctl);
3270                } else if (!btrfs_free_space_trimmed(entry)) {
3271                        ctl->discardable_extents[BTRFS_STAT_CURR]--;
3272                }
3273                kmem_cache_free(btrfs_free_space_cachep, entry);
3274        }
3275
3276        spin_unlock(&cluster->lock);
3277        spin_unlock(&ctl->tree_lock);
3278
3279        return ret;
3280}
3281
3282static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group,
3283                                struct btrfs_free_space *entry,
3284                                struct btrfs_free_cluster *cluster,
3285                                u64 offset, u64 bytes,
3286                                u64 cont1_bytes, u64 min_bytes)
3287{
3288        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3289        unsigned long next_zero;
3290        unsigned long i;
3291        unsigned long want_bits;
3292        unsigned long min_bits;
3293        unsigned long found_bits;
3294        unsigned long max_bits = 0;
3295        unsigned long start = 0;
3296        unsigned long total_found = 0;
3297        int ret;
3298
3299        i = offset_to_bit(entry->offset, ctl->unit,
3300                          max_t(u64, offset, entry->offset));
3301        want_bits = bytes_to_bits(bytes, ctl->unit);
3302        min_bits = bytes_to_bits(min_bytes, ctl->unit);
3303
3304        /*
3305         * Don't bother looking for a cluster in this bitmap if it's heavily
3306         * fragmented.
3307         */
3308        if (entry->max_extent_size &&
3309            entry->max_extent_size < cont1_bytes)
3310                return -ENOSPC;
3311again:
3312        found_bits = 0;
3313        for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
3314                next_zero = find_next_zero_bit(entry->bitmap,
3315                                               BITS_PER_BITMAP, i);
3316                if (next_zero - i >= min_bits) {
3317                        found_bits = next_zero - i;
3318                        if (found_bits > max_bits)
3319                                max_bits = found_bits;
3320                        break;
3321                }
3322                if (next_zero - i > max_bits)
3323                        max_bits = next_zero - i;
3324                i = next_zero;
3325        }
3326
3327        if (!found_bits) {
3328                entry->max_extent_size = (u64)max_bits * ctl->unit;
3329                return -ENOSPC;
3330        }
3331
3332        if (!total_found) {
3333                start = i;
3334                cluster->max_size = 0;
3335        }
3336
3337        total_found += found_bits;
3338
3339        if (cluster->max_size < found_bits * ctl->unit)
3340                cluster->max_size = found_bits * ctl->unit;
3341
3342        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
3343                i = next_zero + 1;
3344                goto again;
3345        }
3346
3347        cluster->window_start = start * ctl->unit + entry->offset;
3348        rb_erase(&entry->offset_index, &ctl->free_space_offset);
3349        rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3350
3351        /*
3352         * We need to know if we're currently on the normal space index when we
3353         * manipulate the bitmap so that we know we need to remove and re-insert
3354         * it into the space_index tree.  Clear the bytes_index node here so the
3355         * bitmap manipulation helpers know not to mess with the space_index
3356         * until this bitmap entry is added back into the normal cache.
3357         */
3358        RB_CLEAR_NODE(&entry->bytes_index);
3359
3360        ret = tree_insert_offset(&cluster->root, entry->offset,
3361                                 &entry->offset_index, 1);
3362        ASSERT(!ret); /* -EEXIST; Logic error */
3363
3364        trace_btrfs_setup_cluster(block_group, cluster,
3365                                  total_found * ctl->unit, 1);
3366        return 0;
3367}
3368
3369/*
3370 * This searches the block group for just extents to fill the cluster with.
3371 * Try to find a cluster with at least bytes total bytes, at least one
3372 * extent of cont1_bytes, and other clusters of at least min_bytes.
3373 */
3374static noinline int
3375setup_cluster_no_bitmap(struct btrfs_block_group *block_group,
3376                        struct btrfs_free_cluster *cluster,
3377                        struct list_head *bitmaps, u64 offset, u64 bytes,
3378                        u64 cont1_bytes, u64 min_bytes)
3379{
3380        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3381        struct btrfs_free_space *first = NULL;
3382        struct btrfs_free_space *entry = NULL;
3383        struct btrfs_free_space *last;
3384        struct rb_node *node;
3385        u64 window_free;
3386        u64 max_extent;
3387        u64 total_size = 0;
3388
3389        entry = tree_search_offset(ctl, offset, 0, 1);
3390        if (!entry)
3391                return -ENOSPC;
3392
3393        /*
3394         * We don't want bitmaps, so just move along until we find a normal
3395         * extent entry.
3396         */
3397        while (entry->bitmap || entry->bytes < min_bytes) {
3398                if (entry->bitmap && list_empty(&entry->list))
3399                        list_add_tail(&entry->list, bitmaps);
3400                node = rb_next(&entry->offset_index);
3401                if (!node)
3402                        return -ENOSPC;
3403                entry = rb_entry(node, struct btrfs_free_space, offset_index);
3404        }
3405
3406        window_free = entry->bytes;
3407        max_extent = entry->bytes;
3408        first = entry;
3409        last = entry;
3410
3411        for (node = rb_next(&entry->offset_index); node;
3412             node = rb_next(&entry->offset_index)) {
3413                entry = rb_entry(node, struct btrfs_free_space, offset_index);
3414
3415                if (entry->bitmap) {
3416                        if (list_empty(&entry->list))
3417                                list_add_tail(&entry->list, bitmaps);
3418                        continue;
3419                }
3420
3421                if (entry->bytes < min_bytes)
3422                        continue;
3423
3424                last = entry;
3425                window_free += entry->bytes;
3426                if (entry->bytes > max_extent)
3427                        max_extent = entry->bytes;
3428        }
3429
3430        if (window_free < bytes || max_extent < cont1_bytes)
3431                return -ENOSPC;
3432
3433        cluster->window_start = first->offset;
3434
3435        node = &first->offset_index;
3436
3437        /*
3438         * now we've found our entries, pull them out of the free space
3439         * cache and put them into the cluster rbtree
3440         */
3441        do {
3442                int ret;
3443
3444                entry = rb_entry(node, struct btrfs_free_space, offset_index);
3445                node = rb_next(&entry->offset_index);
3446                if (entry->bitmap || entry->bytes < min_bytes)
3447                        continue;
3448
3449                rb_erase(&entry->offset_index, &ctl->free_space_offset);
3450                rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes);
3451                ret = tree_insert_offset(&cluster->root, entry->offset,
3452                                         &entry->offset_index, 0);
3453                total_size += entry->bytes;
3454                ASSERT(!ret); /* -EEXIST; Logic error */
3455        } while (node && entry != last);
3456
3457        cluster->max_size = max_extent;
3458        trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
3459        return 0;
3460}
3461
3462/*
3463 * This specifically looks for bitmaps that may work in the cluster, we assume
3464 * that we have already failed to find extents that will work.
3465 */
3466static noinline int
3467setup_cluster_bitmap(struct btrfs_block_group *block_group,
3468                     struct btrfs_free_cluster *cluster,
3469                     struct list_head *bitmaps, u64 offset, u64 bytes,
3470                     u64 cont1_bytes, u64 min_bytes)
3471{
3472        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3473        struct btrfs_free_space *entry = NULL;
3474        int ret = -ENOSPC;
3475        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3476
3477        if (ctl->total_bitmaps == 0)
3478                return -ENOSPC;
3479
3480        /*
3481         * The bitmap that covers offset won't be in the list unless offset
3482         * is just its start offset.
3483         */
3484        if (!list_empty(bitmaps))
3485                entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3486
3487        if (!entry || entry->offset != bitmap_offset) {
3488                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3489                if (entry && list_empty(&entry->list))
3490                        list_add(&entry->list, bitmaps);
3491        }
3492
3493        list_for_each_entry(entry, bitmaps, list) {
3494                if (entry->bytes < bytes)
3495                        continue;
3496                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3497                                           bytes, cont1_bytes, min_bytes);
3498                if (!ret)
3499                        return 0;
3500        }
3501
3502        /*
3503         * The bitmaps list has all the bitmaps that record free space
3504         * starting after offset, so no more search is required.
3505         */
3506        return -ENOSPC;
3507}
3508
3509/*
3510 * here we try to find a cluster of blocks in a block group.  The goal
3511 * is to find at least bytes+empty_size.
3512 * We might not find them all in one contiguous area.
3513 *
3514 * returns zero and sets up cluster if things worked out, otherwise
3515 * it returns -enospc
3516 */
3517int btrfs_find_space_cluster(struct btrfs_block_group *block_group,
3518                             struct btrfs_free_cluster *cluster,
3519                             u64 offset, u64 bytes, u64 empty_size)
3520{
3521        struct btrfs_fs_info *fs_info = block_group->fs_info;
3522        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3523        struct btrfs_free_space *entry, *tmp;
3524        LIST_HEAD(bitmaps);
3525        u64 min_bytes;
3526        u64 cont1_bytes;
3527        int ret;
3528
3529        /*
3530         * Choose the minimum extent size we'll require for this
3531         * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3532         * For metadata, allow allocates with smaller extents.  For
3533         * data, keep it dense.
3534         */
3535        if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3536                cont1_bytes = min_bytes = bytes + empty_size;
3537        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3538                cont1_bytes = bytes;
3539                min_bytes = fs_info->sectorsize;
3540        } else {
3541                cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3542                min_bytes = fs_info->sectorsize;
3543        }
3544
3545        spin_lock(&ctl->tree_lock);
3546
3547        /*
3548         * If we know we don't have enough space to make a cluster don't even
3549         * bother doing all the work to try and find one.
3550         */
3551        if (ctl->free_space < bytes) {
3552                spin_unlock(&ctl->tree_lock);
3553                return -ENOSPC;
3554        }
3555
3556        spin_lock(&cluster->lock);
3557
3558        /* someone already found a cluster, hooray */
3559        if (cluster->block_group) {
3560                ret = 0;
3561                goto out;
3562        }
3563
3564        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3565                                 min_bytes);
3566
3567        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3568                                      bytes + empty_size,
3569                                      cont1_bytes, min_bytes);
3570        if (ret)
3571                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3572                                           offset, bytes + empty_size,
3573                                           cont1_bytes, min_bytes);
3574
3575        /* Clear our temporary list */
3576        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3577                list_del_init(&entry->list);
3578
3579        if (!ret) {
3580                btrfs_get_block_group(block_group);
3581                list_add_tail(&cluster->block_group_list,
3582                              &block_group->cluster_list);
3583                cluster->block_group = block_group;
3584        } else {
3585                trace_btrfs_failed_cluster_setup(block_group);
3586        }
3587out:
3588        spin_unlock(&cluster->lock);
3589        spin_unlock(&ctl->tree_lock);
3590
3591        return ret;
3592}
3593
3594/*
3595 * simple code to zero out a cluster
3596 */
3597void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3598{
3599        spin_lock_init(&cluster->lock);
3600        spin_lock_init(&cluster->refill_lock);
3601        cluster->root = RB_ROOT;
3602        cluster->max_size = 0;
3603        cluster->fragmented = false;
3604        INIT_LIST_HEAD(&cluster->block_group_list);
3605        cluster->block_group = NULL;
3606}
3607
3608static int do_trimming(struct btrfs_block_group *block_group,
3609                       u64 *total_trimmed, u64 start, u64 bytes,
3610                       u64 reserved_start, u64 reserved_bytes,
3611                       enum btrfs_trim_state reserved_trim_state,
3612                       struct btrfs_trim_range *trim_entry)
3613{
3614        struct btrfs_space_info *space_info = block_group->space_info;
3615        struct btrfs_fs_info *fs_info = block_group->fs_info;
3616        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3617        int ret;
3618        int update = 0;
3619        const u64 end = start + bytes;
3620        const u64 reserved_end = reserved_start + reserved_bytes;
3621        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3622        u64 trimmed = 0;
3623
3624        spin_lock(&space_info->lock);
3625        spin_lock(&block_group->lock);
3626        if (!block_group->ro) {
3627                block_group->reserved += reserved_bytes;
3628                space_info->bytes_reserved += reserved_bytes;
3629                update = 1;
3630        }
3631        spin_unlock(&block_group->lock);
3632        spin_unlock(&space_info->lock);
3633
3634        ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3635        if (!ret) {
3636                *total_trimmed += trimmed;
3637                trim_state = BTRFS_TRIM_STATE_TRIMMED;
3638        }
3639
3640        mutex_lock(&ctl->cache_writeout_mutex);
3641        if (reserved_start < start)
3642                __btrfs_add_free_space(block_group, reserved_start,
3643                                       start - reserved_start,
3644                                       reserved_trim_state);
3645        if (start + bytes < reserved_start + reserved_bytes)
3646                __btrfs_add_free_space(block_group, end, reserved_end - end,
3647                                       reserved_trim_state);
3648        __btrfs_add_free_space(block_group, start, bytes, trim_state);
3649        list_del(&trim_entry->list);
3650        mutex_unlock(&ctl->cache_writeout_mutex);
3651
3652        if (update) {
3653                spin_lock(&space_info->lock);
3654                spin_lock(&block_group->lock);
3655                if (block_group->ro)
3656                        space_info->bytes_readonly += reserved_bytes;
3657                block_group->reserved -= reserved_bytes;
3658                space_info->bytes_reserved -= reserved_bytes;
3659                spin_unlock(&block_group->lock);
3660                spin_unlock(&space_info->lock);
3661        }
3662
3663        return ret;
3664}
3665
3666/*
3667 * If @async is set, then we will trim 1 region and return.
3668 */
3669static int trim_no_bitmap(struct btrfs_block_group *block_group,
3670                          u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3671                          bool async)
3672{
3673        struct btrfs_discard_ctl *discard_ctl =
3674                                        &block_group->fs_info->discard_ctl;
3675        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3676        struct btrfs_free_space *entry;
3677        struct rb_node *node;
3678        int ret = 0;
3679        u64 extent_start;
3680        u64 extent_bytes;
3681        enum btrfs_trim_state extent_trim_state;
3682        u64 bytes;
3683        const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3684
3685        while (start < end) {
3686                struct btrfs_trim_range trim_entry;
3687
3688                mutex_lock(&ctl->cache_writeout_mutex);
3689                spin_lock(&ctl->tree_lock);
3690
3691                if (ctl->free_space < minlen)
3692                        goto out_unlock;
3693
3694                entry = tree_search_offset(ctl, start, 0, 1);
3695                if (!entry)
3696                        goto out_unlock;
3697
3698                /* Skip bitmaps and if async, already trimmed entries */
3699                while (entry->bitmap ||
3700                       (async && btrfs_free_space_trimmed(entry))) {
3701                        node = rb_next(&entry->offset_index);
3702                        if (!node)
3703                                goto out_unlock;
3704                        entry = rb_entry(node, struct btrfs_free_space,
3705                                         offset_index);
3706                }
3707
3708                if (entry->offset >= end)
3709                        goto out_unlock;
3710
3711                extent_start = entry->offset;
3712                extent_bytes = entry->bytes;
3713                extent_trim_state = entry->trim_state;
3714                if (async) {
3715                        start = entry->offset;
3716                        bytes = entry->bytes;
3717                        if (bytes < minlen) {
3718                                spin_unlock(&ctl->tree_lock);
3719                                mutex_unlock(&ctl->cache_writeout_mutex);
3720                                goto next;
3721                        }
3722                        unlink_free_space(ctl, entry, true);
3723                        /*
3724                         * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3725                         * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim
3726                         * X when we come back around.  So trim it now.
3727                         */
3728                        if (max_discard_size &&
3729                            bytes >= (max_discard_size +
3730                                      BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
3731                                bytes = max_discard_size;
3732                                extent_bytes = max_discard_size;
3733                                entry->offset += max_discard_size;
3734                                entry->bytes -= max_discard_size;
3735                                link_free_space(ctl, entry);
3736                        } else {
3737                                kmem_cache_free(btrfs_free_space_cachep, entry);
3738                        }
3739                } else {
3740                        start = max(start, extent_start);
3741                        bytes = min(extent_start + extent_bytes, end) - start;
3742                        if (bytes < minlen) {
3743                                spin_unlock(&ctl->tree_lock);
3744                                mutex_unlock(&ctl->cache_writeout_mutex);
3745                                goto next;
3746                        }
3747
3748                        unlink_free_space(ctl, entry, true);
3749                        kmem_cache_free(btrfs_free_space_cachep, entry);
3750                }
3751
3752                spin_unlock(&ctl->tree_lock);
3753                trim_entry.start = extent_start;
3754                trim_entry.bytes = extent_bytes;
3755                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3756                mutex_unlock(&ctl->cache_writeout_mutex);
3757
3758                ret = do_trimming(block_group, total_trimmed, start, bytes,
3759                                  extent_start, extent_bytes, extent_trim_state,
3760                                  &trim_entry);
3761                if (ret) {
3762                        block_group->discard_cursor = start + bytes;
3763                        break;
3764                }
3765next:
3766                start += bytes;
3767                block_group->discard_cursor = start;
3768                if (async && *total_trimmed)
3769                        break;
3770
3771                if (fatal_signal_pending(current)) {
3772                        ret = -ERESTARTSYS;
3773                        break;
3774                }
3775
3776                cond_resched();
3777        }
3778
3779        return ret;
3780
3781out_unlock:
3782        block_group->discard_cursor = btrfs_block_group_end(block_group);
3783        spin_unlock(&ctl->tree_lock);
3784        mutex_unlock(&ctl->cache_writeout_mutex);
3785
3786        return ret;
3787}
3788
3789/*
3790 * If we break out of trimming a bitmap prematurely, we should reset the
3791 * trimming bit.  In a rather contrieved case, it's possible to race here so
3792 * reset the state to BTRFS_TRIM_STATE_UNTRIMMED.
3793 *
3794 * start = start of bitmap
3795 * end = near end of bitmap
3796 *
3797 * Thread 1:                    Thread 2:
3798 * trim_bitmaps(start)
3799 *                              trim_bitmaps(end)
3800 *                              end_trimming_bitmap()
3801 * reset_trimming_bitmap()
3802 */
3803static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset)
3804{
3805        struct btrfs_free_space *entry;
3806
3807        spin_lock(&ctl->tree_lock);
3808        entry = tree_search_offset(ctl, offset, 1, 0);
3809        if (entry) {
3810                if (btrfs_free_space_trimmed(entry)) {
3811                        ctl->discardable_extents[BTRFS_STAT_CURR] +=
3812                                entry->bitmap_extents;
3813                        ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes;
3814                }
3815                entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3816        }
3817
3818        spin_unlock(&ctl->tree_lock);
3819}
3820
3821static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl,
3822                                struct btrfs_free_space *entry)
3823{
3824        if (btrfs_free_space_trimming_bitmap(entry)) {
3825                entry->trim_state = BTRFS_TRIM_STATE_TRIMMED;
3826                ctl->discardable_extents[BTRFS_STAT_CURR] -=
3827                        entry->bitmap_extents;
3828                ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes;
3829        }
3830}
3831
3832/*
3833 * If @async is set, then we will trim 1 region and return.
3834 */
3835static int trim_bitmaps(struct btrfs_block_group *block_group,
3836                        u64 *total_trimmed, u64 start, u64 end, u64 minlen,
3837                        u64 maxlen, bool async)
3838{
3839        struct btrfs_discard_ctl *discard_ctl =
3840                                        &block_group->fs_info->discard_ctl;
3841        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3842        struct btrfs_free_space *entry;
3843        int ret = 0;
3844        int ret2;
3845        u64 bytes;
3846        u64 offset = offset_to_bitmap(ctl, start);
3847        const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
3848
3849        while (offset < end) {
3850                bool next_bitmap = false;
3851                struct btrfs_trim_range trim_entry;
3852
3853                mutex_lock(&ctl->cache_writeout_mutex);
3854                spin_lock(&ctl->tree_lock);
3855
3856                if (ctl->free_space < minlen) {
3857                        block_group->discard_cursor =
3858                                btrfs_block_group_end(block_group);
3859                        spin_unlock(&ctl->tree_lock);
3860                        mutex_unlock(&ctl->cache_writeout_mutex);
3861                        break;
3862                }
3863
3864                entry = tree_search_offset(ctl, offset, 1, 0);
3865                /*
3866                 * Bitmaps are marked trimmed lossily now to prevent constant
3867                 * discarding of the same bitmap (the reason why we are bound
3868                 * by the filters).  So, retrim the block group bitmaps when we
3869                 * are preparing to punt to the unused_bgs list.  This uses
3870                 * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED
3871                 * which is the only discard index which sets minlen to 0.
3872                 */
3873                if (!entry || (async && minlen && start == offset &&
3874                               btrfs_free_space_trimmed(entry))) {
3875                        spin_unlock(&ctl->tree_lock);
3876                        mutex_unlock(&ctl->cache_writeout_mutex);
3877                        next_bitmap = true;
3878                        goto next;
3879                }
3880
3881                /*
3882                 * Async discard bitmap trimming begins at by setting the start
3883                 * to be key.objectid and the offset_to_bitmap() aligns to the
3884                 * start of the bitmap.  This lets us know we are fully
3885                 * scanning the bitmap rather than only some portion of it.
3886                 */
3887                if (start == offset)
3888                        entry->trim_state = BTRFS_TRIM_STATE_TRIMMING;
3889
3890                bytes = minlen;
3891                ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3892                if (ret2 || start >= end) {
3893                        /*
3894                         * We lossily consider a bitmap trimmed if we only skip
3895                         * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER.
3896                         */
3897                        if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER)
3898                                end_trimming_bitmap(ctl, entry);
3899                        else
3900                                entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
3901                        spin_unlock(&ctl->tree_lock);
3902                        mutex_unlock(&ctl->cache_writeout_mutex);
3903                        next_bitmap = true;
3904                        goto next;
3905                }
3906
3907                /*
3908                 * We already trimmed a region, but are using the locking above
3909                 * to reset the trim_state.
3910                 */
3911                if (async && *total_trimmed) {
3912                        spin_unlock(&ctl->tree_lock);
3913                        mutex_unlock(&ctl->cache_writeout_mutex);
3914                        goto out;
3915                }
3916
3917                bytes = min(bytes, end - start);
3918                if (bytes < minlen || (async && maxlen && bytes > maxlen)) {
3919                        spin_unlock(&ctl->tree_lock);
3920                        mutex_unlock(&ctl->cache_writeout_mutex);
3921                        goto next;
3922                }
3923
3924                /*
3925                 * Let bytes = BTRFS_MAX_DISCARD_SIZE + X.
3926                 * If X < @minlen, we won't trim X when we come back around.
3927                 * So trim it now.  We differ here from trimming extents as we
3928                 * don't keep individual state per bit.
3929                 */
3930                if (async &&
3931                    max_discard_size &&
3932                    bytes > (max_discard_size + minlen))
3933                        bytes = max_discard_size;
3934
3935                bitmap_clear_bits(ctl, entry, start, bytes, true);
3936                if (entry->bytes == 0)
3937                        free_bitmap(ctl, entry);
3938
3939                spin_unlock(&ctl->tree_lock);
3940                trim_entry.start = start;
3941                trim_entry.bytes = bytes;
3942                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3943                mutex_unlock(&ctl->cache_writeout_mutex);
3944
3945                ret = do_trimming(block_group, total_trimmed, start, bytes,
3946                                  start, bytes, 0, &trim_entry);
3947                if (ret) {
3948                        reset_trimming_bitmap(ctl, offset);
3949                        block_group->discard_cursor =
3950                                btrfs_block_group_end(block_group);
3951                        break;
3952                }
3953next:
3954                if (next_bitmap) {
3955                        offset += BITS_PER_BITMAP * ctl->unit;
3956                        start = offset;
3957                } else {
3958                        start += bytes;
3959                }
3960                block_group->discard_cursor = start;
3961
3962                if (fatal_signal_pending(current)) {
3963                        if (start != offset)
3964                                reset_trimming_bitmap(ctl, offset);
3965                        ret = -ERESTARTSYS;
3966                        break;
3967                }
3968
3969                cond_resched();
3970        }
3971
3972        if (offset >= end)
3973                block_group->discard_cursor = end;
3974
3975out:
3976        return ret;
3977}
3978
3979int btrfs_trim_block_group(struct btrfs_block_group *block_group,
3980                           u64 *trimmed, u64 start, u64 end, u64 minlen)
3981{
3982        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3983        int ret;
3984        u64 rem = 0;
3985
3986        ASSERT(!btrfs_is_zoned(block_group->fs_info));
3987
3988        *trimmed = 0;
3989
3990        spin_lock(&block_group->lock);
3991        if (block_group->removed) {
3992                spin_unlock(&block_group->lock);
3993                return 0;
3994        }
3995        btrfs_freeze_block_group(block_group);
3996        spin_unlock(&block_group->lock);
3997
3998        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false);
3999        if (ret)
4000                goto out;
4001
4002        ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false);
4003        div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem);
4004        /* If we ended in the middle of a bitmap, reset the trimming flag */
4005        if (rem)
4006                reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end));
4007out:
4008        btrfs_unfreeze_block_group(block_group);
4009        return ret;
4010}
4011
4012int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
4013                                   u64 *trimmed, u64 start, u64 end, u64 minlen,
4014                                   bool async)
4015{
4016        int ret;
4017
4018        *trimmed = 0;
4019
4020        spin_lock(&block_group->lock);
4021        if (block_group->removed) {
4022                spin_unlock(&block_group->lock);
4023                return 0;
4024        }
4025        btrfs_freeze_block_group(block_group);
4026        spin_unlock(&block_group->lock);
4027
4028        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
4029        btrfs_unfreeze_block_group(block_group);
4030
4031        return ret;
4032}
4033
4034int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
4035                                   u64 *trimmed, u64 start, u64 end, u64 minlen,
4036                                   u64 maxlen, bool async)
4037{
4038        int ret;
4039
4040        *trimmed = 0;
4041
4042        spin_lock(&block_group->lock);
4043        if (block_group->removed) {
4044                spin_unlock(&block_group->lock);
4045                return 0;
4046        }
4047        btrfs_freeze_block_group(block_group);
4048        spin_unlock(&block_group->lock);
4049
4050        ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen,
4051                           async);
4052
4053        btrfs_unfreeze_block_group(block_group);
4054
4055        return ret;
4056}
4057
4058bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info)
4059{
4060        return btrfs_super_cache_generation(fs_info->super_copy);
4061}
4062
4063static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
4064                                       struct btrfs_trans_handle *trans)
4065{
4066        struct btrfs_block_group *block_group;
4067        struct rb_node *node;
4068        int ret = 0;
4069
4070        btrfs_info(fs_info, "cleaning free space cache v1");
4071
4072        node = rb_first(&fs_info->block_group_cache_tree);
4073        while (node) {
4074                block_group = rb_entry(node, struct btrfs_block_group, cache_node);
4075                ret = btrfs_remove_free_space_inode(trans, NULL, block_group);
4076                if (ret)
4077                        goto out;
4078                node = rb_next(node);
4079        }
4080out:
4081        return ret;
4082}
4083
4084int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)
4085{
4086        struct btrfs_trans_handle *trans;
4087        int ret;
4088
4089        /*
4090         * update_super_roots will appropriately set or unset
4091         * super_copy->cache_generation based on SPACE_CACHE and
4092         * BTRFS_FS_CLEANUP_SPACE_CACHE_V1. For this reason, we need a
4093         * transaction commit whether we are enabling space cache v1 and don't
4094         * have any other work to do, or are disabling it and removing free
4095         * space inodes.
4096         */
4097        trans = btrfs_start_transaction(fs_info->tree_root, 0);
4098        if (IS_ERR(trans))
4099                return PTR_ERR(trans);
4100
4101        if (!active) {
4102                set_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4103                ret = cleanup_free_space_cache_v1(fs_info, trans);
4104                if (ret) {
4105                        btrfs_abort_transaction(trans, ret);
4106                        btrfs_end_transaction(trans);
4107                        goto out;
4108                }
4109        }
4110
4111        ret = btrfs_commit_transaction(trans);
4112out:
4113        clear_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags);
4114
4115        return ret;
4116}
4117
4118#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4119/*
4120 * Use this if you need to make a bitmap or extent entry specifically, it
4121 * doesn't do any of the merging that add_free_space does, this acts a lot like
4122 * how the free space cache loading stuff works, so you can get really weird
4123 * configurations.
4124 */
4125int test_add_free_space_entry(struct btrfs_block_group *cache,
4126                              u64 offset, u64 bytes, bool bitmap)
4127{
4128        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4129        struct btrfs_free_space *info = NULL, *bitmap_info;
4130        void *map = NULL;
4131        enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED;
4132        u64 bytes_added;
4133        int ret;
4134
4135again:
4136        if (!info) {
4137                info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
4138                if (!info)
4139                        return -ENOMEM;
4140        }
4141
4142        if (!bitmap) {
4143                spin_lock(&ctl->tree_lock);
4144                info->offset = offset;
4145                info->bytes = bytes;
4146                info->max_extent_size = 0;
4147                ret = link_free_space(ctl, info);
4148                spin_unlock(&ctl->tree_lock);
4149                if (ret)
4150                        kmem_cache_free(btrfs_free_space_cachep, info);
4151                return ret;
4152        }
4153
4154        if (!map) {
4155                map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
4156                if (!map) {
4157                        kmem_cache_free(btrfs_free_space_cachep, info);
4158                        return -ENOMEM;
4159                }
4160        }
4161
4162        spin_lock(&ctl->tree_lock);
4163        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4164                                         1, 0);
4165        if (!bitmap_info) {
4166                info->bitmap = map;
4167                map = NULL;
4168                add_new_bitmap(ctl, info, offset);
4169                bitmap_info = info;
4170                info = NULL;
4171        }
4172
4173        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
4174                                          trim_state);
4175
4176        bytes -= bytes_added;
4177        offset += bytes_added;
4178        spin_unlock(&ctl->tree_lock);
4179
4180        if (bytes)
4181                goto again;
4182
4183        if (info)
4184                kmem_cache_free(btrfs_free_space_cachep, info);
4185        if (map)
4186                kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
4187        return 0;
4188}
4189
4190/*
4191 * Checks to see if the given range is in the free space cache.  This is really
4192 * just used to check the absence of space, so if there is free space in the
4193 * range at all we will return 1.
4194 */
4195int test_check_exists(struct btrfs_block_group *cache,
4196                      u64 offset, u64 bytes)
4197{
4198        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
4199        struct btrfs_free_space *info;
4200        int ret = 0;
4201
4202        spin_lock(&ctl->tree_lock);
4203        info = tree_search_offset(ctl, offset, 0, 0);
4204        if (!info) {
4205                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
4206                                          1, 0);
4207                if (!info)
4208                        goto out;
4209        }
4210
4211have_info:
4212        if (info->bitmap) {
4213                u64 bit_off, bit_bytes;
4214                struct rb_node *n;
4215                struct btrfs_free_space *tmp;
4216
4217                bit_off = offset;
4218                bit_bytes = ctl->unit;
4219                ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
4220                if (!ret) {
4221                        if (bit_off == offset) {
4222                                ret = 1;
4223                                goto out;
4224                        } else if (bit_off > offset &&
4225                                   offset + bytes > bit_off) {
4226                                ret = 1;
4227                                goto out;
4228                        }
4229                }
4230
4231                n = rb_prev(&info->offset_index);
4232                while (n) {
4233                        tmp = rb_entry(n, struct btrfs_free_space,
4234                                       offset_index);
4235                        if (tmp->offset + tmp->bytes < offset)
4236                                break;
4237                        if (offset + bytes < tmp->offset) {
4238                                n = rb_prev(&tmp->offset_index);
4239                                continue;
4240                        }
4241                        info = tmp;
4242                        goto have_info;
4243                }
4244
4245                n = rb_next(&info->offset_index);
4246                while (n) {
4247                        tmp = rb_entry(n, struct btrfs_free_space,
4248                                       offset_index);
4249                        if (offset + bytes < tmp->offset)
4250                                break;
4251                        if (tmp->offset + tmp->bytes < offset) {
4252                                n = rb_next(&tmp->offset_index);
4253                                continue;
4254                        }
4255                        info = tmp;
4256                        goto have_info;
4257                }
4258
4259                ret = 0;
4260                goto out;
4261        }
4262
4263        if (info->offset == offset) {
4264                ret = 1;
4265                goto out;
4266        }
4267
4268        if (offset > info->offset && offset < info->offset + info->bytes)
4269                ret = 1;
4270out:
4271        spin_unlock(&ctl->tree_lock);
4272        return ret;
4273}
4274#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
4275