linux/fs/btrfs/free-space-cache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2008 Red Hat.  All rights reserved.
   4 */
   5
   6#include <linux/pagemap.h>
   7#include <linux/sched.h>
   8#include <linux/sched/signal.h>
   9#include <linux/slab.h>
  10#include <linux/math64.h>
  11#include <linux/ratelimit.h>
  12#include <linux/error-injection.h>
  13#include <linux/sched/mm.h>
  14#include "ctree.h"
  15#include "free-space-cache.h"
  16#include "transaction.h"
  17#include "disk-io.h"
  18#include "extent_io.h"
  19#include "inode-map.h"
  20#include "volumes.h"
  21#include "space-info.h"
  22#include "delalloc-space.h"
  23#include "block-group.h"
  24
  25#define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
  26#define MAX_CACHE_BYTES_PER_GIG SZ_32K
  27
  28struct btrfs_trim_range {
  29        u64 start;
  30        u64 bytes;
  31        struct list_head list;
  32};
  33
  34static int link_free_space(struct btrfs_free_space_ctl *ctl,
  35                           struct btrfs_free_space *info);
  36static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
  37                              struct btrfs_free_space *info);
  38static int btrfs_wait_cache_io_root(struct btrfs_root *root,
  39                             struct btrfs_trans_handle *trans,
  40                             struct btrfs_io_ctl *io_ctl,
  41                             struct btrfs_path *path);
  42
  43static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
  44                                               struct btrfs_path *path,
  45                                               u64 offset)
  46{
  47        struct btrfs_fs_info *fs_info = root->fs_info;
  48        struct btrfs_key key;
  49        struct btrfs_key location;
  50        struct btrfs_disk_key disk_key;
  51        struct btrfs_free_space_header *header;
  52        struct extent_buffer *leaf;
  53        struct inode *inode = NULL;
  54        unsigned nofs_flag;
  55        int ret;
  56
  57        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
  58        key.offset = offset;
  59        key.type = 0;
  60
  61        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  62        if (ret < 0)
  63                return ERR_PTR(ret);
  64        if (ret > 0) {
  65                btrfs_release_path(path);
  66                return ERR_PTR(-ENOENT);
  67        }
  68
  69        leaf = path->nodes[0];
  70        header = btrfs_item_ptr(leaf, path->slots[0],
  71                                struct btrfs_free_space_header);
  72        btrfs_free_space_key(leaf, header, &disk_key);
  73        btrfs_disk_key_to_cpu(&location, &disk_key);
  74        btrfs_release_path(path);
  75
  76        /*
  77         * We are often under a trans handle at this point, so we need to make
  78         * sure NOFS is set to keep us from deadlocking.
  79         */
  80        nofs_flag = memalloc_nofs_save();
  81        inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
  82        btrfs_release_path(path);
  83        memalloc_nofs_restore(nofs_flag);
  84        if (IS_ERR(inode))
  85                return inode;
  86
  87        mapping_set_gfp_mask(inode->i_mapping,
  88                        mapping_gfp_constraint(inode->i_mapping,
  89                        ~(__GFP_FS | __GFP_HIGHMEM)));
  90
  91        return inode;
  92}
  93
  94struct inode *lookup_free_space_inode(
  95                struct btrfs_block_group_cache *block_group,
  96                struct btrfs_path *path)
  97{
  98        struct btrfs_fs_info *fs_info = block_group->fs_info;
  99        struct inode *inode = NULL;
 100        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 101
 102        spin_lock(&block_group->lock);
 103        if (block_group->inode)
 104                inode = igrab(block_group->inode);
 105        spin_unlock(&block_group->lock);
 106        if (inode)
 107                return inode;
 108
 109        inode = __lookup_free_space_inode(fs_info->tree_root, path,
 110                                          block_group->key.objectid);
 111        if (IS_ERR(inode))
 112                return inode;
 113
 114        spin_lock(&block_group->lock);
 115        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
 116                btrfs_info(fs_info, "Old style space inode found, converting.");
 117                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
 118                        BTRFS_INODE_NODATACOW;
 119                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 120        }
 121
 122        if (!block_group->iref) {
 123                block_group->inode = igrab(inode);
 124                block_group->iref = 1;
 125        }
 126        spin_unlock(&block_group->lock);
 127
 128        return inode;
 129}
 130
 131static int __create_free_space_inode(struct btrfs_root *root,
 132                                     struct btrfs_trans_handle *trans,
 133                                     struct btrfs_path *path,
 134                                     u64 ino, u64 offset)
 135{
 136        struct btrfs_key key;
 137        struct btrfs_disk_key disk_key;
 138        struct btrfs_free_space_header *header;
 139        struct btrfs_inode_item *inode_item;
 140        struct extent_buffer *leaf;
 141        u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
 142        int ret;
 143
 144        ret = btrfs_insert_empty_inode(trans, root, path, ino);
 145        if (ret)
 146                return ret;
 147
 148        /* We inline crc's for the free disk space cache */
 149        if (ino != BTRFS_FREE_INO_OBJECTID)
 150                flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 151
 152        leaf = path->nodes[0];
 153        inode_item = btrfs_item_ptr(leaf, path->slots[0],
 154                                    struct btrfs_inode_item);
 155        btrfs_item_key(leaf, &disk_key, path->slots[0]);
 156        memzero_extent_buffer(leaf, (unsigned long)inode_item,
 157                             sizeof(*inode_item));
 158        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
 159        btrfs_set_inode_size(leaf, inode_item, 0);
 160        btrfs_set_inode_nbytes(leaf, inode_item, 0);
 161        btrfs_set_inode_uid(leaf, inode_item, 0);
 162        btrfs_set_inode_gid(leaf, inode_item, 0);
 163        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
 164        btrfs_set_inode_flags(leaf, inode_item, flags);
 165        btrfs_set_inode_nlink(leaf, inode_item, 1);
 166        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 167        btrfs_set_inode_block_group(leaf, inode_item, offset);
 168        btrfs_mark_buffer_dirty(leaf);
 169        btrfs_release_path(path);
 170
 171        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 172        key.offset = offset;
 173        key.type = 0;
 174        ret = btrfs_insert_empty_item(trans, root, path, &key,
 175                                      sizeof(struct btrfs_free_space_header));
 176        if (ret < 0) {
 177                btrfs_release_path(path);
 178                return ret;
 179        }
 180
 181        leaf = path->nodes[0];
 182        header = btrfs_item_ptr(leaf, path->slots[0],
 183                                struct btrfs_free_space_header);
 184        memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
 185        btrfs_set_free_space_key(leaf, header, &disk_key);
 186        btrfs_mark_buffer_dirty(leaf);
 187        btrfs_release_path(path);
 188
 189        return 0;
 190}
 191
 192int create_free_space_inode(struct btrfs_trans_handle *trans,
 193                            struct btrfs_block_group_cache *block_group,
 194                            struct btrfs_path *path)
 195{
 196        int ret;
 197        u64 ino;
 198
 199        ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino);
 200        if (ret < 0)
 201                return ret;
 202
 203        return __create_free_space_inode(trans->fs_info->tree_root, trans, path,
 204                                         ino, block_group->key.objectid);
 205}
 206
 207int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
 208                                       struct btrfs_block_rsv *rsv)
 209{
 210        u64 needed_bytes;
 211        int ret;
 212
 213        /* 1 for slack space, 1 for updating the inode */
 214        needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) +
 215                btrfs_calc_metadata_size(fs_info, 1);
 216
 217        spin_lock(&rsv->lock);
 218        if (rsv->reserved < needed_bytes)
 219                ret = -ENOSPC;
 220        else
 221                ret = 0;
 222        spin_unlock(&rsv->lock);
 223        return ret;
 224}
 225
 226int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 227                                    struct btrfs_block_group_cache *block_group,
 228                                    struct inode *inode)
 229{
 230        struct btrfs_root *root = BTRFS_I(inode)->root;
 231        int ret = 0;
 232        bool locked = false;
 233
 234        if (block_group) {
 235                struct btrfs_path *path = btrfs_alloc_path();
 236
 237                if (!path) {
 238                        ret = -ENOMEM;
 239                        goto fail;
 240                }
 241                locked = true;
 242                mutex_lock(&trans->transaction->cache_write_mutex);
 243                if (!list_empty(&block_group->io_list)) {
 244                        list_del_init(&block_group->io_list);
 245
 246                        btrfs_wait_cache_io(trans, block_group, path);
 247                        btrfs_put_block_group(block_group);
 248                }
 249
 250                /*
 251                 * now that we've truncated the cache away, its no longer
 252                 * setup or written
 253                 */
 254                spin_lock(&block_group->lock);
 255                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 256                spin_unlock(&block_group->lock);
 257                btrfs_free_path(path);
 258        }
 259
 260        btrfs_i_size_write(BTRFS_I(inode), 0);
 261        truncate_pagecache(inode, 0);
 262
 263        /*
 264         * We skip the throttling logic for free space cache inodes, so we don't
 265         * need to check for -EAGAIN.
 266         */
 267        ret = btrfs_truncate_inode_items(trans, root, inode,
 268                                         0, BTRFS_EXTENT_DATA_KEY);
 269        if (ret)
 270                goto fail;
 271
 272        ret = btrfs_update_inode(trans, root, inode);
 273
 274fail:
 275        if (locked)
 276                mutex_unlock(&trans->transaction->cache_write_mutex);
 277        if (ret)
 278                btrfs_abort_transaction(trans, ret);
 279
 280        return ret;
 281}
 282
 283static void readahead_cache(struct inode *inode)
 284{
 285        struct file_ra_state *ra;
 286        unsigned long last_index;
 287
 288        ra = kzalloc(sizeof(*ra), GFP_NOFS);
 289        if (!ra)
 290                return;
 291
 292        file_ra_state_init(ra, inode->i_mapping);
 293        last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 294
 295        page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
 296
 297        kfree(ra);
 298}
 299
 300static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 301                       int write)
 302{
 303        int num_pages;
 304        int check_crcs = 0;
 305
 306        num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 307
 308        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
 309                check_crcs = 1;
 310
 311        /* Make sure we can fit our crcs and generation into the first page */
 312        if (write && check_crcs &&
 313            (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
 314                return -ENOSPC;
 315
 316        memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
 317
 318        io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
 319        if (!io_ctl->pages)
 320                return -ENOMEM;
 321
 322        io_ctl->num_pages = num_pages;
 323        io_ctl->fs_info = btrfs_sb(inode->i_sb);
 324        io_ctl->check_crcs = check_crcs;
 325        io_ctl->inode = inode;
 326
 327        return 0;
 328}
 329ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
 330
 331static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 332{
 333        kfree(io_ctl->pages);
 334        io_ctl->pages = NULL;
 335}
 336
 337static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 338{
 339        if (io_ctl->cur) {
 340                io_ctl->cur = NULL;
 341                io_ctl->orig = NULL;
 342        }
 343}
 344
 345static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 346{
 347        ASSERT(io_ctl->index < io_ctl->num_pages);
 348        io_ctl->page = io_ctl->pages[io_ctl->index++];
 349        io_ctl->cur = page_address(io_ctl->page);
 350        io_ctl->orig = io_ctl->cur;
 351        io_ctl->size = PAGE_SIZE;
 352        if (clear)
 353                clear_page(io_ctl->cur);
 354}
 355
 356static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 357{
 358        int i;
 359
 360        io_ctl_unmap_page(io_ctl);
 361
 362        for (i = 0; i < io_ctl->num_pages; i++) {
 363                if (io_ctl->pages[i]) {
 364                        ClearPageChecked(io_ctl->pages[i]);
 365                        unlock_page(io_ctl->pages[i]);
 366                        put_page(io_ctl->pages[i]);
 367                }
 368        }
 369}
 370
 371static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 372                                int uptodate)
 373{
 374        struct page *page;
 375        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 376        int i;
 377
 378        for (i = 0; i < io_ctl->num_pages; i++) {
 379                page = find_or_create_page(inode->i_mapping, i, mask);
 380                if (!page) {
 381                        io_ctl_drop_pages(io_ctl);
 382                        return -ENOMEM;
 383                }
 384                io_ctl->pages[i] = page;
 385                if (uptodate && !PageUptodate(page)) {
 386                        btrfs_readpage(NULL, page);
 387                        lock_page(page);
 388                        if (!PageUptodate(page)) {
 389                                btrfs_err(BTRFS_I(inode)->root->fs_info,
 390                                           "error reading free space cache");
 391                                io_ctl_drop_pages(io_ctl);
 392                                return -EIO;
 393                        }
 394                }
 395        }
 396
 397        for (i = 0; i < io_ctl->num_pages; i++) {
 398                clear_page_dirty_for_io(io_ctl->pages[i]);
 399                set_page_extent_mapped(io_ctl->pages[i]);
 400        }
 401
 402        return 0;
 403}
 404
 405static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 406{
 407        __le64 *val;
 408
 409        io_ctl_map_page(io_ctl, 1);
 410
 411        /*
 412         * Skip the csum areas.  If we don't check crcs then we just have a
 413         * 64bit chunk at the front of the first page.
 414         */
 415        if (io_ctl->check_crcs) {
 416                io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
 417                io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
 418        } else {
 419                io_ctl->cur += sizeof(u64);
 420                io_ctl->size -= sizeof(u64) * 2;
 421        }
 422
 423        val = io_ctl->cur;
 424        *val = cpu_to_le64(generation);
 425        io_ctl->cur += sizeof(u64);
 426}
 427
 428static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 429{
 430        __le64 *gen;
 431
 432        /*
 433         * Skip the crc area.  If we don't check crcs then we just have a 64bit
 434         * chunk at the front of the first page.
 435         */
 436        if (io_ctl->check_crcs) {
 437                io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
 438                io_ctl->size -= sizeof(u64) +
 439                        (sizeof(u32) * io_ctl->num_pages);
 440        } else {
 441                io_ctl->cur += sizeof(u64);
 442                io_ctl->size -= sizeof(u64) * 2;
 443        }
 444
 445        gen = io_ctl->cur;
 446        if (le64_to_cpu(*gen) != generation) {
 447                btrfs_err_rl(io_ctl->fs_info,
 448                        "space cache generation (%llu) does not match inode (%llu)",
 449                                *gen, generation);
 450                io_ctl_unmap_page(io_ctl);
 451                return -EIO;
 452        }
 453        io_ctl->cur += sizeof(u64);
 454        return 0;
 455}
 456
 457static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 458{
 459        u32 *tmp;
 460        u32 crc = ~(u32)0;
 461        unsigned offset = 0;
 462
 463        if (!io_ctl->check_crcs) {
 464                io_ctl_unmap_page(io_ctl);
 465                return;
 466        }
 467
 468        if (index == 0)
 469                offset = sizeof(u32) * io_ctl->num_pages;
 470
 471        crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 472        btrfs_crc32c_final(crc, (u8 *)&crc);
 473        io_ctl_unmap_page(io_ctl);
 474        tmp = page_address(io_ctl->pages[0]);
 475        tmp += index;
 476        *tmp = crc;
 477}
 478
 479static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 480{
 481        u32 *tmp, val;
 482        u32 crc = ~(u32)0;
 483        unsigned offset = 0;
 484
 485        if (!io_ctl->check_crcs) {
 486                io_ctl_map_page(io_ctl, 0);
 487                return 0;
 488        }
 489
 490        if (index == 0)
 491                offset = sizeof(u32) * io_ctl->num_pages;
 492
 493        tmp = page_address(io_ctl->pages[0]);
 494        tmp += index;
 495        val = *tmp;
 496
 497        io_ctl_map_page(io_ctl, 0);
 498        crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset);
 499        btrfs_crc32c_final(crc, (u8 *)&crc);
 500        if (val != crc) {
 501                btrfs_err_rl(io_ctl->fs_info,
 502                        "csum mismatch on free space cache");
 503                io_ctl_unmap_page(io_ctl);
 504                return -EIO;
 505        }
 506
 507        return 0;
 508}
 509
 510static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
 511                            void *bitmap)
 512{
 513        struct btrfs_free_space_entry *entry;
 514
 515        if (!io_ctl->cur)
 516                return -ENOSPC;
 517
 518        entry = io_ctl->cur;
 519        entry->offset = cpu_to_le64(offset);
 520        entry->bytes = cpu_to_le64(bytes);
 521        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
 522                BTRFS_FREE_SPACE_EXTENT;
 523        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 524        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 525
 526        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 527                return 0;
 528
 529        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 530
 531        /* No more pages to map */
 532        if (io_ctl->index >= io_ctl->num_pages)
 533                return 0;
 534
 535        /* map the next page */
 536        io_ctl_map_page(io_ctl, 1);
 537        return 0;
 538}
 539
 540static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 541{
 542        if (!io_ctl->cur)
 543                return -ENOSPC;
 544
 545        /*
 546         * If we aren't at the start of the current page, unmap this one and
 547         * map the next one if there is any left.
 548         */
 549        if (io_ctl->cur != io_ctl->orig) {
 550                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 551                if (io_ctl->index >= io_ctl->num_pages)
 552                        return -ENOSPC;
 553                io_ctl_map_page(io_ctl, 0);
 554        }
 555
 556        copy_page(io_ctl->cur, bitmap);
 557        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 558        if (io_ctl->index < io_ctl->num_pages)
 559                io_ctl_map_page(io_ctl, 0);
 560        return 0;
 561}
 562
 563static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
 564{
 565        /*
 566         * If we're not on the boundary we know we've modified the page and we
 567         * need to crc the page.
 568         */
 569        if (io_ctl->cur != io_ctl->orig)
 570                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 571        else
 572                io_ctl_unmap_page(io_ctl);
 573
 574        while (io_ctl->index < io_ctl->num_pages) {
 575                io_ctl_map_page(io_ctl, 1);
 576                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 577        }
 578}
 579
 580static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
 581                            struct btrfs_free_space *entry, u8 *type)
 582{
 583        struct btrfs_free_space_entry *e;
 584        int ret;
 585
 586        if (!io_ctl->cur) {
 587                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 588                if (ret)
 589                        return ret;
 590        }
 591
 592        e = io_ctl->cur;
 593        entry->offset = le64_to_cpu(e->offset);
 594        entry->bytes = le64_to_cpu(e->bytes);
 595        *type = e->type;
 596        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 597        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 598
 599        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 600                return 0;
 601
 602        io_ctl_unmap_page(io_ctl);
 603
 604        return 0;
 605}
 606
 607static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 608                              struct btrfs_free_space *entry)
 609{
 610        int ret;
 611
 612        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 613        if (ret)
 614                return ret;
 615
 616        copy_page(entry->bitmap, io_ctl->cur);
 617        io_ctl_unmap_page(io_ctl);
 618
 619        return 0;
 620}
 621
 622/*
 623 * Since we attach pinned extents after the fact we can have contiguous sections
 624 * of free space that are split up in entries.  This poses a problem with the
 625 * tree logging stuff since it could have allocated across what appears to be 2
 626 * entries since we would have merged the entries when adding the pinned extents
 627 * back to the free space cache.  So run through the space cache that we just
 628 * loaded and merge contiguous entries.  This will make the log replay stuff not
 629 * blow up and it will make for nicer allocator behavior.
 630 */
 631static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
 632{
 633        struct btrfs_free_space *e, *prev = NULL;
 634        struct rb_node *n;
 635
 636again:
 637        spin_lock(&ctl->tree_lock);
 638        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
 639                e = rb_entry(n, struct btrfs_free_space, offset_index);
 640                if (!prev)
 641                        goto next;
 642                if (e->bitmap || prev->bitmap)
 643                        goto next;
 644                if (prev->offset + prev->bytes == e->offset) {
 645                        unlink_free_space(ctl, prev);
 646                        unlink_free_space(ctl, e);
 647                        prev->bytes += e->bytes;
 648                        kmem_cache_free(btrfs_free_space_cachep, e);
 649                        link_free_space(ctl, prev);
 650                        prev = NULL;
 651                        spin_unlock(&ctl->tree_lock);
 652                        goto again;
 653                }
 654next:
 655                prev = e;
 656        }
 657        spin_unlock(&ctl->tree_lock);
 658}
 659
 660static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 661                                   struct btrfs_free_space_ctl *ctl,
 662                                   struct btrfs_path *path, u64 offset)
 663{
 664        struct btrfs_fs_info *fs_info = root->fs_info;
 665        struct btrfs_free_space_header *header;
 666        struct extent_buffer *leaf;
 667        struct btrfs_io_ctl io_ctl;
 668        struct btrfs_key key;
 669        struct btrfs_free_space *e, *n;
 670        LIST_HEAD(bitmaps);
 671        u64 num_entries;
 672        u64 num_bitmaps;
 673        u64 generation;
 674        u8 type;
 675        int ret = 0;
 676
 677        /* Nothing in the space cache, goodbye */
 678        if (!i_size_read(inode))
 679                return 0;
 680
 681        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 682        key.offset = offset;
 683        key.type = 0;
 684
 685        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 686        if (ret < 0)
 687                return 0;
 688        else if (ret > 0) {
 689                btrfs_release_path(path);
 690                return 0;
 691        }
 692
 693        ret = -1;
 694
 695        leaf = path->nodes[0];
 696        header = btrfs_item_ptr(leaf, path->slots[0],
 697                                struct btrfs_free_space_header);
 698        num_entries = btrfs_free_space_entries(leaf, header);
 699        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
 700        generation = btrfs_free_space_generation(leaf, header);
 701        btrfs_release_path(path);
 702
 703        if (!BTRFS_I(inode)->generation) {
 704                btrfs_info(fs_info,
 705                           "the free space cache file (%llu) is invalid, skip it",
 706                           offset);
 707                return 0;
 708        }
 709
 710        if (BTRFS_I(inode)->generation != generation) {
 711                btrfs_err(fs_info,
 712                          "free space inode generation (%llu) did not match free space cache generation (%llu)",
 713                          BTRFS_I(inode)->generation, generation);
 714                return 0;
 715        }
 716
 717        if (!num_entries)
 718                return 0;
 719
 720        ret = io_ctl_init(&io_ctl, inode, 0);
 721        if (ret)
 722                return ret;
 723
 724        readahead_cache(inode);
 725
 726        ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
 727        if (ret)
 728                goto out;
 729
 730        ret = io_ctl_check_crc(&io_ctl, 0);
 731        if (ret)
 732                goto free_cache;
 733
 734        ret = io_ctl_check_generation(&io_ctl, generation);
 735        if (ret)
 736                goto free_cache;
 737
 738        while (num_entries) {
 739                e = kmem_cache_zalloc(btrfs_free_space_cachep,
 740                                      GFP_NOFS);
 741                if (!e)
 742                        goto free_cache;
 743
 744                ret = io_ctl_read_entry(&io_ctl, e, &type);
 745                if (ret) {
 746                        kmem_cache_free(btrfs_free_space_cachep, e);
 747                        goto free_cache;
 748                }
 749
 750                if (!e->bytes) {
 751                        kmem_cache_free(btrfs_free_space_cachep, e);
 752                        goto free_cache;
 753                }
 754
 755                if (type == BTRFS_FREE_SPACE_EXTENT) {
 756                        spin_lock(&ctl->tree_lock);
 757                        ret = link_free_space(ctl, e);
 758                        spin_unlock(&ctl->tree_lock);
 759                        if (ret) {
 760                                btrfs_err(fs_info,
 761                                        "Duplicate entries in free space cache, dumping");
 762                                kmem_cache_free(btrfs_free_space_cachep, e);
 763                                goto free_cache;
 764                        }
 765                } else {
 766                        ASSERT(num_bitmaps);
 767                        num_bitmaps--;
 768                        e->bitmap = kmem_cache_zalloc(
 769                                        btrfs_free_space_bitmap_cachep, GFP_NOFS);
 770                        if (!e->bitmap) {
 771                                kmem_cache_free(
 772                                        btrfs_free_space_cachep, e);
 773                                goto free_cache;
 774                        }
 775                        spin_lock(&ctl->tree_lock);
 776                        ret = link_free_space(ctl, e);
 777                        ctl->total_bitmaps++;
 778                        ctl->op->recalc_thresholds(ctl);
 779                        spin_unlock(&ctl->tree_lock);
 780                        if (ret) {
 781                                btrfs_err(fs_info,
 782                                        "Duplicate entries in free space cache, dumping");
 783                                kmem_cache_free(btrfs_free_space_cachep, e);
 784                                goto free_cache;
 785                        }
 786                        list_add_tail(&e->list, &bitmaps);
 787                }
 788
 789                num_entries--;
 790        }
 791
 792        io_ctl_unmap_page(&io_ctl);
 793
 794        /*
 795         * We add the bitmaps at the end of the entries in order that
 796         * the bitmap entries are added to the cache.
 797         */
 798        list_for_each_entry_safe(e, n, &bitmaps, list) {
 799                list_del_init(&e->list);
 800                ret = io_ctl_read_bitmap(&io_ctl, e);
 801                if (ret)
 802                        goto free_cache;
 803        }
 804
 805        io_ctl_drop_pages(&io_ctl);
 806        merge_space_tree(ctl);
 807        ret = 1;
 808out:
 809        io_ctl_free(&io_ctl);
 810        return ret;
 811free_cache:
 812        io_ctl_drop_pages(&io_ctl);
 813        __btrfs_remove_free_space_cache(ctl);
 814        goto out;
 815}
 816
 817int load_free_space_cache(struct btrfs_block_group_cache *block_group)
 818{
 819        struct btrfs_fs_info *fs_info = block_group->fs_info;
 820        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 821        struct inode *inode;
 822        struct btrfs_path *path;
 823        int ret = 0;
 824        bool matched;
 825        u64 used = btrfs_block_group_used(&block_group->item);
 826
 827        /*
 828         * If this block group has been marked to be cleared for one reason or
 829         * another then we can't trust the on disk cache, so just return.
 830         */
 831        spin_lock(&block_group->lock);
 832        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 833                spin_unlock(&block_group->lock);
 834                return 0;
 835        }
 836        spin_unlock(&block_group->lock);
 837
 838        path = btrfs_alloc_path();
 839        if (!path)
 840                return 0;
 841        path->search_commit_root = 1;
 842        path->skip_locking = 1;
 843
 844        /*
 845         * We must pass a path with search_commit_root set to btrfs_iget in
 846         * order to avoid a deadlock when allocating extents for the tree root.
 847         *
 848         * When we are COWing an extent buffer from the tree root, when looking
 849         * for a free extent, at extent-tree.c:find_free_extent(), we can find
 850         * block group without its free space cache loaded. When we find one
 851         * we must load its space cache which requires reading its free space
 852         * cache's inode item from the root tree. If this inode item is located
 853         * in the same leaf that we started COWing before, then we end up in
 854         * deadlock on the extent buffer (trying to read lock it when we
 855         * previously write locked it).
 856         *
 857         * It's safe to read the inode item using the commit root because
 858         * block groups, once loaded, stay in memory forever (until they are
 859         * removed) as well as their space caches once loaded. New block groups
 860         * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
 861         * we will never try to read their inode item while the fs is mounted.
 862         */
 863        inode = lookup_free_space_inode(block_group, path);
 864        if (IS_ERR(inode)) {
 865                btrfs_free_path(path);
 866                return 0;
 867        }
 868
 869        /* We may have converted the inode and made the cache invalid. */
 870        spin_lock(&block_group->lock);
 871        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 872                spin_unlock(&block_group->lock);
 873                btrfs_free_path(path);
 874                goto out;
 875        }
 876        spin_unlock(&block_group->lock);
 877
 878        ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
 879                                      path, block_group->key.objectid);
 880        btrfs_free_path(path);
 881        if (ret <= 0)
 882                goto out;
 883
 884        spin_lock(&ctl->tree_lock);
 885        matched = (ctl->free_space == (block_group->key.offset - used -
 886                                       block_group->bytes_super));
 887        spin_unlock(&ctl->tree_lock);
 888
 889        if (!matched) {
 890                __btrfs_remove_free_space_cache(ctl);
 891                btrfs_warn(fs_info,
 892                           "block group %llu has wrong amount of free space",
 893                           block_group->key.objectid);
 894                ret = -1;
 895        }
 896out:
 897        if (ret < 0) {
 898                /* This cache is bogus, make sure it gets cleared */
 899                spin_lock(&block_group->lock);
 900                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 901                spin_unlock(&block_group->lock);
 902                ret = 0;
 903
 904                btrfs_warn(fs_info,
 905                           "failed to load free space cache for block group %llu, rebuilding it now",
 906                           block_group->key.objectid);
 907        }
 908
 909        iput(inode);
 910        return ret;
 911}
 912
 913static noinline_for_stack
 914int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 915                              struct btrfs_free_space_ctl *ctl,
 916                              struct btrfs_block_group_cache *block_group,
 917                              int *entries, int *bitmaps,
 918                              struct list_head *bitmap_list)
 919{
 920        int ret;
 921        struct btrfs_free_cluster *cluster = NULL;
 922        struct btrfs_free_cluster *cluster_locked = NULL;
 923        struct rb_node *node = rb_first(&ctl->free_space_offset);
 924        struct btrfs_trim_range *trim_entry;
 925
 926        /* Get the cluster for this block_group if it exists */
 927        if (block_group && !list_empty(&block_group->cluster_list)) {
 928                cluster = list_entry(block_group->cluster_list.next,
 929                                     struct btrfs_free_cluster,
 930                                     block_group_list);
 931        }
 932
 933        if (!node && cluster) {
 934                cluster_locked = cluster;
 935                spin_lock(&cluster_locked->lock);
 936                node = rb_first(&cluster->root);
 937                cluster = NULL;
 938        }
 939
 940        /* Write out the extent entries */
 941        while (node) {
 942                struct btrfs_free_space *e;
 943
 944                e = rb_entry(node, struct btrfs_free_space, offset_index);
 945                *entries += 1;
 946
 947                ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
 948                                       e->bitmap);
 949                if (ret)
 950                        goto fail;
 951
 952                if (e->bitmap) {
 953                        list_add_tail(&e->list, bitmap_list);
 954                        *bitmaps += 1;
 955                }
 956                node = rb_next(node);
 957                if (!node && cluster) {
 958                        node = rb_first(&cluster->root);
 959                        cluster_locked = cluster;
 960                        spin_lock(&cluster_locked->lock);
 961                        cluster = NULL;
 962                }
 963        }
 964        if (cluster_locked) {
 965                spin_unlock(&cluster_locked->lock);
 966                cluster_locked = NULL;
 967        }
 968
 969        /*
 970         * Make sure we don't miss any range that was removed from our rbtree
 971         * because trimming is running. Otherwise after a umount+mount (or crash
 972         * after committing the transaction) we would leak free space and get
 973         * an inconsistent free space cache report from fsck.
 974         */
 975        list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
 976                ret = io_ctl_add_entry(io_ctl, trim_entry->start,
 977                                       trim_entry->bytes, NULL);
 978                if (ret)
 979                        goto fail;
 980                *entries += 1;
 981        }
 982
 983        return 0;
 984fail:
 985        if (cluster_locked)
 986                spin_unlock(&cluster_locked->lock);
 987        return -ENOSPC;
 988}
 989
 990static noinline_for_stack int
 991update_cache_item(struct btrfs_trans_handle *trans,
 992                  struct btrfs_root *root,
 993                  struct inode *inode,
 994                  struct btrfs_path *path, u64 offset,
 995                  int entries, int bitmaps)
 996{
 997        struct btrfs_key key;
 998        struct btrfs_free_space_header *header;
 999        struct extent_buffer *leaf;
1000        int ret;
1001
1002        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
1003        key.offset = offset;
1004        key.type = 0;
1005
1006        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1007        if (ret < 0) {
1008                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1009                                 EXTENT_DELALLOC, 0, 0, NULL);
1010                goto fail;
1011        }
1012        leaf = path->nodes[0];
1013        if (ret > 0) {
1014                struct btrfs_key found_key;
1015                ASSERT(path->slots[0]);
1016                path->slots[0]--;
1017                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1018                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
1019                    found_key.offset != offset) {
1020                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
1021                                         inode->i_size - 1, EXTENT_DELALLOC, 0,
1022                                         0, NULL);
1023                        btrfs_release_path(path);
1024                        goto fail;
1025                }
1026        }
1027
1028        BTRFS_I(inode)->generation = trans->transid;
1029        header = btrfs_item_ptr(leaf, path->slots[0],
1030                                struct btrfs_free_space_header);
1031        btrfs_set_free_space_entries(leaf, header, entries);
1032        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1033        btrfs_set_free_space_generation(leaf, header, trans->transid);
1034        btrfs_mark_buffer_dirty(leaf);
1035        btrfs_release_path(path);
1036
1037        return 0;
1038
1039fail:
1040        return -1;
1041}
1042
1043static noinline_for_stack int write_pinned_extent_entries(
1044                            struct btrfs_block_group_cache *block_group,
1045                            struct btrfs_io_ctl *io_ctl,
1046                            int *entries)
1047{
1048        u64 start, extent_start, extent_end, len;
1049        struct extent_io_tree *unpin = NULL;
1050        int ret;
1051
1052        if (!block_group)
1053                return 0;
1054
1055        /*
1056         * We want to add any pinned extents to our free space cache
1057         * so we don't leak the space
1058         *
1059         * We shouldn't have switched the pinned extents yet so this is the
1060         * right one
1061         */
1062        unpin = block_group->fs_info->pinned_extents;
1063
1064        start = block_group->key.objectid;
1065
1066        while (start < block_group->key.objectid + block_group->key.offset) {
1067                ret = find_first_extent_bit(unpin, start,
1068                                            &extent_start, &extent_end,
1069                                            EXTENT_DIRTY, NULL);
1070                if (ret)
1071                        return 0;
1072
1073                /* This pinned extent is out of our range */
1074                if (extent_start >= block_group->key.objectid +
1075                    block_group->key.offset)
1076                        return 0;
1077
1078                extent_start = max(extent_start, start);
1079                extent_end = min(block_group->key.objectid +
1080                                 block_group->key.offset, extent_end + 1);
1081                len = extent_end - extent_start;
1082
1083                *entries += 1;
1084                ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1085                if (ret)
1086                        return -ENOSPC;
1087
1088                start = extent_end;
1089        }
1090
1091        return 0;
1092}
1093
1094static noinline_for_stack int
1095write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1096{
1097        struct btrfs_free_space *entry, *next;
1098        int ret;
1099
1100        /* Write out the bitmaps */
1101        list_for_each_entry_safe(entry, next, bitmap_list, list) {
1102                ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1103                if (ret)
1104                        return -ENOSPC;
1105                list_del_init(&entry->list);
1106        }
1107
1108        return 0;
1109}
1110
1111static int flush_dirty_cache(struct inode *inode)
1112{
1113        int ret;
1114
1115        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1116        if (ret)
1117                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1118                                 EXTENT_DELALLOC, 0, 0, NULL);
1119
1120        return ret;
1121}
1122
1123static void noinline_for_stack
1124cleanup_bitmap_list(struct list_head *bitmap_list)
1125{
1126        struct btrfs_free_space *entry, *next;
1127
1128        list_for_each_entry_safe(entry, next, bitmap_list, list)
1129                list_del_init(&entry->list);
1130}
1131
1132static void noinline_for_stack
1133cleanup_write_cache_enospc(struct inode *inode,
1134                           struct btrfs_io_ctl *io_ctl,
1135                           struct extent_state **cached_state)
1136{
1137        io_ctl_drop_pages(io_ctl);
1138        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1139                             i_size_read(inode) - 1, cached_state);
1140}
1141
1142static int __btrfs_wait_cache_io(struct btrfs_root *root,
1143                                 struct btrfs_trans_handle *trans,
1144                                 struct btrfs_block_group_cache *block_group,
1145                                 struct btrfs_io_ctl *io_ctl,
1146                                 struct btrfs_path *path, u64 offset)
1147{
1148        int ret;
1149        struct inode *inode = io_ctl->inode;
1150
1151        if (!inode)
1152                return 0;
1153
1154        /* Flush the dirty pages in the cache file. */
1155        ret = flush_dirty_cache(inode);
1156        if (ret)
1157                goto out;
1158
1159        /* Update the cache item to tell everyone this cache file is valid. */
1160        ret = update_cache_item(trans, root, inode, path, offset,
1161                                io_ctl->entries, io_ctl->bitmaps);
1162out:
1163        io_ctl_free(io_ctl);
1164        if (ret) {
1165                invalidate_inode_pages2(inode->i_mapping);
1166                BTRFS_I(inode)->generation = 0;
1167                if (block_group) {
1168#ifdef DEBUG
1169                        btrfs_err(root->fs_info,
1170                                  "failed to write free space cache for block group %llu",
1171                                  block_group->key.objectid);
1172#endif
1173                }
1174        }
1175        btrfs_update_inode(trans, root, inode);
1176
1177        if (block_group) {
1178                /* the dirty list is protected by the dirty_bgs_lock */
1179                spin_lock(&trans->transaction->dirty_bgs_lock);
1180
1181                /* the disk_cache_state is protected by the block group lock */
1182                spin_lock(&block_group->lock);
1183
1184                /*
1185                 * only mark this as written if we didn't get put back on
1186                 * the dirty list while waiting for IO.   Otherwise our
1187                 * cache state won't be right, and we won't get written again
1188                 */
1189                if (!ret && list_empty(&block_group->dirty_list))
1190                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1191                else if (ret)
1192                        block_group->disk_cache_state = BTRFS_DC_ERROR;
1193
1194                spin_unlock(&block_group->lock);
1195                spin_unlock(&trans->transaction->dirty_bgs_lock);
1196                io_ctl->inode = NULL;
1197                iput(inode);
1198        }
1199
1200        return ret;
1201
1202}
1203
1204static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1205                                    struct btrfs_trans_handle *trans,
1206                                    struct btrfs_io_ctl *io_ctl,
1207                                    struct btrfs_path *path)
1208{
1209        return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1210}
1211
1212int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1213                        struct btrfs_block_group_cache *block_group,
1214                        struct btrfs_path *path)
1215{
1216        return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1217                                     block_group, &block_group->io_ctl,
1218                                     path, block_group->key.objectid);
1219}
1220
1221/**
1222 * __btrfs_write_out_cache - write out cached info to an inode
1223 * @root - the root the inode belongs to
1224 * @ctl - the free space cache we are going to write out
1225 * @block_group - the block_group for this cache if it belongs to a block_group
1226 * @trans - the trans handle
1227 *
1228 * This function writes out a free space cache struct to disk for quick recovery
1229 * on mount.  This will return 0 if it was successful in writing the cache out,
1230 * or an errno if it was not.
1231 */
1232static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1233                                   struct btrfs_free_space_ctl *ctl,
1234                                   struct btrfs_block_group_cache *block_group,
1235                                   struct btrfs_io_ctl *io_ctl,
1236                                   struct btrfs_trans_handle *trans)
1237{
1238        struct extent_state *cached_state = NULL;
1239        LIST_HEAD(bitmap_list);
1240        int entries = 0;
1241        int bitmaps = 0;
1242        int ret;
1243        int must_iput = 0;
1244
1245        if (!i_size_read(inode))
1246                return -EIO;
1247
1248        WARN_ON(io_ctl->pages);
1249        ret = io_ctl_init(io_ctl, inode, 1);
1250        if (ret)
1251                return ret;
1252
1253        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1254                down_write(&block_group->data_rwsem);
1255                spin_lock(&block_group->lock);
1256                if (block_group->delalloc_bytes) {
1257                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1258                        spin_unlock(&block_group->lock);
1259                        up_write(&block_group->data_rwsem);
1260                        BTRFS_I(inode)->generation = 0;
1261                        ret = 0;
1262                        must_iput = 1;
1263                        goto out;
1264                }
1265                spin_unlock(&block_group->lock);
1266        }
1267
1268        /* Lock all pages first so we can lock the extent safely. */
1269        ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1270        if (ret)
1271                goto out_unlock;
1272
1273        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1274                         &cached_state);
1275
1276        io_ctl_set_generation(io_ctl, trans->transid);
1277
1278        mutex_lock(&ctl->cache_writeout_mutex);
1279        /* Write out the extent entries in the free space cache */
1280        spin_lock(&ctl->tree_lock);
1281        ret = write_cache_extent_entries(io_ctl, ctl,
1282                                         block_group, &entries, &bitmaps,
1283                                         &bitmap_list);
1284        if (ret)
1285                goto out_nospc_locked;
1286
1287        /*
1288         * Some spaces that are freed in the current transaction are pinned,
1289         * they will be added into free space cache after the transaction is
1290         * committed, we shouldn't lose them.
1291         *
1292         * If this changes while we are working we'll get added back to
1293         * the dirty list and redo it.  No locking needed
1294         */
1295        ret = write_pinned_extent_entries(block_group, io_ctl, &entries);
1296        if (ret)
1297                goto out_nospc_locked;
1298
1299        /*
1300         * At last, we write out all the bitmaps and keep cache_writeout_mutex
1301         * locked while doing it because a concurrent trim can be manipulating
1302         * or freeing the bitmap.
1303         */
1304        ret = write_bitmap_entries(io_ctl, &bitmap_list);
1305        spin_unlock(&ctl->tree_lock);
1306        mutex_unlock(&ctl->cache_writeout_mutex);
1307        if (ret)
1308                goto out_nospc;
1309
1310        /* Zero out the rest of the pages just to make sure */
1311        io_ctl_zero_remaining_pages(io_ctl);
1312
1313        /* Everything is written out, now we dirty the pages in the file. */
1314        ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1315                                i_size_read(inode), &cached_state);
1316        if (ret)
1317                goto out_nospc;
1318
1319        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1320                up_write(&block_group->data_rwsem);
1321        /*
1322         * Release the pages and unlock the extent, we will flush
1323         * them out later
1324         */
1325        io_ctl_drop_pages(io_ctl);
1326
1327        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1328                             i_size_read(inode) - 1, &cached_state);
1329
1330        /*
1331         * at this point the pages are under IO and we're happy,
1332         * The caller is responsible for waiting on them and updating the
1333         * the cache and the inode
1334         */
1335        io_ctl->entries = entries;
1336        io_ctl->bitmaps = bitmaps;
1337
1338        ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1339        if (ret)
1340                goto out;
1341
1342        return 0;
1343
1344out:
1345        io_ctl->inode = NULL;
1346        io_ctl_free(io_ctl);
1347        if (ret) {
1348                invalidate_inode_pages2(inode->i_mapping);
1349                BTRFS_I(inode)->generation = 0;
1350        }
1351        btrfs_update_inode(trans, root, inode);
1352        if (must_iput)
1353                iput(inode);
1354        return ret;
1355
1356out_nospc_locked:
1357        cleanup_bitmap_list(&bitmap_list);
1358        spin_unlock(&ctl->tree_lock);
1359        mutex_unlock(&ctl->cache_writeout_mutex);
1360
1361out_nospc:
1362        cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1363
1364out_unlock:
1365        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1366                up_write(&block_group->data_rwsem);
1367
1368        goto out;
1369}
1370
1371int btrfs_write_out_cache(struct btrfs_trans_handle *trans,
1372                          struct btrfs_block_group_cache *block_group,
1373                          struct btrfs_path *path)
1374{
1375        struct btrfs_fs_info *fs_info = trans->fs_info;
1376        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1377        struct inode *inode;
1378        int ret = 0;
1379
1380        spin_lock(&block_group->lock);
1381        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1382                spin_unlock(&block_group->lock);
1383                return 0;
1384        }
1385        spin_unlock(&block_group->lock);
1386
1387        inode = lookup_free_space_inode(block_group, path);
1388        if (IS_ERR(inode))
1389                return 0;
1390
1391        ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1392                                block_group, &block_group->io_ctl, trans);
1393        if (ret) {
1394#ifdef DEBUG
1395                btrfs_err(fs_info,
1396                          "failed to write free space cache for block group %llu",
1397                          block_group->key.objectid);
1398#endif
1399                spin_lock(&block_group->lock);
1400                block_group->disk_cache_state = BTRFS_DC_ERROR;
1401                spin_unlock(&block_group->lock);
1402
1403                block_group->io_ctl.inode = NULL;
1404                iput(inode);
1405        }
1406
1407        /*
1408         * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1409         * to wait for IO and put the inode
1410         */
1411
1412        return ret;
1413}
1414
1415static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1416                                          u64 offset)
1417{
1418        ASSERT(offset >= bitmap_start);
1419        offset -= bitmap_start;
1420        return (unsigned long)(div_u64(offset, unit));
1421}
1422
1423static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1424{
1425        return (unsigned long)(div_u64(bytes, unit));
1426}
1427
1428static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1429                                   u64 offset)
1430{
1431        u64 bitmap_start;
1432        u64 bytes_per_bitmap;
1433
1434        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1435        bitmap_start = offset - ctl->start;
1436        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1437        bitmap_start *= bytes_per_bitmap;
1438        bitmap_start += ctl->start;
1439
1440        return bitmap_start;
1441}
1442
1443static int tree_insert_offset(struct rb_root *root, u64 offset,
1444                              struct rb_node *node, int bitmap)
1445{
1446        struct rb_node **p = &root->rb_node;
1447        struct rb_node *parent = NULL;
1448        struct btrfs_free_space *info;
1449
1450        while (*p) {
1451                parent = *p;
1452                info = rb_entry(parent, struct btrfs_free_space, offset_index);
1453
1454                if (offset < info->offset) {
1455                        p = &(*p)->rb_left;
1456                } else if (offset > info->offset) {
1457                        p = &(*p)->rb_right;
1458                } else {
1459                        /*
1460                         * we could have a bitmap entry and an extent entry
1461                         * share the same offset.  If this is the case, we want
1462                         * the extent entry to always be found first if we do a
1463                         * linear search through the tree, since we want to have
1464                         * the quickest allocation time, and allocating from an
1465                         * extent is faster than allocating from a bitmap.  So
1466                         * if we're inserting a bitmap and we find an entry at
1467                         * this offset, we want to go right, or after this entry
1468                         * logically.  If we are inserting an extent and we've
1469                         * found a bitmap, we want to go left, or before
1470                         * logically.
1471                         */
1472                        if (bitmap) {
1473                                if (info->bitmap) {
1474                                        WARN_ON_ONCE(1);
1475                                        return -EEXIST;
1476                                }
1477                                p = &(*p)->rb_right;
1478                        } else {
1479                                if (!info->bitmap) {
1480                                        WARN_ON_ONCE(1);
1481                                        return -EEXIST;
1482                                }
1483                                p = &(*p)->rb_left;
1484                        }
1485                }
1486        }
1487
1488        rb_link_node(node, parent, p);
1489        rb_insert_color(node, root);
1490
1491        return 0;
1492}
1493
1494/*
1495 * searches the tree for the given offset.
1496 *
1497 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1498 * want a section that has at least bytes size and comes at or after the given
1499 * offset.
1500 */
1501static struct btrfs_free_space *
1502tree_search_offset(struct btrfs_free_space_ctl *ctl,
1503                   u64 offset, int bitmap_only, int fuzzy)
1504{
1505        struct rb_node *n = ctl->free_space_offset.rb_node;
1506        struct btrfs_free_space *entry, *prev = NULL;
1507
1508        /* find entry that is closest to the 'offset' */
1509        while (1) {
1510                if (!n) {
1511                        entry = NULL;
1512                        break;
1513                }
1514
1515                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1516                prev = entry;
1517
1518                if (offset < entry->offset)
1519                        n = n->rb_left;
1520                else if (offset > entry->offset)
1521                        n = n->rb_right;
1522                else
1523                        break;
1524        }
1525
1526        if (bitmap_only) {
1527                if (!entry)
1528                        return NULL;
1529                if (entry->bitmap)
1530                        return entry;
1531
1532                /*
1533                 * bitmap entry and extent entry may share same offset,
1534                 * in that case, bitmap entry comes after extent entry.
1535                 */
1536                n = rb_next(n);
1537                if (!n)
1538                        return NULL;
1539                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1540                if (entry->offset != offset)
1541                        return NULL;
1542
1543                WARN_ON(!entry->bitmap);
1544                return entry;
1545        } else if (entry) {
1546                if (entry->bitmap) {
1547                        /*
1548                         * if previous extent entry covers the offset,
1549                         * we should return it instead of the bitmap entry
1550                         */
1551                        n = rb_prev(&entry->offset_index);
1552                        if (n) {
1553                                prev = rb_entry(n, struct btrfs_free_space,
1554                                                offset_index);
1555                                if (!prev->bitmap &&
1556                                    prev->offset + prev->bytes > offset)
1557                                        entry = prev;
1558                        }
1559                }
1560                return entry;
1561        }
1562
1563        if (!prev)
1564                return NULL;
1565
1566        /* find last entry before the 'offset' */
1567        entry = prev;
1568        if (entry->offset > offset) {
1569                n = rb_prev(&entry->offset_index);
1570                if (n) {
1571                        entry = rb_entry(n, struct btrfs_free_space,
1572                                        offset_index);
1573                        ASSERT(entry->offset <= offset);
1574                } else {
1575                        if (fuzzy)
1576                                return entry;
1577                        else
1578                                return NULL;
1579                }
1580        }
1581
1582        if (entry->bitmap) {
1583                n = rb_prev(&entry->offset_index);
1584                if (n) {
1585                        prev = rb_entry(n, struct btrfs_free_space,
1586                                        offset_index);
1587                        if (!prev->bitmap &&
1588                            prev->offset + prev->bytes > offset)
1589                                return prev;
1590                }
1591                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1592                        return entry;
1593        } else if (entry->offset + entry->bytes > offset)
1594                return entry;
1595
1596        if (!fuzzy)
1597                return NULL;
1598
1599        while (1) {
1600                if (entry->bitmap) {
1601                        if (entry->offset + BITS_PER_BITMAP *
1602                            ctl->unit > offset)
1603                                break;
1604                } else {
1605                        if (entry->offset + entry->bytes > offset)
1606                                break;
1607                }
1608
1609                n = rb_next(&entry->offset_index);
1610                if (!n)
1611                        return NULL;
1612                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1613        }
1614        return entry;
1615}
1616
1617static inline void
1618__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1619                    struct btrfs_free_space *info)
1620{
1621        rb_erase(&info->offset_index, &ctl->free_space_offset);
1622        ctl->free_extents--;
1623}
1624
1625static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1626                              struct btrfs_free_space *info)
1627{
1628        __unlink_free_space(ctl, info);
1629        ctl->free_space -= info->bytes;
1630}
1631
1632static int link_free_space(struct btrfs_free_space_ctl *ctl,
1633                           struct btrfs_free_space *info)
1634{
1635        int ret = 0;
1636
1637        ASSERT(info->bytes || info->bitmap);
1638        ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1639                                 &info->offset_index, (info->bitmap != NULL));
1640        if (ret)
1641                return ret;
1642
1643        ctl->free_space += info->bytes;
1644        ctl->free_extents++;
1645        return ret;
1646}
1647
1648static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1649{
1650        struct btrfs_block_group_cache *block_group = ctl->private;
1651        u64 max_bytes;
1652        u64 bitmap_bytes;
1653        u64 extent_bytes;
1654        u64 size = block_group->key.offset;
1655        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1656        u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1657
1658        max_bitmaps = max_t(u64, max_bitmaps, 1);
1659
1660        ASSERT(ctl->total_bitmaps <= max_bitmaps);
1661
1662        /*
1663         * The goal is to keep the total amount of memory used per 1gb of space
1664         * at or below 32k, so we need to adjust how much memory we allow to be
1665         * used by extent based free space tracking
1666         */
1667        if (size < SZ_1G)
1668                max_bytes = MAX_CACHE_BYTES_PER_GIG;
1669        else
1670                max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1671
1672        /*
1673         * we want to account for 1 more bitmap than what we have so we can make
1674         * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1675         * we add more bitmaps.
1676         */
1677        bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
1678
1679        if (bitmap_bytes >= max_bytes) {
1680                ctl->extents_thresh = 0;
1681                return;
1682        }
1683
1684        /*
1685         * we want the extent entry threshold to always be at most 1/2 the max
1686         * bytes we can have, or whatever is less than that.
1687         */
1688        extent_bytes = max_bytes - bitmap_bytes;
1689        extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1690
1691        ctl->extents_thresh =
1692                div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1693}
1694
1695static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1696                                       struct btrfs_free_space *info,
1697                                       u64 offset, u64 bytes)
1698{
1699        unsigned long start, count;
1700
1701        start = offset_to_bit(info->offset, ctl->unit, offset);
1702        count = bytes_to_bits(bytes, ctl->unit);
1703        ASSERT(start + count <= BITS_PER_BITMAP);
1704
1705        bitmap_clear(info->bitmap, start, count);
1706
1707        info->bytes -= bytes;
1708        if (info->max_extent_size > ctl->unit)
1709                info->max_extent_size = 0;
1710}
1711
1712static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1713                              struct btrfs_free_space *info, u64 offset,
1714                              u64 bytes)
1715{
1716        __bitmap_clear_bits(ctl, info, offset, bytes);
1717        ctl->free_space -= bytes;
1718}
1719
1720static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1721                            struct btrfs_free_space *info, u64 offset,
1722                            u64 bytes)
1723{
1724        unsigned long start, count;
1725
1726        start = offset_to_bit(info->offset, ctl->unit, offset);
1727        count = bytes_to_bits(bytes, ctl->unit);
1728        ASSERT(start + count <= BITS_PER_BITMAP);
1729
1730        bitmap_set(info->bitmap, start, count);
1731
1732        info->bytes += bytes;
1733        ctl->free_space += bytes;
1734}
1735
1736/*
1737 * If we can not find suitable extent, we will use bytes to record
1738 * the size of the max extent.
1739 */
1740static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1741                         struct btrfs_free_space *bitmap_info, u64 *offset,
1742                         u64 *bytes, bool for_alloc)
1743{
1744        unsigned long found_bits = 0;
1745        unsigned long max_bits = 0;
1746        unsigned long bits, i;
1747        unsigned long next_zero;
1748        unsigned long extent_bits;
1749
1750        /*
1751         * Skip searching the bitmap if we don't have a contiguous section that
1752         * is large enough for this allocation.
1753         */
1754        if (for_alloc &&
1755            bitmap_info->max_extent_size &&
1756            bitmap_info->max_extent_size < *bytes) {
1757                *bytes = bitmap_info->max_extent_size;
1758                return -1;
1759        }
1760
1761        i = offset_to_bit(bitmap_info->offset, ctl->unit,
1762                          max_t(u64, *offset, bitmap_info->offset));
1763        bits = bytes_to_bits(*bytes, ctl->unit);
1764
1765        for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1766                if (for_alloc && bits == 1) {
1767                        found_bits = 1;
1768                        break;
1769                }
1770                next_zero = find_next_zero_bit(bitmap_info->bitmap,
1771                                               BITS_PER_BITMAP, i);
1772                extent_bits = next_zero - i;
1773                if (extent_bits >= bits) {
1774                        found_bits = extent_bits;
1775                        break;
1776                } else if (extent_bits > max_bits) {
1777                        max_bits = extent_bits;
1778                }
1779                i = next_zero;
1780        }
1781
1782        if (found_bits) {
1783                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1784                *bytes = (u64)(found_bits) * ctl->unit;
1785                return 0;
1786        }
1787
1788        *bytes = (u64)(max_bits) * ctl->unit;
1789        bitmap_info->max_extent_size = *bytes;
1790        return -1;
1791}
1792
1793static inline u64 get_max_extent_size(struct btrfs_free_space *entry)
1794{
1795        if (entry->bitmap)
1796                return entry->max_extent_size;
1797        return entry->bytes;
1798}
1799
1800/* Cache the size of the max extent in bytes */
1801static struct btrfs_free_space *
1802find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1803                unsigned long align, u64 *max_extent_size)
1804{
1805        struct btrfs_free_space *entry;
1806        struct rb_node *node;
1807        u64 tmp;
1808        u64 align_off;
1809        int ret;
1810
1811        if (!ctl->free_space_offset.rb_node)
1812                goto out;
1813
1814        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1815        if (!entry)
1816                goto out;
1817
1818        for (node = &entry->offset_index; node; node = rb_next(node)) {
1819                entry = rb_entry(node, struct btrfs_free_space, offset_index);
1820                if (entry->bytes < *bytes) {
1821                        *max_extent_size = max(get_max_extent_size(entry),
1822                                               *max_extent_size);
1823                        continue;
1824                }
1825
1826                /* make sure the space returned is big enough
1827                 * to match our requested alignment
1828                 */
1829                if (*bytes >= align) {
1830                        tmp = entry->offset - ctl->start + align - 1;
1831                        tmp = div64_u64(tmp, align);
1832                        tmp = tmp * align + ctl->start;
1833                        align_off = tmp - entry->offset;
1834                } else {
1835                        align_off = 0;
1836                        tmp = entry->offset;
1837                }
1838
1839                if (entry->bytes < *bytes + align_off) {
1840                        *max_extent_size = max(get_max_extent_size(entry),
1841                                               *max_extent_size);
1842                        continue;
1843                }
1844
1845                if (entry->bitmap) {
1846                        u64 size = *bytes;
1847
1848                        ret = search_bitmap(ctl, entry, &tmp, &size, true);
1849                        if (!ret) {
1850                                *offset = tmp;
1851                                *bytes = size;
1852                                return entry;
1853                        } else {
1854                                *max_extent_size =
1855                                        max(get_max_extent_size(entry),
1856                                            *max_extent_size);
1857                        }
1858                        continue;
1859                }
1860
1861                *offset = tmp;
1862                *bytes = entry->bytes - align_off;
1863                return entry;
1864        }
1865out:
1866        return NULL;
1867}
1868
1869static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1870                           struct btrfs_free_space *info, u64 offset)
1871{
1872        info->offset = offset_to_bitmap(ctl, offset);
1873        info->bytes = 0;
1874        INIT_LIST_HEAD(&info->list);
1875        link_free_space(ctl, info);
1876        ctl->total_bitmaps++;
1877
1878        ctl->op->recalc_thresholds(ctl);
1879}
1880
1881static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1882                        struct btrfs_free_space *bitmap_info)
1883{
1884        unlink_free_space(ctl, bitmap_info);
1885        kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap);
1886        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1887        ctl->total_bitmaps--;
1888        ctl->op->recalc_thresholds(ctl);
1889}
1890
1891static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1892                              struct btrfs_free_space *bitmap_info,
1893                              u64 *offset, u64 *bytes)
1894{
1895        u64 end;
1896        u64 search_start, search_bytes;
1897        int ret;
1898
1899again:
1900        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1901
1902        /*
1903         * We need to search for bits in this bitmap.  We could only cover some
1904         * of the extent in this bitmap thanks to how we add space, so we need
1905         * to search for as much as it as we can and clear that amount, and then
1906         * go searching for the next bit.
1907         */
1908        search_start = *offset;
1909        search_bytes = ctl->unit;
1910        search_bytes = min(search_bytes, end - search_start + 1);
1911        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1912                            false);
1913        if (ret < 0 || search_start != *offset)
1914                return -EINVAL;
1915
1916        /* We may have found more bits than what we need */
1917        search_bytes = min(search_bytes, *bytes);
1918
1919        /* Cannot clear past the end of the bitmap */
1920        search_bytes = min(search_bytes, end - search_start + 1);
1921
1922        bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1923        *offset += search_bytes;
1924        *bytes -= search_bytes;
1925
1926        if (*bytes) {
1927                struct rb_node *next = rb_next(&bitmap_info->offset_index);
1928                if (!bitmap_info->bytes)
1929                        free_bitmap(ctl, bitmap_info);
1930
1931                /*
1932                 * no entry after this bitmap, but we still have bytes to
1933                 * remove, so something has gone wrong.
1934                 */
1935                if (!next)
1936                        return -EINVAL;
1937
1938                bitmap_info = rb_entry(next, struct btrfs_free_space,
1939                                       offset_index);
1940
1941                /*
1942                 * if the next entry isn't a bitmap we need to return to let the
1943                 * extent stuff do its work.
1944                 */
1945                if (!bitmap_info->bitmap)
1946                        return -EAGAIN;
1947
1948                /*
1949                 * Ok the next item is a bitmap, but it may not actually hold
1950                 * the information for the rest of this free space stuff, so
1951                 * look for it, and if we don't find it return so we can try
1952                 * everything over again.
1953                 */
1954                search_start = *offset;
1955                search_bytes = ctl->unit;
1956                ret = search_bitmap(ctl, bitmap_info, &search_start,
1957                                    &search_bytes, false);
1958                if (ret < 0 || search_start != *offset)
1959                        return -EAGAIN;
1960
1961                goto again;
1962        } else if (!bitmap_info->bytes)
1963                free_bitmap(ctl, bitmap_info);
1964
1965        return 0;
1966}
1967
1968static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1969                               struct btrfs_free_space *info, u64 offset,
1970                               u64 bytes)
1971{
1972        u64 bytes_to_set = 0;
1973        u64 end;
1974
1975        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1976
1977        bytes_to_set = min(end - offset, bytes);
1978
1979        bitmap_set_bits(ctl, info, offset, bytes_to_set);
1980
1981        /*
1982         * We set some bytes, we have no idea what the max extent size is
1983         * anymore.
1984         */
1985        info->max_extent_size = 0;
1986
1987        return bytes_to_set;
1988
1989}
1990
1991static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1992                      struct btrfs_free_space *info)
1993{
1994        struct btrfs_block_group_cache *block_group = ctl->private;
1995        struct btrfs_fs_info *fs_info = block_group->fs_info;
1996        bool forced = false;
1997
1998#ifdef CONFIG_BTRFS_DEBUG
1999        if (btrfs_should_fragment_free_space(block_group))
2000                forced = true;
2001#endif
2002
2003        /*
2004         * If we are below the extents threshold then we can add this as an
2005         * extent, and don't have to deal with the bitmap
2006         */
2007        if (!forced && ctl->free_extents < ctl->extents_thresh) {
2008                /*
2009                 * If this block group has some small extents we don't want to
2010                 * use up all of our free slots in the cache with them, we want
2011                 * to reserve them to larger extents, however if we have plenty
2012                 * of cache left then go ahead an dadd them, no sense in adding
2013                 * the overhead of a bitmap if we don't have to.
2014                 */
2015                if (info->bytes <= fs_info->sectorsize * 4) {
2016                        if (ctl->free_extents * 2 <= ctl->extents_thresh)
2017                                return false;
2018                } else {
2019                        return false;
2020                }
2021        }
2022
2023        /*
2024         * The original block groups from mkfs can be really small, like 8
2025         * megabytes, so don't bother with a bitmap for those entries.  However
2026         * some block groups can be smaller than what a bitmap would cover but
2027         * are still large enough that they could overflow the 32k memory limit,
2028         * so allow those block groups to still be allowed to have a bitmap
2029         * entry.
2030         */
2031        if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
2032                return false;
2033
2034        return true;
2035}
2036
2037static const struct btrfs_free_space_op free_space_op = {
2038        .recalc_thresholds      = recalculate_thresholds,
2039        .use_bitmap             = use_bitmap,
2040};
2041
2042static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2043                              struct btrfs_free_space *info)
2044{
2045        struct btrfs_free_space *bitmap_info;
2046        struct btrfs_block_group_cache *block_group = NULL;
2047        int added = 0;
2048        u64 bytes, offset, bytes_added;
2049        int ret;
2050
2051        bytes = info->bytes;
2052        offset = info->offset;
2053
2054        if (!ctl->op->use_bitmap(ctl, info))
2055                return 0;
2056
2057        if (ctl->op == &free_space_op)
2058                block_group = ctl->private;
2059again:
2060        /*
2061         * Since we link bitmaps right into the cluster we need to see if we
2062         * have a cluster here, and if so and it has our bitmap we need to add
2063         * the free space to that bitmap.
2064         */
2065        if (block_group && !list_empty(&block_group->cluster_list)) {
2066                struct btrfs_free_cluster *cluster;
2067                struct rb_node *node;
2068                struct btrfs_free_space *entry;
2069
2070                cluster = list_entry(block_group->cluster_list.next,
2071                                     struct btrfs_free_cluster,
2072                                     block_group_list);
2073                spin_lock(&cluster->lock);
2074                node = rb_first(&cluster->root);
2075                if (!node) {
2076                        spin_unlock(&cluster->lock);
2077                        goto no_cluster_bitmap;
2078                }
2079
2080                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2081                if (!entry->bitmap) {
2082                        spin_unlock(&cluster->lock);
2083                        goto no_cluster_bitmap;
2084                }
2085
2086                if (entry->offset == offset_to_bitmap(ctl, offset)) {
2087                        bytes_added = add_bytes_to_bitmap(ctl, entry,
2088                                                          offset, bytes);
2089                        bytes -= bytes_added;
2090                        offset += bytes_added;
2091                }
2092                spin_unlock(&cluster->lock);
2093                if (!bytes) {
2094                        ret = 1;
2095                        goto out;
2096                }
2097        }
2098
2099no_cluster_bitmap:
2100        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2101                                         1, 0);
2102        if (!bitmap_info) {
2103                ASSERT(added == 0);
2104                goto new_bitmap;
2105        }
2106
2107        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
2108        bytes -= bytes_added;
2109        offset += bytes_added;
2110        added = 0;
2111
2112        if (!bytes) {
2113                ret = 1;
2114                goto out;
2115        } else
2116                goto again;
2117
2118new_bitmap:
2119        if (info && info->bitmap) {
2120                add_new_bitmap(ctl, info, offset);
2121                added = 1;
2122                info = NULL;
2123                goto again;
2124        } else {
2125                spin_unlock(&ctl->tree_lock);
2126
2127                /* no pre-allocated info, allocate a new one */
2128                if (!info) {
2129                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
2130                                                 GFP_NOFS);
2131                        if (!info) {
2132                                spin_lock(&ctl->tree_lock);
2133                                ret = -ENOMEM;
2134                                goto out;
2135                        }
2136                }
2137
2138                /* allocate the bitmap */
2139                info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep,
2140                                                 GFP_NOFS);
2141                spin_lock(&ctl->tree_lock);
2142                if (!info->bitmap) {
2143                        ret = -ENOMEM;
2144                        goto out;
2145                }
2146                goto again;
2147        }
2148
2149out:
2150        if (info) {
2151                if (info->bitmap)
2152                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
2153                                        info->bitmap);
2154                kmem_cache_free(btrfs_free_space_cachep, info);
2155        }
2156
2157        return ret;
2158}
2159
2160static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2161                          struct btrfs_free_space *info, bool update_stat)
2162{
2163        struct btrfs_free_space *left_info;
2164        struct btrfs_free_space *right_info;
2165        bool merged = false;
2166        u64 offset = info->offset;
2167        u64 bytes = info->bytes;
2168
2169        /*
2170         * first we want to see if there is free space adjacent to the range we
2171         * are adding, if there is remove that struct and add a new one to
2172         * cover the entire range
2173         */
2174        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2175        if (right_info && rb_prev(&right_info->offset_index))
2176                left_info = rb_entry(rb_prev(&right_info->offset_index),
2177                                     struct btrfs_free_space, offset_index);
2178        else
2179                left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2180
2181        if (right_info && !right_info->bitmap) {
2182                if (update_stat)
2183                        unlink_free_space(ctl, right_info);
2184                else
2185                        __unlink_free_space(ctl, right_info);
2186                info->bytes += right_info->bytes;
2187                kmem_cache_free(btrfs_free_space_cachep, right_info);
2188                merged = true;
2189        }
2190
2191        if (left_info && !left_info->bitmap &&
2192            left_info->offset + left_info->bytes == offset) {
2193                if (update_stat)
2194                        unlink_free_space(ctl, left_info);
2195                else
2196                        __unlink_free_space(ctl, left_info);
2197                info->offset = left_info->offset;
2198                info->bytes += left_info->bytes;
2199                kmem_cache_free(btrfs_free_space_cachep, left_info);
2200                merged = true;
2201        }
2202
2203        return merged;
2204}
2205
2206static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2207                                     struct btrfs_free_space *info,
2208                                     bool update_stat)
2209{
2210        struct btrfs_free_space *bitmap;
2211        unsigned long i;
2212        unsigned long j;
2213        const u64 end = info->offset + info->bytes;
2214        const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2215        u64 bytes;
2216
2217        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2218        if (!bitmap)
2219                return false;
2220
2221        i = offset_to_bit(bitmap->offset, ctl->unit, end);
2222        j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2223        if (j == i)
2224                return false;
2225        bytes = (j - i) * ctl->unit;
2226        info->bytes += bytes;
2227
2228        if (update_stat)
2229                bitmap_clear_bits(ctl, bitmap, end, bytes);
2230        else
2231                __bitmap_clear_bits(ctl, bitmap, end, bytes);
2232
2233        if (!bitmap->bytes)
2234                free_bitmap(ctl, bitmap);
2235
2236        return true;
2237}
2238
2239static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2240                                       struct btrfs_free_space *info,
2241                                       bool update_stat)
2242{
2243        struct btrfs_free_space *bitmap;
2244        u64 bitmap_offset;
2245        unsigned long i;
2246        unsigned long j;
2247        unsigned long prev_j;
2248        u64 bytes;
2249
2250        bitmap_offset = offset_to_bitmap(ctl, info->offset);
2251        /* If we're on a boundary, try the previous logical bitmap. */
2252        if (bitmap_offset == info->offset) {
2253                if (info->offset == 0)
2254                        return false;
2255                bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2256        }
2257
2258        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2259        if (!bitmap)
2260                return false;
2261
2262        i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2263        j = 0;
2264        prev_j = (unsigned long)-1;
2265        for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2266                if (j > i)
2267                        break;
2268                prev_j = j;
2269        }
2270        if (prev_j == i)
2271                return false;
2272
2273        if (prev_j == (unsigned long)-1)
2274                bytes = (i + 1) * ctl->unit;
2275        else
2276                bytes = (i - prev_j) * ctl->unit;
2277
2278        info->offset -= bytes;
2279        info->bytes += bytes;
2280
2281        if (update_stat)
2282                bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2283        else
2284                __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2285
2286        if (!bitmap->bytes)
2287                free_bitmap(ctl, bitmap);
2288
2289        return true;
2290}
2291
2292/*
2293 * We prefer always to allocate from extent entries, both for clustered and
2294 * non-clustered allocation requests. So when attempting to add a new extent
2295 * entry, try to see if there's adjacent free space in bitmap entries, and if
2296 * there is, migrate that space from the bitmaps to the extent.
2297 * Like this we get better chances of satisfying space allocation requests
2298 * because we attempt to satisfy them based on a single cache entry, and never
2299 * on 2 or more entries - even if the entries represent a contiguous free space
2300 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2301 * ends).
2302 */
2303static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2304                              struct btrfs_free_space *info,
2305                              bool update_stat)
2306{
2307        /*
2308         * Only work with disconnected entries, as we can change their offset,
2309         * and must be extent entries.
2310         */
2311        ASSERT(!info->bitmap);
2312        ASSERT(RB_EMPTY_NODE(&info->offset_index));
2313
2314        if (ctl->total_bitmaps > 0) {
2315                bool stole_end;
2316                bool stole_front = false;
2317
2318                stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2319                if (ctl->total_bitmaps > 0)
2320                        stole_front = steal_from_bitmap_to_front(ctl, info,
2321                                                                 update_stat);
2322
2323                if (stole_end || stole_front)
2324                        try_merge_free_space(ctl, info, update_stat);
2325        }
2326}
2327
2328int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2329                           struct btrfs_free_space_ctl *ctl,
2330                           u64 offset, u64 bytes)
2331{
2332        struct btrfs_free_space *info;
2333        int ret = 0;
2334
2335        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2336        if (!info)
2337                return -ENOMEM;
2338
2339        info->offset = offset;
2340        info->bytes = bytes;
2341        RB_CLEAR_NODE(&info->offset_index);
2342
2343        spin_lock(&ctl->tree_lock);
2344
2345        if (try_merge_free_space(ctl, info, true))
2346                goto link;
2347
2348        /*
2349         * There was no extent directly to the left or right of this new
2350         * extent then we know we're going to have to allocate a new extent, so
2351         * before we do that see if we need to drop this into a bitmap
2352         */
2353        ret = insert_into_bitmap(ctl, info);
2354        if (ret < 0) {
2355                goto out;
2356        } else if (ret) {
2357                ret = 0;
2358                goto out;
2359        }
2360link:
2361        /*
2362         * Only steal free space from adjacent bitmaps if we're sure we're not
2363         * going to add the new free space to existing bitmap entries - because
2364         * that would mean unnecessary work that would be reverted. Therefore
2365         * attempt to steal space from bitmaps if we're adding an extent entry.
2366         */
2367        steal_from_bitmap(ctl, info, true);
2368
2369        ret = link_free_space(ctl, info);
2370        if (ret)
2371                kmem_cache_free(btrfs_free_space_cachep, info);
2372out:
2373        spin_unlock(&ctl->tree_lock);
2374
2375        if (ret) {
2376                btrfs_crit(fs_info, "unable to add free space :%d", ret);
2377                ASSERT(ret != -EEXIST);
2378        }
2379
2380        return ret;
2381}
2382
2383int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
2384                         u64 bytenr, u64 size)
2385{
2386        return __btrfs_add_free_space(block_group->fs_info,
2387                                      block_group->free_space_ctl,
2388                                      bytenr, size);
2389}
2390
2391int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2392                            u64 offset, u64 bytes)
2393{
2394        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2395        struct btrfs_free_space *info;
2396        int ret;
2397        bool re_search = false;
2398
2399        spin_lock(&ctl->tree_lock);
2400
2401again:
2402        ret = 0;
2403        if (!bytes)
2404                goto out_lock;
2405
2406        info = tree_search_offset(ctl, offset, 0, 0);
2407        if (!info) {
2408                /*
2409                 * oops didn't find an extent that matched the space we wanted
2410                 * to remove, look for a bitmap instead
2411                 */
2412                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2413                                          1, 0);
2414                if (!info) {
2415                        /*
2416                         * If we found a partial bit of our free space in a
2417                         * bitmap but then couldn't find the other part this may
2418                         * be a problem, so WARN about it.
2419                         */
2420                        WARN_ON(re_search);
2421                        goto out_lock;
2422                }
2423        }
2424
2425        re_search = false;
2426        if (!info->bitmap) {
2427                unlink_free_space(ctl, info);
2428                if (offset == info->offset) {
2429                        u64 to_free = min(bytes, info->bytes);
2430
2431                        info->bytes -= to_free;
2432                        info->offset += to_free;
2433                        if (info->bytes) {
2434                                ret = link_free_space(ctl, info);
2435                                WARN_ON(ret);
2436                        } else {
2437                                kmem_cache_free(btrfs_free_space_cachep, info);
2438                        }
2439
2440                        offset += to_free;
2441                        bytes -= to_free;
2442                        goto again;
2443                } else {
2444                        u64 old_end = info->bytes + info->offset;
2445
2446                        info->bytes = offset - info->offset;
2447                        ret = link_free_space(ctl, info);
2448                        WARN_ON(ret);
2449                        if (ret)
2450                                goto out_lock;
2451
2452                        /* Not enough bytes in this entry to satisfy us */
2453                        if (old_end < offset + bytes) {
2454                                bytes -= old_end - offset;
2455                                offset = old_end;
2456                                goto again;
2457                        } else if (old_end == offset + bytes) {
2458                                /* all done */
2459                                goto out_lock;
2460                        }
2461                        spin_unlock(&ctl->tree_lock);
2462
2463                        ret = btrfs_add_free_space(block_group, offset + bytes,
2464                                                   old_end - (offset + bytes));
2465                        WARN_ON(ret);
2466                        goto out;
2467                }
2468        }
2469
2470        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2471        if (ret == -EAGAIN) {
2472                re_search = true;
2473                goto again;
2474        }
2475out_lock:
2476        spin_unlock(&ctl->tree_lock);
2477out:
2478        return ret;
2479}
2480
2481void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2482                           u64 bytes)
2483{
2484        struct btrfs_fs_info *fs_info = block_group->fs_info;
2485        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2486        struct btrfs_free_space *info;
2487        struct rb_node *n;
2488        int count = 0;
2489
2490        spin_lock(&ctl->tree_lock);
2491        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2492                info = rb_entry(n, struct btrfs_free_space, offset_index);
2493                if (info->bytes >= bytes && !block_group->ro)
2494                        count++;
2495                btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2496                           info->offset, info->bytes,
2497                       (info->bitmap) ? "yes" : "no");
2498        }
2499        spin_unlock(&ctl->tree_lock);
2500        btrfs_info(fs_info, "block group has cluster?: %s",
2501               list_empty(&block_group->cluster_list) ? "no" : "yes");
2502        btrfs_info(fs_info,
2503                   "%d blocks of free space at or bigger than bytes is", count);
2504}
2505
2506void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2507{
2508        struct btrfs_fs_info *fs_info = block_group->fs_info;
2509        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2510
2511        spin_lock_init(&ctl->tree_lock);
2512        ctl->unit = fs_info->sectorsize;
2513        ctl->start = block_group->key.objectid;
2514        ctl->private = block_group;
2515        ctl->op = &free_space_op;
2516        INIT_LIST_HEAD(&ctl->trimming_ranges);
2517        mutex_init(&ctl->cache_writeout_mutex);
2518
2519        /*
2520         * we only want to have 32k of ram per block group for keeping
2521         * track of free space, and if we pass 1/2 of that we want to
2522         * start converting things over to using bitmaps
2523         */
2524        ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2525}
2526
2527/*
2528 * for a given cluster, put all of its extents back into the free
2529 * space cache.  If the block group passed doesn't match the block group
2530 * pointed to by the cluster, someone else raced in and freed the
2531 * cluster already.  In that case, we just return without changing anything
2532 */
2533static int
2534__btrfs_return_cluster_to_free_space(
2535                             struct btrfs_block_group_cache *block_group,
2536                             struct btrfs_free_cluster *cluster)
2537{
2538        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2539        struct btrfs_free_space *entry;
2540        struct rb_node *node;
2541
2542        spin_lock(&cluster->lock);
2543        if (cluster->block_group != block_group)
2544                goto out;
2545
2546        cluster->block_group = NULL;
2547        cluster->window_start = 0;
2548        list_del_init(&cluster->block_group_list);
2549
2550        node = rb_first(&cluster->root);
2551        while (node) {
2552                bool bitmap;
2553
2554                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2555                node = rb_next(&entry->offset_index);
2556                rb_erase(&entry->offset_index, &cluster->root);
2557                RB_CLEAR_NODE(&entry->offset_index);
2558
2559                bitmap = (entry->bitmap != NULL);
2560                if (!bitmap) {
2561                        try_merge_free_space(ctl, entry, false);
2562                        steal_from_bitmap(ctl, entry, false);
2563                }
2564                tree_insert_offset(&ctl->free_space_offset,
2565                                   entry->offset, &entry->offset_index, bitmap);
2566        }
2567        cluster->root = RB_ROOT;
2568
2569out:
2570        spin_unlock(&cluster->lock);
2571        btrfs_put_block_group(block_group);
2572        return 0;
2573}
2574
2575static void __btrfs_remove_free_space_cache_locked(
2576                                struct btrfs_free_space_ctl *ctl)
2577{
2578        struct btrfs_free_space *info;
2579        struct rb_node *node;
2580
2581        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2582                info = rb_entry(node, struct btrfs_free_space, offset_index);
2583                if (!info->bitmap) {
2584                        unlink_free_space(ctl, info);
2585                        kmem_cache_free(btrfs_free_space_cachep, info);
2586                } else {
2587                        free_bitmap(ctl, info);
2588                }
2589
2590                cond_resched_lock(&ctl->tree_lock);
2591        }
2592}
2593
2594void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2595{
2596        spin_lock(&ctl->tree_lock);
2597        __btrfs_remove_free_space_cache_locked(ctl);
2598        spin_unlock(&ctl->tree_lock);
2599}
2600
2601void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2602{
2603        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2604        struct btrfs_free_cluster *cluster;
2605        struct list_head *head;
2606
2607        spin_lock(&ctl->tree_lock);
2608        while ((head = block_group->cluster_list.next) !=
2609               &block_group->cluster_list) {
2610                cluster = list_entry(head, struct btrfs_free_cluster,
2611                                     block_group_list);
2612
2613                WARN_ON(cluster->block_group != block_group);
2614                __btrfs_return_cluster_to_free_space(block_group, cluster);
2615
2616                cond_resched_lock(&ctl->tree_lock);
2617        }
2618        __btrfs_remove_free_space_cache_locked(ctl);
2619        spin_unlock(&ctl->tree_lock);
2620
2621}
2622
2623u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2624                               u64 offset, u64 bytes, u64 empty_size,
2625                               u64 *max_extent_size)
2626{
2627        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2628        struct btrfs_free_space *entry = NULL;
2629        u64 bytes_search = bytes + empty_size;
2630        u64 ret = 0;
2631        u64 align_gap = 0;
2632        u64 align_gap_len = 0;
2633
2634        spin_lock(&ctl->tree_lock);
2635        entry = find_free_space(ctl, &offset, &bytes_search,
2636                                block_group->full_stripe_len, max_extent_size);
2637        if (!entry)
2638                goto out;
2639
2640        ret = offset;
2641        if (entry->bitmap) {
2642                bitmap_clear_bits(ctl, entry, offset, bytes);
2643                if (!entry->bytes)
2644                        free_bitmap(ctl, entry);
2645        } else {
2646                unlink_free_space(ctl, entry);
2647                align_gap_len = offset - entry->offset;
2648                align_gap = entry->offset;
2649
2650                entry->offset = offset + bytes;
2651                WARN_ON(entry->bytes < bytes + align_gap_len);
2652
2653                entry->bytes -= bytes + align_gap_len;
2654                if (!entry->bytes)
2655                        kmem_cache_free(btrfs_free_space_cachep, entry);
2656                else
2657                        link_free_space(ctl, entry);
2658        }
2659out:
2660        spin_unlock(&ctl->tree_lock);
2661
2662        if (align_gap_len)
2663                __btrfs_add_free_space(block_group->fs_info, ctl,
2664                                       align_gap, align_gap_len);
2665        return ret;
2666}
2667
2668/*
2669 * given a cluster, put all of its extents back into the free space
2670 * cache.  If a block group is passed, this function will only free
2671 * a cluster that belongs to the passed block group.
2672 *
2673 * Otherwise, it'll get a reference on the block group pointed to by the
2674 * cluster and remove the cluster from it.
2675 */
2676int btrfs_return_cluster_to_free_space(
2677                               struct btrfs_block_group_cache *block_group,
2678                               struct btrfs_free_cluster *cluster)
2679{
2680        struct btrfs_free_space_ctl *ctl;
2681        int ret;
2682
2683        /* first, get a safe pointer to the block group */
2684        spin_lock(&cluster->lock);
2685        if (!block_group) {
2686                block_group = cluster->block_group;
2687                if (!block_group) {
2688                        spin_unlock(&cluster->lock);
2689                        return 0;
2690                }
2691        } else if (cluster->block_group != block_group) {
2692                /* someone else has already freed it don't redo their work */
2693                spin_unlock(&cluster->lock);
2694                return 0;
2695        }
2696        atomic_inc(&block_group->count);
2697        spin_unlock(&cluster->lock);
2698
2699        ctl = block_group->free_space_ctl;
2700
2701        /* now return any extents the cluster had on it */
2702        spin_lock(&ctl->tree_lock);
2703        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2704        spin_unlock(&ctl->tree_lock);
2705
2706        /* finally drop our ref */
2707        btrfs_put_block_group(block_group);
2708        return ret;
2709}
2710
2711static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2712                                   struct btrfs_free_cluster *cluster,
2713                                   struct btrfs_free_space *entry,
2714                                   u64 bytes, u64 min_start,
2715                                   u64 *max_extent_size)
2716{
2717        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2718        int err;
2719        u64 search_start = cluster->window_start;
2720        u64 search_bytes = bytes;
2721        u64 ret = 0;
2722
2723        search_start = min_start;
2724        search_bytes = bytes;
2725
2726        err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2727        if (err) {
2728                *max_extent_size = max(get_max_extent_size(entry),
2729                                       *max_extent_size);
2730                return 0;
2731        }
2732
2733        ret = search_start;
2734        __bitmap_clear_bits(ctl, entry, ret, bytes);
2735
2736        return ret;
2737}
2738
2739/*
2740 * given a cluster, try to allocate 'bytes' from it, returns 0
2741 * if it couldn't find anything suitably large, or a logical disk offset
2742 * if things worked out
2743 */
2744u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2745                             struct btrfs_free_cluster *cluster, u64 bytes,
2746                             u64 min_start, u64 *max_extent_size)
2747{
2748        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2749        struct btrfs_free_space *entry = NULL;
2750        struct rb_node *node;
2751        u64 ret = 0;
2752
2753        spin_lock(&cluster->lock);
2754        if (bytes > cluster->max_size)
2755                goto out;
2756
2757        if (cluster->block_group != block_group)
2758                goto out;
2759
2760        node = rb_first(&cluster->root);
2761        if (!node)
2762                goto out;
2763
2764        entry = rb_entry(node, struct btrfs_free_space, offset_index);
2765        while (1) {
2766                if (entry->bytes < bytes)
2767                        *max_extent_size = max(get_max_extent_size(entry),
2768                                               *max_extent_size);
2769
2770                if (entry->bytes < bytes ||
2771                    (!entry->bitmap && entry->offset < min_start)) {
2772                        node = rb_next(&entry->offset_index);
2773                        if (!node)
2774                                break;
2775                        entry = rb_entry(node, struct btrfs_free_space,
2776                                         offset_index);
2777                        continue;
2778                }
2779
2780                if (entry->bitmap) {
2781                        ret = btrfs_alloc_from_bitmap(block_group,
2782                                                      cluster, entry, bytes,
2783                                                      cluster->window_start,
2784                                                      max_extent_size);
2785                        if (ret == 0) {
2786                                node = rb_next(&entry->offset_index);
2787                                if (!node)
2788                                        break;
2789                                entry = rb_entry(node, struct btrfs_free_space,
2790                                                 offset_index);
2791                                continue;
2792                        }
2793                        cluster->window_start += bytes;
2794                } else {
2795                        ret = entry->offset;
2796
2797                        entry->offset += bytes;
2798                        entry->bytes -= bytes;
2799                }
2800
2801                if (entry->bytes == 0)
2802                        rb_erase(&entry->offset_index, &cluster->root);
2803                break;
2804        }
2805out:
2806        spin_unlock(&cluster->lock);
2807
2808        if (!ret)
2809                return 0;
2810
2811        spin_lock(&ctl->tree_lock);
2812
2813        ctl->free_space -= bytes;
2814        if (entry->bytes == 0) {
2815                ctl->free_extents--;
2816                if (entry->bitmap) {
2817                        kmem_cache_free(btrfs_free_space_bitmap_cachep,
2818                                        entry->bitmap);
2819                        ctl->total_bitmaps--;
2820                        ctl->op->recalc_thresholds(ctl);
2821                }
2822                kmem_cache_free(btrfs_free_space_cachep, entry);
2823        }
2824
2825        spin_unlock(&ctl->tree_lock);
2826
2827        return ret;
2828}
2829
2830static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2831                                struct btrfs_free_space *entry,
2832                                struct btrfs_free_cluster *cluster,
2833                                u64 offset, u64 bytes,
2834                                u64 cont1_bytes, u64 min_bytes)
2835{
2836        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2837        unsigned long next_zero;
2838        unsigned long i;
2839        unsigned long want_bits;
2840        unsigned long min_bits;
2841        unsigned long found_bits;
2842        unsigned long max_bits = 0;
2843        unsigned long start = 0;
2844        unsigned long total_found = 0;
2845        int ret;
2846
2847        i = offset_to_bit(entry->offset, ctl->unit,
2848                          max_t(u64, offset, entry->offset));
2849        want_bits = bytes_to_bits(bytes, ctl->unit);
2850        min_bits = bytes_to_bits(min_bytes, ctl->unit);
2851
2852        /*
2853         * Don't bother looking for a cluster in this bitmap if it's heavily
2854         * fragmented.
2855         */
2856        if (entry->max_extent_size &&
2857            entry->max_extent_size < cont1_bytes)
2858                return -ENOSPC;
2859again:
2860        found_bits = 0;
2861        for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2862                next_zero = find_next_zero_bit(entry->bitmap,
2863                                               BITS_PER_BITMAP, i);
2864                if (next_zero - i >= min_bits) {
2865                        found_bits = next_zero - i;
2866                        if (found_bits > max_bits)
2867                                max_bits = found_bits;
2868                        break;
2869                }
2870                if (next_zero - i > max_bits)
2871                        max_bits = next_zero - i;
2872                i = next_zero;
2873        }
2874
2875        if (!found_bits) {
2876                entry->max_extent_size = (u64)max_bits * ctl->unit;
2877                return -ENOSPC;
2878        }
2879
2880        if (!total_found) {
2881                start = i;
2882                cluster->max_size = 0;
2883        }
2884
2885        total_found += found_bits;
2886
2887        if (cluster->max_size < found_bits * ctl->unit)
2888                cluster->max_size = found_bits * ctl->unit;
2889
2890        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2891                i = next_zero + 1;
2892                goto again;
2893        }
2894
2895        cluster->window_start = start * ctl->unit + entry->offset;
2896        rb_erase(&entry->offset_index, &ctl->free_space_offset);
2897        ret = tree_insert_offset(&cluster->root, entry->offset,
2898                                 &entry->offset_index, 1);
2899        ASSERT(!ret); /* -EEXIST; Logic error */
2900
2901        trace_btrfs_setup_cluster(block_group, cluster,
2902                                  total_found * ctl->unit, 1);
2903        return 0;
2904}
2905
2906/*
2907 * This searches the block group for just extents to fill the cluster with.
2908 * Try to find a cluster with at least bytes total bytes, at least one
2909 * extent of cont1_bytes, and other clusters of at least min_bytes.
2910 */
2911static noinline int
2912setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2913                        struct btrfs_free_cluster *cluster,
2914                        struct list_head *bitmaps, u64 offset, u64 bytes,
2915                        u64 cont1_bytes, u64 min_bytes)
2916{
2917        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2918        struct btrfs_free_space *first = NULL;
2919        struct btrfs_free_space *entry = NULL;
2920        struct btrfs_free_space *last;
2921        struct rb_node *node;
2922        u64 window_free;
2923        u64 max_extent;
2924        u64 total_size = 0;
2925
2926        entry = tree_search_offset(ctl, offset, 0, 1);
2927        if (!entry)
2928                return -ENOSPC;
2929
2930        /*
2931         * We don't want bitmaps, so just move along until we find a normal
2932         * extent entry.
2933         */
2934        while (entry->bitmap || entry->bytes < min_bytes) {
2935                if (entry->bitmap && list_empty(&entry->list))
2936                        list_add_tail(&entry->list, bitmaps);
2937                node = rb_next(&entry->offset_index);
2938                if (!node)
2939                        return -ENOSPC;
2940                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2941        }
2942
2943        window_free = entry->bytes;
2944        max_extent = entry->bytes;
2945        first = entry;
2946        last = entry;
2947
2948        for (node = rb_next(&entry->offset_index); node;
2949             node = rb_next(&entry->offset_index)) {
2950                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2951
2952                if (entry->bitmap) {
2953                        if (list_empty(&entry->list))
2954                                list_add_tail(&entry->list, bitmaps);
2955                        continue;
2956                }
2957
2958                if (entry->bytes < min_bytes)
2959                        continue;
2960
2961                last = entry;
2962                window_free += entry->bytes;
2963                if (entry->bytes > max_extent)
2964                        max_extent = entry->bytes;
2965        }
2966
2967        if (window_free < bytes || max_extent < cont1_bytes)
2968                return -ENOSPC;
2969
2970        cluster->window_start = first->offset;
2971
2972        node = &first->offset_index;
2973
2974        /*
2975         * now we've found our entries, pull them out of the free space
2976         * cache and put them into the cluster rbtree
2977         */
2978        do {
2979                int ret;
2980
2981                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2982                node = rb_next(&entry->offset_index);
2983                if (entry->bitmap || entry->bytes < min_bytes)
2984                        continue;
2985
2986                rb_erase(&entry->offset_index, &ctl->free_space_offset);
2987                ret = tree_insert_offset(&cluster->root, entry->offset,
2988                                         &entry->offset_index, 0);
2989                total_size += entry->bytes;
2990                ASSERT(!ret); /* -EEXIST; Logic error */
2991        } while (node && entry != last);
2992
2993        cluster->max_size = max_extent;
2994        trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2995        return 0;
2996}
2997
2998/*
2999 * This specifically looks for bitmaps that may work in the cluster, we assume
3000 * that we have already failed to find extents that will work.
3001 */
3002static noinline int
3003setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
3004                     struct btrfs_free_cluster *cluster,
3005                     struct list_head *bitmaps, u64 offset, u64 bytes,
3006                     u64 cont1_bytes, u64 min_bytes)
3007{
3008        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3009        struct btrfs_free_space *entry = NULL;
3010        int ret = -ENOSPC;
3011        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
3012
3013        if (ctl->total_bitmaps == 0)
3014                return -ENOSPC;
3015
3016        /*
3017         * The bitmap that covers offset won't be in the list unless offset
3018         * is just its start offset.
3019         */
3020        if (!list_empty(bitmaps))
3021                entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
3022
3023        if (!entry || entry->offset != bitmap_offset) {
3024                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
3025                if (entry && list_empty(&entry->list))
3026                        list_add(&entry->list, bitmaps);
3027        }
3028
3029        list_for_each_entry(entry, bitmaps, list) {
3030                if (entry->bytes < bytes)
3031                        continue;
3032                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
3033                                           bytes, cont1_bytes, min_bytes);
3034                if (!ret)
3035                        return 0;
3036        }
3037
3038        /*
3039         * The bitmaps list has all the bitmaps that record free space
3040         * starting after offset, so no more search is required.
3041         */
3042        return -ENOSPC;
3043}
3044
3045/*
3046 * here we try to find a cluster of blocks in a block group.  The goal
3047 * is to find at least bytes+empty_size.
3048 * We might not find them all in one contiguous area.
3049 *
3050 * returns zero and sets up cluster if things worked out, otherwise
3051 * it returns -enospc
3052 */
3053int btrfs_find_space_cluster(struct btrfs_block_group_cache *block_group,
3054                             struct btrfs_free_cluster *cluster,
3055                             u64 offset, u64 bytes, u64 empty_size)
3056{
3057        struct btrfs_fs_info *fs_info = block_group->fs_info;
3058        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3059        struct btrfs_free_space *entry, *tmp;
3060        LIST_HEAD(bitmaps);
3061        u64 min_bytes;
3062        u64 cont1_bytes;
3063        int ret;
3064
3065        /*
3066         * Choose the minimum extent size we'll require for this
3067         * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3068         * For metadata, allow allocates with smaller extents.  For
3069         * data, keep it dense.
3070         */
3071        if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3072                cont1_bytes = min_bytes = bytes + empty_size;
3073        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3074                cont1_bytes = bytes;
3075                min_bytes = fs_info->sectorsize;
3076        } else {
3077                cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3078                min_bytes = fs_info->sectorsize;
3079        }
3080
3081        spin_lock(&ctl->tree_lock);
3082
3083        /*
3084         * If we know we don't have enough space to make a cluster don't even
3085         * bother doing all the work to try and find one.
3086         */
3087        if (ctl->free_space < bytes) {
3088                spin_unlock(&ctl->tree_lock);
3089                return -ENOSPC;
3090        }
3091
3092        spin_lock(&cluster->lock);
3093
3094        /* someone already found a cluster, hooray */
3095        if (cluster->block_group) {
3096                ret = 0;
3097                goto out;
3098        }
3099
3100        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3101                                 min_bytes);
3102
3103        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3104                                      bytes + empty_size,
3105                                      cont1_bytes, min_bytes);
3106        if (ret)
3107                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3108                                           offset, bytes + empty_size,
3109                                           cont1_bytes, min_bytes);
3110
3111        /* Clear our temporary list */
3112        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3113                list_del_init(&entry->list);
3114
3115        if (!ret) {
3116                atomic_inc(&block_group->count);
3117                list_add_tail(&cluster->block_group_list,
3118                              &block_group->cluster_list);
3119                cluster->block_group = block_group;
3120        } else {
3121                trace_btrfs_failed_cluster_setup(block_group);
3122        }
3123out:
3124        spin_unlock(&cluster->lock);
3125        spin_unlock(&ctl->tree_lock);
3126
3127        return ret;
3128}
3129
3130/*
3131 * simple code to zero out a cluster
3132 */
3133void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3134{
3135        spin_lock_init(&cluster->lock);
3136        spin_lock_init(&cluster->refill_lock);
3137        cluster->root = RB_ROOT;
3138        cluster->max_size = 0;
3139        cluster->fragmented = false;
3140        INIT_LIST_HEAD(&cluster->block_group_list);
3141        cluster->block_group = NULL;
3142}
3143
3144static int do_trimming(struct btrfs_block_group_cache *block_group,
3145                       u64 *total_trimmed, u64 start, u64 bytes,
3146                       u64 reserved_start, u64 reserved_bytes,
3147                       struct btrfs_trim_range *trim_entry)
3148{
3149        struct btrfs_space_info *space_info = block_group->space_info;
3150        struct btrfs_fs_info *fs_info = block_group->fs_info;
3151        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3152        int ret;
3153        int update = 0;
3154        u64 trimmed = 0;
3155
3156        spin_lock(&space_info->lock);
3157        spin_lock(&block_group->lock);
3158        if (!block_group->ro) {
3159                block_group->reserved += reserved_bytes;
3160                space_info->bytes_reserved += reserved_bytes;
3161                update = 1;
3162        }
3163        spin_unlock(&block_group->lock);
3164        spin_unlock(&space_info->lock);
3165
3166        ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3167        if (!ret)
3168                *total_trimmed += trimmed;
3169
3170        mutex_lock(&ctl->cache_writeout_mutex);
3171        btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
3172        list_del(&trim_entry->list);
3173        mutex_unlock(&ctl->cache_writeout_mutex);
3174
3175        if (update) {
3176                spin_lock(&space_info->lock);
3177                spin_lock(&block_group->lock);
3178                if (block_group->ro)
3179                        space_info->bytes_readonly += reserved_bytes;
3180                block_group->reserved -= reserved_bytes;
3181                space_info->bytes_reserved -= reserved_bytes;
3182                spin_unlock(&block_group->lock);
3183                spin_unlock(&space_info->lock);
3184        }
3185
3186        return ret;
3187}
3188
3189static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3190                          u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3191{
3192        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3193        struct btrfs_free_space *entry;
3194        struct rb_node *node;
3195        int ret = 0;
3196        u64 extent_start;
3197        u64 extent_bytes;
3198        u64 bytes;
3199
3200        while (start < end) {
3201                struct btrfs_trim_range trim_entry;
3202
3203                mutex_lock(&ctl->cache_writeout_mutex);
3204                spin_lock(&ctl->tree_lock);
3205
3206                if (ctl->free_space < minlen) {
3207                        spin_unlock(&ctl->tree_lock);
3208                        mutex_unlock(&ctl->cache_writeout_mutex);
3209                        break;
3210                }
3211
3212                entry = tree_search_offset(ctl, start, 0, 1);
3213                if (!entry) {
3214                        spin_unlock(&ctl->tree_lock);
3215                        mutex_unlock(&ctl->cache_writeout_mutex);
3216                        break;
3217                }
3218
3219                /* skip bitmaps */
3220                while (entry->bitmap) {
3221                        node = rb_next(&entry->offset_index);
3222                        if (!node) {
3223                                spin_unlock(&ctl->tree_lock);
3224                                mutex_unlock(&ctl->cache_writeout_mutex);
3225                                goto out;
3226                        }
3227                        entry = rb_entry(node, struct btrfs_free_space,
3228                                         offset_index);
3229                }
3230
3231                if (entry->offset >= end) {
3232                        spin_unlock(&ctl->tree_lock);
3233                        mutex_unlock(&ctl->cache_writeout_mutex);
3234                        break;
3235                }
3236
3237                extent_start = entry->offset;
3238                extent_bytes = entry->bytes;
3239                start = max(start, extent_start);
3240                bytes = min(extent_start + extent_bytes, end) - start;
3241                if (bytes < minlen) {
3242                        spin_unlock(&ctl->tree_lock);
3243                        mutex_unlock(&ctl->cache_writeout_mutex);
3244                        goto next;
3245                }
3246
3247                unlink_free_space(ctl, entry);
3248                kmem_cache_free(btrfs_free_space_cachep, entry);
3249
3250                spin_unlock(&ctl->tree_lock);
3251                trim_entry.start = extent_start;
3252                trim_entry.bytes = extent_bytes;
3253                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3254                mutex_unlock(&ctl->cache_writeout_mutex);
3255
3256                ret = do_trimming(block_group, total_trimmed, start, bytes,
3257                                  extent_start, extent_bytes, &trim_entry);
3258                if (ret)
3259                        break;
3260next:
3261                start += bytes;
3262
3263                if (fatal_signal_pending(current)) {
3264                        ret = -ERESTARTSYS;
3265                        break;
3266                }
3267
3268                cond_resched();
3269        }
3270out:
3271        return ret;
3272}
3273
3274static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3275                        u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3276{
3277        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3278        struct btrfs_free_space *entry;
3279        int ret = 0;
3280        int ret2;
3281        u64 bytes;
3282        u64 offset = offset_to_bitmap(ctl, start);
3283
3284        while (offset < end) {
3285                bool next_bitmap = false;
3286                struct btrfs_trim_range trim_entry;
3287
3288                mutex_lock(&ctl->cache_writeout_mutex);
3289                spin_lock(&ctl->tree_lock);
3290
3291                if (ctl->free_space < minlen) {
3292                        spin_unlock(&ctl->tree_lock);
3293                        mutex_unlock(&ctl->cache_writeout_mutex);
3294                        break;
3295                }
3296
3297                entry = tree_search_offset(ctl, offset, 1, 0);
3298                if (!entry) {
3299                        spin_unlock(&ctl->tree_lock);
3300                        mutex_unlock(&ctl->cache_writeout_mutex);
3301                        next_bitmap = true;
3302                        goto next;
3303                }
3304
3305                bytes = minlen;
3306                ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3307                if (ret2 || start >= end) {
3308                        spin_unlock(&ctl->tree_lock);
3309                        mutex_unlock(&ctl->cache_writeout_mutex);
3310                        next_bitmap = true;
3311                        goto next;
3312                }
3313
3314                bytes = min(bytes, end - start);
3315                if (bytes < minlen) {
3316                        spin_unlock(&ctl->tree_lock);
3317                        mutex_unlock(&ctl->cache_writeout_mutex);
3318                        goto next;
3319                }
3320
3321                bitmap_clear_bits(ctl, entry, start, bytes);
3322                if (entry->bytes == 0)
3323                        free_bitmap(ctl, entry);
3324
3325                spin_unlock(&ctl->tree_lock);
3326                trim_entry.start = start;
3327                trim_entry.bytes = bytes;
3328                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3329                mutex_unlock(&ctl->cache_writeout_mutex);
3330
3331                ret = do_trimming(block_group, total_trimmed, start, bytes,
3332                                  start, bytes, &trim_entry);
3333                if (ret)
3334                        break;
3335next:
3336                if (next_bitmap) {
3337                        offset += BITS_PER_BITMAP * ctl->unit;
3338                } else {
3339                        start += bytes;
3340                        if (start >= offset + BITS_PER_BITMAP * ctl->unit)
3341                                offset += BITS_PER_BITMAP * ctl->unit;
3342                }
3343
3344                if (fatal_signal_pending(current)) {
3345                        ret = -ERESTARTSYS;
3346                        break;
3347                }
3348
3349                cond_resched();
3350        }
3351
3352        return ret;
3353}
3354
3355void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
3356{
3357        atomic_inc(&cache->trimming);
3358}
3359
3360void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
3361{
3362        struct btrfs_fs_info *fs_info = block_group->fs_info;
3363        struct extent_map_tree *em_tree;
3364        struct extent_map *em;
3365        bool cleanup;
3366
3367        spin_lock(&block_group->lock);
3368        cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3369                   block_group->removed);
3370        spin_unlock(&block_group->lock);
3371
3372        if (cleanup) {
3373                mutex_lock(&fs_info->chunk_mutex);
3374                em_tree = &fs_info->mapping_tree;
3375                write_lock(&em_tree->lock);
3376                em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3377                                           1);
3378                BUG_ON(!em); /* logic error, can't happen */
3379                remove_extent_mapping(em_tree, em);
3380                write_unlock(&em_tree->lock);
3381                mutex_unlock(&fs_info->chunk_mutex);
3382
3383                /* once for us and once for the tree */
3384                free_extent_map(em);
3385                free_extent_map(em);
3386
3387                /*
3388                 * We've left one free space entry and other tasks trimming
3389                 * this block group have left 1 entry each one. Free them.
3390                 */
3391                __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3392        }
3393}
3394
3395int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3396                           u64 *trimmed, u64 start, u64 end, u64 minlen)
3397{
3398        int ret;
3399
3400        *trimmed = 0;
3401
3402        spin_lock(&block_group->lock);
3403        if (block_group->removed) {
3404                spin_unlock(&block_group->lock);
3405                return 0;
3406        }
3407        btrfs_get_block_group_trimming(block_group);
3408        spin_unlock(&block_group->lock);
3409
3410        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3411        if (ret)
3412                goto out;
3413
3414        ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3415out:
3416        btrfs_put_block_group_trimming(block_group);
3417        return ret;
3418}
3419
3420/*
3421 * Find the left-most item in the cache tree, and then return the
3422 * smallest inode number in the item.
3423 *
3424 * Note: the returned inode number may not be the smallest one in
3425 * the tree, if the left-most item is a bitmap.
3426 */
3427u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3428{
3429        struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3430        struct btrfs_free_space *entry = NULL;
3431        u64 ino = 0;
3432
3433        spin_lock(&ctl->tree_lock);
3434
3435        if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3436                goto out;
3437
3438        entry = rb_entry(rb_first(&ctl->free_space_offset),
3439                         struct btrfs_free_space, offset_index);
3440
3441        if (!entry->bitmap) {
3442                ino = entry->offset;
3443
3444                unlink_free_space(ctl, entry);
3445                entry->offset++;
3446                entry->bytes--;
3447                if (!entry->bytes)
3448                        kmem_cache_free(btrfs_free_space_cachep, entry);
3449                else
3450                        link_free_space(ctl, entry);
3451        } else {
3452                u64 offset = 0;
3453                u64 count = 1;
3454                int ret;
3455
3456                ret = search_bitmap(ctl, entry, &offset, &count, true);
3457                /* Logic error; Should be empty if it can't find anything */
3458                ASSERT(!ret);
3459
3460                ino = offset;
3461                bitmap_clear_bits(ctl, entry, offset, 1);
3462                if (entry->bytes == 0)
3463                        free_bitmap(ctl, entry);
3464        }
3465out:
3466        spin_unlock(&ctl->tree_lock);
3467
3468        return ino;
3469}
3470
3471struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3472                                    struct btrfs_path *path)
3473{
3474        struct inode *inode = NULL;
3475
3476        spin_lock(&root->ino_cache_lock);
3477        if (root->ino_cache_inode)
3478                inode = igrab(root->ino_cache_inode);
3479        spin_unlock(&root->ino_cache_lock);
3480        if (inode)
3481                return inode;
3482
3483        inode = __lookup_free_space_inode(root, path, 0);
3484        if (IS_ERR(inode))
3485                return inode;
3486
3487        spin_lock(&root->ino_cache_lock);
3488        if (!btrfs_fs_closing(root->fs_info))
3489                root->ino_cache_inode = igrab(inode);
3490        spin_unlock(&root->ino_cache_lock);
3491
3492        return inode;
3493}
3494
3495int create_free_ino_inode(struct btrfs_root *root,
3496                          struct btrfs_trans_handle *trans,
3497                          struct btrfs_path *path)
3498{
3499        return __create_free_space_inode(root, trans, path,
3500                                         BTRFS_FREE_INO_OBJECTID, 0);
3501}
3502
3503int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3504{
3505        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3506        struct btrfs_path *path;
3507        struct inode *inode;
3508        int ret = 0;
3509        u64 root_gen = btrfs_root_generation(&root->root_item);
3510
3511        if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3512                return 0;
3513
3514        /*
3515         * If we're unmounting then just return, since this does a search on the
3516         * normal root and not the commit root and we could deadlock.
3517         */
3518        if (btrfs_fs_closing(fs_info))
3519                return 0;
3520
3521        path = btrfs_alloc_path();
3522        if (!path)
3523                return 0;
3524
3525        inode = lookup_free_ino_inode(root, path);
3526        if (IS_ERR(inode))
3527                goto out;
3528
3529        if (root_gen != BTRFS_I(inode)->generation)
3530                goto out_put;
3531
3532        ret = __load_free_space_cache(root, inode, ctl, path, 0);
3533
3534        if (ret < 0)
3535                btrfs_err(fs_info,
3536                        "failed to load free ino cache for root %llu",
3537                        root->root_key.objectid);
3538out_put:
3539        iput(inode);
3540out:
3541        btrfs_free_path(path);
3542        return ret;
3543}
3544
3545int btrfs_write_out_ino_cache(struct btrfs_root *root,
3546                              struct btrfs_trans_handle *trans,
3547                              struct btrfs_path *path,
3548                              struct inode *inode)
3549{
3550        struct btrfs_fs_info *fs_info = root->fs_info;
3551        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3552        int ret;
3553        struct btrfs_io_ctl io_ctl;
3554        bool release_metadata = true;
3555
3556        if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3557                return 0;
3558
3559        memset(&io_ctl, 0, sizeof(io_ctl));
3560        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
3561        if (!ret) {
3562                /*
3563                 * At this point writepages() didn't error out, so our metadata
3564                 * reservation is released when the writeback finishes, at
3565                 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3566                 * with or without an error.
3567                 */
3568                release_metadata = false;
3569                ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
3570        }
3571
3572        if (ret) {
3573                if (release_metadata)
3574                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
3575                                        inode->i_size, true);
3576#ifdef DEBUG
3577                btrfs_err(fs_info,
3578                          "failed to write free ino cache for root %llu",
3579                          root->root_key.objectid);
3580#endif
3581        }
3582
3583        return ret;
3584}
3585
3586#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3587/*
3588 * Use this if you need to make a bitmap or extent entry specifically, it
3589 * doesn't do any of the merging that add_free_space does, this acts a lot like
3590 * how the free space cache loading stuff works, so you can get really weird
3591 * configurations.
3592 */
3593int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
3594                              u64 offset, u64 bytes, bool bitmap)
3595{
3596        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3597        struct btrfs_free_space *info = NULL, *bitmap_info;
3598        void *map = NULL;
3599        u64 bytes_added;
3600        int ret;
3601
3602again:
3603        if (!info) {
3604                info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3605                if (!info)
3606                        return -ENOMEM;
3607        }
3608
3609        if (!bitmap) {
3610                spin_lock(&ctl->tree_lock);
3611                info->offset = offset;
3612                info->bytes = bytes;
3613                info->max_extent_size = 0;
3614                ret = link_free_space(ctl, info);
3615                spin_unlock(&ctl->tree_lock);
3616                if (ret)
3617                        kmem_cache_free(btrfs_free_space_cachep, info);
3618                return ret;
3619        }
3620
3621        if (!map) {
3622                map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS);
3623                if (!map) {
3624                        kmem_cache_free(btrfs_free_space_cachep, info);
3625                        return -ENOMEM;
3626                }
3627        }
3628
3629        spin_lock(&ctl->tree_lock);
3630        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3631                                         1, 0);
3632        if (!bitmap_info) {
3633                info->bitmap = map;
3634                map = NULL;
3635                add_new_bitmap(ctl, info, offset);
3636                bitmap_info = info;
3637                info = NULL;
3638        }
3639
3640        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3641
3642        bytes -= bytes_added;
3643        offset += bytes_added;
3644        spin_unlock(&ctl->tree_lock);
3645
3646        if (bytes)
3647                goto again;
3648
3649        if (info)
3650                kmem_cache_free(btrfs_free_space_cachep, info);
3651        if (map)
3652                kmem_cache_free(btrfs_free_space_bitmap_cachep, map);
3653        return 0;
3654}
3655
3656/*
3657 * Checks to see if the given range is in the free space cache.  This is really
3658 * just used to check the absence of space, so if there is free space in the
3659 * range at all we will return 1.
3660 */
3661int test_check_exists(struct btrfs_block_group_cache *cache,
3662                      u64 offset, u64 bytes)
3663{
3664        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3665        struct btrfs_free_space *info;
3666        int ret = 0;
3667
3668        spin_lock(&ctl->tree_lock);
3669        info = tree_search_offset(ctl, offset, 0, 0);
3670        if (!info) {
3671                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3672                                          1, 0);
3673                if (!info)
3674                        goto out;
3675        }
3676
3677have_info:
3678        if (info->bitmap) {
3679                u64 bit_off, bit_bytes;
3680                struct rb_node *n;
3681                struct btrfs_free_space *tmp;
3682
3683                bit_off = offset;
3684                bit_bytes = ctl->unit;
3685                ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3686                if (!ret) {
3687                        if (bit_off == offset) {
3688                                ret = 1;
3689                                goto out;
3690                        } else if (bit_off > offset &&
3691                                   offset + bytes > bit_off) {
3692                                ret = 1;
3693                                goto out;
3694                        }
3695                }
3696
3697                n = rb_prev(&info->offset_index);
3698                while (n) {
3699                        tmp = rb_entry(n, struct btrfs_free_space,
3700                                       offset_index);
3701                        if (tmp->offset + tmp->bytes < offset)
3702                                break;
3703                        if (offset + bytes < tmp->offset) {
3704                                n = rb_prev(&tmp->offset_index);
3705                                continue;
3706                        }
3707                        info = tmp;
3708                        goto have_info;
3709                }
3710
3711                n = rb_next(&info->offset_index);
3712                while (n) {
3713                        tmp = rb_entry(n, struct btrfs_free_space,
3714                                       offset_index);
3715                        if (offset + bytes < tmp->offset)
3716                                break;
3717                        if (tmp->offset + tmp->bytes < offset) {
3718                                n = rb_next(&tmp->offset_index);
3719                                continue;
3720                        }
3721                        info = tmp;
3722                        goto have_info;
3723                }
3724
3725                ret = 0;
3726                goto out;
3727        }
3728
3729        if (info->offset == offset) {
3730                ret = 1;
3731                goto out;
3732        }
3733
3734        if (offset > info->offset && offset < info->offset + info->bytes)
3735                ret = 1;
3736out:
3737        spin_unlock(&ctl->tree_lock);
3738        return ret;
3739}
3740#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3741