linux/fs/btrfs/free-space-cache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2008 Red Hat.  All rights reserved.
   4 */
   5
   6#include <linux/pagemap.h>
   7#include <linux/sched.h>
   8#include <linux/sched/signal.h>
   9#include <linux/slab.h>
  10#include <linux/math64.h>
  11#include <linux/ratelimit.h>
  12#include <linux/error-injection.h>
  13#include "ctree.h"
  14#include "free-space-cache.h"
  15#include "transaction.h"
  16#include "disk-io.h"
  17#include "extent_io.h"
  18#include "inode-map.h"
  19#include "volumes.h"
  20
  21#define BITS_PER_BITMAP         (PAGE_SIZE * 8UL)
  22#define MAX_CACHE_BYTES_PER_GIG SZ_32K
  23
  24struct btrfs_trim_range {
  25        u64 start;
  26        u64 bytes;
  27        struct list_head list;
  28};
  29
  30static int link_free_space(struct btrfs_free_space_ctl *ctl,
  31                           struct btrfs_free_space *info);
  32static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
  33                              struct btrfs_free_space *info);
  34static int btrfs_wait_cache_io_root(struct btrfs_root *root,
  35                             struct btrfs_trans_handle *trans,
  36                             struct btrfs_io_ctl *io_ctl,
  37                             struct btrfs_path *path);
  38
  39static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
  40                                               struct btrfs_path *path,
  41                                               u64 offset)
  42{
  43        struct btrfs_fs_info *fs_info = root->fs_info;
  44        struct btrfs_key key;
  45        struct btrfs_key location;
  46        struct btrfs_disk_key disk_key;
  47        struct btrfs_free_space_header *header;
  48        struct extent_buffer *leaf;
  49        struct inode *inode = NULL;
  50        int ret;
  51
  52        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
  53        key.offset = offset;
  54        key.type = 0;
  55
  56        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  57        if (ret < 0)
  58                return ERR_PTR(ret);
  59        if (ret > 0) {
  60                btrfs_release_path(path);
  61                return ERR_PTR(-ENOENT);
  62        }
  63
  64        leaf = path->nodes[0];
  65        header = btrfs_item_ptr(leaf, path->slots[0],
  66                                struct btrfs_free_space_header);
  67        btrfs_free_space_key(leaf, header, &disk_key);
  68        btrfs_disk_key_to_cpu(&location, &disk_key);
  69        btrfs_release_path(path);
  70
  71        inode = btrfs_iget(fs_info->sb, &location, root, NULL);
  72        if (IS_ERR(inode))
  73                return inode;
  74
  75        mapping_set_gfp_mask(inode->i_mapping,
  76                        mapping_gfp_constraint(inode->i_mapping,
  77                        ~(__GFP_FS | __GFP_HIGHMEM)));
  78
  79        return inode;
  80}
  81
  82struct inode *lookup_free_space_inode(struct btrfs_fs_info *fs_info,
  83                                      struct btrfs_block_group_cache
  84                                      *block_group, struct btrfs_path *path)
  85{
  86        struct inode *inode = NULL;
  87        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
  88
  89        spin_lock(&block_group->lock);
  90        if (block_group->inode)
  91                inode = igrab(block_group->inode);
  92        spin_unlock(&block_group->lock);
  93        if (inode)
  94                return inode;
  95
  96        inode = __lookup_free_space_inode(fs_info->tree_root, path,
  97                                          block_group->key.objectid);
  98        if (IS_ERR(inode))
  99                return inode;
 100
 101        spin_lock(&block_group->lock);
 102        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
 103                btrfs_info(fs_info, "Old style space inode found, converting.");
 104                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
 105                        BTRFS_INODE_NODATACOW;
 106                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 107        }
 108
 109        if (!block_group->iref) {
 110                block_group->inode = igrab(inode);
 111                block_group->iref = 1;
 112        }
 113        spin_unlock(&block_group->lock);
 114
 115        return inode;
 116}
 117
 118static int __create_free_space_inode(struct btrfs_root *root,
 119                                     struct btrfs_trans_handle *trans,
 120                                     struct btrfs_path *path,
 121                                     u64 ino, u64 offset)
 122{
 123        struct btrfs_key key;
 124        struct btrfs_disk_key disk_key;
 125        struct btrfs_free_space_header *header;
 126        struct btrfs_inode_item *inode_item;
 127        struct extent_buffer *leaf;
 128        u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
 129        int ret;
 130
 131        ret = btrfs_insert_empty_inode(trans, root, path, ino);
 132        if (ret)
 133                return ret;
 134
 135        /* We inline crc's for the free disk space cache */
 136        if (ino != BTRFS_FREE_INO_OBJECTID)
 137                flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 138
 139        leaf = path->nodes[0];
 140        inode_item = btrfs_item_ptr(leaf, path->slots[0],
 141                                    struct btrfs_inode_item);
 142        btrfs_item_key(leaf, &disk_key, path->slots[0]);
 143        memzero_extent_buffer(leaf, (unsigned long)inode_item,
 144                             sizeof(*inode_item));
 145        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
 146        btrfs_set_inode_size(leaf, inode_item, 0);
 147        btrfs_set_inode_nbytes(leaf, inode_item, 0);
 148        btrfs_set_inode_uid(leaf, inode_item, 0);
 149        btrfs_set_inode_gid(leaf, inode_item, 0);
 150        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
 151        btrfs_set_inode_flags(leaf, inode_item, flags);
 152        btrfs_set_inode_nlink(leaf, inode_item, 1);
 153        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 154        btrfs_set_inode_block_group(leaf, inode_item, offset);
 155        btrfs_mark_buffer_dirty(leaf);
 156        btrfs_release_path(path);
 157
 158        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 159        key.offset = offset;
 160        key.type = 0;
 161        ret = btrfs_insert_empty_item(trans, root, path, &key,
 162                                      sizeof(struct btrfs_free_space_header));
 163        if (ret < 0) {
 164                btrfs_release_path(path);
 165                return ret;
 166        }
 167
 168        leaf = path->nodes[0];
 169        header = btrfs_item_ptr(leaf, path->slots[0],
 170                                struct btrfs_free_space_header);
 171        memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header));
 172        btrfs_set_free_space_key(leaf, header, &disk_key);
 173        btrfs_mark_buffer_dirty(leaf);
 174        btrfs_release_path(path);
 175
 176        return 0;
 177}
 178
 179int create_free_space_inode(struct btrfs_fs_info *fs_info,
 180                            struct btrfs_trans_handle *trans,
 181                            struct btrfs_block_group_cache *block_group,
 182                            struct btrfs_path *path)
 183{
 184        int ret;
 185        u64 ino;
 186
 187        ret = btrfs_find_free_objectid(fs_info->tree_root, &ino);
 188        if (ret < 0)
 189                return ret;
 190
 191        return __create_free_space_inode(fs_info->tree_root, trans, path, ino,
 192                                         block_group->key.objectid);
 193}
 194
 195int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info,
 196                                       struct btrfs_block_rsv *rsv)
 197{
 198        u64 needed_bytes;
 199        int ret;
 200
 201        /* 1 for slack space, 1 for updating the inode */
 202        needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) +
 203                btrfs_calc_trans_metadata_size(fs_info, 1);
 204
 205        spin_lock(&rsv->lock);
 206        if (rsv->reserved < needed_bytes)
 207                ret = -ENOSPC;
 208        else
 209                ret = 0;
 210        spin_unlock(&rsv->lock);
 211        return ret;
 212}
 213
 214int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
 215                                    struct btrfs_block_group_cache *block_group,
 216                                    struct inode *inode)
 217{
 218        struct btrfs_root *root = BTRFS_I(inode)->root;
 219        int ret = 0;
 220        bool locked = false;
 221
 222        if (block_group) {
 223                struct btrfs_path *path = btrfs_alloc_path();
 224
 225                if (!path) {
 226                        ret = -ENOMEM;
 227                        goto fail;
 228                }
 229                locked = true;
 230                mutex_lock(&trans->transaction->cache_write_mutex);
 231                if (!list_empty(&block_group->io_list)) {
 232                        list_del_init(&block_group->io_list);
 233
 234                        btrfs_wait_cache_io(trans, block_group, path);
 235                        btrfs_put_block_group(block_group);
 236                }
 237
 238                /*
 239                 * now that we've truncated the cache away, its no longer
 240                 * setup or written
 241                 */
 242                spin_lock(&block_group->lock);
 243                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 244                spin_unlock(&block_group->lock);
 245                btrfs_free_path(path);
 246        }
 247
 248        btrfs_i_size_write(BTRFS_I(inode), 0);
 249        truncate_pagecache(inode, 0);
 250
 251        /*
 252         * We skip the throttling logic for free space cache inodes, so we don't
 253         * need to check for -EAGAIN.
 254         */
 255        ret = btrfs_truncate_inode_items(trans, root, inode,
 256                                         0, BTRFS_EXTENT_DATA_KEY);
 257        if (ret)
 258                goto fail;
 259
 260        ret = btrfs_update_inode(trans, root, inode);
 261
 262fail:
 263        if (locked)
 264                mutex_unlock(&trans->transaction->cache_write_mutex);
 265        if (ret)
 266                btrfs_abort_transaction(trans, ret);
 267
 268        return ret;
 269}
 270
 271static void readahead_cache(struct inode *inode)
 272{
 273        struct file_ra_state *ra;
 274        unsigned long last_index;
 275
 276        ra = kzalloc(sizeof(*ra), GFP_NOFS);
 277        if (!ra)
 278                return;
 279
 280        file_ra_state_init(ra, inode->i_mapping);
 281        last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 282
 283        page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
 284
 285        kfree(ra);
 286}
 287
 288static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 289                       int write)
 290{
 291        int num_pages;
 292        int check_crcs = 0;
 293
 294        num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 295
 296        if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FREE_INO_OBJECTID)
 297                check_crcs = 1;
 298
 299        /* Make sure we can fit our crcs and generation into the first page */
 300        if (write && check_crcs &&
 301            (num_pages * sizeof(u32) + sizeof(u64)) > PAGE_SIZE)
 302                return -ENOSPC;
 303
 304        memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
 305
 306        io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
 307        if (!io_ctl->pages)
 308                return -ENOMEM;
 309
 310        io_ctl->num_pages = num_pages;
 311        io_ctl->fs_info = btrfs_sb(inode->i_sb);
 312        io_ctl->check_crcs = check_crcs;
 313        io_ctl->inode = inode;
 314
 315        return 0;
 316}
 317ALLOW_ERROR_INJECTION(io_ctl_init, ERRNO);
 318
 319static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 320{
 321        kfree(io_ctl->pages);
 322        io_ctl->pages = NULL;
 323}
 324
 325static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 326{
 327        if (io_ctl->cur) {
 328                io_ctl->cur = NULL;
 329                io_ctl->orig = NULL;
 330        }
 331}
 332
 333static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 334{
 335        ASSERT(io_ctl->index < io_ctl->num_pages);
 336        io_ctl->page = io_ctl->pages[io_ctl->index++];
 337        io_ctl->cur = page_address(io_ctl->page);
 338        io_ctl->orig = io_ctl->cur;
 339        io_ctl->size = PAGE_SIZE;
 340        if (clear)
 341                clear_page(io_ctl->cur);
 342}
 343
 344static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 345{
 346        int i;
 347
 348        io_ctl_unmap_page(io_ctl);
 349
 350        for (i = 0; i < io_ctl->num_pages; i++) {
 351                if (io_ctl->pages[i]) {
 352                        ClearPageChecked(io_ctl->pages[i]);
 353                        unlock_page(io_ctl->pages[i]);
 354                        put_page(io_ctl->pages[i]);
 355                }
 356        }
 357}
 358
 359static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 360                                int uptodate)
 361{
 362        struct page *page;
 363        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 364        int i;
 365
 366        for (i = 0; i < io_ctl->num_pages; i++) {
 367                page = find_or_create_page(inode->i_mapping, i, mask);
 368                if (!page) {
 369                        io_ctl_drop_pages(io_ctl);
 370                        return -ENOMEM;
 371                }
 372                io_ctl->pages[i] = page;
 373                if (uptodate && !PageUptodate(page)) {
 374                        btrfs_readpage(NULL, page);
 375                        lock_page(page);
 376                        if (!PageUptodate(page)) {
 377                                btrfs_err(BTRFS_I(inode)->root->fs_info,
 378                                           "error reading free space cache");
 379                                io_ctl_drop_pages(io_ctl);
 380                                return -EIO;
 381                        }
 382                }
 383        }
 384
 385        for (i = 0; i < io_ctl->num_pages; i++) {
 386                clear_page_dirty_for_io(io_ctl->pages[i]);
 387                set_page_extent_mapped(io_ctl->pages[i]);
 388        }
 389
 390        return 0;
 391}
 392
 393static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 394{
 395        __le64 *val;
 396
 397        io_ctl_map_page(io_ctl, 1);
 398
 399        /*
 400         * Skip the csum areas.  If we don't check crcs then we just have a
 401         * 64bit chunk at the front of the first page.
 402         */
 403        if (io_ctl->check_crcs) {
 404                io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
 405                io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
 406        } else {
 407                io_ctl->cur += sizeof(u64);
 408                io_ctl->size -= sizeof(u64) * 2;
 409        }
 410
 411        val = io_ctl->cur;
 412        *val = cpu_to_le64(generation);
 413        io_ctl->cur += sizeof(u64);
 414}
 415
 416static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 417{
 418        __le64 *gen;
 419
 420        /*
 421         * Skip the crc area.  If we don't check crcs then we just have a 64bit
 422         * chunk at the front of the first page.
 423         */
 424        if (io_ctl->check_crcs) {
 425                io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
 426                io_ctl->size -= sizeof(u64) +
 427                        (sizeof(u32) * io_ctl->num_pages);
 428        } else {
 429                io_ctl->cur += sizeof(u64);
 430                io_ctl->size -= sizeof(u64) * 2;
 431        }
 432
 433        gen = io_ctl->cur;
 434        if (le64_to_cpu(*gen) != generation) {
 435                btrfs_err_rl(io_ctl->fs_info,
 436                        "space cache generation (%llu) does not match inode (%llu)",
 437                                *gen, generation);
 438                io_ctl_unmap_page(io_ctl);
 439                return -EIO;
 440        }
 441        io_ctl->cur += sizeof(u64);
 442        return 0;
 443}
 444
 445static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 446{
 447        u32 *tmp;
 448        u32 crc = ~(u32)0;
 449        unsigned offset = 0;
 450
 451        if (!io_ctl->check_crcs) {
 452                io_ctl_unmap_page(io_ctl);
 453                return;
 454        }
 455
 456        if (index == 0)
 457                offset = sizeof(u32) * io_ctl->num_pages;
 458
 459        crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 460                              PAGE_SIZE - offset);
 461        btrfs_csum_final(crc, (u8 *)&crc);
 462        io_ctl_unmap_page(io_ctl);
 463        tmp = page_address(io_ctl->pages[0]);
 464        tmp += index;
 465        *tmp = crc;
 466}
 467
 468static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 469{
 470        u32 *tmp, val;
 471        u32 crc = ~(u32)0;
 472        unsigned offset = 0;
 473
 474        if (!io_ctl->check_crcs) {
 475                io_ctl_map_page(io_ctl, 0);
 476                return 0;
 477        }
 478
 479        if (index == 0)
 480                offset = sizeof(u32) * io_ctl->num_pages;
 481
 482        tmp = page_address(io_ctl->pages[0]);
 483        tmp += index;
 484        val = *tmp;
 485
 486        io_ctl_map_page(io_ctl, 0);
 487        crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 488                              PAGE_SIZE - offset);
 489        btrfs_csum_final(crc, (u8 *)&crc);
 490        if (val != crc) {
 491                btrfs_err_rl(io_ctl->fs_info,
 492                        "csum mismatch on free space cache");
 493                io_ctl_unmap_page(io_ctl);
 494                return -EIO;
 495        }
 496
 497        return 0;
 498}
 499
 500static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
 501                            void *bitmap)
 502{
 503        struct btrfs_free_space_entry *entry;
 504
 505        if (!io_ctl->cur)
 506                return -ENOSPC;
 507
 508        entry = io_ctl->cur;
 509        entry->offset = cpu_to_le64(offset);
 510        entry->bytes = cpu_to_le64(bytes);
 511        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
 512                BTRFS_FREE_SPACE_EXTENT;
 513        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 514        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 515
 516        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 517                return 0;
 518
 519        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 520
 521        /* No more pages to map */
 522        if (io_ctl->index >= io_ctl->num_pages)
 523                return 0;
 524
 525        /* map the next page */
 526        io_ctl_map_page(io_ctl, 1);
 527        return 0;
 528}
 529
 530static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 531{
 532        if (!io_ctl->cur)
 533                return -ENOSPC;
 534
 535        /*
 536         * If we aren't at the start of the current page, unmap this one and
 537         * map the next one if there is any left.
 538         */
 539        if (io_ctl->cur != io_ctl->orig) {
 540                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 541                if (io_ctl->index >= io_ctl->num_pages)
 542                        return -ENOSPC;
 543                io_ctl_map_page(io_ctl, 0);
 544        }
 545
 546        copy_page(io_ctl->cur, bitmap);
 547        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 548        if (io_ctl->index < io_ctl->num_pages)
 549                io_ctl_map_page(io_ctl, 0);
 550        return 0;
 551}
 552
 553static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
 554{
 555        /*
 556         * If we're not on the boundary we know we've modified the page and we
 557         * need to crc the page.
 558         */
 559        if (io_ctl->cur != io_ctl->orig)
 560                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 561        else
 562                io_ctl_unmap_page(io_ctl);
 563
 564        while (io_ctl->index < io_ctl->num_pages) {
 565                io_ctl_map_page(io_ctl, 1);
 566                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 567        }
 568}
 569
 570static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
 571                            struct btrfs_free_space *entry, u8 *type)
 572{
 573        struct btrfs_free_space_entry *e;
 574        int ret;
 575
 576        if (!io_ctl->cur) {
 577                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 578                if (ret)
 579                        return ret;
 580        }
 581
 582        e = io_ctl->cur;
 583        entry->offset = le64_to_cpu(e->offset);
 584        entry->bytes = le64_to_cpu(e->bytes);
 585        *type = e->type;
 586        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 587        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 588
 589        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 590                return 0;
 591
 592        io_ctl_unmap_page(io_ctl);
 593
 594        return 0;
 595}
 596
 597static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 598                              struct btrfs_free_space *entry)
 599{
 600        int ret;
 601
 602        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 603        if (ret)
 604                return ret;
 605
 606        copy_page(entry->bitmap, io_ctl->cur);
 607        io_ctl_unmap_page(io_ctl);
 608
 609        return 0;
 610}
 611
 612/*
 613 * Since we attach pinned extents after the fact we can have contiguous sections
 614 * of free space that are split up in entries.  This poses a problem with the
 615 * tree logging stuff since it could have allocated across what appears to be 2
 616 * entries since we would have merged the entries when adding the pinned extents
 617 * back to the free space cache.  So run through the space cache that we just
 618 * loaded and merge contiguous entries.  This will make the log replay stuff not
 619 * blow up and it will make for nicer allocator behavior.
 620 */
 621static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
 622{
 623        struct btrfs_free_space *e, *prev = NULL;
 624        struct rb_node *n;
 625
 626again:
 627        spin_lock(&ctl->tree_lock);
 628        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
 629                e = rb_entry(n, struct btrfs_free_space, offset_index);
 630                if (!prev)
 631                        goto next;
 632                if (e->bitmap || prev->bitmap)
 633                        goto next;
 634                if (prev->offset + prev->bytes == e->offset) {
 635                        unlink_free_space(ctl, prev);
 636                        unlink_free_space(ctl, e);
 637                        prev->bytes += e->bytes;
 638                        kmem_cache_free(btrfs_free_space_cachep, e);
 639                        link_free_space(ctl, prev);
 640                        prev = NULL;
 641                        spin_unlock(&ctl->tree_lock);
 642                        goto again;
 643                }
 644next:
 645                prev = e;
 646        }
 647        spin_unlock(&ctl->tree_lock);
 648}
 649
 650static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 651                                   struct btrfs_free_space_ctl *ctl,
 652                                   struct btrfs_path *path, u64 offset)
 653{
 654        struct btrfs_fs_info *fs_info = root->fs_info;
 655        struct btrfs_free_space_header *header;
 656        struct extent_buffer *leaf;
 657        struct btrfs_io_ctl io_ctl;
 658        struct btrfs_key key;
 659        struct btrfs_free_space *e, *n;
 660        LIST_HEAD(bitmaps);
 661        u64 num_entries;
 662        u64 num_bitmaps;
 663        u64 generation;
 664        u8 type;
 665        int ret = 0;
 666
 667        /* Nothing in the space cache, goodbye */
 668        if (!i_size_read(inode))
 669                return 0;
 670
 671        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 672        key.offset = offset;
 673        key.type = 0;
 674
 675        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 676        if (ret < 0)
 677                return 0;
 678        else if (ret > 0) {
 679                btrfs_release_path(path);
 680                return 0;
 681        }
 682
 683        ret = -1;
 684
 685        leaf = path->nodes[0];
 686        header = btrfs_item_ptr(leaf, path->slots[0],
 687                                struct btrfs_free_space_header);
 688        num_entries = btrfs_free_space_entries(leaf, header);
 689        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
 690        generation = btrfs_free_space_generation(leaf, header);
 691        btrfs_release_path(path);
 692
 693        if (!BTRFS_I(inode)->generation) {
 694                btrfs_info(fs_info,
 695                           "the free space cache file (%llu) is invalid, skip it",
 696                           offset);
 697                return 0;
 698        }
 699
 700        if (BTRFS_I(inode)->generation != generation) {
 701                btrfs_err(fs_info,
 702                          "free space inode generation (%llu) did not match free space cache generation (%llu)",
 703                          BTRFS_I(inode)->generation, generation);
 704                return 0;
 705        }
 706
 707        if (!num_entries)
 708                return 0;
 709
 710        ret = io_ctl_init(&io_ctl, inode, 0);
 711        if (ret)
 712                return ret;
 713
 714        readahead_cache(inode);
 715
 716        ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
 717        if (ret)
 718                goto out;
 719
 720        ret = io_ctl_check_crc(&io_ctl, 0);
 721        if (ret)
 722                goto free_cache;
 723
 724        ret = io_ctl_check_generation(&io_ctl, generation);
 725        if (ret)
 726                goto free_cache;
 727
 728        while (num_entries) {
 729                e = kmem_cache_zalloc(btrfs_free_space_cachep,
 730                                      GFP_NOFS);
 731                if (!e)
 732                        goto free_cache;
 733
 734                ret = io_ctl_read_entry(&io_ctl, e, &type);
 735                if (ret) {
 736                        kmem_cache_free(btrfs_free_space_cachep, e);
 737                        goto free_cache;
 738                }
 739
 740                if (!e->bytes) {
 741                        kmem_cache_free(btrfs_free_space_cachep, e);
 742                        goto free_cache;
 743                }
 744
 745                if (type == BTRFS_FREE_SPACE_EXTENT) {
 746                        spin_lock(&ctl->tree_lock);
 747                        ret = link_free_space(ctl, e);
 748                        spin_unlock(&ctl->tree_lock);
 749                        if (ret) {
 750                                btrfs_err(fs_info,
 751                                        "Duplicate entries in free space cache, dumping");
 752                                kmem_cache_free(btrfs_free_space_cachep, e);
 753                                goto free_cache;
 754                        }
 755                } else {
 756                        ASSERT(num_bitmaps);
 757                        num_bitmaps--;
 758                        e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
 759                        if (!e->bitmap) {
 760                                kmem_cache_free(
 761                                        btrfs_free_space_cachep, e);
 762                                goto free_cache;
 763                        }
 764                        spin_lock(&ctl->tree_lock);
 765                        ret = link_free_space(ctl, e);
 766                        ctl->total_bitmaps++;
 767                        ctl->op->recalc_thresholds(ctl);
 768                        spin_unlock(&ctl->tree_lock);
 769                        if (ret) {
 770                                btrfs_err(fs_info,
 771                                        "Duplicate entries in free space cache, dumping");
 772                                kmem_cache_free(btrfs_free_space_cachep, e);
 773                                goto free_cache;
 774                        }
 775                        list_add_tail(&e->list, &bitmaps);
 776                }
 777
 778                num_entries--;
 779        }
 780
 781        io_ctl_unmap_page(&io_ctl);
 782
 783        /*
 784         * We add the bitmaps at the end of the entries in order that
 785         * the bitmap entries are added to the cache.
 786         */
 787        list_for_each_entry_safe(e, n, &bitmaps, list) {
 788                list_del_init(&e->list);
 789                ret = io_ctl_read_bitmap(&io_ctl, e);
 790                if (ret)
 791                        goto free_cache;
 792        }
 793
 794        io_ctl_drop_pages(&io_ctl);
 795        merge_space_tree(ctl);
 796        ret = 1;
 797out:
 798        io_ctl_free(&io_ctl);
 799        return ret;
 800free_cache:
 801        io_ctl_drop_pages(&io_ctl);
 802        __btrfs_remove_free_space_cache(ctl);
 803        goto out;
 804}
 805
 806int load_free_space_cache(struct btrfs_fs_info *fs_info,
 807                          struct btrfs_block_group_cache *block_group)
 808{
 809        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 810        struct inode *inode;
 811        struct btrfs_path *path;
 812        int ret = 0;
 813        bool matched;
 814        u64 used = btrfs_block_group_used(&block_group->item);
 815
 816        /*
 817         * If this block group has been marked to be cleared for one reason or
 818         * another then we can't trust the on disk cache, so just return.
 819         */
 820        spin_lock(&block_group->lock);
 821        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 822                spin_unlock(&block_group->lock);
 823                return 0;
 824        }
 825        spin_unlock(&block_group->lock);
 826
 827        path = btrfs_alloc_path();
 828        if (!path)
 829                return 0;
 830        path->search_commit_root = 1;
 831        path->skip_locking = 1;
 832
 833        inode = lookup_free_space_inode(fs_info, block_group, path);
 834        if (IS_ERR(inode)) {
 835                btrfs_free_path(path);
 836                return 0;
 837        }
 838
 839        /* We may have converted the inode and made the cache invalid. */
 840        spin_lock(&block_group->lock);
 841        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 842                spin_unlock(&block_group->lock);
 843                btrfs_free_path(path);
 844                goto out;
 845        }
 846        spin_unlock(&block_group->lock);
 847
 848        ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
 849                                      path, block_group->key.objectid);
 850        btrfs_free_path(path);
 851        if (ret <= 0)
 852                goto out;
 853
 854        spin_lock(&ctl->tree_lock);
 855        matched = (ctl->free_space == (block_group->key.offset - used -
 856                                       block_group->bytes_super));
 857        spin_unlock(&ctl->tree_lock);
 858
 859        if (!matched) {
 860                __btrfs_remove_free_space_cache(ctl);
 861                btrfs_warn(fs_info,
 862                           "block group %llu has wrong amount of free space",
 863                           block_group->key.objectid);
 864                ret = -1;
 865        }
 866out:
 867        if (ret < 0) {
 868                /* This cache is bogus, make sure it gets cleared */
 869                spin_lock(&block_group->lock);
 870                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 871                spin_unlock(&block_group->lock);
 872                ret = 0;
 873
 874                btrfs_warn(fs_info,
 875                           "failed to load free space cache for block group %llu, rebuilding it now",
 876                           block_group->key.objectid);
 877        }
 878
 879        iput(inode);
 880        return ret;
 881}
 882
 883static noinline_for_stack
 884int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
 885                              struct btrfs_free_space_ctl *ctl,
 886                              struct btrfs_block_group_cache *block_group,
 887                              int *entries, int *bitmaps,
 888                              struct list_head *bitmap_list)
 889{
 890        int ret;
 891        struct btrfs_free_cluster *cluster = NULL;
 892        struct btrfs_free_cluster *cluster_locked = NULL;
 893        struct rb_node *node = rb_first(&ctl->free_space_offset);
 894        struct btrfs_trim_range *trim_entry;
 895
 896        /* Get the cluster for this block_group if it exists */
 897        if (block_group && !list_empty(&block_group->cluster_list)) {
 898                cluster = list_entry(block_group->cluster_list.next,
 899                                     struct btrfs_free_cluster,
 900                                     block_group_list);
 901        }
 902
 903        if (!node && cluster) {
 904                cluster_locked = cluster;
 905                spin_lock(&cluster_locked->lock);
 906                node = rb_first(&cluster->root);
 907                cluster = NULL;
 908        }
 909
 910        /* Write out the extent entries */
 911        while (node) {
 912                struct btrfs_free_space *e;
 913
 914                e = rb_entry(node, struct btrfs_free_space, offset_index);
 915                *entries += 1;
 916
 917                ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
 918                                       e->bitmap);
 919                if (ret)
 920                        goto fail;
 921
 922                if (e->bitmap) {
 923                        list_add_tail(&e->list, bitmap_list);
 924                        *bitmaps += 1;
 925                }
 926                node = rb_next(node);
 927                if (!node && cluster) {
 928                        node = rb_first(&cluster->root);
 929                        cluster_locked = cluster;
 930                        spin_lock(&cluster_locked->lock);
 931                        cluster = NULL;
 932                }
 933        }
 934        if (cluster_locked) {
 935                spin_unlock(&cluster_locked->lock);
 936                cluster_locked = NULL;
 937        }
 938
 939        /*
 940         * Make sure we don't miss any range that was removed from our rbtree
 941         * because trimming is running. Otherwise after a umount+mount (or crash
 942         * after committing the transaction) we would leak free space and get
 943         * an inconsistent free space cache report from fsck.
 944         */
 945        list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) {
 946                ret = io_ctl_add_entry(io_ctl, trim_entry->start,
 947                                       trim_entry->bytes, NULL);
 948                if (ret)
 949                        goto fail;
 950                *entries += 1;
 951        }
 952
 953        return 0;
 954fail:
 955        if (cluster_locked)
 956                spin_unlock(&cluster_locked->lock);
 957        return -ENOSPC;
 958}
 959
 960static noinline_for_stack int
 961update_cache_item(struct btrfs_trans_handle *trans,
 962                  struct btrfs_root *root,
 963                  struct inode *inode,
 964                  struct btrfs_path *path, u64 offset,
 965                  int entries, int bitmaps)
 966{
 967        struct btrfs_key key;
 968        struct btrfs_free_space_header *header;
 969        struct extent_buffer *leaf;
 970        int ret;
 971
 972        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 973        key.offset = offset;
 974        key.type = 0;
 975
 976        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 977        if (ret < 0) {
 978                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
 979                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
 980                goto fail;
 981        }
 982        leaf = path->nodes[0];
 983        if (ret > 0) {
 984                struct btrfs_key found_key;
 985                ASSERT(path->slots[0]);
 986                path->slots[0]--;
 987                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 988                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
 989                    found_key.offset != offset) {
 990                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
 991                                         inode->i_size - 1,
 992                                         EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
 993                                         NULL);
 994                        btrfs_release_path(path);
 995                        goto fail;
 996                }
 997        }
 998
 999        BTRFS_I(inode)->generation = trans->transid;
1000        header = btrfs_item_ptr(leaf, path->slots[0],
1001                                struct btrfs_free_space_header);
1002        btrfs_set_free_space_entries(leaf, header, entries);
1003        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
1004        btrfs_set_free_space_generation(leaf, header, trans->transid);
1005        btrfs_mark_buffer_dirty(leaf);
1006        btrfs_release_path(path);
1007
1008        return 0;
1009
1010fail:
1011        return -1;
1012}
1013
1014static noinline_for_stack int
1015write_pinned_extent_entries(struct btrfs_fs_info *fs_info,
1016                            struct btrfs_block_group_cache *block_group,
1017                            struct btrfs_io_ctl *io_ctl,
1018                            int *entries)
1019{
1020        u64 start, extent_start, extent_end, len;
1021        struct extent_io_tree *unpin = NULL;
1022        int ret;
1023
1024        if (!block_group)
1025                return 0;
1026
1027        /*
1028         * We want to add any pinned extents to our free space cache
1029         * so we don't leak the space
1030         *
1031         * We shouldn't have switched the pinned extents yet so this is the
1032         * right one
1033         */
1034        unpin = fs_info->pinned_extents;
1035
1036        start = block_group->key.objectid;
1037
1038        while (start < block_group->key.objectid + block_group->key.offset) {
1039                ret = find_first_extent_bit(unpin, start,
1040                                            &extent_start, &extent_end,
1041                                            EXTENT_DIRTY, NULL);
1042                if (ret)
1043                        return 0;
1044
1045                /* This pinned extent is out of our range */
1046                if (extent_start >= block_group->key.objectid +
1047                    block_group->key.offset)
1048                        return 0;
1049
1050                extent_start = max(extent_start, start);
1051                extent_end = min(block_group->key.objectid +
1052                                 block_group->key.offset, extent_end + 1);
1053                len = extent_end - extent_start;
1054
1055                *entries += 1;
1056                ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1057                if (ret)
1058                        return -ENOSPC;
1059
1060                start = extent_end;
1061        }
1062
1063        return 0;
1064}
1065
1066static noinline_for_stack int
1067write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1068{
1069        struct btrfs_free_space *entry, *next;
1070        int ret;
1071
1072        /* Write out the bitmaps */
1073        list_for_each_entry_safe(entry, next, bitmap_list, list) {
1074                ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1075                if (ret)
1076                        return -ENOSPC;
1077                list_del_init(&entry->list);
1078        }
1079
1080        return 0;
1081}
1082
1083static int flush_dirty_cache(struct inode *inode)
1084{
1085        int ret;
1086
1087        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1088        if (ret)
1089                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1090                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL);
1091
1092        return ret;
1093}
1094
1095static void noinline_for_stack
1096cleanup_bitmap_list(struct list_head *bitmap_list)
1097{
1098        struct btrfs_free_space *entry, *next;
1099
1100        list_for_each_entry_safe(entry, next, bitmap_list, list)
1101                list_del_init(&entry->list);
1102}
1103
1104static void noinline_for_stack
1105cleanup_write_cache_enospc(struct inode *inode,
1106                           struct btrfs_io_ctl *io_ctl,
1107                           struct extent_state **cached_state)
1108{
1109        io_ctl_drop_pages(io_ctl);
1110        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1111                             i_size_read(inode) - 1, cached_state);
1112}
1113
1114static int __btrfs_wait_cache_io(struct btrfs_root *root,
1115                                 struct btrfs_trans_handle *trans,
1116                                 struct btrfs_block_group_cache *block_group,
1117                                 struct btrfs_io_ctl *io_ctl,
1118                                 struct btrfs_path *path, u64 offset)
1119{
1120        int ret;
1121        struct inode *inode = io_ctl->inode;
1122
1123        if (!inode)
1124                return 0;
1125
1126        /* Flush the dirty pages in the cache file. */
1127        ret = flush_dirty_cache(inode);
1128        if (ret)
1129                goto out;
1130
1131        /* Update the cache item to tell everyone this cache file is valid. */
1132        ret = update_cache_item(trans, root, inode, path, offset,
1133                                io_ctl->entries, io_ctl->bitmaps);
1134out:
1135        io_ctl_free(io_ctl);
1136        if (ret) {
1137                invalidate_inode_pages2(inode->i_mapping);
1138                BTRFS_I(inode)->generation = 0;
1139                if (block_group) {
1140#ifdef DEBUG
1141                        btrfs_err(root->fs_info,
1142                                  "failed to write free space cache for block group %llu",
1143                                  block_group->key.objectid);
1144#endif
1145                }
1146        }
1147        btrfs_update_inode(trans, root, inode);
1148
1149        if (block_group) {
1150                /* the dirty list is protected by the dirty_bgs_lock */
1151                spin_lock(&trans->transaction->dirty_bgs_lock);
1152
1153                /* the disk_cache_state is protected by the block group lock */
1154                spin_lock(&block_group->lock);
1155
1156                /*
1157                 * only mark this as written if we didn't get put back on
1158                 * the dirty list while waiting for IO.   Otherwise our
1159                 * cache state won't be right, and we won't get written again
1160                 */
1161                if (!ret && list_empty(&block_group->dirty_list))
1162                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1163                else if (ret)
1164                        block_group->disk_cache_state = BTRFS_DC_ERROR;
1165
1166                spin_unlock(&block_group->lock);
1167                spin_unlock(&trans->transaction->dirty_bgs_lock);
1168                io_ctl->inode = NULL;
1169                iput(inode);
1170        }
1171
1172        return ret;
1173
1174}
1175
1176static int btrfs_wait_cache_io_root(struct btrfs_root *root,
1177                                    struct btrfs_trans_handle *trans,
1178                                    struct btrfs_io_ctl *io_ctl,
1179                                    struct btrfs_path *path)
1180{
1181        return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0);
1182}
1183
1184int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
1185                        struct btrfs_block_group_cache *block_group,
1186                        struct btrfs_path *path)
1187{
1188        return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans,
1189                                     block_group, &block_group->io_ctl,
1190                                     path, block_group->key.objectid);
1191}
1192
1193/**
1194 * __btrfs_write_out_cache - write out cached info to an inode
1195 * @root - the root the inode belongs to
1196 * @ctl - the free space cache we are going to write out
1197 * @block_group - the block_group for this cache if it belongs to a block_group
1198 * @trans - the trans handle
1199 *
1200 * This function writes out a free space cache struct to disk for quick recovery
1201 * on mount.  This will return 0 if it was successful in writing the cache out,
1202 * or an errno if it was not.
1203 */
1204static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1205                                   struct btrfs_free_space_ctl *ctl,
1206                                   struct btrfs_block_group_cache *block_group,
1207                                   struct btrfs_io_ctl *io_ctl,
1208                                   struct btrfs_trans_handle *trans)
1209{
1210        struct btrfs_fs_info *fs_info = root->fs_info;
1211        struct extent_state *cached_state = NULL;
1212        LIST_HEAD(bitmap_list);
1213        int entries = 0;
1214        int bitmaps = 0;
1215        int ret;
1216        int must_iput = 0;
1217
1218        if (!i_size_read(inode))
1219                return -EIO;
1220
1221        WARN_ON(io_ctl->pages);
1222        ret = io_ctl_init(io_ctl, inode, 1);
1223        if (ret)
1224                return ret;
1225
1226        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1227                down_write(&block_group->data_rwsem);
1228                spin_lock(&block_group->lock);
1229                if (block_group->delalloc_bytes) {
1230                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1231                        spin_unlock(&block_group->lock);
1232                        up_write(&block_group->data_rwsem);
1233                        BTRFS_I(inode)->generation = 0;
1234                        ret = 0;
1235                        must_iput = 1;
1236                        goto out;
1237                }
1238                spin_unlock(&block_group->lock);
1239        }
1240
1241        /* Lock all pages first so we can lock the extent safely. */
1242        ret = io_ctl_prepare_pages(io_ctl, inode, 0);
1243        if (ret)
1244                goto out_unlock;
1245
1246        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1247                         &cached_state);
1248
1249        io_ctl_set_generation(io_ctl, trans->transid);
1250
1251        mutex_lock(&ctl->cache_writeout_mutex);
1252        /* Write out the extent entries in the free space cache */
1253        spin_lock(&ctl->tree_lock);
1254        ret = write_cache_extent_entries(io_ctl, ctl,
1255                                         block_group, &entries, &bitmaps,
1256                                         &bitmap_list);
1257        if (ret)
1258                goto out_nospc_locked;
1259
1260        /*
1261         * Some spaces that are freed in the current transaction are pinned,
1262         * they will be added into free space cache after the transaction is
1263         * committed, we shouldn't lose them.
1264         *
1265         * If this changes while we are working we'll get added back to
1266         * the dirty list and redo it.  No locking needed
1267         */
1268        ret = write_pinned_extent_entries(fs_info, block_group,
1269                                          io_ctl, &entries);
1270        if (ret)
1271                goto out_nospc_locked;
1272
1273        /*
1274         * At last, we write out all the bitmaps and keep cache_writeout_mutex
1275         * locked while doing it because a concurrent trim can be manipulating
1276         * or freeing the bitmap.
1277         */
1278        ret = write_bitmap_entries(io_ctl, &bitmap_list);
1279        spin_unlock(&ctl->tree_lock);
1280        mutex_unlock(&ctl->cache_writeout_mutex);
1281        if (ret)
1282                goto out_nospc;
1283
1284        /* Zero out the rest of the pages just to make sure */
1285        io_ctl_zero_remaining_pages(io_ctl);
1286
1287        /* Everything is written out, now we dirty the pages in the file. */
1288        ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0,
1289                                i_size_read(inode), &cached_state);
1290        if (ret)
1291                goto out_nospc;
1292
1293        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1294                up_write(&block_group->data_rwsem);
1295        /*
1296         * Release the pages and unlock the extent, we will flush
1297         * them out later
1298         */
1299        io_ctl_drop_pages(io_ctl);
1300
1301        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1302                             i_size_read(inode) - 1, &cached_state);
1303
1304        /*
1305         * at this point the pages are under IO and we're happy,
1306         * The caller is responsible for waiting on them and updating the
1307         * the cache and the inode
1308         */
1309        io_ctl->entries = entries;
1310        io_ctl->bitmaps = bitmaps;
1311
1312        ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1313        if (ret)
1314                goto out;
1315
1316        return 0;
1317
1318out:
1319        io_ctl->inode = NULL;
1320        io_ctl_free(io_ctl);
1321        if (ret) {
1322                invalidate_inode_pages2(inode->i_mapping);
1323                BTRFS_I(inode)->generation = 0;
1324        }
1325        btrfs_update_inode(trans, root, inode);
1326        if (must_iput)
1327                iput(inode);
1328        return ret;
1329
1330out_nospc_locked:
1331        cleanup_bitmap_list(&bitmap_list);
1332        spin_unlock(&ctl->tree_lock);
1333        mutex_unlock(&ctl->cache_writeout_mutex);
1334
1335out_nospc:
1336        cleanup_write_cache_enospc(inode, io_ctl, &cached_state);
1337
1338out_unlock:
1339        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1340                up_write(&block_group->data_rwsem);
1341
1342        goto out;
1343}
1344
1345int btrfs_write_out_cache(struct btrfs_fs_info *fs_info,
1346                          struct btrfs_trans_handle *trans,
1347                          struct btrfs_block_group_cache *block_group,
1348                          struct btrfs_path *path)
1349{
1350        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1351        struct inode *inode;
1352        int ret = 0;
1353
1354        spin_lock(&block_group->lock);
1355        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1356                spin_unlock(&block_group->lock);
1357                return 0;
1358        }
1359        spin_unlock(&block_group->lock);
1360
1361        inode = lookup_free_space_inode(fs_info, block_group, path);
1362        if (IS_ERR(inode))
1363                return 0;
1364
1365        ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl,
1366                                block_group, &block_group->io_ctl, trans);
1367        if (ret) {
1368#ifdef DEBUG
1369                btrfs_err(fs_info,
1370                          "failed to write free space cache for block group %llu",
1371                          block_group->key.objectid);
1372#endif
1373                spin_lock(&block_group->lock);
1374                block_group->disk_cache_state = BTRFS_DC_ERROR;
1375                spin_unlock(&block_group->lock);
1376
1377                block_group->io_ctl.inode = NULL;
1378                iput(inode);
1379        }
1380
1381        /*
1382         * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1383         * to wait for IO and put the inode
1384         */
1385
1386        return ret;
1387}
1388
1389static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1390                                          u64 offset)
1391{
1392        ASSERT(offset >= bitmap_start);
1393        offset -= bitmap_start;
1394        return (unsigned long)(div_u64(offset, unit));
1395}
1396
1397static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1398{
1399        return (unsigned long)(div_u64(bytes, unit));
1400}
1401
1402static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1403                                   u64 offset)
1404{
1405        u64 bitmap_start;
1406        u64 bytes_per_bitmap;
1407
1408        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1409        bitmap_start = offset - ctl->start;
1410        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1411        bitmap_start *= bytes_per_bitmap;
1412        bitmap_start += ctl->start;
1413
1414        return bitmap_start;
1415}
1416
1417static int tree_insert_offset(struct rb_root *root, u64 offset,
1418                              struct rb_node *node, int bitmap)
1419{
1420        struct rb_node **p = &root->rb_node;
1421        struct rb_node *parent = NULL;
1422        struct btrfs_free_space *info;
1423
1424        while (*p) {
1425                parent = *p;
1426                info = rb_entry(parent, struct btrfs_free_space, offset_index);
1427
1428                if (offset < info->offset) {
1429                        p = &(*p)->rb_left;
1430                } else if (offset > info->offset) {
1431                        p = &(*p)->rb_right;
1432                } else {
1433                        /*
1434                         * we could have a bitmap entry and an extent entry
1435                         * share the same offset.  If this is the case, we want
1436                         * the extent entry to always be found first if we do a
1437                         * linear search through the tree, since we want to have
1438                         * the quickest allocation time, and allocating from an
1439                         * extent is faster than allocating from a bitmap.  So
1440                         * if we're inserting a bitmap and we find an entry at
1441                         * this offset, we want to go right, or after this entry
1442                         * logically.  If we are inserting an extent and we've
1443                         * found a bitmap, we want to go left, or before
1444                         * logically.
1445                         */
1446                        if (bitmap) {
1447                                if (info->bitmap) {
1448                                        WARN_ON_ONCE(1);
1449                                        return -EEXIST;
1450                                }
1451                                p = &(*p)->rb_right;
1452                        } else {
1453                                if (!info->bitmap) {
1454                                        WARN_ON_ONCE(1);
1455                                        return -EEXIST;
1456                                }
1457                                p = &(*p)->rb_left;
1458                        }
1459                }
1460        }
1461
1462        rb_link_node(node, parent, p);
1463        rb_insert_color(node, root);
1464
1465        return 0;
1466}
1467
1468/*
1469 * searches the tree for the given offset.
1470 *
1471 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1472 * want a section that has at least bytes size and comes at or after the given
1473 * offset.
1474 */
1475static struct btrfs_free_space *
1476tree_search_offset(struct btrfs_free_space_ctl *ctl,
1477                   u64 offset, int bitmap_only, int fuzzy)
1478{
1479        struct rb_node *n = ctl->free_space_offset.rb_node;
1480        struct btrfs_free_space *entry, *prev = NULL;
1481
1482        /* find entry that is closest to the 'offset' */
1483        while (1) {
1484                if (!n) {
1485                        entry = NULL;
1486                        break;
1487                }
1488
1489                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1490                prev = entry;
1491
1492                if (offset < entry->offset)
1493                        n = n->rb_left;
1494                else if (offset > entry->offset)
1495                        n = n->rb_right;
1496                else
1497                        break;
1498        }
1499
1500        if (bitmap_only) {
1501                if (!entry)
1502                        return NULL;
1503                if (entry->bitmap)
1504                        return entry;
1505
1506                /*
1507                 * bitmap entry and extent entry may share same offset,
1508                 * in that case, bitmap entry comes after extent entry.
1509                 */
1510                n = rb_next(n);
1511                if (!n)
1512                        return NULL;
1513                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1514                if (entry->offset != offset)
1515                        return NULL;
1516
1517                WARN_ON(!entry->bitmap);
1518                return entry;
1519        } else if (entry) {
1520                if (entry->bitmap) {
1521                        /*
1522                         * if previous extent entry covers the offset,
1523                         * we should return it instead of the bitmap entry
1524                         */
1525                        n = rb_prev(&entry->offset_index);
1526                        if (n) {
1527                                prev = rb_entry(n, struct btrfs_free_space,
1528                                                offset_index);
1529                                if (!prev->bitmap &&
1530                                    prev->offset + prev->bytes > offset)
1531                                        entry = prev;
1532                        }
1533                }
1534                return entry;
1535        }
1536
1537        if (!prev)
1538                return NULL;
1539
1540        /* find last entry before the 'offset' */
1541        entry = prev;
1542        if (entry->offset > offset) {
1543                n = rb_prev(&entry->offset_index);
1544                if (n) {
1545                        entry = rb_entry(n, struct btrfs_free_space,
1546                                        offset_index);
1547                        ASSERT(entry->offset <= offset);
1548                } else {
1549                        if (fuzzy)
1550                                return entry;
1551                        else
1552                                return NULL;
1553                }
1554        }
1555
1556        if (entry->bitmap) {
1557                n = rb_prev(&entry->offset_index);
1558                if (n) {
1559                        prev = rb_entry(n, struct btrfs_free_space,
1560                                        offset_index);
1561                        if (!prev->bitmap &&
1562                            prev->offset + prev->bytes > offset)
1563                                return prev;
1564                }
1565                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1566                        return entry;
1567        } else if (entry->offset + entry->bytes > offset)
1568                return entry;
1569
1570        if (!fuzzy)
1571                return NULL;
1572
1573        while (1) {
1574                if (entry->bitmap) {
1575                        if (entry->offset + BITS_PER_BITMAP *
1576                            ctl->unit > offset)
1577                                break;
1578                } else {
1579                        if (entry->offset + entry->bytes > offset)
1580                                break;
1581                }
1582
1583                n = rb_next(&entry->offset_index);
1584                if (!n)
1585                        return NULL;
1586                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1587        }
1588        return entry;
1589}
1590
1591static inline void
1592__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1593                    struct btrfs_free_space *info)
1594{
1595        rb_erase(&info->offset_index, &ctl->free_space_offset);
1596        ctl->free_extents--;
1597}
1598
1599static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1600                              struct btrfs_free_space *info)
1601{
1602        __unlink_free_space(ctl, info);
1603        ctl->free_space -= info->bytes;
1604}
1605
1606static int link_free_space(struct btrfs_free_space_ctl *ctl,
1607                           struct btrfs_free_space *info)
1608{
1609        int ret = 0;
1610
1611        ASSERT(info->bytes || info->bitmap);
1612        ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1613                                 &info->offset_index, (info->bitmap != NULL));
1614        if (ret)
1615                return ret;
1616
1617        ctl->free_space += info->bytes;
1618        ctl->free_extents++;
1619        return ret;
1620}
1621
1622static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1623{
1624        struct btrfs_block_group_cache *block_group = ctl->private;
1625        u64 max_bytes;
1626        u64 bitmap_bytes;
1627        u64 extent_bytes;
1628        u64 size = block_group->key.offset;
1629        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1630        u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1631
1632        max_bitmaps = max_t(u64, max_bitmaps, 1);
1633
1634        ASSERT(ctl->total_bitmaps <= max_bitmaps);
1635
1636        /*
1637         * The goal is to keep the total amount of memory used per 1gb of space
1638         * at or below 32k, so we need to adjust how much memory we allow to be
1639         * used by extent based free space tracking
1640         */
1641        if (size < SZ_1G)
1642                max_bytes = MAX_CACHE_BYTES_PER_GIG;
1643        else
1644                max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
1645
1646        /*
1647         * we want to account for 1 more bitmap than what we have so we can make
1648         * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1649         * we add more bitmaps.
1650         */
1651        bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit;
1652
1653        if (bitmap_bytes >= max_bytes) {
1654                ctl->extents_thresh = 0;
1655                return;
1656        }
1657
1658        /*
1659         * we want the extent entry threshold to always be at most 1/2 the max
1660         * bytes we can have, or whatever is less than that.
1661         */
1662        extent_bytes = max_bytes - bitmap_bytes;
1663        extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1664
1665        ctl->extents_thresh =
1666                div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1667}
1668
1669static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1670                                       struct btrfs_free_space *info,
1671                                       u64 offset, u64 bytes)
1672{
1673        unsigned long start, count;
1674
1675        start = offset_to_bit(info->offset, ctl->unit, offset);
1676        count = bytes_to_bits(bytes, ctl->unit);
1677        ASSERT(start + count <= BITS_PER_BITMAP);
1678
1679        bitmap_clear(info->bitmap, start, count);
1680
1681        info->bytes -= bytes;
1682}
1683
1684static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1685                              struct btrfs_free_space *info, u64 offset,
1686                              u64 bytes)
1687{
1688        __bitmap_clear_bits(ctl, info, offset, bytes);
1689        ctl->free_space -= bytes;
1690}
1691
1692static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1693                            struct btrfs_free_space *info, u64 offset,
1694                            u64 bytes)
1695{
1696        unsigned long start, count;
1697
1698        start = offset_to_bit(info->offset, ctl->unit, offset);
1699        count = bytes_to_bits(bytes, ctl->unit);
1700        ASSERT(start + count <= BITS_PER_BITMAP);
1701
1702        bitmap_set(info->bitmap, start, count);
1703
1704        info->bytes += bytes;
1705        ctl->free_space += bytes;
1706}
1707
1708/*
1709 * If we can not find suitable extent, we will use bytes to record
1710 * the size of the max extent.
1711 */
1712static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1713                         struct btrfs_free_space *bitmap_info, u64 *offset,
1714                         u64 *bytes, bool for_alloc)
1715{
1716        unsigned long found_bits = 0;
1717        unsigned long max_bits = 0;
1718        unsigned long bits, i;
1719        unsigned long next_zero;
1720        unsigned long extent_bits;
1721
1722        /*
1723         * Skip searching the bitmap if we don't have a contiguous section that
1724         * is large enough for this allocation.
1725         */
1726        if (for_alloc &&
1727            bitmap_info->max_extent_size &&
1728            bitmap_info->max_extent_size < *bytes) {
1729                *bytes = bitmap_info->max_extent_size;
1730                return -1;
1731        }
1732
1733        i = offset_to_bit(bitmap_info->offset, ctl->unit,
1734                          max_t(u64, *offset, bitmap_info->offset));
1735        bits = bytes_to_bits(*bytes, ctl->unit);
1736
1737        for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1738                if (for_alloc && bits == 1) {
1739                        found_bits = 1;
1740                        break;
1741                }
1742                next_zero = find_next_zero_bit(bitmap_info->bitmap,
1743                                               BITS_PER_BITMAP, i);
1744                extent_bits = next_zero - i;
1745                if (extent_bits >= bits) {
1746                        found_bits = extent_bits;
1747                        break;
1748                } else if (extent_bits > max_bits) {
1749                        max_bits = extent_bits;
1750                }
1751                i = next_zero;
1752        }
1753
1754        if (found_bits) {
1755                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1756                *bytes = (u64)(found_bits) * ctl->unit;
1757                return 0;
1758        }
1759
1760        *bytes = (u64)(max_bits) * ctl->unit;
1761        bitmap_info->max_extent_size = *bytes;
1762        return -1;
1763}
1764
1765/* Cache the size of the max extent in bytes */
1766static struct btrfs_free_space *
1767find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1768                unsigned long align, u64 *max_extent_size)
1769{
1770        struct btrfs_free_space *entry;
1771        struct rb_node *node;
1772        u64 tmp;
1773        u64 align_off;
1774        int ret;
1775
1776        if (!ctl->free_space_offset.rb_node)
1777                goto out;
1778
1779        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1780        if (!entry)
1781                goto out;
1782
1783        for (node = &entry->offset_index; node; node = rb_next(node)) {
1784                entry = rb_entry(node, struct btrfs_free_space, offset_index);
1785                if (entry->bytes < *bytes) {
1786                        if (entry->bytes > *max_extent_size)
1787                                *max_extent_size = entry->bytes;
1788                        continue;
1789                }
1790
1791                /* make sure the space returned is big enough
1792                 * to match our requested alignment
1793                 */
1794                if (*bytes >= align) {
1795                        tmp = entry->offset - ctl->start + align - 1;
1796                        tmp = div64_u64(tmp, align);
1797                        tmp = tmp * align + ctl->start;
1798                        align_off = tmp - entry->offset;
1799                } else {
1800                        align_off = 0;
1801                        tmp = entry->offset;
1802                }
1803
1804                if (entry->bytes < *bytes + align_off) {
1805                        if (entry->bytes > *max_extent_size)
1806                                *max_extent_size = entry->bytes;
1807                        continue;
1808                }
1809
1810                if (entry->bitmap) {
1811                        u64 size = *bytes;
1812
1813                        ret = search_bitmap(ctl, entry, &tmp, &size, true);
1814                        if (!ret) {
1815                                *offset = tmp;
1816                                *bytes = size;
1817                                return entry;
1818                        } else if (size > *max_extent_size) {
1819                                *max_extent_size = size;
1820                        }
1821                        continue;
1822                }
1823
1824                *offset = tmp;
1825                *bytes = entry->bytes - align_off;
1826                return entry;
1827        }
1828out:
1829        return NULL;
1830}
1831
1832static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1833                           struct btrfs_free_space *info, u64 offset)
1834{
1835        info->offset = offset_to_bitmap(ctl, offset);
1836        info->bytes = 0;
1837        INIT_LIST_HEAD(&info->list);
1838        link_free_space(ctl, info);
1839        ctl->total_bitmaps++;
1840
1841        ctl->op->recalc_thresholds(ctl);
1842}
1843
1844static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1845                        struct btrfs_free_space *bitmap_info)
1846{
1847        unlink_free_space(ctl, bitmap_info);
1848        kfree(bitmap_info->bitmap);
1849        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1850        ctl->total_bitmaps--;
1851        ctl->op->recalc_thresholds(ctl);
1852}
1853
1854static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1855                              struct btrfs_free_space *bitmap_info,
1856                              u64 *offset, u64 *bytes)
1857{
1858        u64 end;
1859        u64 search_start, search_bytes;
1860        int ret;
1861
1862again:
1863        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1864
1865        /*
1866         * We need to search for bits in this bitmap.  We could only cover some
1867         * of the extent in this bitmap thanks to how we add space, so we need
1868         * to search for as much as it as we can and clear that amount, and then
1869         * go searching for the next bit.
1870         */
1871        search_start = *offset;
1872        search_bytes = ctl->unit;
1873        search_bytes = min(search_bytes, end - search_start + 1);
1874        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1875                            false);
1876        if (ret < 0 || search_start != *offset)
1877                return -EINVAL;
1878
1879        /* We may have found more bits than what we need */
1880        search_bytes = min(search_bytes, *bytes);
1881
1882        /* Cannot clear past the end of the bitmap */
1883        search_bytes = min(search_bytes, end - search_start + 1);
1884
1885        bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1886        *offset += search_bytes;
1887        *bytes -= search_bytes;
1888
1889        if (*bytes) {
1890                struct rb_node *next = rb_next(&bitmap_info->offset_index);
1891                if (!bitmap_info->bytes)
1892                        free_bitmap(ctl, bitmap_info);
1893
1894                /*
1895                 * no entry after this bitmap, but we still have bytes to
1896                 * remove, so something has gone wrong.
1897                 */
1898                if (!next)
1899                        return -EINVAL;
1900
1901                bitmap_info = rb_entry(next, struct btrfs_free_space,
1902                                       offset_index);
1903
1904                /*
1905                 * if the next entry isn't a bitmap we need to return to let the
1906                 * extent stuff do its work.
1907                 */
1908                if (!bitmap_info->bitmap)
1909                        return -EAGAIN;
1910
1911                /*
1912                 * Ok the next item is a bitmap, but it may not actually hold
1913                 * the information for the rest of this free space stuff, so
1914                 * look for it, and if we don't find it return so we can try
1915                 * everything over again.
1916                 */
1917                search_start = *offset;
1918                search_bytes = ctl->unit;
1919                ret = search_bitmap(ctl, bitmap_info, &search_start,
1920                                    &search_bytes, false);
1921                if (ret < 0 || search_start != *offset)
1922                        return -EAGAIN;
1923
1924                goto again;
1925        } else if (!bitmap_info->bytes)
1926                free_bitmap(ctl, bitmap_info);
1927
1928        return 0;
1929}
1930
1931static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1932                               struct btrfs_free_space *info, u64 offset,
1933                               u64 bytes)
1934{
1935        u64 bytes_to_set = 0;
1936        u64 end;
1937
1938        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1939
1940        bytes_to_set = min(end - offset, bytes);
1941
1942        bitmap_set_bits(ctl, info, offset, bytes_to_set);
1943
1944        /*
1945         * We set some bytes, we have no idea what the max extent size is
1946         * anymore.
1947         */
1948        info->max_extent_size = 0;
1949
1950        return bytes_to_set;
1951
1952}
1953
1954static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1955                      struct btrfs_free_space *info)
1956{
1957        struct btrfs_block_group_cache *block_group = ctl->private;
1958        struct btrfs_fs_info *fs_info = block_group->fs_info;
1959        bool forced = false;
1960
1961#ifdef CONFIG_BTRFS_DEBUG
1962        if (btrfs_should_fragment_free_space(block_group))
1963                forced = true;
1964#endif
1965
1966        /*
1967         * If we are below the extents threshold then we can add this as an
1968         * extent, and don't have to deal with the bitmap
1969         */
1970        if (!forced && ctl->free_extents < ctl->extents_thresh) {
1971                /*
1972                 * If this block group has some small extents we don't want to
1973                 * use up all of our free slots in the cache with them, we want
1974                 * to reserve them to larger extents, however if we have plenty
1975                 * of cache left then go ahead an dadd them, no sense in adding
1976                 * the overhead of a bitmap if we don't have to.
1977                 */
1978                if (info->bytes <= fs_info->sectorsize * 4) {
1979                        if (ctl->free_extents * 2 <= ctl->extents_thresh)
1980                                return false;
1981                } else {
1982                        return false;
1983                }
1984        }
1985
1986        /*
1987         * The original block groups from mkfs can be really small, like 8
1988         * megabytes, so don't bother with a bitmap for those entries.  However
1989         * some block groups can be smaller than what a bitmap would cover but
1990         * are still large enough that they could overflow the 32k memory limit,
1991         * so allow those block groups to still be allowed to have a bitmap
1992         * entry.
1993         */
1994        if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
1995                return false;
1996
1997        return true;
1998}
1999
2000static const struct btrfs_free_space_op free_space_op = {
2001        .recalc_thresholds      = recalculate_thresholds,
2002        .use_bitmap             = use_bitmap,
2003};
2004
2005static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
2006                              struct btrfs_free_space *info)
2007{
2008        struct btrfs_free_space *bitmap_info;
2009        struct btrfs_block_group_cache *block_group = NULL;
2010        int added = 0;
2011        u64 bytes, offset, bytes_added;
2012        int ret;
2013
2014        bytes = info->bytes;
2015        offset = info->offset;
2016
2017        if (!ctl->op->use_bitmap(ctl, info))
2018                return 0;
2019
2020        if (ctl->op == &free_space_op)
2021                block_group = ctl->private;
2022again:
2023        /*
2024         * Since we link bitmaps right into the cluster we need to see if we
2025         * have a cluster here, and if so and it has our bitmap we need to add
2026         * the free space to that bitmap.
2027         */
2028        if (block_group && !list_empty(&block_group->cluster_list)) {
2029                struct btrfs_free_cluster *cluster;
2030                struct rb_node *node;
2031                struct btrfs_free_space *entry;
2032
2033                cluster = list_entry(block_group->cluster_list.next,
2034                                     struct btrfs_free_cluster,
2035                                     block_group_list);
2036                spin_lock(&cluster->lock);
2037                node = rb_first(&cluster->root);
2038                if (!node) {
2039                        spin_unlock(&cluster->lock);
2040                        goto no_cluster_bitmap;
2041                }
2042
2043                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2044                if (!entry->bitmap) {
2045                        spin_unlock(&cluster->lock);
2046                        goto no_cluster_bitmap;
2047                }
2048
2049                if (entry->offset == offset_to_bitmap(ctl, offset)) {
2050                        bytes_added = add_bytes_to_bitmap(ctl, entry,
2051                                                          offset, bytes);
2052                        bytes -= bytes_added;
2053                        offset += bytes_added;
2054                }
2055                spin_unlock(&cluster->lock);
2056                if (!bytes) {
2057                        ret = 1;
2058                        goto out;
2059                }
2060        }
2061
2062no_cluster_bitmap:
2063        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2064                                         1, 0);
2065        if (!bitmap_info) {
2066                ASSERT(added == 0);
2067                goto new_bitmap;
2068        }
2069
2070        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
2071        bytes -= bytes_added;
2072        offset += bytes_added;
2073        added = 0;
2074
2075        if (!bytes) {
2076                ret = 1;
2077                goto out;
2078        } else
2079                goto again;
2080
2081new_bitmap:
2082        if (info && info->bitmap) {
2083                add_new_bitmap(ctl, info, offset);
2084                added = 1;
2085                info = NULL;
2086                goto again;
2087        } else {
2088                spin_unlock(&ctl->tree_lock);
2089
2090                /* no pre-allocated info, allocate a new one */
2091                if (!info) {
2092                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
2093                                                 GFP_NOFS);
2094                        if (!info) {
2095                                spin_lock(&ctl->tree_lock);
2096                                ret = -ENOMEM;
2097                                goto out;
2098                        }
2099                }
2100
2101                /* allocate the bitmap */
2102                info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
2103                spin_lock(&ctl->tree_lock);
2104                if (!info->bitmap) {
2105                        ret = -ENOMEM;
2106                        goto out;
2107                }
2108                goto again;
2109        }
2110
2111out:
2112        if (info) {
2113                if (info->bitmap)
2114                        kfree(info->bitmap);
2115                kmem_cache_free(btrfs_free_space_cachep, info);
2116        }
2117
2118        return ret;
2119}
2120
2121static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
2122                          struct btrfs_free_space *info, bool update_stat)
2123{
2124        struct btrfs_free_space *left_info;
2125        struct btrfs_free_space *right_info;
2126        bool merged = false;
2127        u64 offset = info->offset;
2128        u64 bytes = info->bytes;
2129
2130        /*
2131         * first we want to see if there is free space adjacent to the range we
2132         * are adding, if there is remove that struct and add a new one to
2133         * cover the entire range
2134         */
2135        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
2136        if (right_info && rb_prev(&right_info->offset_index))
2137                left_info = rb_entry(rb_prev(&right_info->offset_index),
2138                                     struct btrfs_free_space, offset_index);
2139        else
2140                left_info = tree_search_offset(ctl, offset - 1, 0, 0);
2141
2142        if (right_info && !right_info->bitmap) {
2143                if (update_stat)
2144                        unlink_free_space(ctl, right_info);
2145                else
2146                        __unlink_free_space(ctl, right_info);
2147                info->bytes += right_info->bytes;
2148                kmem_cache_free(btrfs_free_space_cachep, right_info);
2149                merged = true;
2150        }
2151
2152        if (left_info && !left_info->bitmap &&
2153            left_info->offset + left_info->bytes == offset) {
2154                if (update_stat)
2155                        unlink_free_space(ctl, left_info);
2156                else
2157                        __unlink_free_space(ctl, left_info);
2158                info->offset = left_info->offset;
2159                info->bytes += left_info->bytes;
2160                kmem_cache_free(btrfs_free_space_cachep, left_info);
2161                merged = true;
2162        }
2163
2164        return merged;
2165}
2166
2167static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2168                                     struct btrfs_free_space *info,
2169                                     bool update_stat)
2170{
2171        struct btrfs_free_space *bitmap;
2172        unsigned long i;
2173        unsigned long j;
2174        const u64 end = info->offset + info->bytes;
2175        const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2176        u64 bytes;
2177
2178        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2179        if (!bitmap)
2180                return false;
2181
2182        i = offset_to_bit(bitmap->offset, ctl->unit, end);
2183        j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2184        if (j == i)
2185                return false;
2186        bytes = (j - i) * ctl->unit;
2187        info->bytes += bytes;
2188
2189        if (update_stat)
2190                bitmap_clear_bits(ctl, bitmap, end, bytes);
2191        else
2192                __bitmap_clear_bits(ctl, bitmap, end, bytes);
2193
2194        if (!bitmap->bytes)
2195                free_bitmap(ctl, bitmap);
2196
2197        return true;
2198}
2199
2200static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2201                                       struct btrfs_free_space *info,
2202                                       bool update_stat)
2203{
2204        struct btrfs_free_space *bitmap;
2205        u64 bitmap_offset;
2206        unsigned long i;
2207        unsigned long j;
2208        unsigned long prev_j;
2209        u64 bytes;
2210
2211        bitmap_offset = offset_to_bitmap(ctl, info->offset);
2212        /* If we're on a boundary, try the previous logical bitmap. */
2213        if (bitmap_offset == info->offset) {
2214                if (info->offset == 0)
2215                        return false;
2216                bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2217        }
2218
2219        bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2220        if (!bitmap)
2221                return false;
2222
2223        i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2224        j = 0;
2225        prev_j = (unsigned long)-1;
2226        for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2227                if (j > i)
2228                        break;
2229                prev_j = j;
2230        }
2231        if (prev_j == i)
2232                return false;
2233
2234        if (prev_j == (unsigned long)-1)
2235                bytes = (i + 1) * ctl->unit;
2236        else
2237                bytes = (i - prev_j) * ctl->unit;
2238
2239        info->offset -= bytes;
2240        info->bytes += bytes;
2241
2242        if (update_stat)
2243                bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2244        else
2245                __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2246
2247        if (!bitmap->bytes)
2248                free_bitmap(ctl, bitmap);
2249
2250        return true;
2251}
2252
2253/*
2254 * We prefer always to allocate from extent entries, both for clustered and
2255 * non-clustered allocation requests. So when attempting to add a new extent
2256 * entry, try to see if there's adjacent free space in bitmap entries, and if
2257 * there is, migrate that space from the bitmaps to the extent.
2258 * Like this we get better chances of satisfying space allocation requests
2259 * because we attempt to satisfy them based on a single cache entry, and never
2260 * on 2 or more entries - even if the entries represent a contiguous free space
2261 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2262 * ends).
2263 */
2264static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2265                              struct btrfs_free_space *info,
2266                              bool update_stat)
2267{
2268        /*
2269         * Only work with disconnected entries, as we can change their offset,
2270         * and must be extent entries.
2271         */
2272        ASSERT(!info->bitmap);
2273        ASSERT(RB_EMPTY_NODE(&info->offset_index));
2274
2275        if (ctl->total_bitmaps > 0) {
2276                bool stole_end;
2277                bool stole_front = false;
2278
2279                stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2280                if (ctl->total_bitmaps > 0)
2281                        stole_front = steal_from_bitmap_to_front(ctl, info,
2282                                                                 update_stat);
2283
2284                if (stole_end || stole_front)
2285                        try_merge_free_space(ctl, info, update_stat);
2286        }
2287}
2288
2289int __btrfs_add_free_space(struct btrfs_fs_info *fs_info,
2290                           struct btrfs_free_space_ctl *ctl,
2291                           u64 offset, u64 bytes)
2292{
2293        struct btrfs_free_space *info;
2294        int ret = 0;
2295
2296        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2297        if (!info)
2298                return -ENOMEM;
2299
2300        info->offset = offset;
2301        info->bytes = bytes;
2302        RB_CLEAR_NODE(&info->offset_index);
2303
2304        spin_lock(&ctl->tree_lock);
2305
2306        if (try_merge_free_space(ctl, info, true))
2307                goto link;
2308
2309        /*
2310         * There was no extent directly to the left or right of this new
2311         * extent then we know we're going to have to allocate a new extent, so
2312         * before we do that see if we need to drop this into a bitmap
2313         */
2314        ret = insert_into_bitmap(ctl, info);
2315        if (ret < 0) {
2316                goto out;
2317        } else if (ret) {
2318                ret = 0;
2319                goto out;
2320        }
2321link:
2322        /*
2323         * Only steal free space from adjacent bitmaps if we're sure we're not
2324         * going to add the new free space to existing bitmap entries - because
2325         * that would mean unnecessary work that would be reverted. Therefore
2326         * attempt to steal space from bitmaps if we're adding an extent entry.
2327         */
2328        steal_from_bitmap(ctl, info, true);
2329
2330        ret = link_free_space(ctl, info);
2331        if (ret)
2332                kmem_cache_free(btrfs_free_space_cachep, info);
2333out:
2334        spin_unlock(&ctl->tree_lock);
2335
2336        if (ret) {
2337                btrfs_crit(fs_info, "unable to add free space :%d", ret);
2338                ASSERT(ret != -EEXIST);
2339        }
2340
2341        return ret;
2342}
2343
2344int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2345                            u64 offset, u64 bytes)
2346{
2347        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2348        struct btrfs_free_space *info;
2349        int ret;
2350        bool re_search = false;
2351
2352        spin_lock(&ctl->tree_lock);
2353
2354again:
2355        ret = 0;
2356        if (!bytes)
2357                goto out_lock;
2358
2359        info = tree_search_offset(ctl, offset, 0, 0);
2360        if (!info) {
2361                /*
2362                 * oops didn't find an extent that matched the space we wanted
2363                 * to remove, look for a bitmap instead
2364                 */
2365                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2366                                          1, 0);
2367                if (!info) {
2368                        /*
2369                         * If we found a partial bit of our free space in a
2370                         * bitmap but then couldn't find the other part this may
2371                         * be a problem, so WARN about it.
2372                         */
2373                        WARN_ON(re_search);
2374                        goto out_lock;
2375                }
2376        }
2377
2378        re_search = false;
2379        if (!info->bitmap) {
2380                unlink_free_space(ctl, info);
2381                if (offset == info->offset) {
2382                        u64 to_free = min(bytes, info->bytes);
2383
2384                        info->bytes -= to_free;
2385                        info->offset += to_free;
2386                        if (info->bytes) {
2387                                ret = link_free_space(ctl, info);
2388                                WARN_ON(ret);
2389                        } else {
2390                                kmem_cache_free(btrfs_free_space_cachep, info);
2391                        }
2392
2393                        offset += to_free;
2394                        bytes -= to_free;
2395                        goto again;
2396                } else {
2397                        u64 old_end = info->bytes + info->offset;
2398
2399                        info->bytes = offset - info->offset;
2400                        ret = link_free_space(ctl, info);
2401                        WARN_ON(ret);
2402                        if (ret)
2403                                goto out_lock;
2404
2405                        /* Not enough bytes in this entry to satisfy us */
2406                        if (old_end < offset + bytes) {
2407                                bytes -= old_end - offset;
2408                                offset = old_end;
2409                                goto again;
2410                        } else if (old_end == offset + bytes) {
2411                                /* all done */
2412                                goto out_lock;
2413                        }
2414                        spin_unlock(&ctl->tree_lock);
2415
2416                        ret = btrfs_add_free_space(block_group, offset + bytes,
2417                                                   old_end - (offset + bytes));
2418                        WARN_ON(ret);
2419                        goto out;
2420                }
2421        }
2422
2423        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2424        if (ret == -EAGAIN) {
2425                re_search = true;
2426                goto again;
2427        }
2428out_lock:
2429        spin_unlock(&ctl->tree_lock);
2430out:
2431        return ret;
2432}
2433
2434void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2435                           u64 bytes)
2436{
2437        struct btrfs_fs_info *fs_info = block_group->fs_info;
2438        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2439        struct btrfs_free_space *info;
2440        struct rb_node *n;
2441        int count = 0;
2442
2443        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2444                info = rb_entry(n, struct btrfs_free_space, offset_index);
2445                if (info->bytes >= bytes && !block_group->ro)
2446                        count++;
2447                btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
2448                           info->offset, info->bytes,
2449                       (info->bitmap) ? "yes" : "no");
2450        }
2451        btrfs_info(fs_info, "block group has cluster?: %s",
2452               list_empty(&block_group->cluster_list) ? "no" : "yes");
2453        btrfs_info(fs_info,
2454                   "%d blocks of free space at or bigger than bytes is", count);
2455}
2456
2457void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2458{
2459        struct btrfs_fs_info *fs_info = block_group->fs_info;
2460        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2461
2462        spin_lock_init(&ctl->tree_lock);
2463        ctl->unit = fs_info->sectorsize;
2464        ctl->start = block_group->key.objectid;
2465        ctl->private = block_group;
2466        ctl->op = &free_space_op;
2467        INIT_LIST_HEAD(&ctl->trimming_ranges);
2468        mutex_init(&ctl->cache_writeout_mutex);
2469
2470        /*
2471         * we only want to have 32k of ram per block group for keeping
2472         * track of free space, and if we pass 1/2 of that we want to
2473         * start converting things over to using bitmaps
2474         */
2475        ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
2476}
2477
2478/*
2479 * for a given cluster, put all of its extents back into the free
2480 * space cache.  If the block group passed doesn't match the block group
2481 * pointed to by the cluster, someone else raced in and freed the
2482 * cluster already.  In that case, we just return without changing anything
2483 */
2484static int
2485__btrfs_return_cluster_to_free_space(
2486                             struct btrfs_block_group_cache *block_group,
2487                             struct btrfs_free_cluster *cluster)
2488{
2489        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2490        struct btrfs_free_space *entry;
2491        struct rb_node *node;
2492
2493        spin_lock(&cluster->lock);
2494        if (cluster->block_group != block_group)
2495                goto out;
2496
2497        cluster->block_group = NULL;
2498        cluster->window_start = 0;
2499        list_del_init(&cluster->block_group_list);
2500
2501        node = rb_first(&cluster->root);
2502        while (node) {
2503                bool bitmap;
2504
2505                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2506                node = rb_next(&entry->offset_index);
2507                rb_erase(&entry->offset_index, &cluster->root);
2508                RB_CLEAR_NODE(&entry->offset_index);
2509
2510                bitmap = (entry->bitmap != NULL);
2511                if (!bitmap) {
2512                        try_merge_free_space(ctl, entry, false);
2513                        steal_from_bitmap(ctl, entry, false);
2514                }
2515                tree_insert_offset(&ctl->free_space_offset,
2516                                   entry->offset, &entry->offset_index, bitmap);
2517        }
2518        cluster->root = RB_ROOT;
2519
2520out:
2521        spin_unlock(&cluster->lock);
2522        btrfs_put_block_group(block_group);
2523        return 0;
2524}
2525
2526static void __btrfs_remove_free_space_cache_locked(
2527                                struct btrfs_free_space_ctl *ctl)
2528{
2529        struct btrfs_free_space *info;
2530        struct rb_node *node;
2531
2532        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2533                info = rb_entry(node, struct btrfs_free_space, offset_index);
2534                if (!info->bitmap) {
2535                        unlink_free_space(ctl, info);
2536                        kmem_cache_free(btrfs_free_space_cachep, info);
2537                } else {
2538                        free_bitmap(ctl, info);
2539                }
2540
2541                cond_resched_lock(&ctl->tree_lock);
2542        }
2543}
2544
2545void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2546{
2547        spin_lock(&ctl->tree_lock);
2548        __btrfs_remove_free_space_cache_locked(ctl);
2549        spin_unlock(&ctl->tree_lock);
2550}
2551
2552void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2553{
2554        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2555        struct btrfs_free_cluster *cluster;
2556        struct list_head *head;
2557
2558        spin_lock(&ctl->tree_lock);
2559        while ((head = block_group->cluster_list.next) !=
2560               &block_group->cluster_list) {
2561                cluster = list_entry(head, struct btrfs_free_cluster,
2562                                     block_group_list);
2563
2564                WARN_ON(cluster->block_group != block_group);
2565                __btrfs_return_cluster_to_free_space(block_group, cluster);
2566
2567                cond_resched_lock(&ctl->tree_lock);
2568        }
2569        __btrfs_remove_free_space_cache_locked(ctl);
2570        spin_unlock(&ctl->tree_lock);
2571
2572}
2573
2574u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2575                               u64 offset, u64 bytes, u64 empty_size,
2576                               u64 *max_extent_size)
2577{
2578        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2579        struct btrfs_free_space *entry = NULL;
2580        u64 bytes_search = bytes + empty_size;
2581        u64 ret = 0;
2582        u64 align_gap = 0;
2583        u64 align_gap_len = 0;
2584
2585        spin_lock(&ctl->tree_lock);
2586        entry = find_free_space(ctl, &offset, &bytes_search,
2587                                block_group->full_stripe_len, max_extent_size);
2588        if (!entry)
2589                goto out;
2590
2591        ret = offset;
2592        if (entry->bitmap) {
2593                bitmap_clear_bits(ctl, entry, offset, bytes);
2594                if (!entry->bytes)
2595                        free_bitmap(ctl, entry);
2596        } else {
2597                unlink_free_space(ctl, entry);
2598                align_gap_len = offset - entry->offset;
2599                align_gap = entry->offset;
2600
2601                entry->offset = offset + bytes;
2602                WARN_ON(entry->bytes < bytes + align_gap_len);
2603
2604                entry->bytes -= bytes + align_gap_len;
2605                if (!entry->bytes)
2606                        kmem_cache_free(btrfs_free_space_cachep, entry);
2607                else
2608                        link_free_space(ctl, entry);
2609        }
2610out:
2611        spin_unlock(&ctl->tree_lock);
2612
2613        if (align_gap_len)
2614                __btrfs_add_free_space(block_group->fs_info, ctl,
2615                                       align_gap, align_gap_len);
2616        return ret;
2617}
2618
2619/*
2620 * given a cluster, put all of its extents back into the free space
2621 * cache.  If a block group is passed, this function will only free
2622 * a cluster that belongs to the passed block group.
2623 *
2624 * Otherwise, it'll get a reference on the block group pointed to by the
2625 * cluster and remove the cluster from it.
2626 */
2627int btrfs_return_cluster_to_free_space(
2628                               struct btrfs_block_group_cache *block_group,
2629                               struct btrfs_free_cluster *cluster)
2630{
2631        struct btrfs_free_space_ctl *ctl;
2632        int ret;
2633
2634        /* first, get a safe pointer to the block group */
2635        spin_lock(&cluster->lock);
2636        if (!block_group) {
2637                block_group = cluster->block_group;
2638                if (!block_group) {
2639                        spin_unlock(&cluster->lock);
2640                        return 0;
2641                }
2642        } else if (cluster->block_group != block_group) {
2643                /* someone else has already freed it don't redo their work */
2644                spin_unlock(&cluster->lock);
2645                return 0;
2646        }
2647        atomic_inc(&block_group->count);
2648        spin_unlock(&cluster->lock);
2649
2650        ctl = block_group->free_space_ctl;
2651
2652        /* now return any extents the cluster had on it */
2653        spin_lock(&ctl->tree_lock);
2654        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2655        spin_unlock(&ctl->tree_lock);
2656
2657        /* finally drop our ref */
2658        btrfs_put_block_group(block_group);
2659        return ret;
2660}
2661
2662static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2663                                   struct btrfs_free_cluster *cluster,
2664                                   struct btrfs_free_space *entry,
2665                                   u64 bytes, u64 min_start,
2666                                   u64 *max_extent_size)
2667{
2668        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2669        int err;
2670        u64 search_start = cluster->window_start;
2671        u64 search_bytes = bytes;
2672        u64 ret = 0;
2673
2674        search_start = min_start;
2675        search_bytes = bytes;
2676
2677        err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2678        if (err) {
2679                if (search_bytes > *max_extent_size)
2680                        *max_extent_size = search_bytes;
2681                return 0;
2682        }
2683
2684        ret = search_start;
2685        __bitmap_clear_bits(ctl, entry, ret, bytes);
2686
2687        return ret;
2688}
2689
2690/*
2691 * given a cluster, try to allocate 'bytes' from it, returns 0
2692 * if it couldn't find anything suitably large, or a logical disk offset
2693 * if things worked out
2694 */
2695u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2696                             struct btrfs_free_cluster *cluster, u64 bytes,
2697                             u64 min_start, u64 *max_extent_size)
2698{
2699        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2700        struct btrfs_free_space *entry = NULL;
2701        struct rb_node *node;
2702        u64 ret = 0;
2703
2704        spin_lock(&cluster->lock);
2705        if (bytes > cluster->max_size)
2706                goto out;
2707
2708        if (cluster->block_group != block_group)
2709                goto out;
2710
2711        node = rb_first(&cluster->root);
2712        if (!node)
2713                goto out;
2714
2715        entry = rb_entry(node, struct btrfs_free_space, offset_index);
2716        while (1) {
2717                if (entry->bytes < bytes && entry->bytes > *max_extent_size)
2718                        *max_extent_size = entry->bytes;
2719
2720                if (entry->bytes < bytes ||
2721                    (!entry->bitmap && entry->offset < min_start)) {
2722                        node = rb_next(&entry->offset_index);
2723                        if (!node)
2724                                break;
2725                        entry = rb_entry(node, struct btrfs_free_space,
2726                                         offset_index);
2727                        continue;
2728                }
2729
2730                if (entry->bitmap) {
2731                        ret = btrfs_alloc_from_bitmap(block_group,
2732                                                      cluster, entry, bytes,
2733                                                      cluster->window_start,
2734                                                      max_extent_size);
2735                        if (ret == 0) {
2736                                node = rb_next(&entry->offset_index);
2737                                if (!node)
2738                                        break;
2739                                entry = rb_entry(node, struct btrfs_free_space,
2740                                                 offset_index);
2741                                continue;
2742                        }
2743                        cluster->window_start += bytes;
2744                } else {
2745                        ret = entry->offset;
2746
2747                        entry->offset += bytes;
2748                        entry->bytes -= bytes;
2749                }
2750
2751                if (entry->bytes == 0)
2752                        rb_erase(&entry->offset_index, &cluster->root);
2753                break;
2754        }
2755out:
2756        spin_unlock(&cluster->lock);
2757
2758        if (!ret)
2759                return 0;
2760
2761        spin_lock(&ctl->tree_lock);
2762
2763        ctl->free_space -= bytes;
2764        if (entry->bytes == 0) {
2765                ctl->free_extents--;
2766                if (entry->bitmap) {
2767                        kfree(entry->bitmap);
2768                        ctl->total_bitmaps--;
2769                        ctl->op->recalc_thresholds(ctl);
2770                }
2771                kmem_cache_free(btrfs_free_space_cachep, entry);
2772        }
2773
2774        spin_unlock(&ctl->tree_lock);
2775
2776        return ret;
2777}
2778
2779static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2780                                struct btrfs_free_space *entry,
2781                                struct btrfs_free_cluster *cluster,
2782                                u64 offset, u64 bytes,
2783                                u64 cont1_bytes, u64 min_bytes)
2784{
2785        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2786        unsigned long next_zero;
2787        unsigned long i;
2788        unsigned long want_bits;
2789        unsigned long min_bits;
2790        unsigned long found_bits;
2791        unsigned long max_bits = 0;
2792        unsigned long start = 0;
2793        unsigned long total_found = 0;
2794        int ret;
2795
2796        i = offset_to_bit(entry->offset, ctl->unit,
2797                          max_t(u64, offset, entry->offset));
2798        want_bits = bytes_to_bits(bytes, ctl->unit);
2799        min_bits = bytes_to_bits(min_bytes, ctl->unit);
2800
2801        /*
2802         * Don't bother looking for a cluster in this bitmap if it's heavily
2803         * fragmented.
2804         */
2805        if (entry->max_extent_size &&
2806            entry->max_extent_size < cont1_bytes)
2807                return -ENOSPC;
2808again:
2809        found_bits = 0;
2810        for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2811                next_zero = find_next_zero_bit(entry->bitmap,
2812                                               BITS_PER_BITMAP, i);
2813                if (next_zero - i >= min_bits) {
2814                        found_bits = next_zero - i;
2815                        if (found_bits > max_bits)
2816                                max_bits = found_bits;
2817                        break;
2818                }
2819                if (next_zero - i > max_bits)
2820                        max_bits = next_zero - i;
2821                i = next_zero;
2822        }
2823
2824        if (!found_bits) {
2825                entry->max_extent_size = (u64)max_bits * ctl->unit;
2826                return -ENOSPC;
2827        }
2828
2829        if (!total_found) {
2830                start = i;
2831                cluster->max_size = 0;
2832        }
2833
2834        total_found += found_bits;
2835
2836        if (cluster->max_size < found_bits * ctl->unit)
2837                cluster->max_size = found_bits * ctl->unit;
2838
2839        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2840                i = next_zero + 1;
2841                goto again;
2842        }
2843
2844        cluster->window_start = start * ctl->unit + entry->offset;
2845        rb_erase(&entry->offset_index, &ctl->free_space_offset);
2846        ret = tree_insert_offset(&cluster->root, entry->offset,
2847                                 &entry->offset_index, 1);
2848        ASSERT(!ret); /* -EEXIST; Logic error */
2849
2850        trace_btrfs_setup_cluster(block_group, cluster,
2851                                  total_found * ctl->unit, 1);
2852        return 0;
2853}
2854
2855/*
2856 * This searches the block group for just extents to fill the cluster with.
2857 * Try to find a cluster with at least bytes total bytes, at least one
2858 * extent of cont1_bytes, and other clusters of at least min_bytes.
2859 */
2860static noinline int
2861setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2862                        struct btrfs_free_cluster *cluster,
2863                        struct list_head *bitmaps, u64 offset, u64 bytes,
2864                        u64 cont1_bytes, u64 min_bytes)
2865{
2866        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2867        struct btrfs_free_space *first = NULL;
2868        struct btrfs_free_space *entry = NULL;
2869        struct btrfs_free_space *last;
2870        struct rb_node *node;
2871        u64 window_free;
2872        u64 max_extent;
2873        u64 total_size = 0;
2874
2875        entry = tree_search_offset(ctl, offset, 0, 1);
2876        if (!entry)
2877                return -ENOSPC;
2878
2879        /*
2880         * We don't want bitmaps, so just move along until we find a normal
2881         * extent entry.
2882         */
2883        while (entry->bitmap || entry->bytes < min_bytes) {
2884                if (entry->bitmap && list_empty(&entry->list))
2885                        list_add_tail(&entry->list, bitmaps);
2886                node = rb_next(&entry->offset_index);
2887                if (!node)
2888                        return -ENOSPC;
2889                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2890        }
2891
2892        window_free = entry->bytes;
2893        max_extent = entry->bytes;
2894        first = entry;
2895        last = entry;
2896
2897        for (node = rb_next(&entry->offset_index); node;
2898             node = rb_next(&entry->offset_index)) {
2899                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2900
2901                if (entry->bitmap) {
2902                        if (list_empty(&entry->list))
2903                                list_add_tail(&entry->list, bitmaps);
2904                        continue;
2905                }
2906
2907                if (entry->bytes < min_bytes)
2908                        continue;
2909
2910                last = entry;
2911                window_free += entry->bytes;
2912                if (entry->bytes > max_extent)
2913                        max_extent = entry->bytes;
2914        }
2915
2916        if (window_free < bytes || max_extent < cont1_bytes)
2917                return -ENOSPC;
2918
2919        cluster->window_start = first->offset;
2920
2921        node = &first->offset_index;
2922
2923        /*
2924         * now we've found our entries, pull them out of the free space
2925         * cache and put them into the cluster rbtree
2926         */
2927        do {
2928                int ret;
2929
2930                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2931                node = rb_next(&entry->offset_index);
2932                if (entry->bitmap || entry->bytes < min_bytes)
2933                        continue;
2934
2935                rb_erase(&entry->offset_index, &ctl->free_space_offset);
2936                ret = tree_insert_offset(&cluster->root, entry->offset,
2937                                         &entry->offset_index, 0);
2938                total_size += entry->bytes;
2939                ASSERT(!ret); /* -EEXIST; Logic error */
2940        } while (node && entry != last);
2941
2942        cluster->max_size = max_extent;
2943        trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2944        return 0;
2945}
2946
2947/*
2948 * This specifically looks for bitmaps that may work in the cluster, we assume
2949 * that we have already failed to find extents that will work.
2950 */
2951static noinline int
2952setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2953                     struct btrfs_free_cluster *cluster,
2954                     struct list_head *bitmaps, u64 offset, u64 bytes,
2955                     u64 cont1_bytes, u64 min_bytes)
2956{
2957        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2958        struct btrfs_free_space *entry = NULL;
2959        int ret = -ENOSPC;
2960        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2961
2962        if (ctl->total_bitmaps == 0)
2963                return -ENOSPC;
2964
2965        /*
2966         * The bitmap that covers offset won't be in the list unless offset
2967         * is just its start offset.
2968         */
2969        if (!list_empty(bitmaps))
2970                entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2971
2972        if (!entry || entry->offset != bitmap_offset) {
2973                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2974                if (entry && list_empty(&entry->list))
2975                        list_add(&entry->list, bitmaps);
2976        }
2977
2978        list_for_each_entry(entry, bitmaps, list) {
2979                if (entry->bytes < bytes)
2980                        continue;
2981                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2982                                           bytes, cont1_bytes, min_bytes);
2983                if (!ret)
2984                        return 0;
2985        }
2986
2987        /*
2988         * The bitmaps list has all the bitmaps that record free space
2989         * starting after offset, so no more search is required.
2990         */
2991        return -ENOSPC;
2992}
2993
2994/*
2995 * here we try to find a cluster of blocks in a block group.  The goal
2996 * is to find at least bytes+empty_size.
2997 * We might not find them all in one contiguous area.
2998 *
2999 * returns zero and sets up cluster if things worked out, otherwise
3000 * it returns -enospc
3001 */
3002int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info,
3003                             struct btrfs_block_group_cache *block_group,
3004                             struct btrfs_free_cluster *cluster,
3005                             u64 offset, u64 bytes, u64 empty_size)
3006{
3007        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3008        struct btrfs_free_space *entry, *tmp;
3009        LIST_HEAD(bitmaps);
3010        u64 min_bytes;
3011        u64 cont1_bytes;
3012        int ret;
3013
3014        /*
3015         * Choose the minimum extent size we'll require for this
3016         * cluster.  For SSD_SPREAD, don't allow any fragmentation.
3017         * For metadata, allow allocates with smaller extents.  For
3018         * data, keep it dense.
3019         */
3020        if (btrfs_test_opt(fs_info, SSD_SPREAD)) {
3021                cont1_bytes = min_bytes = bytes + empty_size;
3022        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
3023                cont1_bytes = bytes;
3024                min_bytes = fs_info->sectorsize;
3025        } else {
3026                cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
3027                min_bytes = fs_info->sectorsize;
3028        }
3029
3030        spin_lock(&ctl->tree_lock);
3031
3032        /*
3033         * If we know we don't have enough space to make a cluster don't even
3034         * bother doing all the work to try and find one.
3035         */
3036        if (ctl->free_space < bytes) {
3037                spin_unlock(&ctl->tree_lock);
3038                return -ENOSPC;
3039        }
3040
3041        spin_lock(&cluster->lock);
3042
3043        /* someone already found a cluster, hooray */
3044        if (cluster->block_group) {
3045                ret = 0;
3046                goto out;
3047        }
3048
3049        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
3050                                 min_bytes);
3051
3052        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
3053                                      bytes + empty_size,
3054                                      cont1_bytes, min_bytes);
3055        if (ret)
3056                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
3057                                           offset, bytes + empty_size,
3058                                           cont1_bytes, min_bytes);
3059
3060        /* Clear our temporary list */
3061        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
3062                list_del_init(&entry->list);
3063
3064        if (!ret) {
3065                atomic_inc(&block_group->count);
3066                list_add_tail(&cluster->block_group_list,
3067                              &block_group->cluster_list);
3068                cluster->block_group = block_group;
3069        } else {
3070                trace_btrfs_failed_cluster_setup(block_group);
3071        }
3072out:
3073        spin_unlock(&cluster->lock);
3074        spin_unlock(&ctl->tree_lock);
3075
3076        return ret;
3077}
3078
3079/*
3080 * simple code to zero out a cluster
3081 */
3082void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3083{
3084        spin_lock_init(&cluster->lock);
3085        spin_lock_init(&cluster->refill_lock);
3086        cluster->root = RB_ROOT;
3087        cluster->max_size = 0;
3088        cluster->fragmented = false;
3089        INIT_LIST_HEAD(&cluster->block_group_list);
3090        cluster->block_group = NULL;
3091}
3092
3093static int do_trimming(struct btrfs_block_group_cache *block_group,
3094                       u64 *total_trimmed, u64 start, u64 bytes,
3095                       u64 reserved_start, u64 reserved_bytes,
3096                       struct btrfs_trim_range *trim_entry)
3097{
3098        struct btrfs_space_info *space_info = block_group->space_info;
3099        struct btrfs_fs_info *fs_info = block_group->fs_info;
3100        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3101        int ret;
3102        int update = 0;
3103        u64 trimmed = 0;
3104
3105        spin_lock(&space_info->lock);
3106        spin_lock(&block_group->lock);
3107        if (!block_group->ro) {
3108                block_group->reserved += reserved_bytes;
3109                space_info->bytes_reserved += reserved_bytes;
3110                update = 1;
3111        }
3112        spin_unlock(&block_group->lock);
3113        spin_unlock(&space_info->lock);
3114
3115        ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
3116        if (!ret)
3117                *total_trimmed += trimmed;
3118
3119        mutex_lock(&ctl->cache_writeout_mutex);
3120        btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
3121        list_del(&trim_entry->list);
3122        mutex_unlock(&ctl->cache_writeout_mutex);
3123
3124        if (update) {
3125                spin_lock(&space_info->lock);
3126                spin_lock(&block_group->lock);
3127                if (block_group->ro)
3128                        space_info->bytes_readonly += reserved_bytes;
3129                block_group->reserved -= reserved_bytes;
3130                space_info->bytes_reserved -= reserved_bytes;
3131                spin_unlock(&space_info->lock);
3132                spin_unlock(&block_group->lock);
3133        }
3134
3135        return ret;
3136}
3137
3138static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
3139                          u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3140{
3141        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3142        struct btrfs_free_space *entry;
3143        struct rb_node *node;
3144        int ret = 0;
3145        u64 extent_start;
3146        u64 extent_bytes;
3147        u64 bytes;
3148
3149        while (start < end) {
3150                struct btrfs_trim_range trim_entry;
3151
3152                mutex_lock(&ctl->cache_writeout_mutex);
3153                spin_lock(&ctl->tree_lock);
3154
3155                if (ctl->free_space < minlen) {
3156                        spin_unlock(&ctl->tree_lock);
3157                        mutex_unlock(&ctl->cache_writeout_mutex);
3158                        break;
3159                }
3160
3161                entry = tree_search_offset(ctl, start, 0, 1);
3162                if (!entry) {
3163                        spin_unlock(&ctl->tree_lock);
3164                        mutex_unlock(&ctl->cache_writeout_mutex);
3165                        break;
3166                }
3167
3168                /* skip bitmaps */
3169                while (entry->bitmap) {
3170                        node = rb_next(&entry->offset_index);
3171                        if (!node) {
3172                                spin_unlock(&ctl->tree_lock);
3173                                mutex_unlock(&ctl->cache_writeout_mutex);
3174                                goto out;
3175                        }
3176                        entry = rb_entry(node, struct btrfs_free_space,
3177                                         offset_index);
3178                }
3179
3180                if (entry->offset >= end) {
3181                        spin_unlock(&ctl->tree_lock);
3182                        mutex_unlock(&ctl->cache_writeout_mutex);
3183                        break;
3184                }
3185
3186                extent_start = entry->offset;
3187                extent_bytes = entry->bytes;
3188                start = max(start, extent_start);
3189                bytes = min(extent_start + extent_bytes, end) - start;
3190                if (bytes < minlen) {
3191                        spin_unlock(&ctl->tree_lock);
3192                        mutex_unlock(&ctl->cache_writeout_mutex);
3193                        goto next;
3194                }
3195
3196                unlink_free_space(ctl, entry);
3197                kmem_cache_free(btrfs_free_space_cachep, entry);
3198
3199                spin_unlock(&ctl->tree_lock);
3200                trim_entry.start = extent_start;
3201                trim_entry.bytes = extent_bytes;
3202                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3203                mutex_unlock(&ctl->cache_writeout_mutex);
3204
3205                ret = do_trimming(block_group, total_trimmed, start, bytes,
3206                                  extent_start, extent_bytes, &trim_entry);
3207                if (ret)
3208                        break;
3209next:
3210                start += bytes;
3211
3212                if (fatal_signal_pending(current)) {
3213                        ret = -ERESTARTSYS;
3214                        break;
3215                }
3216
3217                cond_resched();
3218        }
3219out:
3220        return ret;
3221}
3222
3223static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3224                        u64 *total_trimmed, u64 start, u64 end, u64 minlen)
3225{
3226        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
3227        struct btrfs_free_space *entry;
3228        int ret = 0;
3229        int ret2;
3230        u64 bytes;
3231        u64 offset = offset_to_bitmap(ctl, start);
3232
3233        while (offset < end) {
3234                bool next_bitmap = false;
3235                struct btrfs_trim_range trim_entry;
3236
3237                mutex_lock(&ctl->cache_writeout_mutex);
3238                spin_lock(&ctl->tree_lock);
3239
3240                if (ctl->free_space < minlen) {
3241                        spin_unlock(&ctl->tree_lock);
3242                        mutex_unlock(&ctl->cache_writeout_mutex);
3243                        break;
3244                }
3245
3246                entry = tree_search_offset(ctl, offset, 1, 0);
3247                if (!entry) {
3248                        spin_unlock(&ctl->tree_lock);
3249                        mutex_unlock(&ctl->cache_writeout_mutex);
3250                        next_bitmap = true;
3251                        goto next;
3252                }
3253
3254                bytes = minlen;
3255                ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3256                if (ret2 || start >= end) {
3257                        spin_unlock(&ctl->tree_lock);
3258                        mutex_unlock(&ctl->cache_writeout_mutex);
3259                        next_bitmap = true;
3260                        goto next;
3261                }
3262
3263                bytes = min(bytes, end - start);
3264                if (bytes < minlen) {
3265                        spin_unlock(&ctl->tree_lock);
3266                        mutex_unlock(&ctl->cache_writeout_mutex);
3267                        goto next;
3268                }
3269
3270                bitmap_clear_bits(ctl, entry, start, bytes);
3271                if (entry->bytes == 0)
3272                        free_bitmap(ctl, entry);
3273
3274                spin_unlock(&ctl->tree_lock);
3275                trim_entry.start = start;
3276                trim_entry.bytes = bytes;
3277                list_add_tail(&trim_entry.list, &ctl->trimming_ranges);
3278                mutex_unlock(&ctl->cache_writeout_mutex);
3279
3280                ret = do_trimming(block_group, total_trimmed, start, bytes,
3281                                  start, bytes, &trim_entry);
3282                if (ret)
3283                        break;
3284next:
3285                if (next_bitmap) {
3286                        offset += BITS_PER_BITMAP * ctl->unit;
3287                } else {
3288                        start += bytes;
3289                        if (start >= offset + BITS_PER_BITMAP * ctl->unit)
3290                                offset += BITS_PER_BITMAP * ctl->unit;
3291                }
3292
3293                if (fatal_signal_pending(current)) {
3294                        ret = -ERESTARTSYS;
3295                        break;
3296                }
3297
3298                cond_resched();
3299        }
3300
3301        return ret;
3302}
3303
3304void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
3305{
3306        atomic_inc(&cache->trimming);
3307}
3308
3309void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
3310{
3311        struct btrfs_fs_info *fs_info = block_group->fs_info;
3312        struct extent_map_tree *em_tree;
3313        struct extent_map *em;
3314        bool cleanup;
3315
3316        spin_lock(&block_group->lock);
3317        cleanup = (atomic_dec_and_test(&block_group->trimming) &&
3318                   block_group->removed);
3319        spin_unlock(&block_group->lock);
3320
3321        if (cleanup) {
3322                mutex_lock(&fs_info->chunk_mutex);
3323                em_tree = &fs_info->mapping_tree.map_tree;
3324                write_lock(&em_tree->lock);
3325                em = lookup_extent_mapping(em_tree, block_group->key.objectid,
3326                                           1);
3327                BUG_ON(!em); /* logic error, can't happen */
3328                /*
3329                 * remove_extent_mapping() will delete us from the pinned_chunks
3330                 * list, which is protected by the chunk mutex.
3331                 */
3332                remove_extent_mapping(em_tree, em);
3333                write_unlock(&em_tree->lock);
3334                mutex_unlock(&fs_info->chunk_mutex);
3335
3336                /* once for us and once for the tree */
3337                free_extent_map(em);
3338                free_extent_map(em);
3339
3340                /*
3341                 * We've left one free space entry and other tasks trimming
3342                 * this block group have left 1 entry each one. Free them.
3343                 */
3344                __btrfs_remove_free_space_cache(block_group->free_space_ctl);
3345        }
3346}
3347
3348int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
3349                           u64 *trimmed, u64 start, u64 end, u64 minlen)
3350{
3351        int ret;
3352
3353        *trimmed = 0;
3354
3355        spin_lock(&block_group->lock);
3356        if (block_group->removed) {
3357                spin_unlock(&block_group->lock);
3358                return 0;
3359        }
3360        btrfs_get_block_group_trimming(block_group);
3361        spin_unlock(&block_group->lock);
3362
3363        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
3364        if (ret)
3365                goto out;
3366
3367        ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
3368out:
3369        btrfs_put_block_group_trimming(block_group);
3370        return ret;
3371}
3372
3373/*
3374 * Find the left-most item in the cache tree, and then return the
3375 * smallest inode number in the item.
3376 *
3377 * Note: the returned inode number may not be the smallest one in
3378 * the tree, if the left-most item is a bitmap.
3379 */
3380u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3381{
3382        struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
3383        struct btrfs_free_space *entry = NULL;
3384        u64 ino = 0;
3385
3386        spin_lock(&ctl->tree_lock);
3387
3388        if (RB_EMPTY_ROOT(&ctl->free_space_offset))
3389                goto out;
3390
3391        entry = rb_entry(rb_first(&ctl->free_space_offset),
3392                         struct btrfs_free_space, offset_index);
3393
3394        if (!entry->bitmap) {
3395                ino = entry->offset;
3396
3397                unlink_free_space(ctl, entry);
3398                entry->offset++;
3399                entry->bytes--;
3400                if (!entry->bytes)
3401                        kmem_cache_free(btrfs_free_space_cachep, entry);
3402                else
3403                        link_free_space(ctl, entry);
3404        } else {
3405                u64 offset = 0;
3406                u64 count = 1;
3407                int ret;
3408
3409                ret = search_bitmap(ctl, entry, &offset, &count, true);
3410                /* Logic error; Should be empty if it can't find anything */
3411                ASSERT(!ret);
3412
3413                ino = offset;
3414                bitmap_clear_bits(ctl, entry, offset, 1);
3415                if (entry->bytes == 0)
3416                        free_bitmap(ctl, entry);
3417        }
3418out:
3419        spin_unlock(&ctl->tree_lock);
3420
3421        return ino;
3422}
3423
3424struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3425                                    struct btrfs_path *path)
3426{
3427        struct inode *inode = NULL;
3428
3429        spin_lock(&root->ino_cache_lock);
3430        if (root->ino_cache_inode)
3431                inode = igrab(root->ino_cache_inode);
3432        spin_unlock(&root->ino_cache_lock);
3433        if (inode)
3434                return inode;
3435
3436        inode = __lookup_free_space_inode(root, path, 0);
3437        if (IS_ERR(inode))
3438                return inode;
3439
3440        spin_lock(&root->ino_cache_lock);
3441        if (!btrfs_fs_closing(root->fs_info))
3442                root->ino_cache_inode = igrab(inode);
3443        spin_unlock(&root->ino_cache_lock);
3444
3445        return inode;
3446}
3447
3448int create_free_ino_inode(struct btrfs_root *root,
3449                          struct btrfs_trans_handle *trans,
3450                          struct btrfs_path *path)
3451{
3452        return __create_free_space_inode(root, trans, path,
3453                                         BTRFS_FREE_INO_OBJECTID, 0);
3454}
3455
3456int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3457{
3458        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3459        struct btrfs_path *path;
3460        struct inode *inode;
3461        int ret = 0;
3462        u64 root_gen = btrfs_root_generation(&root->root_item);
3463
3464        if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3465                return 0;
3466
3467        /*
3468         * If we're unmounting then just return, since this does a search on the
3469         * normal root and not the commit root and we could deadlock.
3470         */
3471        if (btrfs_fs_closing(fs_info))
3472                return 0;
3473
3474        path = btrfs_alloc_path();
3475        if (!path)
3476                return 0;
3477
3478        inode = lookup_free_ino_inode(root, path);
3479        if (IS_ERR(inode))
3480                goto out;
3481
3482        if (root_gen != BTRFS_I(inode)->generation)
3483                goto out_put;
3484
3485        ret = __load_free_space_cache(root, inode, ctl, path, 0);
3486
3487        if (ret < 0)
3488                btrfs_err(fs_info,
3489                        "failed to load free ino cache for root %llu",
3490                        root->root_key.objectid);
3491out_put:
3492        iput(inode);
3493out:
3494        btrfs_free_path(path);
3495        return ret;
3496}
3497
3498int btrfs_write_out_ino_cache(struct btrfs_root *root,
3499                              struct btrfs_trans_handle *trans,
3500                              struct btrfs_path *path,
3501                              struct inode *inode)
3502{
3503        struct btrfs_fs_info *fs_info = root->fs_info;
3504        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3505        int ret;
3506        struct btrfs_io_ctl io_ctl;
3507        bool release_metadata = true;
3508
3509        if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE))
3510                return 0;
3511
3512        memset(&io_ctl, 0, sizeof(io_ctl));
3513        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, trans);
3514        if (!ret) {
3515                /*
3516                 * At this point writepages() didn't error out, so our metadata
3517                 * reservation is released when the writeback finishes, at
3518                 * inode.c:btrfs_finish_ordered_io(), regardless of it finishing
3519                 * with or without an error.
3520                 */
3521                release_metadata = false;
3522                ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path);
3523        }
3524
3525        if (ret) {
3526                if (release_metadata)
3527                        btrfs_delalloc_release_metadata(BTRFS_I(inode),
3528                                        inode->i_size, true);
3529#ifdef DEBUG
3530                btrfs_err(fs_info,
3531                          "failed to write free ino cache for root %llu",
3532                          root->root_key.objectid);
3533#endif
3534        }
3535
3536        return ret;
3537}
3538
3539#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3540/*
3541 * Use this if you need to make a bitmap or extent entry specifically, it
3542 * doesn't do any of the merging that add_free_space does, this acts a lot like
3543 * how the free space cache loading stuff works, so you can get really weird
3544 * configurations.
3545 */
3546int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
3547                              u64 offset, u64 bytes, bool bitmap)
3548{
3549        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3550        struct btrfs_free_space *info = NULL, *bitmap_info;
3551        void *map = NULL;
3552        u64 bytes_added;
3553        int ret;
3554
3555again:
3556        if (!info) {
3557                info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3558                if (!info)
3559                        return -ENOMEM;
3560        }
3561
3562        if (!bitmap) {
3563                spin_lock(&ctl->tree_lock);
3564                info->offset = offset;
3565                info->bytes = bytes;
3566                info->max_extent_size = 0;
3567                ret = link_free_space(ctl, info);
3568                spin_unlock(&ctl->tree_lock);
3569                if (ret)
3570                        kmem_cache_free(btrfs_free_space_cachep, info);
3571                return ret;
3572        }
3573
3574        if (!map) {
3575                map = kzalloc(PAGE_SIZE, GFP_NOFS);
3576                if (!map) {
3577                        kmem_cache_free(btrfs_free_space_cachep, info);
3578                        return -ENOMEM;
3579                }
3580        }
3581
3582        spin_lock(&ctl->tree_lock);
3583        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3584                                         1, 0);
3585        if (!bitmap_info) {
3586                info->bitmap = map;
3587                map = NULL;
3588                add_new_bitmap(ctl, info, offset);
3589                bitmap_info = info;
3590                info = NULL;
3591        }
3592
3593        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3594
3595        bytes -= bytes_added;
3596        offset += bytes_added;
3597        spin_unlock(&ctl->tree_lock);
3598
3599        if (bytes)
3600                goto again;
3601
3602        if (info)
3603                kmem_cache_free(btrfs_free_space_cachep, info);
3604        if (map)
3605                kfree(map);
3606        return 0;
3607}
3608
3609/*
3610 * Checks to see if the given range is in the free space cache.  This is really
3611 * just used to check the absence of space, so if there is free space in the
3612 * range at all we will return 1.
3613 */
3614int test_check_exists(struct btrfs_block_group_cache *cache,
3615                      u64 offset, u64 bytes)
3616{
3617        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3618        struct btrfs_free_space *info;
3619        int ret = 0;
3620
3621        spin_lock(&ctl->tree_lock);
3622        info = tree_search_offset(ctl, offset, 0, 0);
3623        if (!info) {
3624                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3625                                          1, 0);
3626                if (!info)
3627                        goto out;
3628        }
3629
3630have_info:
3631        if (info->bitmap) {
3632                u64 bit_off, bit_bytes;
3633                struct rb_node *n;
3634                struct btrfs_free_space *tmp;
3635
3636                bit_off = offset;
3637                bit_bytes = ctl->unit;
3638                ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3639                if (!ret) {
3640                        if (bit_off == offset) {
3641                                ret = 1;
3642                                goto out;
3643                        } else if (bit_off > offset &&
3644                                   offset + bytes > bit_off) {
3645                                ret = 1;
3646                                goto out;
3647                        }
3648                }
3649
3650                n = rb_prev(&info->offset_index);
3651                while (n) {
3652                        tmp = rb_entry(n, struct btrfs_free_space,
3653                                       offset_index);
3654                        if (tmp->offset + tmp->bytes < offset)
3655                                break;
3656                        if (offset + bytes < tmp->offset) {
3657                                n = rb_prev(&tmp->offset_index);
3658                                continue;
3659                        }
3660                        info = tmp;
3661                        goto have_info;
3662                }
3663
3664                n = rb_next(&info->offset_index);
3665                while (n) {
3666                        tmp = rb_entry(n, struct btrfs_free_space,
3667                                       offset_index);
3668                        if (offset + bytes < tmp->offset)
3669                                break;
3670                        if (tmp->offset + tmp->bytes < offset) {
3671                                n = rb_next(&tmp->offset_index);
3672                                continue;
3673                        }
3674                        info = tmp;
3675                        goto have_info;
3676                }
3677
3678                ret = 0;
3679                goto out;
3680        }
3681
3682        if (info->offset == offset) {
3683                ret = 1;
3684                goto out;
3685        }
3686
3687        if (offset > info->offset && offset < info->offset + info->bytes)
3688                ret = 1;
3689out:
3690        spin_unlock(&ctl->tree_lock);
3691        return ret;
3692}
3693#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3694