linux/fs/btrfs/free-space-cache.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2008 Red Hat.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/pagemap.h>
  20#include <linux/sched.h>
  21#include <linux/slab.h>
  22#include <linux/math64.h>
  23#include <linux/ratelimit.h>
  24#include "ctree.h"
  25#include "free-space-cache.h"
  26#include "transaction.h"
  27#include "disk-io.h"
  28#include "extent_io.h"
  29#include "inode-map.h"
  30
  31#define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
  32#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
  33
  34static int link_free_space(struct btrfs_free_space_ctl *ctl,
  35                           struct btrfs_free_space *info);
  36static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
  37                              struct btrfs_free_space *info);
  38
  39static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
  40                                               struct btrfs_path *path,
  41                                               u64 offset)
  42{
  43        struct btrfs_key key;
  44        struct btrfs_key location;
  45        struct btrfs_disk_key disk_key;
  46        struct btrfs_free_space_header *header;
  47        struct extent_buffer *leaf;
  48        struct inode *inode = NULL;
  49        int ret;
  50
  51        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
  52        key.offset = offset;
  53        key.type = 0;
  54
  55        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  56        if (ret < 0)
  57                return ERR_PTR(ret);
  58        if (ret > 0) {
  59                btrfs_release_path(path);
  60                return ERR_PTR(-ENOENT);
  61        }
  62
  63        leaf = path->nodes[0];
  64        header = btrfs_item_ptr(leaf, path->slots[0],
  65                                struct btrfs_free_space_header);
  66        btrfs_free_space_key(leaf, header, &disk_key);
  67        btrfs_disk_key_to_cpu(&location, &disk_key);
  68        btrfs_release_path(path);
  69
  70        inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
  71        if (!inode)
  72                return ERR_PTR(-ENOENT);
  73        if (IS_ERR(inode))
  74                return inode;
  75        if (is_bad_inode(inode)) {
  76                iput(inode);
  77                return ERR_PTR(-ENOENT);
  78        }
  79
  80        mapping_set_gfp_mask(inode->i_mapping,
  81                        mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
  82
  83        return inode;
  84}
  85
  86struct inode *lookup_free_space_inode(struct btrfs_root *root,
  87                                      struct btrfs_block_group_cache
  88                                      *block_group, struct btrfs_path *path)
  89{
  90        struct inode *inode = NULL;
  91        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
  92
  93        spin_lock(&block_group->lock);
  94        if (block_group->inode)
  95                inode = igrab(block_group->inode);
  96        spin_unlock(&block_group->lock);
  97        if (inode)
  98                return inode;
  99
 100        inode = __lookup_free_space_inode(root, path,
 101                                          block_group->key.objectid);
 102        if (IS_ERR(inode))
 103                return inode;
 104
 105        spin_lock(&block_group->lock);
 106        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
 107                btrfs_info(root->fs_info,
 108                        "Old style space inode found, converting.");
 109                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
 110                        BTRFS_INODE_NODATACOW;
 111                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 112        }
 113
 114        if (!block_group->iref) {
 115                block_group->inode = igrab(inode);
 116                block_group->iref = 1;
 117        }
 118        spin_unlock(&block_group->lock);
 119
 120        return inode;
 121}
 122
 123static int __create_free_space_inode(struct btrfs_root *root,
 124                                     struct btrfs_trans_handle *trans,
 125                                     struct btrfs_path *path,
 126                                     u64 ino, u64 offset)
 127{
 128        struct btrfs_key key;
 129        struct btrfs_disk_key disk_key;
 130        struct btrfs_free_space_header *header;
 131        struct btrfs_inode_item *inode_item;
 132        struct extent_buffer *leaf;
 133        u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
 134        int ret;
 135
 136        ret = btrfs_insert_empty_inode(trans, root, path, ino);
 137        if (ret)
 138                return ret;
 139
 140        /* We inline crc's for the free disk space cache */
 141        if (ino != BTRFS_FREE_INO_OBJECTID)
 142                flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 143
 144        leaf = path->nodes[0];
 145        inode_item = btrfs_item_ptr(leaf, path->slots[0],
 146                                    struct btrfs_inode_item);
 147        btrfs_item_key(leaf, &disk_key, path->slots[0]);
 148        memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
 149                             sizeof(*inode_item));
 150        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
 151        btrfs_set_inode_size(leaf, inode_item, 0);
 152        btrfs_set_inode_nbytes(leaf, inode_item, 0);
 153        btrfs_set_inode_uid(leaf, inode_item, 0);
 154        btrfs_set_inode_gid(leaf, inode_item, 0);
 155        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
 156        btrfs_set_inode_flags(leaf, inode_item, flags);
 157        btrfs_set_inode_nlink(leaf, inode_item, 1);
 158        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 159        btrfs_set_inode_block_group(leaf, inode_item, offset);
 160        btrfs_mark_buffer_dirty(leaf);
 161        btrfs_release_path(path);
 162
 163        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 164        key.offset = offset;
 165        key.type = 0;
 166
 167        ret = btrfs_insert_empty_item(trans, root, path, &key,
 168                                      sizeof(struct btrfs_free_space_header));
 169        if (ret < 0) {
 170                btrfs_release_path(path);
 171                return ret;
 172        }
 173        leaf = path->nodes[0];
 174        header = btrfs_item_ptr(leaf, path->slots[0],
 175                                struct btrfs_free_space_header);
 176        memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
 177        btrfs_set_free_space_key(leaf, header, &disk_key);
 178        btrfs_mark_buffer_dirty(leaf);
 179        btrfs_release_path(path);
 180
 181        return 0;
 182}
 183
 184int create_free_space_inode(struct btrfs_root *root,
 185                            struct btrfs_trans_handle *trans,
 186                            struct btrfs_block_group_cache *block_group,
 187                            struct btrfs_path *path)
 188{
 189        int ret;
 190        u64 ino;
 191
 192        ret = btrfs_find_free_objectid(root, &ino);
 193        if (ret < 0)
 194                return ret;
 195
 196        return __create_free_space_inode(root, trans, path, ino,
 197                                         block_group->key.objectid);
 198}
 199
 200int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 201                                       struct btrfs_block_rsv *rsv)
 202{
 203        u64 needed_bytes;
 204        int ret;
 205
 206        /* 1 for slack space, 1 for updating the inode */
 207        needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
 208                btrfs_calc_trans_metadata_size(root, 1);
 209
 210        spin_lock(&rsv->lock);
 211        if (rsv->reserved < needed_bytes)
 212                ret = -ENOSPC;
 213        else
 214                ret = 0;
 215        spin_unlock(&rsv->lock);
 216        return ret;
 217}
 218
 219int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 220                                    struct btrfs_trans_handle *trans,
 221                                    struct inode *inode)
 222{
 223        int ret = 0;
 224
 225        btrfs_i_size_write(inode, 0);
 226        truncate_pagecache(inode, 0);
 227
 228        /*
 229         * We don't need an orphan item because truncating the free space cache
 230         * will never be split across transactions.
 231         */
 232        ret = btrfs_truncate_inode_items(trans, root, inode,
 233                                         0, BTRFS_EXTENT_DATA_KEY);
 234        if (ret) {
 235                btrfs_abort_transaction(trans, root, ret);
 236                return ret;
 237        }
 238
 239        ret = btrfs_update_inode(trans, root, inode);
 240        if (ret)
 241                btrfs_abort_transaction(trans, root, ret);
 242
 243        return ret;
 244}
 245
 246static int readahead_cache(struct inode *inode)
 247{
 248        struct file_ra_state *ra;
 249        unsigned long last_index;
 250
 251        ra = kzalloc(sizeof(*ra), GFP_NOFS);
 252        if (!ra)
 253                return -ENOMEM;
 254
 255        file_ra_state_init(ra, inode->i_mapping);
 256        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 257
 258        page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
 259
 260        kfree(ra);
 261
 262        return 0;
 263}
 264
 265struct io_ctl {
 266        void *cur, *orig;
 267        struct page *page;
 268        struct page **pages;
 269        struct btrfs_root *root;
 270        unsigned long size;
 271        int index;
 272        int num_pages;
 273        unsigned check_crcs:1;
 274};
 275
 276static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 277                       struct btrfs_root *root, int write)
 278{
 279        int num_pages;
 280        int check_crcs = 0;
 281
 282        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
 283                    PAGE_CACHE_SHIFT;
 284
 285        if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
 286                check_crcs = 1;
 287
 288        /* Make sure we can fit our crcs into the first page */
 289        if (write && check_crcs &&
 290            (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
 291                return -ENOSPC;
 292
 293        memset(io_ctl, 0, sizeof(struct io_ctl));
 294
 295        io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
 296        if (!io_ctl->pages)
 297                return -ENOMEM;
 298
 299        io_ctl->num_pages = num_pages;
 300        io_ctl->root = root;
 301        io_ctl->check_crcs = check_crcs;
 302
 303        return 0;
 304}
 305
 306static void io_ctl_free(struct io_ctl *io_ctl)
 307{
 308        kfree(io_ctl->pages);
 309}
 310
 311static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 312{
 313        if (io_ctl->cur) {
 314                kunmap(io_ctl->page);
 315                io_ctl->cur = NULL;
 316                io_ctl->orig = NULL;
 317        }
 318}
 319
 320static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 321{
 322        ASSERT(io_ctl->index < io_ctl->num_pages);
 323        io_ctl->page = io_ctl->pages[io_ctl->index++];
 324        io_ctl->cur = kmap(io_ctl->page);
 325        io_ctl->orig = io_ctl->cur;
 326        io_ctl->size = PAGE_CACHE_SIZE;
 327        if (clear)
 328                memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
 329}
 330
 331static void io_ctl_drop_pages(struct io_ctl *io_ctl)
 332{
 333        int i;
 334
 335        io_ctl_unmap_page(io_ctl);
 336
 337        for (i = 0; i < io_ctl->num_pages; i++) {
 338                if (io_ctl->pages[i]) {
 339                        ClearPageChecked(io_ctl->pages[i]);
 340                        unlock_page(io_ctl->pages[i]);
 341                        page_cache_release(io_ctl->pages[i]);
 342                }
 343        }
 344}
 345
 346static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
 347                                int uptodate)
 348{
 349        struct page *page;
 350        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 351        int i;
 352
 353        for (i = 0; i < io_ctl->num_pages; i++) {
 354                page = find_or_create_page(inode->i_mapping, i, mask);
 355                if (!page) {
 356                        io_ctl_drop_pages(io_ctl);
 357                        return -ENOMEM;
 358                }
 359                io_ctl->pages[i] = page;
 360                if (uptodate && !PageUptodate(page)) {
 361                        btrfs_readpage(NULL, page);
 362                        lock_page(page);
 363                        if (!PageUptodate(page)) {
 364                                btrfs_err(BTRFS_I(inode)->root->fs_info,
 365                                           "error reading free space cache");
 366                                io_ctl_drop_pages(io_ctl);
 367                                return -EIO;
 368                        }
 369                }
 370        }
 371
 372        for (i = 0; i < io_ctl->num_pages; i++) {
 373                clear_page_dirty_for_io(io_ctl->pages[i]);
 374                set_page_extent_mapped(io_ctl->pages[i]);
 375        }
 376
 377        return 0;
 378}
 379
 380static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 381{
 382        __le64 *val;
 383
 384        io_ctl_map_page(io_ctl, 1);
 385
 386        /*
 387         * Skip the csum areas.  If we don't check crcs then we just have a
 388         * 64bit chunk at the front of the first page.
 389         */
 390        if (io_ctl->check_crcs) {
 391                io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
 392                io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
 393        } else {
 394                io_ctl->cur += sizeof(u64);
 395                io_ctl->size -= sizeof(u64) * 2;
 396        }
 397
 398        val = io_ctl->cur;
 399        *val = cpu_to_le64(generation);
 400        io_ctl->cur += sizeof(u64);
 401}
 402
 403static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 404{
 405        __le64 *gen;
 406
 407        /*
 408         * Skip the crc area.  If we don't check crcs then we just have a 64bit
 409         * chunk at the front of the first page.
 410         */
 411        if (io_ctl->check_crcs) {
 412                io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
 413                io_ctl->size -= sizeof(u64) +
 414                        (sizeof(u32) * io_ctl->num_pages);
 415        } else {
 416                io_ctl->cur += sizeof(u64);
 417                io_ctl->size -= sizeof(u64) * 2;
 418        }
 419
 420        gen = io_ctl->cur;
 421        if (le64_to_cpu(*gen) != generation) {
 422                printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
 423                                   "(%Lu) does not match inode (%Lu)\n", *gen,
 424                                   generation);
 425                io_ctl_unmap_page(io_ctl);
 426                return -EIO;
 427        }
 428        io_ctl->cur += sizeof(u64);
 429        return 0;
 430}
 431
 432static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
 433{
 434        u32 *tmp;
 435        u32 crc = ~(u32)0;
 436        unsigned offset = 0;
 437
 438        if (!io_ctl->check_crcs) {
 439                io_ctl_unmap_page(io_ctl);
 440                return;
 441        }
 442
 443        if (index == 0)
 444                offset = sizeof(u32) * io_ctl->num_pages;
 445
 446        crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 447                              PAGE_CACHE_SIZE - offset);
 448        btrfs_csum_final(crc, (char *)&crc);
 449        io_ctl_unmap_page(io_ctl);
 450        tmp = kmap(io_ctl->pages[0]);
 451        tmp += index;
 452        *tmp = crc;
 453        kunmap(io_ctl->pages[0]);
 454}
 455
 456static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
 457{
 458        u32 *tmp, val;
 459        u32 crc = ~(u32)0;
 460        unsigned offset = 0;
 461
 462        if (!io_ctl->check_crcs) {
 463                io_ctl_map_page(io_ctl, 0);
 464                return 0;
 465        }
 466
 467        if (index == 0)
 468                offset = sizeof(u32) * io_ctl->num_pages;
 469
 470        tmp = kmap(io_ctl->pages[0]);
 471        tmp += index;
 472        val = *tmp;
 473        kunmap(io_ctl->pages[0]);
 474
 475        io_ctl_map_page(io_ctl, 0);
 476        crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 477                              PAGE_CACHE_SIZE - offset);
 478        btrfs_csum_final(crc, (char *)&crc);
 479        if (val != crc) {
 480                printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
 481                                   "space cache\n");
 482                io_ctl_unmap_page(io_ctl);
 483                return -EIO;
 484        }
 485
 486        return 0;
 487}
 488
 489static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
 490                            void *bitmap)
 491{
 492        struct btrfs_free_space_entry *entry;
 493
 494        if (!io_ctl->cur)
 495                return -ENOSPC;
 496
 497        entry = io_ctl->cur;
 498        entry->offset = cpu_to_le64(offset);
 499        entry->bytes = cpu_to_le64(bytes);
 500        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
 501                BTRFS_FREE_SPACE_EXTENT;
 502        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 503        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 504
 505        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 506                return 0;
 507
 508        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 509
 510        /* No more pages to map */
 511        if (io_ctl->index >= io_ctl->num_pages)
 512                return 0;
 513
 514        /* map the next page */
 515        io_ctl_map_page(io_ctl, 1);
 516        return 0;
 517}
 518
 519static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
 520{
 521        if (!io_ctl->cur)
 522                return -ENOSPC;
 523
 524        /*
 525         * If we aren't at the start of the current page, unmap this one and
 526         * map the next one if there is any left.
 527         */
 528        if (io_ctl->cur != io_ctl->orig) {
 529                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 530                if (io_ctl->index >= io_ctl->num_pages)
 531                        return -ENOSPC;
 532                io_ctl_map_page(io_ctl, 0);
 533        }
 534
 535        memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
 536        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 537        if (io_ctl->index < io_ctl->num_pages)
 538                io_ctl_map_page(io_ctl, 0);
 539        return 0;
 540}
 541
 542static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
 543{
 544        /*
 545         * If we're not on the boundary we know we've modified the page and we
 546         * need to crc the page.
 547         */
 548        if (io_ctl->cur != io_ctl->orig)
 549                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 550        else
 551                io_ctl_unmap_page(io_ctl);
 552
 553        while (io_ctl->index < io_ctl->num_pages) {
 554                io_ctl_map_page(io_ctl, 1);
 555                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 556        }
 557}
 558
 559static int io_ctl_read_entry(struct io_ctl *io_ctl,
 560                            struct btrfs_free_space *entry, u8 *type)
 561{
 562        struct btrfs_free_space_entry *e;
 563        int ret;
 564
 565        if (!io_ctl->cur) {
 566                ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 567                if (ret)
 568                        return ret;
 569        }
 570
 571        e = io_ctl->cur;
 572        entry->offset = le64_to_cpu(e->offset);
 573        entry->bytes = le64_to_cpu(e->bytes);
 574        *type = e->type;
 575        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 576        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 577
 578        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 579                return 0;
 580
 581        io_ctl_unmap_page(io_ctl);
 582
 583        return 0;
 584}
 585
 586static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
 587                              struct btrfs_free_space *entry)
 588{
 589        int ret;
 590
 591        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
 592        if (ret)
 593                return ret;
 594
 595        memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
 596        io_ctl_unmap_page(io_ctl);
 597
 598        return 0;
 599}
 600
 601/*
 602 * Since we attach pinned extents after the fact we can have contiguous sections
 603 * of free space that are split up in entries.  This poses a problem with the
 604 * tree logging stuff since it could have allocated across what appears to be 2
 605 * entries since we would have merged the entries when adding the pinned extents
 606 * back to the free space cache.  So run through the space cache that we just
 607 * loaded and merge contiguous entries.  This will make the log replay stuff not
 608 * blow up and it will make for nicer allocator behavior.
 609 */
 610static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
 611{
 612        struct btrfs_free_space *e, *prev = NULL;
 613        struct rb_node *n;
 614
 615again:
 616        spin_lock(&ctl->tree_lock);
 617        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
 618                e = rb_entry(n, struct btrfs_free_space, offset_index);
 619                if (!prev)
 620                        goto next;
 621                if (e->bitmap || prev->bitmap)
 622                        goto next;
 623                if (prev->offset + prev->bytes == e->offset) {
 624                        unlink_free_space(ctl, prev);
 625                        unlink_free_space(ctl, e);
 626                        prev->bytes += e->bytes;
 627                        kmem_cache_free(btrfs_free_space_cachep, e);
 628                        link_free_space(ctl, prev);
 629                        prev = NULL;
 630                        spin_unlock(&ctl->tree_lock);
 631                        goto again;
 632                }
 633next:
 634                prev = e;
 635        }
 636        spin_unlock(&ctl->tree_lock);
 637}
 638
 639static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 640                                   struct btrfs_free_space_ctl *ctl,
 641                                   struct btrfs_path *path, u64 offset)
 642{
 643        struct btrfs_free_space_header *header;
 644        struct extent_buffer *leaf;
 645        struct io_ctl io_ctl;
 646        struct btrfs_key key;
 647        struct btrfs_free_space *e, *n;
 648        struct list_head bitmaps;
 649        u64 num_entries;
 650        u64 num_bitmaps;
 651        u64 generation;
 652        u8 type;
 653        int ret = 0;
 654
 655        INIT_LIST_HEAD(&bitmaps);
 656
 657        /* Nothing in the space cache, goodbye */
 658        if (!i_size_read(inode))
 659                return 0;
 660
 661        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 662        key.offset = offset;
 663        key.type = 0;
 664
 665        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 666        if (ret < 0)
 667                return 0;
 668        else if (ret > 0) {
 669                btrfs_release_path(path);
 670                return 0;
 671        }
 672
 673        ret = -1;
 674
 675        leaf = path->nodes[0];
 676        header = btrfs_item_ptr(leaf, path->slots[0],
 677                                struct btrfs_free_space_header);
 678        num_entries = btrfs_free_space_entries(leaf, header);
 679        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
 680        generation = btrfs_free_space_generation(leaf, header);
 681        btrfs_release_path(path);
 682
 683        if (!BTRFS_I(inode)->generation) {
 684                btrfs_info(root->fs_info,
 685                           "The free space cache file (%llu) is invalid. skip it\n",
 686                           offset);
 687                return 0;
 688        }
 689
 690        if (BTRFS_I(inode)->generation != generation) {
 691                btrfs_err(root->fs_info,
 692                        "free space inode generation (%llu) "
 693                        "did not match free space cache generation (%llu)",
 694                        BTRFS_I(inode)->generation, generation);
 695                return 0;
 696        }
 697
 698        if (!num_entries)
 699                return 0;
 700
 701        ret = io_ctl_init(&io_ctl, inode, root, 0);
 702        if (ret)
 703                return ret;
 704
 705        ret = readahead_cache(inode);
 706        if (ret)
 707                goto out;
 708
 709        ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
 710        if (ret)
 711                goto out;
 712
 713        ret = io_ctl_check_crc(&io_ctl, 0);
 714        if (ret)
 715                goto free_cache;
 716
 717        ret = io_ctl_check_generation(&io_ctl, generation);
 718        if (ret)
 719                goto free_cache;
 720
 721        while (num_entries) {
 722                e = kmem_cache_zalloc(btrfs_free_space_cachep,
 723                                      GFP_NOFS);
 724                if (!e)
 725                        goto free_cache;
 726
 727                ret = io_ctl_read_entry(&io_ctl, e, &type);
 728                if (ret) {
 729                        kmem_cache_free(btrfs_free_space_cachep, e);
 730                        goto free_cache;
 731                }
 732
 733                if (!e->bytes) {
 734                        kmem_cache_free(btrfs_free_space_cachep, e);
 735                        goto free_cache;
 736                }
 737
 738                if (type == BTRFS_FREE_SPACE_EXTENT) {
 739                        spin_lock(&ctl->tree_lock);
 740                        ret = link_free_space(ctl, e);
 741                        spin_unlock(&ctl->tree_lock);
 742                        if (ret) {
 743                                btrfs_err(root->fs_info,
 744                                        "Duplicate entries in free space cache, dumping");
 745                                kmem_cache_free(btrfs_free_space_cachep, e);
 746                                goto free_cache;
 747                        }
 748                } else {
 749                        ASSERT(num_bitmaps);
 750                        num_bitmaps--;
 751                        e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
 752                        if (!e->bitmap) {
 753                                kmem_cache_free(
 754                                        btrfs_free_space_cachep, e);
 755                                goto free_cache;
 756                        }
 757                        spin_lock(&ctl->tree_lock);
 758                        ret = link_free_space(ctl, e);
 759                        ctl->total_bitmaps++;
 760                        ctl->op->recalc_thresholds(ctl);
 761                        spin_unlock(&ctl->tree_lock);
 762                        if (ret) {
 763                                btrfs_err(root->fs_info,
 764                                        "Duplicate entries in free space cache, dumping");
 765                                kmem_cache_free(btrfs_free_space_cachep, e);
 766                                goto free_cache;
 767                        }
 768                        list_add_tail(&e->list, &bitmaps);
 769                }
 770
 771                num_entries--;
 772        }
 773
 774        io_ctl_unmap_page(&io_ctl);
 775
 776        /*
 777         * We add the bitmaps at the end of the entries in order that
 778         * the bitmap entries are added to the cache.
 779         */
 780        list_for_each_entry_safe(e, n, &bitmaps, list) {
 781                list_del_init(&e->list);
 782                ret = io_ctl_read_bitmap(&io_ctl, e);
 783                if (ret)
 784                        goto free_cache;
 785        }
 786
 787        io_ctl_drop_pages(&io_ctl);
 788        merge_space_tree(ctl);
 789        ret = 1;
 790out:
 791        io_ctl_free(&io_ctl);
 792        return ret;
 793free_cache:
 794        io_ctl_drop_pages(&io_ctl);
 795        __btrfs_remove_free_space_cache(ctl);
 796        goto out;
 797}
 798
 799int load_free_space_cache(struct btrfs_fs_info *fs_info,
 800                          struct btrfs_block_group_cache *block_group)
 801{
 802        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 803        struct btrfs_root *root = fs_info->tree_root;
 804        struct inode *inode;
 805        struct btrfs_path *path;
 806        int ret = 0;
 807        bool matched;
 808        u64 used = btrfs_block_group_used(&block_group->item);
 809
 810        /*
 811         * If this block group has been marked to be cleared for one reason or
 812         * another then we can't trust the on disk cache, so just return.
 813         */
 814        spin_lock(&block_group->lock);
 815        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 816                spin_unlock(&block_group->lock);
 817                return 0;
 818        }
 819        spin_unlock(&block_group->lock);
 820
 821        path = btrfs_alloc_path();
 822        if (!path)
 823                return 0;
 824        path->search_commit_root = 1;
 825        path->skip_locking = 1;
 826
 827        inode = lookup_free_space_inode(root, block_group, path);
 828        if (IS_ERR(inode)) {
 829                btrfs_free_path(path);
 830                return 0;
 831        }
 832
 833        /* We may have converted the inode and made the cache invalid. */
 834        spin_lock(&block_group->lock);
 835        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 836                spin_unlock(&block_group->lock);
 837                btrfs_free_path(path);
 838                goto out;
 839        }
 840        spin_unlock(&block_group->lock);
 841
 842        ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
 843                                      path, block_group->key.objectid);
 844        btrfs_free_path(path);
 845        if (ret <= 0)
 846                goto out;
 847
 848        spin_lock(&ctl->tree_lock);
 849        matched = (ctl->free_space == (block_group->key.offset - used -
 850                                       block_group->bytes_super));
 851        spin_unlock(&ctl->tree_lock);
 852
 853        if (!matched) {
 854                __btrfs_remove_free_space_cache(ctl);
 855                btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
 856                        block_group->key.objectid);
 857                ret = -1;
 858        }
 859out:
 860        if (ret < 0) {
 861                /* This cache is bogus, make sure it gets cleared */
 862                spin_lock(&block_group->lock);
 863                block_group->disk_cache_state = BTRFS_DC_CLEAR;
 864                spin_unlock(&block_group->lock);
 865                ret = 0;
 866
 867                btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
 868                        block_group->key.objectid);
 869        }
 870
 871        iput(inode);
 872        return ret;
 873}
 874
 875static noinline_for_stack
 876int write_cache_extent_entries(struct io_ctl *io_ctl,
 877                              struct btrfs_free_space_ctl *ctl,
 878                              struct btrfs_block_group_cache *block_group,
 879                              int *entries, int *bitmaps,
 880                              struct list_head *bitmap_list)
 881{
 882        int ret;
 883        struct btrfs_free_cluster *cluster = NULL;
 884        struct rb_node *node = rb_first(&ctl->free_space_offset);
 885
 886        /* Get the cluster for this block_group if it exists */
 887        if (block_group && !list_empty(&block_group->cluster_list)) {
 888                cluster = list_entry(block_group->cluster_list.next,
 889                                     struct btrfs_free_cluster,
 890                                     block_group_list);
 891        }
 892
 893        if (!node && cluster) {
 894                node = rb_first(&cluster->root);
 895                cluster = NULL;
 896        }
 897
 898        /* Write out the extent entries */
 899        while (node) {
 900                struct btrfs_free_space *e;
 901
 902                e = rb_entry(node, struct btrfs_free_space, offset_index);
 903                *entries += 1;
 904
 905                ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
 906                                       e->bitmap);
 907                if (ret)
 908                        goto fail;
 909
 910                if (e->bitmap) {
 911                        list_add_tail(&e->list, bitmap_list);
 912                        *bitmaps += 1;
 913                }
 914                node = rb_next(node);
 915                if (!node && cluster) {
 916                        node = rb_first(&cluster->root);
 917                        cluster = NULL;
 918                }
 919        }
 920        return 0;
 921fail:
 922        return -ENOSPC;
 923}
 924
 925static noinline_for_stack int
 926update_cache_item(struct btrfs_trans_handle *trans,
 927                  struct btrfs_root *root,
 928                  struct inode *inode,
 929                  struct btrfs_path *path, u64 offset,
 930                  int entries, int bitmaps)
 931{
 932        struct btrfs_key key;
 933        struct btrfs_free_space_header *header;
 934        struct extent_buffer *leaf;
 935        int ret;
 936
 937        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 938        key.offset = offset;
 939        key.type = 0;
 940
 941        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 942        if (ret < 0) {
 943                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
 944                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
 945                                 GFP_NOFS);
 946                goto fail;
 947        }
 948        leaf = path->nodes[0];
 949        if (ret > 0) {
 950                struct btrfs_key found_key;
 951                ASSERT(path->slots[0]);
 952                path->slots[0]--;
 953                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 954                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
 955                    found_key.offset != offset) {
 956                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
 957                                         inode->i_size - 1,
 958                                         EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
 959                                         NULL, GFP_NOFS);
 960                        btrfs_release_path(path);
 961                        goto fail;
 962                }
 963        }
 964
 965        BTRFS_I(inode)->generation = trans->transid;
 966        header = btrfs_item_ptr(leaf, path->slots[0],
 967                                struct btrfs_free_space_header);
 968        btrfs_set_free_space_entries(leaf, header, entries);
 969        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
 970        btrfs_set_free_space_generation(leaf, header, trans->transid);
 971        btrfs_mark_buffer_dirty(leaf);
 972        btrfs_release_path(path);
 973
 974        return 0;
 975
 976fail:
 977        return -1;
 978}
 979
 980static noinline_for_stack int
 981write_pinned_extent_entries(struct btrfs_root *root,
 982                            struct btrfs_block_group_cache *block_group,
 983                            struct io_ctl *io_ctl,
 984                            int *entries)
 985{
 986        u64 start, extent_start, extent_end, len;
 987        struct extent_io_tree *unpin = NULL;
 988        int ret;
 989
 990        if (!block_group)
 991                return 0;
 992
 993        /*
 994         * We want to add any pinned extents to our free space cache
 995         * so we don't leak the space
 996         *
 997         * We shouldn't have switched the pinned extents yet so this is the
 998         * right one
 999         */
1000        unpin = root->fs_info->pinned_extents;
1001
1002        start = block_group->key.objectid;
1003
1004        while (start < block_group->key.objectid + block_group->key.offset) {
1005                ret = find_first_extent_bit(unpin, start,
1006                                            &extent_start, &extent_end,
1007                                            EXTENT_DIRTY, NULL);
1008                if (ret)
1009                        return 0;
1010
1011                /* This pinned extent is out of our range */
1012                if (extent_start >= block_group->key.objectid +
1013                    block_group->key.offset)
1014                        return 0;
1015
1016                extent_start = max(extent_start, start);
1017                extent_end = min(block_group->key.objectid +
1018                                 block_group->key.offset, extent_end + 1);
1019                len = extent_end - extent_start;
1020
1021                *entries += 1;
1022                ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1023                if (ret)
1024                        return -ENOSPC;
1025
1026                start = extent_end;
1027        }
1028
1029        return 0;
1030}
1031
1032static noinline_for_stack int
1033write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
1034{
1035        struct list_head *pos, *n;
1036        int ret;
1037
1038        /* Write out the bitmaps */
1039        list_for_each_safe(pos, n, bitmap_list) {
1040                struct btrfs_free_space *entry =
1041                        list_entry(pos, struct btrfs_free_space, list);
1042
1043                ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1044                if (ret)
1045                        return -ENOSPC;
1046                list_del_init(&entry->list);
1047        }
1048
1049        return 0;
1050}
1051
1052static int flush_dirty_cache(struct inode *inode)
1053{
1054        int ret;
1055
1056        ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1057        if (ret)
1058                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1059                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1060                                 GFP_NOFS);
1061
1062        return ret;
1063}
1064
1065static void noinline_for_stack
1066cleanup_write_cache_enospc(struct inode *inode,
1067                           struct io_ctl *io_ctl,
1068                           struct extent_state **cached_state,
1069                           struct list_head *bitmap_list)
1070{
1071        struct list_head *pos, *n;
1072
1073        list_for_each_safe(pos, n, bitmap_list) {
1074                struct btrfs_free_space *entry =
1075                        list_entry(pos, struct btrfs_free_space, list);
1076                list_del_init(&entry->list);
1077        }
1078        io_ctl_drop_pages(io_ctl);
1079        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1080                             i_size_read(inode) - 1, cached_state,
1081                             GFP_NOFS);
1082}
1083
1084/**
1085 * __btrfs_write_out_cache - write out cached info to an inode
1086 * @root - the root the inode belongs to
1087 * @ctl - the free space cache we are going to write out
1088 * @block_group - the block_group for this cache if it belongs to a block_group
1089 * @trans - the trans handle
1090 * @path - the path to use
1091 * @offset - the offset for the key we'll insert
1092 *
1093 * This function writes out a free space cache struct to disk for quick recovery
1094 * on mount.  This will return 0 if it was successfull in writing the cache out,
1095 * and -1 if it was not.
1096 */
1097static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1098                                   struct btrfs_free_space_ctl *ctl,
1099                                   struct btrfs_block_group_cache *block_group,
1100                                   struct btrfs_trans_handle *trans,
1101                                   struct btrfs_path *path, u64 offset)
1102{
1103        struct extent_state *cached_state = NULL;
1104        struct io_ctl io_ctl;
1105        LIST_HEAD(bitmap_list);
1106        int entries = 0;
1107        int bitmaps = 0;
1108        int ret;
1109
1110        if (!i_size_read(inode))
1111                return -1;
1112
1113        ret = io_ctl_init(&io_ctl, inode, root, 1);
1114        if (ret)
1115                return -1;
1116
1117        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1118                down_write(&block_group->data_rwsem);
1119                spin_lock(&block_group->lock);
1120                if (block_group->delalloc_bytes) {
1121                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1122                        spin_unlock(&block_group->lock);
1123                        up_write(&block_group->data_rwsem);
1124                        BTRFS_I(inode)->generation = 0;
1125                        ret = 0;
1126                        goto out;
1127                }
1128                spin_unlock(&block_group->lock);
1129        }
1130
1131        /* Lock all pages first so we can lock the extent safely. */
1132        io_ctl_prepare_pages(&io_ctl, inode, 0);
1133
1134        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1135                         0, &cached_state);
1136
1137        io_ctl_set_generation(&io_ctl, trans->transid);
1138
1139        /* Write out the extent entries in the free space cache */
1140        ret = write_cache_extent_entries(&io_ctl, ctl,
1141                                         block_group, &entries, &bitmaps,
1142                                         &bitmap_list);
1143        if (ret)
1144                goto out_nospc;
1145
1146        /*
1147         * Some spaces that are freed in the current transaction are pinned,
1148         * they will be added into free space cache after the transaction is
1149         * committed, we shouldn't lose them.
1150         */
1151        ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1152        if (ret)
1153                goto out_nospc;
1154
1155        /* At last, we write out all the bitmaps. */
1156        ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1157        if (ret)
1158                goto out_nospc;
1159
1160        /* Zero out the rest of the pages just to make sure */
1161        io_ctl_zero_remaining_pages(&io_ctl);
1162
1163        /* Everything is written out, now we dirty the pages in the file. */
1164        ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
1165                                0, i_size_read(inode), &cached_state);
1166        if (ret)
1167                goto out_nospc;
1168
1169        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1170                up_write(&block_group->data_rwsem);
1171        /*
1172         * Release the pages and unlock the extent, we will flush
1173         * them out later
1174         */
1175        io_ctl_drop_pages(&io_ctl);
1176
1177        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1178                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1179
1180        /* Flush the dirty pages in the cache file. */
1181        ret = flush_dirty_cache(inode);
1182        if (ret)
1183                goto out;
1184
1185        /* Update the cache item to tell everyone this cache file is valid. */
1186        ret = update_cache_item(trans, root, inode, path, offset,
1187                                entries, bitmaps);
1188out:
1189        io_ctl_free(&io_ctl);
1190        if (ret) {
1191                invalidate_inode_pages2(inode->i_mapping);
1192                BTRFS_I(inode)->generation = 0;
1193        }
1194        btrfs_update_inode(trans, root, inode);
1195        return ret;
1196
1197out_nospc:
1198        cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
1199
1200        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1201                up_write(&block_group->data_rwsem);
1202
1203        goto out;
1204}
1205
1206int btrfs_write_out_cache(struct btrfs_root *root,
1207                          struct btrfs_trans_handle *trans,
1208                          struct btrfs_block_group_cache *block_group,
1209                          struct btrfs_path *path)
1210{
1211        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1212        struct inode *inode;
1213        int ret = 0;
1214
1215        root = root->fs_info->tree_root;
1216
1217        spin_lock(&block_group->lock);
1218        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1219                spin_unlock(&block_group->lock);
1220                return 0;
1221        }
1222
1223        if (block_group->delalloc_bytes) {
1224                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1225                spin_unlock(&block_group->lock);
1226                return 0;
1227        }
1228        spin_unlock(&block_group->lock);
1229
1230        inode = lookup_free_space_inode(root, block_group, path);
1231        if (IS_ERR(inode))
1232                return 0;
1233
1234        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
1235                                      path, block_group->key.objectid);
1236        if (ret) {
1237                spin_lock(&block_group->lock);
1238                block_group->disk_cache_state = BTRFS_DC_ERROR;
1239                spin_unlock(&block_group->lock);
1240                ret = 0;
1241#ifdef DEBUG
1242                btrfs_err(root->fs_info,
1243                        "failed to write free space cache for block group %llu",
1244                        block_group->key.objectid);
1245#endif
1246        }
1247
1248        iput(inode);
1249        return ret;
1250}
1251
1252static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1253                                          u64 offset)
1254{
1255        ASSERT(offset >= bitmap_start);
1256        offset -= bitmap_start;
1257        return (unsigned long)(div_u64(offset, unit));
1258}
1259
1260static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1261{
1262        return (unsigned long)(div_u64(bytes, unit));
1263}
1264
1265static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1266                                   u64 offset)
1267{
1268        u64 bitmap_start;
1269        u64 bytes_per_bitmap;
1270
1271        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1272        bitmap_start = offset - ctl->start;
1273        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1274        bitmap_start *= bytes_per_bitmap;
1275        bitmap_start += ctl->start;
1276
1277        return bitmap_start;
1278}
1279
1280static int tree_insert_offset(struct rb_root *root, u64 offset,
1281                              struct rb_node *node, int bitmap)
1282{
1283        struct rb_node **p = &root->rb_node;
1284        struct rb_node *parent = NULL;
1285        struct btrfs_free_space *info;
1286
1287        while (*p) {
1288                parent = *p;
1289                info = rb_entry(parent, struct btrfs_free_space, offset_index);
1290
1291                if (offset < info->offset) {
1292                        p = &(*p)->rb_left;
1293                } else if (offset > info->offset) {
1294                        p = &(*p)->rb_right;
1295                } else {
1296                        /*
1297                         * we could have a bitmap entry and an extent entry
1298                         * share the same offset.  If this is the case, we want
1299                         * the extent entry to always be found first if we do a
1300                         * linear search through the tree, since we want to have
1301                         * the quickest allocation time, and allocating from an
1302                         * extent is faster than allocating from a bitmap.  So
1303                         * if we're inserting a bitmap and we find an entry at
1304                         * this offset, we want to go right, or after this entry
1305                         * logically.  If we are inserting an extent and we've
1306                         * found a bitmap, we want to go left, or before
1307                         * logically.
1308                         */
1309                        if (bitmap) {
1310                                if (info->bitmap) {
1311                                        WARN_ON_ONCE(1);
1312                                        return -EEXIST;
1313                                }
1314                                p = &(*p)->rb_right;
1315                        } else {
1316                                if (!info->bitmap) {
1317                                        WARN_ON_ONCE(1);
1318                                        return -EEXIST;
1319                                }
1320                                p = &(*p)->rb_left;
1321                        }
1322                }
1323        }
1324
1325        rb_link_node(node, parent, p);
1326        rb_insert_color(node, root);
1327
1328        return 0;
1329}
1330
1331/*
1332 * searches the tree for the given offset.
1333 *
1334 * fuzzy - If this is set, then we are trying to make an allocation, and we just
1335 * want a section that has at least bytes size and comes at or after the given
1336 * offset.
1337 */
1338static struct btrfs_free_space *
1339tree_search_offset(struct btrfs_free_space_ctl *ctl,
1340                   u64 offset, int bitmap_only, int fuzzy)
1341{
1342        struct rb_node *n = ctl->free_space_offset.rb_node;
1343        struct btrfs_free_space *entry, *prev = NULL;
1344
1345        /* find entry that is closest to the 'offset' */
1346        while (1) {
1347                if (!n) {
1348                        entry = NULL;
1349                        break;
1350                }
1351
1352                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1353                prev = entry;
1354
1355                if (offset < entry->offset)
1356                        n = n->rb_left;
1357                else if (offset > entry->offset)
1358                        n = n->rb_right;
1359                else
1360                        break;
1361        }
1362
1363        if (bitmap_only) {
1364                if (!entry)
1365                        return NULL;
1366                if (entry->bitmap)
1367                        return entry;
1368
1369                /*
1370                 * bitmap entry and extent entry may share same offset,
1371                 * in that case, bitmap entry comes after extent entry.
1372                 */
1373                n = rb_next(n);
1374                if (!n)
1375                        return NULL;
1376                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1377                if (entry->offset != offset)
1378                        return NULL;
1379
1380                WARN_ON(!entry->bitmap);
1381                return entry;
1382        } else if (entry) {
1383                if (entry->bitmap) {
1384                        /*
1385                         * if previous extent entry covers the offset,
1386                         * we should return it instead of the bitmap entry
1387                         */
1388                        n = rb_prev(&entry->offset_index);
1389                        if (n) {
1390                                prev = rb_entry(n, struct btrfs_free_space,
1391                                                offset_index);
1392                                if (!prev->bitmap &&
1393                                    prev->offset + prev->bytes > offset)
1394                                        entry = prev;
1395                        }
1396                }
1397                return entry;
1398        }
1399
1400        if (!prev)
1401                return NULL;
1402
1403        /* find last entry before the 'offset' */
1404        entry = prev;
1405        if (entry->offset > offset) {
1406                n = rb_prev(&entry->offset_index);
1407                if (n) {
1408                        entry = rb_entry(n, struct btrfs_free_space,
1409                                        offset_index);
1410                        ASSERT(entry->offset <= offset);
1411                } else {
1412                        if (fuzzy)
1413                                return entry;
1414                        else
1415                                return NULL;
1416                }
1417        }
1418
1419        if (entry->bitmap) {
1420                n = rb_prev(&entry->offset_index);
1421                if (n) {
1422                        prev = rb_entry(n, struct btrfs_free_space,
1423                                        offset_index);
1424                        if (!prev->bitmap &&
1425                            prev->offset + prev->bytes > offset)
1426                                return prev;
1427                }
1428                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1429                        return entry;
1430        } else if (entry->offset + entry->bytes > offset)
1431                return entry;
1432
1433        if (!fuzzy)
1434                return NULL;
1435
1436        while (1) {
1437                if (entry->bitmap) {
1438                        if (entry->offset + BITS_PER_BITMAP *
1439                            ctl->unit > offset)
1440                                break;
1441                } else {
1442                        if (entry->offset + entry->bytes > offset)
1443                                break;
1444                }
1445
1446                n = rb_next(&entry->offset_index);
1447                if (!n)
1448                        return NULL;
1449                entry = rb_entry(n, struct btrfs_free_space, offset_index);
1450        }
1451        return entry;
1452}
1453
1454static inline void
1455__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1456                    struct btrfs_free_space *info)
1457{
1458        rb_erase(&info->offset_index, &ctl->free_space_offset);
1459        ctl->free_extents--;
1460}
1461
1462static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1463                              struct btrfs_free_space *info)
1464{
1465        __unlink_free_space(ctl, info);
1466        ctl->free_space -= info->bytes;
1467}
1468
1469static int link_free_space(struct btrfs_free_space_ctl *ctl,
1470                           struct btrfs_free_space *info)
1471{
1472        int ret = 0;
1473
1474        ASSERT(info->bytes || info->bitmap);
1475        ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1476                                 &info->offset_index, (info->bitmap != NULL));
1477        if (ret)
1478                return ret;
1479
1480        ctl->free_space += info->bytes;
1481        ctl->free_extents++;
1482        return ret;
1483}
1484
1485static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1486{
1487        struct btrfs_block_group_cache *block_group = ctl->private;
1488        u64 max_bytes;
1489        u64 bitmap_bytes;
1490        u64 extent_bytes;
1491        u64 size = block_group->key.offset;
1492        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1493        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1494
1495        max_bitmaps = max(max_bitmaps, 1);
1496
1497        ASSERT(ctl->total_bitmaps <= max_bitmaps);
1498
1499        /*
1500         * The goal is to keep the total amount of memory used per 1gb of space
1501         * at or below 32k, so we need to adjust how much memory we allow to be
1502         * used by extent based free space tracking
1503         */
1504        if (size < 1024 * 1024 * 1024)
1505                max_bytes = MAX_CACHE_BYTES_PER_GIG;
1506        else
1507                max_bytes = MAX_CACHE_BYTES_PER_GIG *
1508                        div64_u64(size, 1024 * 1024 * 1024);
1509
1510        /*
1511         * we want to account for 1 more bitmap than what we have so we can make
1512         * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1513         * we add more bitmaps.
1514         */
1515        bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
1516
1517        if (bitmap_bytes >= max_bytes) {
1518                ctl->extents_thresh = 0;
1519                return;
1520        }
1521
1522        /*
1523         * we want the extent entry threshold to always be at most 1/2 the maxw
1524         * bytes we can have, or whatever is less than that.
1525         */
1526        extent_bytes = max_bytes - bitmap_bytes;
1527        extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
1528
1529        ctl->extents_thresh =
1530                div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1531}
1532
1533static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1534                                       struct btrfs_free_space *info,
1535                                       u64 offset, u64 bytes)
1536{
1537        unsigned long start, count;
1538
1539        start = offset_to_bit(info->offset, ctl->unit, offset);
1540        count = bytes_to_bits(bytes, ctl->unit);
1541        ASSERT(start + count <= BITS_PER_BITMAP);
1542
1543        bitmap_clear(info->bitmap, start, count);
1544
1545        info->bytes -= bytes;
1546}
1547
1548static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1549                              struct btrfs_free_space *info, u64 offset,
1550                              u64 bytes)
1551{
1552        __bitmap_clear_bits(ctl, info, offset, bytes);
1553        ctl->free_space -= bytes;
1554}
1555
1556static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1557                            struct btrfs_free_space *info, u64 offset,
1558                            u64 bytes)
1559{
1560        unsigned long start, count;
1561
1562        start = offset_to_bit(info->offset, ctl->unit, offset);
1563        count = bytes_to_bits(bytes, ctl->unit);
1564        ASSERT(start + count <= BITS_PER_BITMAP);
1565
1566        bitmap_set(info->bitmap, start, count);
1567
1568        info->bytes += bytes;
1569        ctl->free_space += bytes;
1570}
1571
1572/*
1573 * If we can not find suitable extent, we will use bytes to record
1574 * the size of the max extent.
1575 */
1576static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1577                         struct btrfs_free_space *bitmap_info, u64 *offset,
1578                         u64 *bytes)
1579{
1580        unsigned long found_bits = 0;
1581        unsigned long max_bits = 0;
1582        unsigned long bits, i;
1583        unsigned long next_zero;
1584        unsigned long extent_bits;
1585
1586        i = offset_to_bit(bitmap_info->offset, ctl->unit,
1587                          max_t(u64, *offset, bitmap_info->offset));
1588        bits = bytes_to_bits(*bytes, ctl->unit);
1589
1590        for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1591                next_zero = find_next_zero_bit(bitmap_info->bitmap,
1592                                               BITS_PER_BITMAP, i);
1593                extent_bits = next_zero - i;
1594                if (extent_bits >= bits) {
1595                        found_bits = extent_bits;
1596                        break;
1597                } else if (extent_bits > max_bits) {
1598                        max_bits = extent_bits;
1599                }
1600                i = next_zero;
1601        }
1602
1603        if (found_bits) {
1604                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1605                *bytes = (u64)(found_bits) * ctl->unit;
1606                return 0;
1607        }
1608
1609        *bytes = (u64)(max_bits) * ctl->unit;
1610        return -1;
1611}
1612
1613/* Cache the size of the max extent in bytes */
1614static struct btrfs_free_space *
1615find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1616                unsigned long align, u64 *max_extent_size)
1617{
1618        struct btrfs_free_space *entry;
1619        struct rb_node *node;
1620        u64 tmp;
1621        u64 align_off;
1622        int ret;
1623
1624        if (!ctl->free_space_offset.rb_node)
1625                goto out;
1626
1627        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1628        if (!entry)
1629                goto out;
1630
1631        for (node = &entry->offset_index; node; node = rb_next(node)) {
1632                entry = rb_entry(node, struct btrfs_free_space, offset_index);
1633                if (entry->bytes < *bytes) {
1634                        if (entry->bytes > *max_extent_size)
1635                                *max_extent_size = entry->bytes;
1636                        continue;
1637                }
1638
1639                /* make sure the space returned is big enough
1640                 * to match our requested alignment
1641                 */
1642                if (*bytes >= align) {
1643                        tmp = entry->offset - ctl->start + align - 1;
1644                        do_div(tmp, align);
1645                        tmp = tmp * align + ctl->start;
1646                        align_off = tmp - entry->offset;
1647                } else {
1648                        align_off = 0;
1649                        tmp = entry->offset;
1650                }
1651
1652                if (entry->bytes < *bytes + align_off) {
1653                        if (entry->bytes > *max_extent_size)
1654                                *max_extent_size = entry->bytes;
1655                        continue;
1656                }
1657
1658                if (entry->bitmap) {
1659                        u64 size = *bytes;
1660
1661                        ret = search_bitmap(ctl, entry, &tmp, &size);
1662                        if (!ret) {
1663                                *offset = tmp;
1664                                *bytes = size;
1665                                return entry;
1666                        } else if (size > *max_extent_size) {
1667                                *max_extent_size = size;
1668                        }
1669                        continue;
1670                }
1671
1672                *offset = tmp;
1673                *bytes = entry->bytes - align_off;
1674                return entry;
1675        }
1676out:
1677        return NULL;
1678}
1679
1680static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1681                           struct btrfs_free_space *info, u64 offset)
1682{
1683        info->offset = offset_to_bitmap(ctl, offset);
1684        info->bytes = 0;
1685        INIT_LIST_HEAD(&info->list);
1686        link_free_space(ctl, info);
1687        ctl->total_bitmaps++;
1688
1689        ctl->op->recalc_thresholds(ctl);
1690}
1691
1692static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1693                        struct btrfs_free_space *bitmap_info)
1694{
1695        unlink_free_space(ctl, bitmap_info);
1696        kfree(bitmap_info->bitmap);
1697        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1698        ctl->total_bitmaps--;
1699        ctl->op->recalc_thresholds(ctl);
1700}
1701
1702static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1703                              struct btrfs_free_space *bitmap_info,
1704                              u64 *offset, u64 *bytes)
1705{
1706        u64 end;
1707        u64 search_start, search_bytes;
1708        int ret;
1709
1710again:
1711        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1712
1713        /*
1714         * We need to search for bits in this bitmap.  We could only cover some
1715         * of the extent in this bitmap thanks to how we add space, so we need
1716         * to search for as much as it as we can and clear that amount, and then
1717         * go searching for the next bit.
1718         */
1719        search_start = *offset;
1720        search_bytes = ctl->unit;
1721        search_bytes = min(search_bytes, end - search_start + 1);
1722        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
1723        if (ret < 0 || search_start != *offset)
1724                return -EINVAL;
1725
1726        /* We may have found more bits than what we need */
1727        search_bytes = min(search_bytes, *bytes);
1728
1729        /* Cannot clear past the end of the bitmap */
1730        search_bytes = min(search_bytes, end - search_start + 1);
1731
1732        bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1733        *offset += search_bytes;
1734        *bytes -= search_bytes;
1735
1736        if (*bytes) {
1737                struct rb_node *next = rb_next(&bitmap_info->offset_index);
1738                if (!bitmap_info->bytes)
1739                        free_bitmap(ctl, bitmap_info);
1740
1741                /*
1742                 * no entry after this bitmap, but we still have bytes to
1743                 * remove, so something has gone wrong.
1744                 */
1745                if (!next)
1746                        return -EINVAL;
1747
1748                bitmap_info = rb_entry(next, struct btrfs_free_space,
1749                                       offset_index);
1750
1751                /*
1752                 * if the next entry isn't a bitmap we need to return to let the
1753                 * extent stuff do its work.
1754                 */
1755                if (!bitmap_info->bitmap)
1756                        return -EAGAIN;
1757
1758                /*
1759                 * Ok the next item is a bitmap, but it may not actually hold
1760                 * the information for the rest of this free space stuff, so
1761                 * look for it, and if we don't find it return so we can try
1762                 * everything over again.
1763                 */
1764                search_start = *offset;
1765                search_bytes = ctl->unit;
1766                ret = search_bitmap(ctl, bitmap_info, &search_start,
1767                                    &search_bytes);
1768                if (ret < 0 || search_start != *offset)
1769                        return -EAGAIN;
1770
1771                goto again;
1772        } else if (!bitmap_info->bytes)
1773                free_bitmap(ctl, bitmap_info);
1774
1775        return 0;
1776}
1777
1778static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1779                               struct btrfs_free_space *info, u64 offset,
1780                               u64 bytes)
1781{
1782        u64 bytes_to_set = 0;
1783        u64 end;
1784
1785        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1786
1787        bytes_to_set = min(end - offset, bytes);
1788
1789        bitmap_set_bits(ctl, info, offset, bytes_to_set);
1790
1791        return bytes_to_set;
1792
1793}
1794
1795static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1796                      struct btrfs_free_space *info)
1797{
1798        struct btrfs_block_group_cache *block_group = ctl->private;
1799
1800        /*
1801         * If we are below the extents threshold then we can add this as an
1802         * extent, and don't have to deal with the bitmap
1803         */
1804        if (ctl->free_extents < ctl->extents_thresh) {
1805                /*
1806                 * If this block group has some small extents we don't want to
1807                 * use up all of our free slots in the cache with them, we want
1808                 * to reserve them to larger extents, however if we have plent
1809                 * of cache left then go ahead an dadd them, no sense in adding
1810                 * the overhead of a bitmap if we don't have to.
1811                 */
1812                if (info->bytes <= block_group->sectorsize * 4) {
1813                        if (ctl->free_extents * 2 <= ctl->extents_thresh)
1814                                return false;
1815                } else {
1816                        return false;
1817                }
1818        }
1819
1820        /*
1821         * The original block groups from mkfs can be really small, like 8
1822         * megabytes, so don't bother with a bitmap for those entries.  However
1823         * some block groups can be smaller than what a bitmap would cover but
1824         * are still large enough that they could overflow the 32k memory limit,
1825         * so allow those block groups to still be allowed to have a bitmap
1826         * entry.
1827         */
1828        if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
1829                return false;
1830
1831        return true;
1832}
1833
1834static struct btrfs_free_space_op free_space_op = {
1835        .recalc_thresholds      = recalculate_thresholds,
1836        .use_bitmap             = use_bitmap,
1837};
1838
1839static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
1840                              struct btrfs_free_space *info)
1841{
1842        struct btrfs_free_space *bitmap_info;
1843        struct btrfs_block_group_cache *block_group = NULL;
1844        int added = 0;
1845        u64 bytes, offset, bytes_added;
1846        int ret;
1847
1848        bytes = info->bytes;
1849        offset = info->offset;
1850
1851        if (!ctl->op->use_bitmap(ctl, info))
1852                return 0;
1853
1854        if (ctl->op == &free_space_op)
1855                block_group = ctl->private;
1856again:
1857        /*
1858         * Since we link bitmaps right into the cluster we need to see if we
1859         * have a cluster here, and if so and it has our bitmap we need to add
1860         * the free space to that bitmap.
1861         */
1862        if (block_group && !list_empty(&block_group->cluster_list)) {
1863                struct btrfs_free_cluster *cluster;
1864                struct rb_node *node;
1865                struct btrfs_free_space *entry;
1866
1867                cluster = list_entry(block_group->cluster_list.next,
1868                                     struct btrfs_free_cluster,
1869                                     block_group_list);
1870                spin_lock(&cluster->lock);
1871                node = rb_first(&cluster->root);
1872                if (!node) {
1873                        spin_unlock(&cluster->lock);
1874                        goto no_cluster_bitmap;
1875                }
1876
1877                entry = rb_entry(node, struct btrfs_free_space, offset_index);
1878                if (!entry->bitmap) {
1879                        spin_unlock(&cluster->lock);
1880                        goto no_cluster_bitmap;
1881                }
1882
1883                if (entry->offset == offset_to_bitmap(ctl, offset)) {
1884                        bytes_added = add_bytes_to_bitmap(ctl, entry,
1885                                                          offset, bytes);
1886                        bytes -= bytes_added;
1887                        offset += bytes_added;
1888                }
1889                spin_unlock(&cluster->lock);
1890                if (!bytes) {
1891                        ret = 1;
1892                        goto out;
1893                }
1894        }
1895
1896no_cluster_bitmap:
1897        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1898                                         1, 0);
1899        if (!bitmap_info) {
1900                ASSERT(added == 0);
1901                goto new_bitmap;
1902        }
1903
1904        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
1905        bytes -= bytes_added;
1906        offset += bytes_added;
1907        added = 0;
1908
1909        if (!bytes) {
1910                ret = 1;
1911                goto out;
1912        } else
1913                goto again;
1914
1915new_bitmap:
1916        if (info && info->bitmap) {
1917                add_new_bitmap(ctl, info, offset);
1918                added = 1;
1919                info = NULL;
1920                goto again;
1921        } else {
1922                spin_unlock(&ctl->tree_lock);
1923
1924                /* no pre-allocated info, allocate a new one */
1925                if (!info) {
1926                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
1927                                                 GFP_NOFS);
1928                        if (!info) {
1929                                spin_lock(&ctl->tree_lock);
1930                                ret = -ENOMEM;
1931                                goto out;
1932                        }
1933                }
1934
1935                /* allocate the bitmap */
1936                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
1937                spin_lock(&ctl->tree_lock);
1938                if (!info->bitmap) {
1939                        ret = -ENOMEM;
1940                        goto out;
1941                }
1942                goto again;
1943        }
1944
1945out:
1946        if (info) {
1947                if (info->bitmap)
1948                        kfree(info->bitmap);
1949                kmem_cache_free(btrfs_free_space_cachep, info);
1950        }
1951
1952        return ret;
1953}
1954
1955static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
1956                          struct btrfs_free_space *info, bool update_stat)
1957{
1958        struct btrfs_free_space *left_info;
1959        struct btrfs_free_space *right_info;
1960        bool merged = false;
1961        u64 offset = info->offset;
1962        u64 bytes = info->bytes;
1963
1964        /*
1965         * first we want to see if there is free space adjacent to the range we
1966         * are adding, if there is remove that struct and add a new one to
1967         * cover the entire range
1968         */
1969        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
1970        if (right_info && rb_prev(&right_info->offset_index))
1971                left_info = rb_entry(rb_prev(&right_info->offset_index),
1972                                     struct btrfs_free_space, offset_index);
1973        else
1974                left_info = tree_search_offset(ctl, offset - 1, 0, 0);
1975
1976        if (right_info && !right_info->bitmap) {
1977                if (update_stat)
1978                        unlink_free_space(ctl, right_info);
1979                else
1980                        __unlink_free_space(ctl, right_info);
1981                info->bytes += right_info->bytes;
1982                kmem_cache_free(btrfs_free_space_cachep, right_info);
1983                merged = true;
1984        }
1985
1986        if (left_info && !left_info->bitmap &&
1987            left_info->offset + left_info->bytes == offset) {
1988                if (update_stat)
1989                        unlink_free_space(ctl, left_info);
1990                else
1991                        __unlink_free_space(ctl, left_info);
1992                info->offset = left_info->offset;
1993                info->bytes += left_info->bytes;
1994                kmem_cache_free(btrfs_free_space_cachep, left_info);
1995                merged = true;
1996        }
1997
1998        return merged;
1999}
2000
2001int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2002                           u64 offset, u64 bytes)
2003{
2004        struct btrfs_free_space *info;
2005        int ret = 0;
2006
2007        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2008        if (!info)
2009                return -ENOMEM;
2010
2011        info->offset = offset;
2012        info->bytes = bytes;
2013
2014        spin_lock(&ctl->tree_lock);
2015
2016        if (try_merge_free_space(ctl, info, true))
2017                goto link;
2018
2019        /*
2020         * There was no extent directly to the left or right of this new
2021         * extent then we know we're going to have to allocate a new extent, so
2022         * before we do that see if we need to drop this into a bitmap
2023         */
2024        ret = insert_into_bitmap(ctl, info);
2025        if (ret < 0) {
2026                goto out;
2027        } else if (ret) {
2028                ret = 0;
2029                goto out;
2030        }
2031link:
2032        ret = link_free_space(ctl, info);
2033        if (ret)
2034                kmem_cache_free(btrfs_free_space_cachep, info);
2035out:
2036        spin_unlock(&ctl->tree_lock);
2037
2038        if (ret) {
2039                printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
2040                ASSERT(ret != -EEXIST);
2041        }
2042
2043        return ret;
2044}
2045
2046int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2047                            u64 offset, u64 bytes)
2048{
2049        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2050        struct btrfs_free_space *info;
2051        int ret;
2052        bool re_search = false;
2053
2054        spin_lock(&ctl->tree_lock);
2055
2056again:
2057        ret = 0;
2058        if (!bytes)
2059                goto out_lock;
2060
2061        info = tree_search_offset(ctl, offset, 0, 0);
2062        if (!info) {
2063                /*
2064                 * oops didn't find an extent that matched the space we wanted
2065                 * to remove, look for a bitmap instead
2066                 */
2067                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2068                                          1, 0);
2069                if (!info) {
2070                        /*
2071                         * If we found a partial bit of our free space in a
2072                         * bitmap but then couldn't find the other part this may
2073                         * be a problem, so WARN about it.
2074                         */
2075                        WARN_ON(re_search);
2076                        goto out_lock;
2077                }
2078        }
2079
2080        re_search = false;
2081        if (!info->bitmap) {
2082                unlink_free_space(ctl, info);
2083                if (offset == info->offset) {
2084                        u64 to_free = min(bytes, info->bytes);
2085
2086                        info->bytes -= to_free;
2087                        info->offset += to_free;
2088                        if (info->bytes) {
2089                                ret = link_free_space(ctl, info);
2090                                WARN_ON(ret);
2091                        } else {
2092                                kmem_cache_free(btrfs_free_space_cachep, info);
2093                        }
2094
2095                        offset += to_free;
2096                        bytes -= to_free;
2097                        goto again;
2098                } else {
2099                        u64 old_end = info->bytes + info->offset;
2100
2101                        info->bytes = offset - info->offset;
2102                        ret = link_free_space(ctl, info);
2103                        WARN_ON(ret);
2104                        if (ret)
2105                                goto out_lock;
2106
2107                        /* Not enough bytes in this entry to satisfy us */
2108                        if (old_end < offset + bytes) {
2109                                bytes -= old_end - offset;
2110                                offset = old_end;
2111                                goto again;
2112                        } else if (old_end == offset + bytes) {
2113                                /* all done */
2114                                goto out_lock;
2115                        }
2116                        spin_unlock(&ctl->tree_lock);
2117
2118                        ret = btrfs_add_free_space(block_group, offset + bytes,
2119                                                   old_end - (offset + bytes));
2120                        WARN_ON(ret);
2121                        goto out;
2122                }
2123        }
2124
2125        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2126        if (ret == -EAGAIN) {
2127                re_search = true;
2128                goto again;
2129        }
2130out_lock:
2131        spin_unlock(&ctl->tree_lock);
2132out:
2133        return ret;
2134}
2135
2136void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2137                           u64 bytes)
2138{
2139        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2140        struct btrfs_free_space *info;
2141        struct rb_node *n;
2142        int count = 0;
2143
2144        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2145                info = rb_entry(n, struct btrfs_free_space, offset_index);
2146                if (info->bytes >= bytes && !block_group->ro)
2147                        count++;
2148                btrfs_crit(block_group->fs_info,
2149                           "entry offset %llu, bytes %llu, bitmap %s",
2150                           info->offset, info->bytes,
2151                       (info->bitmap) ? "yes" : "no");
2152        }
2153        btrfs_info(block_group->fs_info, "block group has cluster?: %s",
2154               list_empty(&block_group->cluster_list) ? "no" : "yes");
2155        btrfs_info(block_group->fs_info,
2156                   "%d blocks of free space at or bigger than bytes is", count);
2157}
2158
2159void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2160{
2161        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2162
2163        spin_lock_init(&ctl->tree_lock);
2164        ctl->unit = block_group->sectorsize;
2165        ctl->start = block_group->key.objectid;
2166        ctl->private = block_group;
2167        ctl->op = &free_space_op;
2168
2169        /*
2170         * we only want to have 32k of ram per block group for keeping
2171         * track of free space, and if we pass 1/2 of that we want to
2172         * start converting things over to using bitmaps
2173         */
2174        ctl->extents_thresh = ((1024 * 32) / 2) /
2175                                sizeof(struct btrfs_free_space);
2176}
2177
2178/*
2179 * for a given cluster, put all of its extents back into the free
2180 * space cache.  If the block group passed doesn't match the block group
2181 * pointed to by the cluster, someone else raced in and freed the
2182 * cluster already.  In that case, we just return without changing anything
2183 */
2184static int
2185__btrfs_return_cluster_to_free_space(
2186                             struct btrfs_block_group_cache *block_group,
2187                             struct btrfs_free_cluster *cluster)
2188{
2189        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2190        struct btrfs_free_space *entry;
2191        struct rb_node *node;
2192
2193        spin_lock(&cluster->lock);
2194        if (cluster->block_group != block_group)
2195                goto out;
2196
2197        cluster->block_group = NULL;
2198        cluster->window_start = 0;
2199        list_del_init(&cluster->block_group_list);
2200
2201        node = rb_first(&cluster->root);
2202        while (node) {
2203                bool bitmap;
2204
2205                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2206                node = rb_next(&entry->offset_index);
2207                rb_erase(&entry->offset_index, &cluster->root);
2208
2209                bitmap = (entry->bitmap != NULL);
2210                if (!bitmap)
2211                        try_merge_free_space(ctl, entry, false);
2212                tree_insert_offset(&ctl->free_space_offset,
2213                                   entry->offset, &entry->offset_index, bitmap);
2214        }
2215        cluster->root = RB_ROOT;
2216
2217out:
2218        spin_unlock(&cluster->lock);
2219        btrfs_put_block_group(block_group);
2220        return 0;
2221}
2222
2223static void __btrfs_remove_free_space_cache_locked(
2224                                struct btrfs_free_space_ctl *ctl)
2225{
2226        struct btrfs_free_space *info;
2227        struct rb_node *node;
2228
2229        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2230                info = rb_entry(node, struct btrfs_free_space, offset_index);
2231                if (!info->bitmap) {
2232                        unlink_free_space(ctl, info);
2233                        kmem_cache_free(btrfs_free_space_cachep, info);
2234                } else {
2235                        free_bitmap(ctl, info);
2236                }
2237                if (need_resched()) {
2238                        spin_unlock(&ctl->tree_lock);
2239                        cond_resched();
2240                        spin_lock(&ctl->tree_lock);
2241                }
2242        }
2243}
2244
2245void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2246{
2247        spin_lock(&ctl->tree_lock);
2248        __btrfs_remove_free_space_cache_locked(ctl);
2249        spin_unlock(&ctl->tree_lock);
2250}
2251
2252void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2253{
2254        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2255        struct btrfs_free_cluster *cluster;
2256        struct list_head *head;
2257
2258        spin_lock(&ctl->tree_lock);
2259        while ((head = block_group->cluster_list.next) !=
2260               &block_group->cluster_list) {
2261                cluster = list_entry(head, struct btrfs_free_cluster,
2262                                     block_group_list);
2263
2264                WARN_ON(cluster->block_group != block_group);
2265                __btrfs_return_cluster_to_free_space(block_group, cluster);
2266                if (need_resched()) {
2267                        spin_unlock(&ctl->tree_lock);
2268                        cond_resched();
2269                        spin_lock(&ctl->tree_lock);
2270                }
2271        }
2272        __btrfs_remove_free_space_cache_locked(ctl);
2273        spin_unlock(&ctl->tree_lock);
2274
2275}
2276
2277u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2278                               u64 offset, u64 bytes, u64 empty_size,
2279                               u64 *max_extent_size)
2280{
2281        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2282        struct btrfs_free_space *entry = NULL;
2283        u64 bytes_search = bytes + empty_size;
2284        u64 ret = 0;
2285        u64 align_gap = 0;
2286        u64 align_gap_len = 0;
2287
2288        spin_lock(&ctl->tree_lock);
2289        entry = find_free_space(ctl, &offset, &bytes_search,
2290                                block_group->full_stripe_len, max_extent_size);
2291        if (!entry)
2292                goto out;
2293
2294        ret = offset;
2295        if (entry->bitmap) {
2296                bitmap_clear_bits(ctl, entry, offset, bytes);
2297                if (!entry->bytes)
2298                        free_bitmap(ctl, entry);
2299        } else {
2300                unlink_free_space(ctl, entry);
2301                align_gap_len = offset - entry->offset;
2302                align_gap = entry->offset;
2303
2304                entry->offset = offset + bytes;
2305                WARN_ON(entry->bytes < bytes + align_gap_len);
2306
2307                entry->bytes -= bytes + align_gap_len;
2308                if (!entry->bytes)
2309                        kmem_cache_free(btrfs_free_space_cachep, entry);
2310                else
2311                        link_free_space(ctl, entry);
2312        }
2313out:
2314        spin_unlock(&ctl->tree_lock);
2315
2316        if (align_gap_len)
2317                __btrfs_add_free_space(ctl, align_gap, align_gap_len);
2318        return ret;
2319}
2320
2321/*
2322 * given a cluster, put all of its extents back into the free space
2323 * cache.  If a block group is passed, this function will only free
2324 * a cluster that belongs to the passed block group.
2325 *
2326 * Otherwise, it'll get a reference on the block group pointed to by the
2327 * cluster and remove the cluster from it.
2328 */
2329int btrfs_return_cluster_to_free_space(
2330                               struct btrfs_block_group_cache *block_group,
2331                               struct btrfs_free_cluster *cluster)
2332{
2333        struct btrfs_free_space_ctl *ctl;
2334        int ret;
2335
2336        /* first, get a safe pointer to the block group */
2337        spin_lock(&cluster->lock);
2338        if (!block_group) {
2339                block_group = cluster->block_group;
2340                if (!block_group) {
2341                        spin_unlock(&cluster->lock);
2342                        return 0;
2343                }
2344        } else if (cluster->block_group != block_group) {
2345                /* someone else has already freed it don't redo their work */
2346                spin_unlock(&cluster->lock);
2347                return 0;
2348        }
2349        atomic_inc(&block_group->count);
2350        spin_unlock(&cluster->lock);
2351
2352        ctl = block_group->free_space_ctl;
2353
2354        /* now return any extents the cluster had on it */
2355        spin_lock(&ctl->tree_lock);
2356        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2357        spin_unlock(&ctl->tree_lock);
2358
2359        /* finally drop our ref */
2360        btrfs_put_block_group(block_group);
2361        return ret;
2362}
2363
2364static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2365                                   struct btrfs_free_cluster *cluster,
2366                                   struct btrfs_free_space *entry,
2367                                   u64 bytes, u64 min_start,
2368                                   u64 *max_extent_size)
2369{
2370        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2371        int err;
2372        u64 search_start = cluster->window_start;
2373        u64 search_bytes = bytes;
2374        u64 ret = 0;
2375
2376        search_start = min_start;
2377        search_bytes = bytes;
2378
2379        err = search_bitmap(ctl, entry, &search_start, &search_bytes);
2380        if (err) {
2381                if (search_bytes > *max_extent_size)
2382                        *max_extent_size = search_bytes;
2383                return 0;
2384        }
2385
2386        ret = search_start;
2387        __bitmap_clear_bits(ctl, entry, ret, bytes);
2388
2389        return ret;
2390}
2391
2392/*
2393 * given a cluster, try to allocate 'bytes' from it, returns 0
2394 * if it couldn't find anything suitably large, or a logical disk offset
2395 * if things worked out
2396 */
2397u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2398                             struct btrfs_free_cluster *cluster, u64 bytes,
2399                             u64 min_start, u64 *max_extent_size)
2400{
2401        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2402        struct btrfs_free_space *entry = NULL;
2403        struct rb_node *node;
2404        u64 ret = 0;
2405
2406        spin_lock(&cluster->lock);
2407        if (bytes > cluster->max_size)
2408                goto out;
2409
2410        if (cluster->block_group != block_group)
2411                goto out;
2412
2413        node = rb_first(&cluster->root);
2414        if (!node)
2415                goto out;
2416
2417        entry = rb_entry(node, struct btrfs_free_space, offset_index);
2418        while (1) {
2419                if (entry->bytes < bytes && entry->bytes > *max_extent_size)
2420                        *max_extent_size = entry->bytes;
2421
2422                if (entry->bytes < bytes ||
2423                    (!entry->bitmap && entry->offset < min_start)) {
2424                        node = rb_next(&entry->offset_index);
2425                        if (!node)
2426                                break;
2427                        entry = rb_entry(node, struct btrfs_free_space,
2428                                         offset_index);
2429                        continue;
2430                }
2431
2432                if (entry->bitmap) {
2433                        ret = btrfs_alloc_from_bitmap(block_group,
2434                                                      cluster, entry, bytes,
2435                                                      cluster->window_start,
2436                                                      max_extent_size);
2437                        if (ret == 0) {
2438                                node = rb_next(&entry->offset_index);
2439                                if (!node)
2440                                        break;
2441                                entry = rb_entry(node, struct btrfs_free_space,
2442                                                 offset_index);
2443                                continue;
2444                        }
2445                        cluster->window_start += bytes;
2446                } else {
2447                        ret = entry->offset;
2448
2449                        entry->offset += bytes;
2450                        entry->bytes -= bytes;
2451                }
2452
2453                if (entry->bytes == 0)
2454                        rb_erase(&entry->offset_index, &cluster->root);
2455                break;
2456        }
2457out:
2458        spin_unlock(&cluster->lock);
2459
2460        if (!ret)
2461                return 0;
2462
2463        spin_lock(&ctl->tree_lock);
2464
2465        ctl->free_space -= bytes;
2466        if (entry->bytes == 0) {
2467                ctl->free_extents--;
2468                if (entry->bitmap) {
2469                        kfree(entry->bitmap);
2470                        ctl->total_bitmaps--;
2471                        ctl->op->recalc_thresholds(ctl);
2472                }
2473                kmem_cache_free(btrfs_free_space_cachep, entry);
2474        }
2475
2476        spin_unlock(&ctl->tree_lock);
2477
2478        return ret;
2479}
2480
2481static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2482                                struct btrfs_free_space *entry,
2483                                struct btrfs_free_cluster *cluster,
2484                                u64 offset, u64 bytes,
2485                                u64 cont1_bytes, u64 min_bytes)
2486{
2487        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2488        unsigned long next_zero;
2489        unsigned long i;
2490        unsigned long want_bits;
2491        unsigned long min_bits;
2492        unsigned long found_bits;
2493        unsigned long start = 0;
2494        unsigned long total_found = 0;
2495        int ret;
2496
2497        i = offset_to_bit(entry->offset, ctl->unit,
2498                          max_t(u64, offset, entry->offset));
2499        want_bits = bytes_to_bits(bytes, ctl->unit);
2500        min_bits = bytes_to_bits(min_bytes, ctl->unit);
2501
2502again:
2503        found_bits = 0;
2504        for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2505                next_zero = find_next_zero_bit(entry->bitmap,
2506                                               BITS_PER_BITMAP, i);
2507                if (next_zero - i >= min_bits) {
2508                        found_bits = next_zero - i;
2509                        break;
2510                }
2511                i = next_zero;
2512        }
2513
2514        if (!found_bits)
2515                return -ENOSPC;
2516
2517        if (!total_found) {
2518                start = i;
2519                cluster->max_size = 0;
2520        }
2521
2522        total_found += found_bits;
2523
2524        if (cluster->max_size < found_bits * ctl->unit)
2525                cluster->max_size = found_bits * ctl->unit;
2526
2527        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2528                i = next_zero + 1;
2529                goto again;
2530        }
2531
2532        cluster->window_start = start * ctl->unit + entry->offset;
2533        rb_erase(&entry->offset_index, &ctl->free_space_offset);
2534        ret = tree_insert_offset(&cluster->root, entry->offset,
2535                                 &entry->offset_index, 1);
2536        ASSERT(!ret); /* -EEXIST; Logic error */
2537
2538        trace_btrfs_setup_cluster(block_group, cluster,
2539                                  total_found * ctl->unit, 1);
2540        return 0;
2541}
2542
2543/*
2544 * This searches the block group for just extents to fill the cluster with.
2545 * Try to find a cluster with at least bytes total bytes, at least one
2546 * extent of cont1_bytes, and other clusters of at least min_bytes.
2547 */
2548static noinline int
2549setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2550                        struct btrfs_free_cluster *cluster,
2551                        struct list_head *bitmaps, u64 offset, u64 bytes,
2552                        u64 cont1_bytes, u64 min_bytes)
2553{
2554        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2555        struct btrfs_free_space *first = NULL;
2556        struct btrfs_free_space *entry = NULL;
2557        struct btrfs_free_space *last;
2558        struct rb_node *node;
2559        u64 window_free;
2560        u64 max_extent;
2561        u64 total_size = 0;
2562
2563        entry = tree_search_offset(ctl, offset, 0, 1);
2564        if (!entry)
2565                return -ENOSPC;
2566
2567        /*
2568         * We don't want bitmaps, so just move along until we find a normal
2569         * extent entry.
2570         */
2571        while (entry->bitmap || entry->bytes < min_bytes) {
2572                if (entry->bitmap && list_empty(&entry->list))
2573                        list_add_tail(&entry->list, bitmaps);
2574                node = rb_next(&entry->offset_index);
2575                if (!node)
2576                        return -ENOSPC;
2577                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2578        }
2579
2580        window_free = entry->bytes;
2581        max_extent = entry->bytes;
2582        first = entry;
2583        last = entry;
2584
2585        for (node = rb_next(&entry->offset_index); node;
2586             node = rb_next(&entry->offset_index)) {
2587                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2588
2589                if (entry->bitmap) {
2590                        if (list_empty(&entry->list))
2591                                list_add_tail(&entry->list, bitmaps);
2592                        continue;
2593                }
2594
2595                if (entry->bytes < min_bytes)
2596                        continue;
2597
2598                last = entry;
2599                window_free += entry->bytes;
2600                if (entry->bytes > max_extent)
2601                        max_extent = entry->bytes;
2602        }
2603
2604        if (window_free < bytes || max_extent < cont1_bytes)
2605                return -ENOSPC;
2606
2607        cluster->window_start = first->offset;
2608
2609        node = &first->offset_index;
2610
2611        /*
2612         * now we've found our entries, pull them out of the free space
2613         * cache and put them into the cluster rbtree
2614         */
2615        do {
2616                int ret;
2617
2618                entry = rb_entry(node, struct btrfs_free_space, offset_index);
2619                node = rb_next(&entry->offset_index);
2620                if (entry->bitmap || entry->bytes < min_bytes)
2621                        continue;
2622
2623                rb_erase(&entry->offset_index, &ctl->free_space_offset);
2624                ret = tree_insert_offset(&cluster->root, entry->offset,
2625                                         &entry->offset_index, 0);
2626                total_size += entry->bytes;
2627                ASSERT(!ret); /* -EEXIST; Logic error */
2628        } while (node && entry != last);
2629
2630        cluster->max_size = max_extent;
2631        trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2632        return 0;
2633}
2634
2635/*
2636 * This specifically looks for bitmaps that may work in the cluster, we assume
2637 * that we have already failed to find extents that will work.
2638 */
2639static noinline int
2640setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2641                     struct btrfs_free_cluster *cluster,
2642                     struct list_head *bitmaps, u64 offset, u64 bytes,
2643                     u64 cont1_bytes, u64 min_bytes)
2644{
2645        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2646        struct btrfs_free_space *entry;
2647        int ret = -ENOSPC;
2648        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2649
2650        if (ctl->total_bitmaps == 0)
2651                return -ENOSPC;
2652
2653        /*
2654         * The bitmap that covers offset won't be in the list unless offset
2655         * is just its start offset.
2656         */
2657        entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2658        if (entry->offset != bitmap_offset) {
2659                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2660                if (entry && list_empty(&entry->list))
2661                        list_add(&entry->list, bitmaps);
2662        }
2663
2664        list_for_each_entry(entry, bitmaps, list) {
2665                if (entry->bytes < bytes)
2666                        continue;
2667                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2668                                           bytes, cont1_bytes, min_bytes);
2669                if (!ret)
2670                        return 0;
2671        }
2672
2673        /*
2674         * The bitmaps list has all the bitmaps that record free space
2675         * starting after offset, so no more search is required.
2676         */
2677        return -ENOSPC;
2678}
2679
2680/*
2681 * here we try to find a cluster of blocks in a block group.  The goal
2682 * is to find at least bytes+empty_size.
2683 * We might not find them all in one contiguous area.
2684 *
2685 * returns zero and sets up cluster if things worked out, otherwise
2686 * it returns -enospc
2687 */
2688int btrfs_find_space_cluster(struct btrfs_root *root,
2689                             struct btrfs_block_group_cache *block_group,
2690                             struct btrfs_free_cluster *cluster,
2691                             u64 offset, u64 bytes, u64 empty_size)
2692{
2693        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2694        struct btrfs_free_space *entry, *tmp;
2695        LIST_HEAD(bitmaps);
2696        u64 min_bytes;
2697        u64 cont1_bytes;
2698        int ret;
2699
2700        /*
2701         * Choose the minimum extent size we'll require for this
2702         * cluster.  For SSD_SPREAD, don't allow any fragmentation.
2703         * For metadata, allow allocates with smaller extents.  For
2704         * data, keep it dense.
2705         */
2706        if (btrfs_test_opt(root, SSD_SPREAD)) {
2707                cont1_bytes = min_bytes = bytes + empty_size;
2708        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2709                cont1_bytes = bytes;
2710                min_bytes = block_group->sectorsize;
2711        } else {
2712                cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
2713                min_bytes = block_group->sectorsize;
2714        }
2715
2716        spin_lock(&ctl->tree_lock);
2717
2718        /*
2719         * If we know we don't have enough space to make a cluster don't even
2720         * bother doing all the work to try and find one.
2721         */
2722        if (ctl->free_space < bytes) {
2723                spin_unlock(&ctl->tree_lock);
2724                return -ENOSPC;
2725        }
2726
2727        spin_lock(&cluster->lock);
2728
2729        /* someone already found a cluster, hooray */
2730        if (cluster->block_group) {
2731                ret = 0;
2732                goto out;
2733        }
2734
2735        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2736                                 min_bytes);
2737
2738        INIT_LIST_HEAD(&bitmaps);
2739        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2740                                      bytes + empty_size,
2741                                      cont1_bytes, min_bytes);
2742        if (ret)
2743                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2744                                           offset, bytes + empty_size,
2745                                           cont1_bytes, min_bytes);
2746
2747        /* Clear our temporary list */
2748        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
2749                list_del_init(&entry->list);
2750
2751        if (!ret) {
2752                atomic_inc(&block_group->count);
2753                list_add_tail(&cluster->block_group_list,
2754                              &block_group->cluster_list);
2755                cluster->block_group = block_group;
2756        } else {
2757                trace_btrfs_failed_cluster_setup(block_group);
2758        }
2759out:
2760        spin_unlock(&cluster->lock);
2761        spin_unlock(&ctl->tree_lock);
2762
2763        return ret;
2764}
2765
2766/*
2767 * simple code to zero out a cluster
2768 */
2769void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2770{
2771        spin_lock_init(&cluster->lock);
2772        spin_lock_init(&cluster->refill_lock);
2773        cluster->root = RB_ROOT;
2774        cluster->max_size = 0;
2775        INIT_LIST_HEAD(&cluster->block_group_list);
2776        cluster->block_group = NULL;
2777}
2778
2779static int do_trimming(struct btrfs_block_group_cache *block_group,
2780                       u64 *total_trimmed, u64 start, u64 bytes,
2781                       u64 reserved_start, u64 reserved_bytes)
2782{
2783        struct btrfs_space_info *space_info = block_group->space_info;
2784        struct btrfs_fs_info *fs_info = block_group->fs_info;
2785        int ret;
2786        int update = 0;
2787        u64 trimmed = 0;
2788
2789        spin_lock(&space_info->lock);
2790        spin_lock(&block_group->lock);
2791        if (!block_group->ro) {
2792                block_group->reserved += reserved_bytes;
2793                space_info->bytes_reserved += reserved_bytes;
2794                update = 1;
2795        }
2796        spin_unlock(&block_group->lock);
2797        spin_unlock(&space_info->lock);
2798
2799        ret = btrfs_error_discard_extent(fs_info->extent_root,
2800                                         start, bytes, &trimmed);
2801        if (!ret)
2802                *total_trimmed += trimmed;
2803
2804        btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2805
2806        if (update) {
2807                spin_lock(&space_info->lock);
2808                spin_lock(&block_group->lock);
2809                if (block_group->ro)
2810                        space_info->bytes_readonly += reserved_bytes;
2811                block_group->reserved -= reserved_bytes;
2812                space_info->bytes_reserved -= reserved_bytes;
2813                spin_unlock(&space_info->lock);
2814                spin_unlock(&block_group->lock);
2815        }
2816
2817        return ret;
2818}
2819
2820static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2821                          u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2822{
2823        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2824        struct btrfs_free_space *entry;
2825        struct rb_node *node;
2826        int ret = 0;
2827        u64 extent_start;
2828        u64 extent_bytes;
2829        u64 bytes;
2830
2831        while (start < end) {
2832                spin_lock(&ctl->tree_lock);
2833
2834                if (ctl->free_space < minlen) {
2835                        spin_unlock(&ctl->tree_lock);
2836                        break;
2837                }
2838
2839                entry = tree_search_offset(ctl, start, 0, 1);
2840                if (!entry) {
2841                        spin_unlock(&ctl->tree_lock);
2842                        break;
2843                }
2844
2845                /* skip bitmaps */
2846                while (entry->bitmap) {
2847                        node = rb_next(&entry->offset_index);
2848                        if (!node) {
2849                                spin_unlock(&ctl->tree_lock);
2850                                goto out;
2851                        }
2852                        entry = rb_entry(node, struct btrfs_free_space,
2853                                         offset_index);
2854                }
2855
2856                if (entry->offset >= end) {
2857                        spin_unlock(&ctl->tree_lock);
2858                        break;
2859                }
2860
2861                extent_start = entry->offset;
2862                extent_bytes = entry->bytes;
2863                start = max(start, extent_start);
2864                bytes = min(extent_start + extent_bytes, end) - start;
2865                if (bytes < minlen) {
2866                        spin_unlock(&ctl->tree_lock);
2867                        goto next;
2868                }
2869
2870                unlink_free_space(ctl, entry);
2871                kmem_cache_free(btrfs_free_space_cachep, entry);
2872
2873                spin_unlock(&ctl->tree_lock);
2874
2875                ret = do_trimming(block_group, total_trimmed, start, bytes,
2876                                  extent_start, extent_bytes);
2877                if (ret)
2878                        break;
2879next:
2880                start += bytes;
2881
2882                if (fatal_signal_pending(current)) {
2883                        ret = -ERESTARTSYS;
2884                        break;
2885                }
2886
2887                cond_resched();
2888        }
2889out:
2890        return ret;
2891}
2892
2893static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
2894                        u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2895{
2896        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2897        struct btrfs_free_space *entry;
2898        int ret = 0;
2899        int ret2;
2900        u64 bytes;
2901        u64 offset = offset_to_bitmap(ctl, start);
2902
2903        while (offset < end) {
2904                bool next_bitmap = false;
2905
2906                spin_lock(&ctl->tree_lock);
2907
2908                if (ctl->free_space < minlen) {
2909                        spin_unlock(&ctl->tree_lock);
2910                        break;
2911                }
2912
2913                entry = tree_search_offset(ctl, offset, 1, 0);
2914                if (!entry) {
2915                        spin_unlock(&ctl->tree_lock);
2916                        next_bitmap = true;
2917                        goto next;
2918                }
2919
2920                bytes = minlen;
2921                ret2 = search_bitmap(ctl, entry, &start, &bytes);
2922                if (ret2 || start >= end) {
2923                        spin_unlock(&ctl->tree_lock);
2924                        next_bitmap = true;
2925                        goto next;
2926                }
2927
2928                bytes = min(bytes, end - start);
2929                if (bytes < minlen) {
2930                        spin_unlock(&ctl->tree_lock);
2931                        goto next;
2932                }
2933
2934                bitmap_clear_bits(ctl, entry, start, bytes);
2935                if (entry->bytes == 0)
2936                        free_bitmap(ctl, entry);
2937
2938                spin_unlock(&ctl->tree_lock);
2939
2940                ret = do_trimming(block_group, total_trimmed, start, bytes,
2941                                  start, bytes);
2942                if (ret)
2943                        break;
2944next:
2945                if (next_bitmap) {
2946                        offset += BITS_PER_BITMAP * ctl->unit;
2947                } else {
2948                        start += bytes;
2949                        if (start >= offset + BITS_PER_BITMAP * ctl->unit)
2950                                offset += BITS_PER_BITMAP * ctl->unit;
2951                }
2952
2953                if (fatal_signal_pending(current)) {
2954                        ret = -ERESTARTSYS;
2955                        break;
2956                }
2957
2958                cond_resched();
2959        }
2960
2961        return ret;
2962}
2963
2964int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2965                           u64 *trimmed, u64 start, u64 end, u64 minlen)
2966{
2967        int ret;
2968
2969        *trimmed = 0;
2970
2971        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
2972        if (ret)
2973                return ret;
2974
2975        ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
2976
2977        return ret;
2978}
2979
2980/*
2981 * Find the left-most item in the cache tree, and then return the
2982 * smallest inode number in the item.
2983 *
2984 * Note: the returned inode number may not be the smallest one in
2985 * the tree, if the left-most item is a bitmap.
2986 */
2987u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
2988{
2989        struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
2990        struct btrfs_free_space *entry = NULL;
2991        u64 ino = 0;
2992
2993        spin_lock(&ctl->tree_lock);
2994
2995        if (RB_EMPTY_ROOT(&ctl->free_space_offset))
2996                goto out;
2997
2998        entry = rb_entry(rb_first(&ctl->free_space_offset),
2999                         struct btrfs_free_space, offset_index);
3000
3001        if (!entry->bitmap) {
3002                ino = entry->offset;
3003
3004                unlink_free_space(ctl, entry);
3005                entry->offset++;
3006                entry->bytes--;
3007                if (!entry->bytes)
3008                        kmem_cache_free(btrfs_free_space_cachep, entry);
3009                else
3010                        link_free_space(ctl, entry);
3011        } else {
3012                u64 offset = 0;
3013                u64 count = 1;
3014                int ret;
3015
3016                ret = search_bitmap(ctl, entry, &offset, &count);
3017                /* Logic error; Should be empty if it can't find anything */
3018                ASSERT(!ret);
3019
3020                ino = offset;
3021                bitmap_clear_bits(ctl, entry, offset, 1);
3022                if (entry->bytes == 0)
3023                        free_bitmap(ctl, entry);
3024        }
3025out:
3026        spin_unlock(&ctl->tree_lock);
3027
3028        return ino;
3029}
3030
3031struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3032                                    struct btrfs_path *path)
3033{
3034        struct inode *inode = NULL;
3035
3036        spin_lock(&root->cache_lock);
3037        if (root->cache_inode)
3038                inode = igrab(root->cache_inode);
3039        spin_unlock(&root->cache_lock);
3040        if (inode)
3041                return inode;
3042
3043        inode = __lookup_free_space_inode(root, path, 0);
3044        if (IS_ERR(inode))
3045                return inode;
3046
3047        spin_lock(&root->cache_lock);
3048        if (!btrfs_fs_closing(root->fs_info))
3049                root->cache_inode = igrab(inode);
3050        spin_unlock(&root->cache_lock);
3051
3052        return inode;
3053}
3054
3055int create_free_ino_inode(struct btrfs_root *root,
3056                          struct btrfs_trans_handle *trans,
3057                          struct btrfs_path *path)
3058{
3059        return __create_free_space_inode(root, trans, path,
3060                                         BTRFS_FREE_INO_OBJECTID, 0);
3061}
3062
3063int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3064{
3065        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3066        struct btrfs_path *path;
3067        struct inode *inode;
3068        int ret = 0;
3069        u64 root_gen = btrfs_root_generation(&root->root_item);
3070
3071        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
3072                return 0;
3073
3074        /*
3075         * If we're unmounting then just return, since this does a search on the
3076         * normal root and not the commit root and we could deadlock.
3077         */
3078        if (btrfs_fs_closing(fs_info))
3079                return 0;
3080
3081        path = btrfs_alloc_path();
3082        if (!path)
3083                return 0;
3084
3085        inode = lookup_free_ino_inode(root, path);
3086        if (IS_ERR(inode))
3087                goto out;
3088
3089        if (root_gen != BTRFS_I(inode)->generation)
3090                goto out_put;
3091
3092        ret = __load_free_space_cache(root, inode, ctl, path, 0);
3093
3094        if (ret < 0)
3095                btrfs_err(fs_info,
3096                        "failed to load free ino cache for root %llu",
3097                        root->root_key.objectid);
3098out_put:
3099        iput(inode);
3100out:
3101        btrfs_free_path(path);
3102        return ret;
3103}
3104
3105int btrfs_write_out_ino_cache(struct btrfs_root *root,
3106                              struct btrfs_trans_handle *trans,
3107                              struct btrfs_path *path,
3108                              struct inode *inode)
3109{
3110        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3111        int ret;
3112
3113        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
3114                return 0;
3115
3116        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
3117        if (ret) {
3118                btrfs_delalloc_release_metadata(inode, inode->i_size);
3119#ifdef DEBUG
3120                btrfs_err(root->fs_info,
3121                        "failed to write free ino cache for root %llu",
3122                        root->root_key.objectid);
3123#endif
3124        }
3125
3126        return ret;
3127}
3128
3129#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3130/*
3131 * Use this if you need to make a bitmap or extent entry specifically, it
3132 * doesn't do any of the merging that add_free_space does, this acts a lot like
3133 * how the free space cache loading stuff works, so you can get really weird
3134 * configurations.
3135 */
3136int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
3137                              u64 offset, u64 bytes, bool bitmap)
3138{
3139        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3140        struct btrfs_free_space *info = NULL, *bitmap_info;
3141        void *map = NULL;
3142        u64 bytes_added;
3143        int ret;
3144
3145again:
3146        if (!info) {
3147                info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3148                if (!info)
3149                        return -ENOMEM;
3150        }
3151
3152        if (!bitmap) {
3153                spin_lock(&ctl->tree_lock);
3154                info->offset = offset;
3155                info->bytes = bytes;
3156                ret = link_free_space(ctl, info);
3157                spin_unlock(&ctl->tree_lock);
3158                if (ret)
3159                        kmem_cache_free(btrfs_free_space_cachep, info);
3160                return ret;
3161        }
3162
3163        if (!map) {
3164                map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
3165                if (!map) {
3166                        kmem_cache_free(btrfs_free_space_cachep, info);
3167                        return -ENOMEM;
3168                }
3169        }
3170
3171        spin_lock(&ctl->tree_lock);
3172        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3173                                         1, 0);
3174        if (!bitmap_info) {
3175                info->bitmap = map;
3176                map = NULL;
3177                add_new_bitmap(ctl, info, offset);
3178                bitmap_info = info;
3179        }
3180
3181        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3182        bytes -= bytes_added;
3183        offset += bytes_added;
3184        spin_unlock(&ctl->tree_lock);
3185
3186        if (bytes)
3187                goto again;
3188
3189        if (map)
3190                kfree(map);
3191        return 0;
3192}
3193
3194/*
3195 * Checks to see if the given range is in the free space cache.  This is really
3196 * just used to check the absence of space, so if there is free space in the
3197 * range at all we will return 1.
3198 */
3199int test_check_exists(struct btrfs_block_group_cache *cache,
3200                      u64 offset, u64 bytes)
3201{
3202        struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3203        struct btrfs_free_space *info;
3204        int ret = 0;
3205
3206        spin_lock(&ctl->tree_lock);
3207        info = tree_search_offset(ctl, offset, 0, 0);
3208        if (!info) {
3209                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3210                                          1, 0);
3211                if (!info)
3212                        goto out;
3213        }
3214
3215have_info:
3216        if (info->bitmap) {
3217                u64 bit_off, bit_bytes;
3218                struct rb_node *n;
3219                struct btrfs_free_space *tmp;
3220
3221                bit_off = offset;
3222                bit_bytes = ctl->unit;
3223                ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
3224                if (!ret) {
3225                        if (bit_off == offset) {
3226                                ret = 1;
3227                                goto out;
3228                        } else if (bit_off > offset &&
3229                                   offset + bytes > bit_off) {
3230                                ret = 1;
3231                                goto out;
3232                        }
3233                }
3234
3235                n = rb_prev(&info->offset_index);
3236                while (n) {
3237                        tmp = rb_entry(n, struct btrfs_free_space,
3238                                       offset_index);
3239                        if (tmp->offset + tmp->bytes < offset)
3240                                break;
3241                        if (offset + bytes < tmp->offset) {
3242                                n = rb_prev(&info->offset_index);
3243                                continue;
3244                        }
3245                        info = tmp;
3246                        goto have_info;
3247                }
3248
3249                n = rb_next(&info->offset_index);
3250                while (n) {
3251                        tmp = rb_entry(n, struct btrfs_free_space,
3252                                       offset_index);
3253                        if (offset + bytes < tmp->offset)
3254                                break;
3255                        if (tmp->offset + tmp->bytes < offset) {
3256                                n = rb_next(&info->offset_index);
3257                                continue;
3258                        }
3259                        info = tmp;
3260                        goto have_info;
3261                }
3262
3263                goto out;
3264        }
3265
3266        if (info->offset == offset) {
3267                ret = 1;
3268                goto out;
3269        }
3270
3271        if (offset > info->offset && offset < info->offset + info->bytes)
3272                ret = 1;
3273out:
3274        spin_unlock(&ctl->tree_lock);
3275        return ret;
3276}
3277#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3278