linux/fs/ext3/balloc.c
<<
>>
Prefs
   1/*
   2 *  linux/fs/ext3/balloc.c
   3 *
   4 * Copyright (C) 1992, 1993, 1994, 1995
   5 * Remy Card (card@masi.ibp.fr)
   6 * Laboratoire MASI - Institut Blaise Pascal
   7 * Universite Pierre et Marie Curie (Paris VI)
   8 *
   9 *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
  10 *  Big-endian to little-endian byte-swapping/bitmaps by
  11 *        David S. Miller (davem@caip.rutgers.edu), 1995
  12 */
  13
  14#include <linux/time.h>
  15#include <linux/capability.h>
  16#include <linux/fs.h>
  17#include <linux/jbd.h>
  18#include <linux/ext3_fs.h>
  19#include <linux/ext3_jbd.h>
  20#include <linux/quotaops.h>
  21#include <linux/buffer_head.h>
  22
  23/*
  24 * balloc.c contains the blocks allocation and deallocation routines
  25 */
  26
  27/*
  28 * The free blocks are managed by bitmaps.  A file system contains several
  29 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
  30 * block for inodes, N blocks for the inode table and data blocks.
  31 *
  32 * The file system contains group descriptors which are located after the
  33 * super block.  Each descriptor contains the number of the bitmap block and
  34 * the free blocks count in the block.  The descriptors are loaded in memory
  35 * when a file system is mounted (see ext3_fill_super).
  36 */
  37
  38
  39#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
  40
  41/**
  42 * ext3_get_group_desc() -- load group descriptor from disk
  43 * @sb:                 super block
  44 * @block_group:        given block group
  45 * @bh:                 pointer to the buffer head to store the block
  46 *                      group descriptor
  47 */
  48struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
  49                                             unsigned int block_group,
  50                                             struct buffer_head ** bh)
  51{
  52        unsigned long group_desc;
  53        unsigned long offset;
  54        struct ext3_group_desc * desc;
  55        struct ext3_sb_info *sbi = EXT3_SB(sb);
  56
  57        if (block_group >= sbi->s_groups_count) {
  58                ext3_error (sb, "ext3_get_group_desc",
  59                            "block_group >= groups_count - "
  60                            "block_group = %d, groups_count = %lu",
  61                            block_group, sbi->s_groups_count);
  62
  63                return NULL;
  64        }
  65        smp_rmb();
  66
  67        group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
  68        offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
  69        if (!sbi->s_group_desc[group_desc]) {
  70                ext3_error (sb, "ext3_get_group_desc",
  71                            "Group descriptor not loaded - "
  72                            "block_group = %d, group_desc = %lu, desc = %lu",
  73                             block_group, group_desc, offset);
  74                return NULL;
  75        }
  76
  77        desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
  78        if (bh)
  79                *bh = sbi->s_group_desc[group_desc];
  80        return desc + offset;
  81}
  82
  83static int ext3_valid_block_bitmap(struct super_block *sb,
  84                                        struct ext3_group_desc *desc,
  85                                        unsigned int block_group,
  86                                        struct buffer_head *bh)
  87{
  88        ext3_grpblk_t offset;
  89        ext3_grpblk_t next_zero_bit;
  90        ext3_fsblk_t bitmap_blk;
  91        ext3_fsblk_t group_first_block;
  92
  93        group_first_block = ext3_group_first_block_no(sb, block_group);
  94
  95        /* check whether block bitmap block number is set */
  96        bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
  97        offset = bitmap_blk - group_first_block;
  98        if (!ext3_test_bit(offset, bh->b_data))
  99                /* bad block bitmap */
 100                goto err_out;
 101
 102        /* check whether the inode bitmap block number is set */
 103        bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
 104        offset = bitmap_blk - group_first_block;
 105        if (!ext3_test_bit(offset, bh->b_data))
 106                /* bad block bitmap */
 107                goto err_out;
 108
 109        /* check whether the inode table block number is set */
 110        bitmap_blk = le32_to_cpu(desc->bg_inode_table);
 111        offset = bitmap_blk - group_first_block;
 112        next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
 113                                offset + EXT3_SB(sb)->s_itb_per_group,
 114                                offset);
 115        if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
 116                /* good bitmap for inode tables */
 117                return 1;
 118
 119err_out:
 120        ext3_error(sb, __func__,
 121                        "Invalid block bitmap - "
 122                        "block_group = %d, block = %lu",
 123                        block_group, bitmap_blk);
 124        return 0;
 125}
 126
 127/**
 128 * read_block_bitmap()
 129 * @sb:                 super block
 130 * @block_group:        given block group
 131 *
 132 * Read the bitmap for a given block_group,and validate the
 133 * bits for block/inode/inode tables are set in the bitmaps
 134 *
 135 * Return buffer_head on success or NULL in case of failure.
 136 */
 137static struct buffer_head *
 138read_block_bitmap(struct super_block *sb, unsigned int block_group)
 139{
 140        struct ext3_group_desc * desc;
 141        struct buffer_head * bh = NULL;
 142        ext3_fsblk_t bitmap_blk;
 143
 144        desc = ext3_get_group_desc(sb, block_group, NULL);
 145        if (!desc)
 146                return NULL;
 147        bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
 148        bh = sb_getblk(sb, bitmap_blk);
 149        if (unlikely(!bh)) {
 150                ext3_error(sb, __func__,
 151                            "Cannot read block bitmap - "
 152                            "block_group = %d, block_bitmap = %u",
 153                            block_group, le32_to_cpu(desc->bg_block_bitmap));
 154                return NULL;
 155        }
 156        if (likely(bh_uptodate_or_lock(bh)))
 157                return bh;
 158
 159        if (bh_submit_read(bh) < 0) {
 160                brelse(bh);
 161                ext3_error(sb, __func__,
 162                            "Cannot read block bitmap - "
 163                            "block_group = %d, block_bitmap = %u",
 164                            block_group, le32_to_cpu(desc->bg_block_bitmap));
 165                return NULL;
 166        }
 167        ext3_valid_block_bitmap(sb, desc, block_group, bh);
 168        /*
 169         * file system mounted not to panic on error, continue with corrupt
 170         * bitmap
 171         */
 172        return bh;
 173}
 174/*
 175 * The reservation window structure operations
 176 * --------------------------------------------
 177 * Operations include:
 178 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
 179 *
 180 * We use a red-black tree to represent per-filesystem reservation
 181 * windows.
 182 *
 183 */
 184
 185/**
 186 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
 187 * @rb_root:            root of per-filesystem reservation rb tree
 188 * @verbose:            verbose mode
 189 * @fn:                 function which wishes to dump the reservation map
 190 *
 191 * If verbose is turned on, it will print the whole block reservation
 192 * windows(start, end). Otherwise, it will only print out the "bad" windows,
 193 * those windows that overlap with their immediate neighbors.
 194 */
 195#if 1
 196static void __rsv_window_dump(struct rb_root *root, int verbose,
 197                              const char *fn)
 198{
 199        struct rb_node *n;
 200        struct ext3_reserve_window_node *rsv, *prev;
 201        int bad;
 202
 203restart:
 204        n = rb_first(root);
 205        bad = 0;
 206        prev = NULL;
 207
 208        printk("Block Allocation Reservation Windows Map (%s):\n", fn);
 209        while (n) {
 210                rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
 211                if (verbose)
 212                        printk("reservation window 0x%p "
 213                               "start:  %lu, end:  %lu\n",
 214                               rsv, rsv->rsv_start, rsv->rsv_end);
 215                if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
 216                        printk("Bad reservation %p (start >= end)\n",
 217                               rsv);
 218                        bad = 1;
 219                }
 220                if (prev && prev->rsv_end >= rsv->rsv_start) {
 221                        printk("Bad reservation %p (prev->end >= start)\n",
 222                               rsv);
 223                        bad = 1;
 224                }
 225                if (bad) {
 226                        if (!verbose) {
 227                                printk("Restarting reservation walk in verbose mode\n");
 228                                verbose = 1;
 229                                goto restart;
 230                        }
 231                }
 232                n = rb_next(n);
 233                prev = rsv;
 234        }
 235        printk("Window map complete.\n");
 236        BUG_ON(bad);
 237}
 238#define rsv_window_dump(root, verbose) \
 239        __rsv_window_dump((root), (verbose), __func__)
 240#else
 241#define rsv_window_dump(root, verbose) do {} while (0)
 242#endif
 243
 244/**
 245 * goal_in_my_reservation()
 246 * @rsv:                inode's reservation window
 247 * @grp_goal:           given goal block relative to the allocation block group
 248 * @group:              the current allocation block group
 249 * @sb:                 filesystem super block
 250 *
 251 * Test if the given goal block (group relative) is within the file's
 252 * own block reservation window range.
 253 *
 254 * If the reservation window is outside the goal allocation group, return 0;
 255 * grp_goal (given goal block) could be -1, which means no specific
 256 * goal block. In this case, always return 1.
 257 * If the goal block is within the reservation window, return 1;
 258 * otherwise, return 0;
 259 */
 260static int
 261goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 262                        unsigned int group, struct super_block * sb)
 263{
 264        ext3_fsblk_t group_first_block, group_last_block;
 265
 266        group_first_block = ext3_group_first_block_no(sb, group);
 267        group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
 268
 269        if ((rsv->_rsv_start > group_last_block) ||
 270            (rsv->_rsv_end < group_first_block))
 271                return 0;
 272        if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
 273                || (grp_goal + group_first_block > rsv->_rsv_end)))
 274                return 0;
 275        return 1;
 276}
 277
 278/**
 279 * search_reserve_window()
 280 * @rb_root:            root of reservation tree
 281 * @goal:               target allocation block
 282 *
 283 * Find the reserved window which includes the goal, or the previous one
 284 * if the goal is not in any window.
 285 * Returns NULL if there are no windows or if all windows start after the goal.
 286 */
 287static struct ext3_reserve_window_node *
 288search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
 289{
 290        struct rb_node *n = root->rb_node;
 291        struct ext3_reserve_window_node *rsv;
 292
 293        if (!n)
 294                return NULL;
 295
 296        do {
 297                rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
 298
 299                if (goal < rsv->rsv_start)
 300                        n = n->rb_left;
 301                else if (goal > rsv->rsv_end)
 302                        n = n->rb_right;
 303                else
 304                        return rsv;
 305        } while (n);
 306        /*
 307         * We've fallen off the end of the tree: the goal wasn't inside
 308         * any particular node.  OK, the previous node must be to one
 309         * side of the interval containing the goal.  If it's the RHS,
 310         * we need to back up one.
 311         */
 312        if (rsv->rsv_start > goal) {
 313                n = rb_prev(&rsv->rsv_node);
 314                rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
 315        }
 316        return rsv;
 317}
 318
 319/**
 320 * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
 321 * @sb:                 super block
 322 * @rsv:                reservation window to add
 323 *
 324 * Must be called with rsv_lock hold.
 325 */
 326void ext3_rsv_window_add(struct super_block *sb,
 327                    struct ext3_reserve_window_node *rsv)
 328{
 329        struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
 330        struct rb_node *node = &rsv->rsv_node;
 331        ext3_fsblk_t start = rsv->rsv_start;
 332
 333        struct rb_node ** p = &root->rb_node;
 334        struct rb_node * parent = NULL;
 335        struct ext3_reserve_window_node *this;
 336
 337        while (*p)
 338        {
 339                parent = *p;
 340                this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
 341
 342                if (start < this->rsv_start)
 343                        p = &(*p)->rb_left;
 344                else if (start > this->rsv_end)
 345                        p = &(*p)->rb_right;
 346                else {
 347                        rsv_window_dump(root, 1);
 348                        BUG();
 349                }
 350        }
 351
 352        rb_link_node(node, parent, p);
 353        rb_insert_color(node, root);
 354}
 355
 356/**
 357 * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
 358 * @sb:                 super block
 359 * @rsv:                reservation window to remove
 360 *
 361 * Mark the block reservation window as not allocated, and unlink it
 362 * from the filesystem reservation window rb tree. Must be called with
 363 * rsv_lock hold.
 364 */
 365static void rsv_window_remove(struct super_block *sb,
 366                              struct ext3_reserve_window_node *rsv)
 367{
 368        rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 369        rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 370        rsv->rsv_alloc_hit = 0;
 371        rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
 372}
 373
 374/*
 375 * rsv_is_empty() -- Check if the reservation window is allocated.
 376 * @rsv:                given reservation window to check
 377 *
 378 * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
 379 */
 380static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
 381{
 382        /* a valid reservation end block could not be 0 */
 383        return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 384}
 385
 386/**
 387 * ext3_init_block_alloc_info()
 388 * @inode:              file inode structure
 389 *
 390 * Allocate and initialize the  reservation window structure, and
 391 * link the window to the ext3 inode structure at last
 392 *
 393 * The reservation window structure is only dynamically allocated
 394 * and linked to ext3 inode the first time the open file
 395 * needs a new block. So, before every ext3_new_block(s) call, for
 396 * regular files, we should check whether the reservation window
 397 * structure exists or not. In the latter case, this function is called.
 398 * Fail to do so will result in block reservation being turned off for that
 399 * open file.
 400 *
 401 * This function is called from ext3_get_blocks_handle(), also called
 402 * when setting the reservation window size through ioctl before the file
 403 * is open for write (needs block allocation).
 404 *
 405 * Needs truncate_mutex protection prior to call this function.
 406 */
 407void ext3_init_block_alloc_info(struct inode *inode)
 408{
 409        struct ext3_inode_info *ei = EXT3_I(inode);
 410        struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
 411        struct super_block *sb = inode->i_sb;
 412
 413        block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
 414        if (block_i) {
 415                struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
 416
 417                rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 418                rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 419
 420                /*
 421                 * if filesystem is mounted with NORESERVATION, the goal
 422                 * reservation window size is set to zero to indicate
 423                 * block reservation is off
 424                 */
 425                if (!test_opt(sb, RESERVATION))
 426                        rsv->rsv_goal_size = 0;
 427                else
 428                        rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
 429                rsv->rsv_alloc_hit = 0;
 430                block_i->last_alloc_logical_block = 0;
 431                block_i->last_alloc_physical_block = 0;
 432        }
 433        ei->i_block_alloc_info = block_i;
 434}
 435
 436/**
 437 * ext3_discard_reservation()
 438 * @inode:              inode
 439 *
 440 * Discard(free) block reservation window on last file close, or truncate
 441 * or at last iput().
 442 *
 443 * It is being called in three cases:
 444 *      ext3_release_file(): last writer close the file
 445 *      ext3_clear_inode(): last iput(), when nobody link to this file.
 446 *      ext3_truncate(): when the block indirect map is about to change.
 447 *
 448 */
 449void ext3_discard_reservation(struct inode *inode)
 450{
 451        struct ext3_inode_info *ei = EXT3_I(inode);
 452        struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
 453        struct ext3_reserve_window_node *rsv;
 454        spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
 455
 456        if (!block_i)
 457                return;
 458
 459        rsv = &block_i->rsv_window_node;
 460        if (!rsv_is_empty(&rsv->rsv_window)) {
 461                spin_lock(rsv_lock);
 462                if (!rsv_is_empty(&rsv->rsv_window))
 463                        rsv_window_remove(inode->i_sb, rsv);
 464                spin_unlock(rsv_lock);
 465        }
 466}
 467
 468/**
 469 * ext3_free_blocks_sb() -- Free given blocks and update quota
 470 * @handle:                     handle to this transaction
 471 * @sb:                         super block
 472 * @block:                      start physcial block to free
 473 * @count:                      number of blocks to free
 474 * @pdquot_freed_blocks:        pointer to quota
 475 */
 476void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
 477                         ext3_fsblk_t block, unsigned long count,
 478                         unsigned long *pdquot_freed_blocks)
 479{
 480        struct buffer_head *bitmap_bh = NULL;
 481        struct buffer_head *gd_bh;
 482        unsigned long block_group;
 483        ext3_grpblk_t bit;
 484        unsigned long i;
 485        unsigned long overflow;
 486        struct ext3_group_desc * desc;
 487        struct ext3_super_block * es;
 488        struct ext3_sb_info *sbi;
 489        int err = 0, ret;
 490        ext3_grpblk_t group_freed;
 491
 492        *pdquot_freed_blocks = 0;
 493        sbi = EXT3_SB(sb);
 494        es = sbi->s_es;
 495        if (block < le32_to_cpu(es->s_first_data_block) ||
 496            block + count < block ||
 497            block + count > le32_to_cpu(es->s_blocks_count)) {
 498                ext3_error (sb, "ext3_free_blocks",
 499                            "Freeing blocks not in datazone - "
 500                            "block = "E3FSBLK", count = %lu", block, count);
 501                goto error_return;
 502        }
 503
 504        ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
 505
 506do_more:
 507        overflow = 0;
 508        block_group = (block - le32_to_cpu(es->s_first_data_block)) /
 509                      EXT3_BLOCKS_PER_GROUP(sb);
 510        bit = (block - le32_to_cpu(es->s_first_data_block)) %
 511                      EXT3_BLOCKS_PER_GROUP(sb);
 512        /*
 513         * Check to see if we are freeing blocks across a group
 514         * boundary.
 515         */
 516        if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
 517                overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
 518                count -= overflow;
 519        }
 520        brelse(bitmap_bh);
 521        bitmap_bh = read_block_bitmap(sb, block_group);
 522        if (!bitmap_bh)
 523                goto error_return;
 524        desc = ext3_get_group_desc (sb, block_group, &gd_bh);
 525        if (!desc)
 526                goto error_return;
 527
 528        if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
 529            in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
 530            in_range (block, le32_to_cpu(desc->bg_inode_table),
 531                      sbi->s_itb_per_group) ||
 532            in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
 533                      sbi->s_itb_per_group)) {
 534                ext3_error (sb, "ext3_free_blocks",
 535                            "Freeing blocks in system zones - "
 536                            "Block = "E3FSBLK", count = %lu",
 537                            block, count);
 538                goto error_return;
 539        }
 540
 541        /*
 542         * We are about to start releasing blocks in the bitmap,
 543         * so we need undo access.
 544         */
 545        /* @@@ check errors */
 546        BUFFER_TRACE(bitmap_bh, "getting undo access");
 547        err = ext3_journal_get_undo_access(handle, bitmap_bh);
 548        if (err)
 549                goto error_return;
 550
 551        /*
 552         * We are about to modify some metadata.  Call the journal APIs
 553         * to unshare ->b_data if a currently-committing transaction is
 554         * using it
 555         */
 556        BUFFER_TRACE(gd_bh, "get_write_access");
 557        err = ext3_journal_get_write_access(handle, gd_bh);
 558        if (err)
 559                goto error_return;
 560
 561        jbd_lock_bh_state(bitmap_bh);
 562
 563        for (i = 0, group_freed = 0; i < count; i++) {
 564                /*
 565                 * An HJ special.  This is expensive...
 566                 */
 567#ifdef CONFIG_JBD_DEBUG
 568                jbd_unlock_bh_state(bitmap_bh);
 569                {
 570                        struct buffer_head *debug_bh;
 571                        debug_bh = sb_find_get_block(sb, block + i);
 572                        if (debug_bh) {
 573                                BUFFER_TRACE(debug_bh, "Deleted!");
 574                                if (!bh2jh(bitmap_bh)->b_committed_data)
 575                                        BUFFER_TRACE(debug_bh,
 576                                                "No commited data in bitmap");
 577                                BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
 578                                __brelse(debug_bh);
 579                        }
 580                }
 581                jbd_lock_bh_state(bitmap_bh);
 582#endif
 583                if (need_resched()) {
 584                        jbd_unlock_bh_state(bitmap_bh);
 585                        cond_resched();
 586                        jbd_lock_bh_state(bitmap_bh);
 587                }
 588                /* @@@ This prevents newly-allocated data from being
 589                 * freed and then reallocated within the same
 590                 * transaction.
 591                 *
 592                 * Ideally we would want to allow that to happen, but to
 593                 * do so requires making journal_forget() capable of
 594                 * revoking the queued write of a data block, which
 595                 * implies blocking on the journal lock.  *forget()
 596                 * cannot block due to truncate races.
 597                 *
 598                 * Eventually we can fix this by making journal_forget()
 599                 * return a status indicating whether or not it was able
 600                 * to revoke the buffer.  On successful revoke, it is
 601                 * safe not to set the allocation bit in the committed
 602                 * bitmap, because we know that there is no outstanding
 603                 * activity on the buffer any more and so it is safe to
 604                 * reallocate it.
 605                 */
 606                BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
 607                J_ASSERT_BH(bitmap_bh,
 608                                bh2jh(bitmap_bh)->b_committed_data != NULL);
 609                ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
 610                                bh2jh(bitmap_bh)->b_committed_data);
 611
 612                /*
 613                 * We clear the bit in the bitmap after setting the committed
 614                 * data bit, because this is the reverse order to that which
 615                 * the allocator uses.
 616                 */
 617                BUFFER_TRACE(bitmap_bh, "clear bit");
 618                if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 619                                                bit + i, bitmap_bh->b_data)) {
 620                        jbd_unlock_bh_state(bitmap_bh);
 621                        ext3_error(sb, __func__,
 622                                "bit already cleared for block "E3FSBLK,
 623                                 block + i);
 624                        jbd_lock_bh_state(bitmap_bh);
 625                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
 626                } else {
 627                        group_freed++;
 628                }
 629        }
 630        jbd_unlock_bh_state(bitmap_bh);
 631
 632        spin_lock(sb_bgl_lock(sbi, block_group));
 633        le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
 634        spin_unlock(sb_bgl_lock(sbi, block_group));
 635        percpu_counter_add(&sbi->s_freeblocks_counter, count);
 636
 637        /* We dirtied the bitmap block */
 638        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 639        err = ext3_journal_dirty_metadata(handle, bitmap_bh);
 640
 641        /* And the group descriptor block */
 642        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
 643        ret = ext3_journal_dirty_metadata(handle, gd_bh);
 644        if (!err) err = ret;
 645        *pdquot_freed_blocks += group_freed;
 646
 647        if (overflow && !err) {
 648                block += count;
 649                count = overflow;
 650                goto do_more;
 651        }
 652
 653error_return:
 654        brelse(bitmap_bh);
 655        ext3_std_error(sb, err);
 656        return;
 657}
 658
 659/**
 660 * ext3_free_blocks() -- Free given blocks and update quota
 661 * @handle:             handle for this transaction
 662 * @inode:              inode
 663 * @block:              start physical block to free
 664 * @count:              number of blocks to count
 665 */
 666void ext3_free_blocks(handle_t *handle, struct inode *inode,
 667                        ext3_fsblk_t block, unsigned long count)
 668{
 669        struct super_block * sb;
 670        unsigned long dquot_freed_blocks;
 671
 672        sb = inode->i_sb;
 673        if (!sb) {
 674                printk ("ext3_free_blocks: nonexistent device");
 675                return;
 676        }
 677        ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
 678        if (dquot_freed_blocks)
 679                vfs_dq_free_block(inode, dquot_freed_blocks);
 680        return;
 681}
 682
 683/**
 684 * ext3_test_allocatable()
 685 * @nr:                 given allocation block group
 686 * @bh:                 bufferhead contains the bitmap of the given block group
 687 *
 688 * For ext3 allocations, we must not reuse any blocks which are
 689 * allocated in the bitmap buffer's "last committed data" copy.  This
 690 * prevents deletes from freeing up the page for reuse until we have
 691 * committed the delete transaction.
 692 *
 693 * If we didn't do this, then deleting something and reallocating it as
 694 * data would allow the old block to be overwritten before the
 695 * transaction committed (because we force data to disk before commit).
 696 * This would lead to corruption if we crashed between overwriting the
 697 * data and committing the delete.
 698 *
 699 * @@@ We may want to make this allocation behaviour conditional on
 700 * data-writes at some point, and disable it for metadata allocations or
 701 * sync-data inodes.
 702 */
 703static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
 704{
 705        int ret;
 706        struct journal_head *jh = bh2jh(bh);
 707
 708        if (ext3_test_bit(nr, bh->b_data))
 709                return 0;
 710
 711        jbd_lock_bh_state(bh);
 712        if (!jh->b_committed_data)
 713                ret = 1;
 714        else
 715                ret = !ext3_test_bit(nr, jh->b_committed_data);
 716        jbd_unlock_bh_state(bh);
 717        return ret;
 718}
 719
 720/**
 721 * bitmap_search_next_usable_block()
 722 * @start:              the starting block (group relative) of the search
 723 * @bh:                 bufferhead contains the block group bitmap
 724 * @maxblocks:          the ending block (group relative) of the reservation
 725 *
 726 * The bitmap search --- search forward alternately through the actual
 727 * bitmap on disk and the last-committed copy in journal, until we find a
 728 * bit free in both bitmaps.
 729 */
 730static ext3_grpblk_t
 731bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 732                                        ext3_grpblk_t maxblocks)
 733{
 734        ext3_grpblk_t next;
 735        struct journal_head *jh = bh2jh(bh);
 736
 737        while (start < maxblocks) {
 738                next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
 739                if (next >= maxblocks)
 740                        return -1;
 741                if (ext3_test_allocatable(next, bh))
 742                        return next;
 743                jbd_lock_bh_state(bh);
 744                if (jh->b_committed_data)
 745                        start = ext3_find_next_zero_bit(jh->b_committed_data,
 746                                                        maxblocks, next);
 747                jbd_unlock_bh_state(bh);
 748        }
 749        return -1;
 750}
 751
 752/**
 753 * find_next_usable_block()
 754 * @start:              the starting block (group relative) to find next
 755 *                      allocatable block in bitmap.
 756 * @bh:                 bufferhead contains the block group bitmap
 757 * @maxblocks:          the ending block (group relative) for the search
 758 *
 759 * Find an allocatable block in a bitmap.  We honor both the bitmap and
 760 * its last-committed copy (if that exists), and perform the "most
 761 * appropriate allocation" algorithm of looking for a free block near
 762 * the initial goal; then for a free byte somewhere in the bitmap; then
 763 * for any free bit in the bitmap.
 764 */
 765static ext3_grpblk_t
 766find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 767                        ext3_grpblk_t maxblocks)
 768{
 769        ext3_grpblk_t here, next;
 770        char *p, *r;
 771
 772        if (start > 0) {
 773                /*
 774                 * The goal was occupied; search forward for a free
 775                 * block within the next XX blocks.
 776                 *
 777                 * end_goal is more or less random, but it has to be
 778                 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
 779                 * next 64-bit boundary is simple..
 780                 */
 781                ext3_grpblk_t end_goal = (start + 63) & ~63;
 782                if (end_goal > maxblocks)
 783                        end_goal = maxblocks;
 784                here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
 785                if (here < end_goal && ext3_test_allocatable(here, bh))
 786                        return here;
 787                ext3_debug("Bit not found near goal\n");
 788        }
 789
 790        here = start;
 791        if (here < 0)
 792                here = 0;
 793
 794        p = ((char *)bh->b_data) + (here >> 3);
 795        r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
 796        next = (r - ((char *)bh->b_data)) << 3;
 797
 798        if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
 799                return next;
 800
 801        /*
 802         * The bitmap search --- search forward alternately through the actual
 803         * bitmap and the last-committed copy until we find a bit free in
 804         * both
 805         */
 806        here = bitmap_search_next_usable_block(here, bh, maxblocks);
 807        return here;
 808}
 809
 810/**
 811 * claim_block()
 812 * @block:              the free block (group relative) to allocate
 813 * @bh:                 the bufferhead containts the block group bitmap
 814 *
 815 * We think we can allocate this block in this bitmap.  Try to set the bit.
 816 * If that succeeds then check that nobody has allocated and then freed the
 817 * block since we saw that is was not marked in b_committed_data.  If it _was_
 818 * allocated and freed then clear the bit in the bitmap again and return
 819 * zero (failure).
 820 */
 821static inline int
 822claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
 823{
 824        struct journal_head *jh = bh2jh(bh);
 825        int ret;
 826
 827        if (ext3_set_bit_atomic(lock, block, bh->b_data))
 828                return 0;
 829        jbd_lock_bh_state(bh);
 830        if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
 831                ext3_clear_bit_atomic(lock, block, bh->b_data);
 832                ret = 0;
 833        } else {
 834                ret = 1;
 835        }
 836        jbd_unlock_bh_state(bh);
 837        return ret;
 838}
 839
 840/**
 841 * ext3_try_to_allocate()
 842 * @sb:                 superblock
 843 * @handle:             handle to this transaction
 844 * @group:              given allocation block group
 845 * @bitmap_bh:          bufferhead holds the block bitmap
 846 * @grp_goal:           given target block within the group
 847 * @count:              target number of blocks to allocate
 848 * @my_rsv:             reservation window
 849 *
 850 * Attempt to allocate blocks within a give range. Set the range of allocation
 851 * first, then find the first free bit(s) from the bitmap (within the range),
 852 * and at last, allocate the blocks by claiming the found free bit as allocated.
 853 *
 854 * To set the range of this allocation:
 855 *      if there is a reservation window, only try to allocate block(s) from the
 856 *      file's own reservation window;
 857 *      Otherwise, the allocation range starts from the give goal block, ends at
 858 *      the block group's last block.
 859 *
 860 * If we failed to allocate the desired block then we may end up crossing to a
 861 * new bitmap.  In that case we must release write access to the old one via
 862 * ext3_journal_release_buffer(), else we'll run out of credits.
 863 */
 864static ext3_grpblk_t
 865ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
 866                        struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
 867                        unsigned long *count, struct ext3_reserve_window *my_rsv)
 868{
 869        ext3_fsblk_t group_first_block;
 870        ext3_grpblk_t start, end;
 871        unsigned long num = 0;
 872
 873        /* we do allocation within the reservation window if we have a window */
 874        if (my_rsv) {
 875                group_first_block = ext3_group_first_block_no(sb, group);
 876                if (my_rsv->_rsv_start >= group_first_block)
 877                        start = my_rsv->_rsv_start - group_first_block;
 878                else
 879                        /* reservation window cross group boundary */
 880                        start = 0;
 881                end = my_rsv->_rsv_end - group_first_block + 1;
 882                if (end > EXT3_BLOCKS_PER_GROUP(sb))
 883                        /* reservation window crosses group boundary */
 884                        end = EXT3_BLOCKS_PER_GROUP(sb);
 885                if ((start <= grp_goal) && (grp_goal < end))
 886                        start = grp_goal;
 887                else
 888                        grp_goal = -1;
 889        } else {
 890                if (grp_goal > 0)
 891                        start = grp_goal;
 892                else
 893                        start = 0;
 894                end = EXT3_BLOCKS_PER_GROUP(sb);
 895        }
 896
 897        BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
 898
 899repeat:
 900        if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
 901                grp_goal = find_next_usable_block(start, bitmap_bh, end);
 902                if (grp_goal < 0)
 903                        goto fail_access;
 904                if (!my_rsv) {
 905                        int i;
 906
 907                        for (i = 0; i < 7 && grp_goal > start &&
 908                                        ext3_test_allocatable(grp_goal - 1,
 909                                                                bitmap_bh);
 910                                        i++, grp_goal--)
 911                                ;
 912                }
 913        }
 914        start = grp_goal;
 915
 916        if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
 917                grp_goal, bitmap_bh)) {
 918                /*
 919                 * The block was allocated by another thread, or it was
 920                 * allocated and then freed by another thread
 921                 */
 922                start++;
 923                grp_goal++;
 924                if (start >= end)
 925                        goto fail_access;
 926                goto repeat;
 927        }
 928        num++;
 929        grp_goal++;
 930        while (num < *count && grp_goal < end
 931                && ext3_test_allocatable(grp_goal, bitmap_bh)
 932                && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
 933                                grp_goal, bitmap_bh)) {
 934                num++;
 935                grp_goal++;
 936        }
 937        *count = num;
 938        return grp_goal - num;
 939fail_access:
 940        *count = num;
 941        return -1;
 942}
 943
 944/**
 945 *      find_next_reservable_window():
 946 *              find a reservable space within the given range.
 947 *              It does not allocate the reservation window for now:
 948 *              alloc_new_reservation() will do the work later.
 949 *
 950 *      @search_head: the head of the searching list;
 951 *              This is not necessarily the list head of the whole filesystem
 952 *
 953 *              We have both head and start_block to assist the search
 954 *              for the reservable space. The list starts from head,
 955 *              but we will shift to the place where start_block is,
 956 *              then start from there, when looking for a reservable space.
 957 *
 958 *      @size: the target new reservation window size
 959 *
 960 *      @group_first_block: the first block we consider to start
 961 *                      the real search from
 962 *
 963 *      @last_block:
 964 *              the maximum block number that our goal reservable space
 965 *              could start from. This is normally the last block in this
 966 *              group. The search will end when we found the start of next
 967 *              possible reservable space is out of this boundary.
 968 *              This could handle the cross boundary reservation window
 969 *              request.
 970 *
 971 *      basically we search from the given range, rather than the whole
 972 *      reservation double linked list, (start_block, last_block)
 973 *      to find a free region that is of my size and has not
 974 *      been reserved.
 975 *
 976 */
 977static int find_next_reservable_window(
 978                                struct ext3_reserve_window_node *search_head,
 979                                struct ext3_reserve_window_node *my_rsv,
 980                                struct super_block * sb,
 981                                ext3_fsblk_t start_block,
 982                                ext3_fsblk_t last_block)
 983{
 984        struct rb_node *next;
 985        struct ext3_reserve_window_node *rsv, *prev;
 986        ext3_fsblk_t cur;
 987        int size = my_rsv->rsv_goal_size;
 988
 989        /* TODO: make the start of the reservation window byte-aligned */
 990        /* cur = *start_block & ~7;*/
 991        cur = start_block;
 992        rsv = search_head;
 993        if (!rsv)
 994                return -1;
 995
 996        while (1) {
 997                if (cur <= rsv->rsv_end)
 998                        cur = rsv->rsv_end + 1;
 999
1000                /* TODO?
1001                 * in the case we could not find a reservable space
1002                 * that is what is expected, during the re-search, we could
1003                 * remember what's the largest reservable space we could have
1004                 * and return that one.
1005                 *
1006                 * For now it will fail if we could not find the reservable
1007                 * space with expected-size (or more)...
1008                 */
1009                if (cur > last_block)
1010                        return -1;              /* fail */
1011
1012                prev = rsv;
1013                next = rb_next(&rsv->rsv_node);
1014                rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
1015
1016                /*
1017                 * Reached the last reservation, we can just append to the
1018                 * previous one.
1019                 */
1020                if (!next)
1021                        break;
1022
1023                if (cur + size <= rsv->rsv_start) {
1024                        /*
1025                         * Found a reserveable space big enough.  We could
1026                         * have a reservation across the group boundary here
1027                         */
1028                        break;
1029                }
1030        }
1031        /*
1032         * we come here either :
1033         * when we reach the end of the whole list,
1034         * and there is empty reservable space after last entry in the list.
1035         * append it to the end of the list.
1036         *
1037         * or we found one reservable space in the middle of the list,
1038         * return the reservation window that we could append to.
1039         * succeed.
1040         */
1041
1042        if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
1043                rsv_window_remove(sb, my_rsv);
1044
1045        /*
1046         * Let's book the whole avaliable window for now.  We will check the
1047         * disk bitmap later and then, if there are free blocks then we adjust
1048         * the window size if it's larger than requested.
1049         * Otherwise, we will remove this node from the tree next time
1050         * call find_next_reservable_window.
1051         */
1052        my_rsv->rsv_start = cur;
1053        my_rsv->rsv_end = cur + size - 1;
1054        my_rsv->rsv_alloc_hit = 0;
1055
1056        if (prev != my_rsv)
1057                ext3_rsv_window_add(sb, my_rsv);
1058
1059        return 0;
1060}
1061
1062/**
1063 *      alloc_new_reservation()--allocate a new reservation window
1064 *
1065 *              To make a new reservation, we search part of the filesystem
1066 *              reservation list (the list that inside the group). We try to
1067 *              allocate a new reservation window near the allocation goal,
1068 *              or the beginning of the group, if there is no goal.
1069 *
1070 *              We first find a reservable space after the goal, then from
1071 *              there, we check the bitmap for the first free block after
1072 *              it. If there is no free block until the end of group, then the
1073 *              whole group is full, we failed. Otherwise, check if the free
1074 *              block is inside the expected reservable space, if so, we
1075 *              succeed.
1076 *              If the first free block is outside the reservable space, then
1077 *              start from the first free block, we search for next available
1078 *              space, and go on.
1079 *
1080 *      on succeed, a new reservation will be found and inserted into the list
1081 *      It contains at least one free block, and it does not overlap with other
1082 *      reservation windows.
1083 *
1084 *      failed: we failed to find a reservation window in this group
1085 *
1086 *      @rsv: the reservation
1087 *
1088 *      @grp_goal: The goal (group-relative).  It is where the search for a
1089 *              free reservable space should start from.
1090 *              if we have a grp_goal(grp_goal >0 ), then start from there,
1091 *              no grp_goal(grp_goal = -1), we start from the first block
1092 *              of the group.
1093 *
1094 *      @sb: the super block
1095 *      @group: the group we are trying to allocate in
1096 *      @bitmap_bh: the block group block bitmap
1097 *
1098 */
1099static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
1100                ext3_grpblk_t grp_goal, struct super_block *sb,
1101                unsigned int group, struct buffer_head *bitmap_bh)
1102{
1103        struct ext3_reserve_window_node *search_head;
1104        ext3_fsblk_t group_first_block, group_end_block, start_block;
1105        ext3_grpblk_t first_free_block;
1106        struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
1107        unsigned long size;
1108        int ret;
1109        spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1110
1111        group_first_block = ext3_group_first_block_no(sb, group);
1112        group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1113
1114        if (grp_goal < 0)
1115                start_block = group_first_block;
1116        else
1117                start_block = grp_goal + group_first_block;
1118
1119        size = my_rsv->rsv_goal_size;
1120
1121        if (!rsv_is_empty(&my_rsv->rsv_window)) {
1122                /*
1123                 * if the old reservation is cross group boundary
1124                 * and if the goal is inside the old reservation window,
1125                 * we will come here when we just failed to allocate from
1126                 * the first part of the window. We still have another part
1127                 * that belongs to the next group. In this case, there is no
1128                 * point to discard our window and try to allocate a new one
1129                 * in this group(which will fail). we should
1130                 * keep the reservation window, just simply move on.
1131                 *
1132                 * Maybe we could shift the start block of the reservation
1133                 * window to the first block of next group.
1134                 */
1135
1136                if ((my_rsv->rsv_start <= group_end_block) &&
1137                                (my_rsv->rsv_end > group_end_block) &&
1138                                (start_block >= my_rsv->rsv_start))
1139                        return -1;
1140
1141                if ((my_rsv->rsv_alloc_hit >
1142                     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1143                        /*
1144                         * if the previously allocation hit ratio is
1145                         * greater than 1/2, then we double the size of
1146                         * the reservation window the next time,
1147                         * otherwise we keep the same size window
1148                         */
1149                        size = size * 2;
1150                        if (size > EXT3_MAX_RESERVE_BLOCKS)
1151                                size = EXT3_MAX_RESERVE_BLOCKS;
1152                        my_rsv->rsv_goal_size= size;
1153                }
1154        }
1155
1156        spin_lock(rsv_lock);
1157        /*
1158         * shift the search start to the window near the goal block
1159         */
1160        search_head = search_reserve_window(fs_rsv_root, start_block);
1161
1162        /*
1163         * find_next_reservable_window() simply finds a reservable window
1164         * inside the given range(start_block, group_end_block).
1165         *
1166         * To make sure the reservation window has a free bit inside it, we
1167         * need to check the bitmap after we found a reservable window.
1168         */
1169retry:
1170        ret = find_next_reservable_window(search_head, my_rsv, sb,
1171                                                start_block, group_end_block);
1172
1173        if (ret == -1) {
1174                if (!rsv_is_empty(&my_rsv->rsv_window))
1175                        rsv_window_remove(sb, my_rsv);
1176                spin_unlock(rsv_lock);
1177                return -1;
1178        }
1179
1180        /*
1181         * On success, find_next_reservable_window() returns the
1182         * reservation window where there is a reservable space after it.
1183         * Before we reserve this reservable space, we need
1184         * to make sure there is at least a free block inside this region.
1185         *
1186         * searching the first free bit on the block bitmap and copy of
1187         * last committed bitmap alternatively, until we found a allocatable
1188         * block. Search start from the start block of the reservable space
1189         * we just found.
1190         */
1191        spin_unlock(rsv_lock);
1192        first_free_block = bitmap_search_next_usable_block(
1193                        my_rsv->rsv_start - group_first_block,
1194                        bitmap_bh, group_end_block - group_first_block + 1);
1195
1196        if (first_free_block < 0) {
1197                /*
1198                 * no free block left on the bitmap, no point
1199                 * to reserve the space. return failed.
1200                 */
1201                spin_lock(rsv_lock);
1202                if (!rsv_is_empty(&my_rsv->rsv_window))
1203                        rsv_window_remove(sb, my_rsv);
1204                spin_unlock(rsv_lock);
1205                return -1;              /* failed */
1206        }
1207
1208        start_block = first_free_block + group_first_block;
1209        /*
1210         * check if the first free block is within the
1211         * free space we just reserved
1212         */
1213        if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1214                return 0;               /* success */
1215        /*
1216         * if the first free bit we found is out of the reservable space
1217         * continue search for next reservable space,
1218         * start from where the free block is,
1219         * we also shift the list head to where we stopped last time
1220         */
1221        search_head = my_rsv;
1222        spin_lock(rsv_lock);
1223        goto retry;
1224}
1225
1226/**
1227 * try_to_extend_reservation()
1228 * @my_rsv:             given reservation window
1229 * @sb:                 super block
1230 * @size:               the delta to extend
1231 *
1232 * Attempt to expand the reservation window large enough to have
1233 * required number of free blocks
1234 *
1235 * Since ext3_try_to_allocate() will always allocate blocks within
1236 * the reservation window range, if the window size is too small,
1237 * multiple blocks allocation has to stop at the end of the reservation
1238 * window. To make this more efficient, given the total number of
1239 * blocks needed and the current size of the window, we try to
1240 * expand the reservation window size if necessary on a best-effort
1241 * basis before ext3_new_blocks() tries to allocate blocks,
1242 */
1243static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1244                        struct super_block *sb, int size)
1245{
1246        struct ext3_reserve_window_node *next_rsv;
1247        struct rb_node *next;
1248        spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1249
1250        if (!spin_trylock(rsv_lock))
1251                return;
1252
1253        next = rb_next(&my_rsv->rsv_node);
1254
1255        if (!next)
1256                my_rsv->rsv_end += size;
1257        else {
1258                next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
1259
1260                if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1261                        my_rsv->rsv_end += size;
1262                else
1263                        my_rsv->rsv_end = next_rsv->rsv_start - 1;
1264        }
1265        spin_unlock(rsv_lock);
1266}
1267
1268/**
1269 * ext3_try_to_allocate_with_rsv()
1270 * @sb:                 superblock
1271 * @handle:             handle to this transaction
1272 * @group:              given allocation block group
1273 * @bitmap_bh:          bufferhead holds the block bitmap
1274 * @grp_goal:           given target block within the group
1275 * @count:              target number of blocks to allocate
1276 * @my_rsv:             reservation window
1277 * @errp:               pointer to store the error code
1278 *
1279 * This is the main function used to allocate a new block and its reservation
1280 * window.
1281 *
1282 * Each time when a new block allocation is need, first try to allocate from
1283 * its own reservation.  If it does not have a reservation window, instead of
1284 * looking for a free bit on bitmap first, then look up the reservation list to
1285 * see if it is inside somebody else's reservation window, we try to allocate a
1286 * reservation window for it starting from the goal first. Then do the block
1287 * allocation within the reservation window.
1288 *
1289 * This will avoid keeping on searching the reservation list again and
1290 * again when somebody is looking for a free block (without
1291 * reservation), and there are lots of free blocks, but they are all
1292 * being reserved.
1293 *
1294 * We use a red-black tree for the per-filesystem reservation list.
1295 *
1296 */
1297static ext3_grpblk_t
1298ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1299                        unsigned int group, struct buffer_head *bitmap_bh,
1300                        ext3_grpblk_t grp_goal,
1301                        struct ext3_reserve_window_node * my_rsv,
1302                        unsigned long *count, int *errp)
1303{
1304        ext3_fsblk_t group_first_block, group_last_block;
1305        ext3_grpblk_t ret = 0;
1306        int fatal;
1307        unsigned long num = *count;
1308
1309        *errp = 0;
1310
1311        /*
1312         * Make sure we use undo access for the bitmap, because it is critical
1313         * that we do the frozen_data COW on bitmap buffers in all cases even
1314         * if the buffer is in BJ_Forget state in the committing transaction.
1315         */
1316        BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1317        fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
1318        if (fatal) {
1319                *errp = fatal;
1320                return -1;
1321        }
1322
1323        /*
1324         * we don't deal with reservation when
1325         * filesystem is mounted without reservation
1326         * or the file is not a regular file
1327         * or last attempt to allocate a block with reservation turned on failed
1328         */
1329        if (my_rsv == NULL ) {
1330                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1331                                                grp_goal, count, NULL);
1332                goto out;
1333        }
1334        /*
1335         * grp_goal is a group relative block number (if there is a goal)
1336         * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
1337         * first block is a filesystem wide block number
1338         * first block is the block number of the first block in this group
1339         */
1340        group_first_block = ext3_group_first_block_no(sb, group);
1341        group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1342
1343        /*
1344         * Basically we will allocate a new block from inode's reservation
1345         * window.
1346         *
1347         * We need to allocate a new reservation window, if:
1348         * a) inode does not have a reservation window; or
1349         * b) last attempt to allocate a block from existing reservation
1350         *    failed; or
1351         * c) we come here with a goal and with a reservation window
1352         *
1353         * We do not need to allocate a new reservation window if we come here
1354         * at the beginning with a goal and the goal is inside the window, or
1355         * we don't have a goal but already have a reservation window.
1356         * then we could go to allocate from the reservation window directly.
1357         */
1358        while (1) {
1359                if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1360                        !goal_in_my_reservation(&my_rsv->rsv_window,
1361                                                grp_goal, group, sb)) {
1362                        if (my_rsv->rsv_goal_size < *count)
1363                                my_rsv->rsv_goal_size = *count;
1364                        ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1365                                                        group, bitmap_bh);
1366                        if (ret < 0)
1367                                break;                  /* failed */
1368
1369                        if (!goal_in_my_reservation(&my_rsv->rsv_window,
1370                                                        grp_goal, group, sb))
1371                                grp_goal = -1;
1372                } else if (grp_goal >= 0) {
1373                        int curr = my_rsv->rsv_end -
1374                                        (grp_goal + group_first_block) + 1;
1375
1376                        if (curr < *count)
1377                                try_to_extend_reservation(my_rsv, sb,
1378                                                        *count - curr);
1379                }
1380
1381                if ((my_rsv->rsv_start > group_last_block) ||
1382                                (my_rsv->rsv_end < group_first_block)) {
1383                        rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
1384                        BUG();
1385                }
1386                ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1387                                           grp_goal, &num, &my_rsv->rsv_window);
1388                if (ret >= 0) {
1389                        my_rsv->rsv_alloc_hit += num;
1390                        *count = num;
1391                        break;                          /* succeed */
1392                }
1393                num = *count;
1394        }
1395out:
1396        if (ret >= 0) {
1397                BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1398                                        "bitmap block");
1399                fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
1400                if (fatal) {
1401                        *errp = fatal;
1402                        return -1;
1403                }
1404                return ret;
1405        }
1406
1407        BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1408        ext3_journal_release_buffer(handle, bitmap_bh);
1409        return ret;
1410}
1411
1412/**
1413 * ext3_has_free_blocks()
1414 * @sbi:                in-core super block structure.
1415 *
1416 * Check if filesystem has at least 1 free block available for allocation.
1417 */
1418static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1419{
1420        ext3_fsblk_t free_blocks, root_blocks;
1421
1422        free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1423        root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1424        if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1425                sbi->s_resuid != current_fsuid() &&
1426                (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1427                return 0;
1428        }
1429        return 1;
1430}
1431
1432/**
1433 * ext3_should_retry_alloc()
1434 * @sb:                 super block
1435 * @retries             number of attemps has been made
1436 *
1437 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1438 * it is profitable to retry the operation, this function will wait
1439 * for the current or commiting transaction to complete, and then
1440 * return TRUE.
1441 *
1442 * if the total number of retries exceed three times, return FALSE.
1443 */
1444int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1445{
1446        if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
1447                return 0;
1448
1449        jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1450
1451        return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
1452}
1453
1454/**
1455 * ext3_new_blocks() -- core block(s) allocation function
1456 * @handle:             handle to this transaction
1457 * @inode:              file inode
1458 * @goal:               given target block(filesystem wide)
1459 * @count:              target number of blocks to allocate
1460 * @errp:               error code
1461 *
1462 * ext3_new_blocks uses a goal block to assist allocation.  It tries to
1463 * allocate block(s) from the block group contains the goal block first. If that
1464 * fails, it will try to allocate block(s) from other block groups without
1465 * any specific goal block.
1466 *
1467 */
1468ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1469                        ext3_fsblk_t goal, unsigned long *count, int *errp)
1470{
1471        struct buffer_head *bitmap_bh = NULL;
1472        struct buffer_head *gdp_bh;
1473        int group_no;
1474        int goal_group;
1475        ext3_grpblk_t grp_target_blk;   /* blockgroup relative goal block */
1476        ext3_grpblk_t grp_alloc_blk;    /* blockgroup-relative allocated block*/
1477        ext3_fsblk_t ret_block;         /* filesyetem-wide allocated block */
1478        int bgi;                        /* blockgroup iteration index */
1479        int fatal = 0, err;
1480        int performed_allocation = 0;
1481        ext3_grpblk_t free_blocks;      /* number of free blocks in a group */
1482        struct super_block *sb;
1483        struct ext3_group_desc *gdp;
1484        struct ext3_super_block *es;
1485        struct ext3_sb_info *sbi;
1486        struct ext3_reserve_window_node *my_rsv = NULL;
1487        struct ext3_block_alloc_info *block_i;
1488        unsigned short windowsz = 0;
1489#ifdef EXT3FS_DEBUG
1490        static int goal_hits, goal_attempts;
1491#endif
1492        unsigned long ngroups;
1493        unsigned long num = *count;
1494
1495        *errp = -ENOSPC;
1496        sb = inode->i_sb;
1497        if (!sb) {
1498                printk("ext3_new_block: nonexistent device");
1499                return 0;
1500        }
1501
1502        /*
1503         * Check quota for allocation of this block.
1504         */
1505        if (vfs_dq_alloc_block(inode, num)) {
1506                *errp = -EDQUOT;
1507                return 0;
1508        }
1509
1510        sbi = EXT3_SB(sb);
1511        es = EXT3_SB(sb)->s_es;
1512        ext3_debug("goal=%lu.\n", goal);
1513        /*
1514         * Allocate a block from reservation only when
1515         * filesystem is mounted with reservation(default,-o reservation), and
1516         * it's a regular file, and
1517         * the desired window size is greater than 0 (One could use ioctl
1518         * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
1519         * reservation on that particular file)
1520         */
1521        block_i = EXT3_I(inode)->i_block_alloc_info;
1522        if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1523                my_rsv = &block_i->rsv_window_node;
1524
1525        if (!ext3_has_free_blocks(sbi)) {
1526                *errp = -ENOSPC;
1527                goto out;
1528        }
1529
1530        /*
1531         * First, test whether the goal block is free.
1532         */
1533        if (goal < le32_to_cpu(es->s_first_data_block) ||
1534            goal >= le32_to_cpu(es->s_blocks_count))
1535                goal = le32_to_cpu(es->s_first_data_block);
1536        group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1537                        EXT3_BLOCKS_PER_GROUP(sb);
1538        goal_group = group_no;
1539retry_alloc:
1540        gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1541        if (!gdp)
1542                goto io_error;
1543
1544        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1545        /*
1546         * if there is not enough free blocks to make a new resevation
1547         * turn off reservation for this allocation
1548         */
1549        if (my_rsv && (free_blocks < windowsz)
1550                && (free_blocks > 0)
1551                && (rsv_is_empty(&my_rsv->rsv_window)))
1552                my_rsv = NULL;
1553
1554        if (free_blocks > 0) {
1555                grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
1556                                EXT3_BLOCKS_PER_GROUP(sb));
1557                bitmap_bh = read_block_bitmap(sb, group_no);
1558                if (!bitmap_bh)
1559                        goto io_error;
1560                grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1561                                        group_no, bitmap_bh, grp_target_blk,
1562                                        my_rsv, &num, &fatal);
1563                if (fatal)
1564                        goto out;
1565                if (grp_alloc_blk >= 0)
1566                        goto allocated;
1567        }
1568
1569        ngroups = EXT3_SB(sb)->s_groups_count;
1570        smp_rmb();
1571
1572        /*
1573         * Now search the rest of the groups.  We assume that
1574         * group_no and gdp correctly point to the last group visited.
1575         */
1576        for (bgi = 0; bgi < ngroups; bgi++) {
1577                group_no++;
1578                if (group_no >= ngroups)
1579                        group_no = 0;
1580                gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1581                if (!gdp)
1582                        goto io_error;
1583                free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1584                /*
1585                 * skip this group if the number of
1586                 * free blocks is less than half of the reservation
1587                 * window size.
1588                 */
1589                if (my_rsv && (free_blocks <= (windowsz/2)))
1590                        continue;
1591
1592                brelse(bitmap_bh);
1593                bitmap_bh = read_block_bitmap(sb, group_no);
1594                if (!bitmap_bh)
1595                        goto io_error;
1596                /*
1597                 * try to allocate block(s) from this group, without a goal(-1).
1598                 */
1599                grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1600                                        group_no, bitmap_bh, -1, my_rsv,
1601                                        &num, &fatal);
1602                if (fatal)
1603                        goto out;
1604                if (grp_alloc_blk >= 0)
1605                        goto allocated;
1606        }
1607        /*
1608         * We may end up a bogus ealier ENOSPC error due to
1609         * filesystem is "full" of reservations, but
1610         * there maybe indeed free blocks avaliable on disk
1611         * In this case, we just forget about the reservations
1612         * just do block allocation as without reservations.
1613         */
1614        if (my_rsv) {
1615                my_rsv = NULL;
1616                windowsz = 0;
1617                group_no = goal_group;
1618                goto retry_alloc;
1619        }
1620        /* No space left on the device */
1621        *errp = -ENOSPC;
1622        goto out;
1623
1624allocated:
1625
1626        ext3_debug("using block group %d(%d)\n",
1627                        group_no, gdp->bg_free_blocks_count);
1628
1629        BUFFER_TRACE(gdp_bh, "get_write_access");
1630        fatal = ext3_journal_get_write_access(handle, gdp_bh);
1631        if (fatal)
1632                goto out;
1633
1634        ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
1635
1636        if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
1637            in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
1638            in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
1639                      EXT3_SB(sb)->s_itb_per_group) ||
1640            in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
1641                      EXT3_SB(sb)->s_itb_per_group)) {
1642                ext3_error(sb, "ext3_new_block",
1643                            "Allocating block in system zone - "
1644                            "blocks from "E3FSBLK", length %lu",
1645                             ret_block, num);
1646                /*
1647                 * claim_block() marked the blocks we allocated as in use. So we
1648                 * may want to selectively mark some of the blocks as free.
1649                 */
1650                goto retry_alloc;
1651        }
1652
1653        performed_allocation = 1;
1654
1655#ifdef CONFIG_JBD_DEBUG
1656        {
1657                struct buffer_head *debug_bh;
1658
1659                /* Record bitmap buffer state in the newly allocated block */
1660                debug_bh = sb_find_get_block(sb, ret_block);
1661                if (debug_bh) {
1662                        BUFFER_TRACE(debug_bh, "state when allocated");
1663                        BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1664                        brelse(debug_bh);
1665                }
1666        }
1667        jbd_lock_bh_state(bitmap_bh);
1668        spin_lock(sb_bgl_lock(sbi, group_no));
1669        if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1670                int i;
1671
1672                for (i = 0; i < num; i++) {
1673                        if (ext3_test_bit(grp_alloc_blk+i,
1674                                        bh2jh(bitmap_bh)->b_committed_data)) {
1675                                printk("%s: block was unexpectedly set in "
1676                                        "b_committed_data\n", __func__);
1677                        }
1678                }
1679        }
1680        ext3_debug("found bit %d\n", grp_alloc_blk);
1681        spin_unlock(sb_bgl_lock(sbi, group_no));
1682        jbd_unlock_bh_state(bitmap_bh);
1683#endif
1684
1685        if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
1686                ext3_error(sb, "ext3_new_block",
1687                            "block("E3FSBLK") >= blocks count(%d) - "
1688                            "block_group = %d, es == %p ", ret_block,
1689                        le32_to_cpu(es->s_blocks_count), group_no, es);
1690                goto out;
1691        }
1692
1693        /*
1694         * It is up to the caller to add the new buffer to a journal
1695         * list of some description.  We don't know in advance whether
1696         * the caller wants to use it as metadata or data.
1697         */
1698        ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
1699                        ret_block, goal_hits, goal_attempts);
1700
1701        spin_lock(sb_bgl_lock(sbi, group_no));
1702        le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1703        spin_unlock(sb_bgl_lock(sbi, group_no));
1704        percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1705
1706        BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1707        err = ext3_journal_dirty_metadata(handle, gdp_bh);
1708        if (!fatal)
1709                fatal = err;
1710
1711        if (fatal)
1712                goto out;
1713
1714        *errp = 0;
1715        brelse(bitmap_bh);
1716        vfs_dq_free_block(inode, *count-num);
1717        *count = num;
1718        return ret_block;
1719
1720io_error:
1721        *errp = -EIO;
1722out:
1723        if (fatal) {
1724                *errp = fatal;
1725                ext3_std_error(sb, fatal);
1726        }
1727        /*
1728         * Undo the block allocation
1729         */
1730        if (!performed_allocation)
1731                vfs_dq_free_block(inode, *count);
1732        brelse(bitmap_bh);
1733        return 0;
1734}
1735
1736ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
1737                        ext3_fsblk_t goal, int *errp)
1738{
1739        unsigned long count = 1;
1740
1741        return ext3_new_blocks(handle, inode, goal, &count, errp);
1742}
1743
1744/**
1745 * ext3_count_free_blocks() -- count filesystem free blocks
1746 * @sb:         superblock
1747 *
1748 * Adds up the number of free blocks from each block group.
1749 */
1750ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
1751{
1752        ext3_fsblk_t desc_count;
1753        struct ext3_group_desc *gdp;
1754        int i;
1755        unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
1756#ifdef EXT3FS_DEBUG
1757        struct ext3_super_block *es;
1758        ext3_fsblk_t bitmap_count;
1759        unsigned long x;
1760        struct buffer_head *bitmap_bh = NULL;
1761
1762        es = EXT3_SB(sb)->s_es;
1763        desc_count = 0;
1764        bitmap_count = 0;
1765        gdp = NULL;
1766
1767        smp_rmb();
1768        for (i = 0; i < ngroups; i++) {
1769                gdp = ext3_get_group_desc(sb, i, NULL);
1770                if (!gdp)
1771                        continue;
1772                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1773                brelse(bitmap_bh);
1774                bitmap_bh = read_block_bitmap(sb, i);
1775                if (bitmap_bh == NULL)
1776                        continue;
1777
1778                x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1779                printk("group %d: stored = %d, counted = %lu\n",
1780                        i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1781                bitmap_count += x;
1782        }
1783        brelse(bitmap_bh);
1784        printk("ext3_count_free_blocks: stored = "E3FSBLK
1785                ", computed = "E3FSBLK", "E3FSBLK"\n",
1786               le32_to_cpu(es->s_free_blocks_count),
1787                desc_count, bitmap_count);
1788        return bitmap_count;
1789#else
1790        desc_count = 0;
1791        smp_rmb();
1792        for (i = 0; i < ngroups; i++) {
1793                gdp = ext3_get_group_desc(sb, i, NULL);
1794                if (!gdp)
1795                        continue;
1796                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1797        }
1798
1799        return desc_count;
1800#endif
1801}
1802
1803static inline int test_root(int a, int b)
1804{
1805        int num = b;
1806
1807        while (a > num)
1808                num *= b;
1809        return num == a;
1810}
1811
1812static int ext3_group_sparse(int group)
1813{
1814        if (group <= 1)
1815                return 1;
1816        if (!(group & 1))
1817                return 0;
1818        return (test_root(group, 7) || test_root(group, 5) ||
1819                test_root(group, 3));
1820}
1821
1822/**
1823 *      ext3_bg_has_super - number of blocks used by the superblock in group
1824 *      @sb: superblock for filesystem
1825 *      @group: group number to check
1826 *
1827 *      Return the number of blocks used by the superblock (primary or backup)
1828 *      in this group.  Currently this will be only 0 or 1.
1829 */
1830int ext3_bg_has_super(struct super_block *sb, int group)
1831{
1832        if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
1833                                EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1834                        !ext3_group_sparse(group))
1835                return 0;
1836        return 1;
1837}
1838
1839static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
1840{
1841        unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1842        unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
1843        unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
1844
1845        if (group == first || group == first + 1 || group == last)
1846                return 1;
1847        return 0;
1848}
1849
1850static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
1851{
1852        return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
1853}
1854
1855/**
1856 *      ext3_bg_num_gdb - number of blocks used by the group table in group
1857 *      @sb: superblock for filesystem
1858 *      @group: group number to check
1859 *
1860 *      Return the number of blocks used by the group descriptor table
1861 *      (primary or backup) in this group.  In the future there may be a
1862 *      different number of descriptor blocks in each group.
1863 */
1864unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1865{
1866        unsigned long first_meta_bg =
1867                        le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
1868        unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1869
1870        if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
1871                        metagroup < first_meta_bg)
1872                return ext3_bg_num_gdb_nometa(sb,group);
1873
1874        return ext3_bg_num_gdb_meta(sb,group);
1875
1876}
1877