linux/fs/ocfs2/suballoc.c
<<
>>
Prefs
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * suballoc.c
   5 *
   6 * metadata alloc and free
   7 * Inspired by ext3 block groups.
   8 *
   9 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  10 *
  11 * This program is free software; you can redistribute it and/or
  12 * modify it under the terms of the GNU General Public
  13 * License as published by the Free Software Foundation; either
  14 * version 2 of the License, or (at your option) any later version.
  15 *
  16 * This program is distributed in the hope that it will be useful,
  17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 * General Public License for more details.
  20 *
  21 * You should have received a copy of the GNU General Public
  22 * License along with this program; if not, write to the
  23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 * Boston, MA 021110-1307, USA.
  25 */
  26
  27#include <linux/fs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/highmem.h>
  31
  32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
  33#include <cluster/masklog.h>
  34
  35#include "ocfs2.h"
  36
  37#include "alloc.h"
  38#include "blockcheck.h"
  39#include "dlmglue.h"
  40#include "inode.h"
  41#include "journal.h"
  42#include "localalloc.h"
  43#include "suballoc.h"
  44#include "super.h"
  45#include "sysfile.h"
  46#include "uptodate.h"
  47
  48#include "buffer_head_io.h"
  49
  50#define NOT_ALLOC_NEW_GROUP             0
  51#define ALLOC_NEW_GROUP                 0x1
  52#define ALLOC_GROUPS_FROM_GLOBAL        0x2
  53
  54#define OCFS2_MAX_INODES_TO_STEAL       1024
  55
  56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
  57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
  58static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
  59static int ocfs2_block_group_fill(handle_t *handle,
  60                                  struct inode *alloc_inode,
  61                                  struct buffer_head *bg_bh,
  62                                  u64 group_blkno,
  63                                  u16 my_chain,
  64                                  struct ocfs2_chain_list *cl);
  65static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
  66                                   struct inode *alloc_inode,
  67                                   struct buffer_head *bh,
  68                                   u64 max_block,
  69                                   u64 *last_alloc_group,
  70                                   int flags);
  71
  72static int ocfs2_cluster_group_search(struct inode *inode,
  73                                      struct buffer_head *group_bh,
  74                                      u32 bits_wanted, u32 min_bits,
  75                                      u64 max_block,
  76                                      u16 *bit_off, u16 *bits_found);
  77static int ocfs2_block_group_search(struct inode *inode,
  78                                    struct buffer_head *group_bh,
  79                                    u32 bits_wanted, u32 min_bits,
  80                                    u64 max_block,
  81                                    u16 *bit_off, u16 *bits_found);
  82static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
  83                                     struct ocfs2_alloc_context *ac,
  84                                     handle_t *handle,
  85                                     u32 bits_wanted,
  86                                     u32 min_bits,
  87                                     u16 *bit_off,
  88                                     unsigned int *num_bits,
  89                                     u64 *bg_blkno);
  90static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
  91                                         int nr);
  92static inline int ocfs2_block_group_set_bits(handle_t *handle,
  93                                             struct inode *alloc_inode,
  94                                             struct ocfs2_group_desc *bg,
  95                                             struct buffer_head *group_bh,
  96                                             unsigned int bit_off,
  97                                             unsigned int num_bits);
  98static inline int ocfs2_block_group_clear_bits(handle_t *handle,
  99                                               struct inode *alloc_inode,
 100                                               struct ocfs2_group_desc *bg,
 101                                               struct buffer_head *group_bh,
 102                                               unsigned int bit_off,
 103                                               unsigned int num_bits);
 104
 105static int ocfs2_relink_block_group(handle_t *handle,
 106                                    struct inode *alloc_inode,
 107                                    struct buffer_head *fe_bh,
 108                                    struct buffer_head *bg_bh,
 109                                    struct buffer_head *prev_bg_bh,
 110                                    u16 chain);
 111static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
 112                                                     u32 wanted);
 113static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 114                                                   u64 bg_blkno,
 115                                                   u16 bg_bit_off);
 116static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 117                                                u64 data_blkno,
 118                                                u64 *bg_blkno,
 119                                                u16 *bg_bit_off);
 120static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 121                                             u32 bits_wanted, u64 max_block,
 122                                             int flags,
 123                                             struct ocfs2_alloc_context **ac);
 124
 125void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 126{
 127        struct inode *inode = ac->ac_inode;
 128
 129        if (inode) {
 130                if (ac->ac_which != OCFS2_AC_USE_LOCAL)
 131                        ocfs2_inode_unlock(inode, 1);
 132
 133                mutex_unlock(&inode->i_mutex);
 134
 135                iput(inode);
 136                ac->ac_inode = NULL;
 137        }
 138        brelse(ac->ac_bh);
 139        ac->ac_bh = NULL;
 140}
 141
 142void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
 143{
 144        ocfs2_free_ac_resource(ac);
 145        kfree(ac);
 146}
 147
 148static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 149{
 150        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 151}
 152
 153#define do_error(fmt, ...)                                              \
 154        do{                                                             \
 155                if (clean_error)                                        \
 156                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
 157                else                                                    \
 158                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
 159        } while (0)
 160
 161static int ocfs2_validate_gd_self(struct super_block *sb,
 162                                  struct buffer_head *bh,
 163                                  int clean_error)
 164{
 165        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 166
 167        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
 168                do_error("Group descriptor #%llu has bad signature %.*s",
 169                         (unsigned long long)bh->b_blocknr, 7,
 170                         gd->bg_signature);
 171                return -EINVAL;
 172        }
 173
 174        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
 175                do_error("Group descriptor #%llu has an invalid bg_blkno "
 176                         "of %llu",
 177                         (unsigned long long)bh->b_blocknr,
 178                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
 179                return -EINVAL;
 180        }
 181
 182        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
 183                do_error("Group descriptor #%llu has an invalid "
 184                         "fs_generation of #%u",
 185                         (unsigned long long)bh->b_blocknr,
 186                         le32_to_cpu(gd->bg_generation));
 187                return -EINVAL;
 188        }
 189
 190        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
 191                do_error("Group descriptor #%llu has bit count %u but "
 192                         "claims that %u are free",
 193                         (unsigned long long)bh->b_blocknr,
 194                         le16_to_cpu(gd->bg_bits),
 195                         le16_to_cpu(gd->bg_free_bits_count));
 196                return -EINVAL;
 197        }
 198
 199        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
 200                do_error("Group descriptor #%llu has bit count %u but "
 201                         "max bitmap bits of %u",
 202                         (unsigned long long)bh->b_blocknr,
 203                         le16_to_cpu(gd->bg_bits),
 204                         8 * le16_to_cpu(gd->bg_size));
 205                return -EINVAL;
 206        }
 207
 208        return 0;
 209}
 210
 211static int ocfs2_validate_gd_parent(struct super_block *sb,
 212                                    struct ocfs2_dinode *di,
 213                                    struct buffer_head *bh,
 214                                    int clean_error)
 215{
 216        unsigned int max_bits;
 217        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 218
 219        if (di->i_blkno != gd->bg_parent_dinode) {
 220                do_error("Group descriptor #%llu has bad parent "
 221                         "pointer (%llu, expected %llu)",
 222                         (unsigned long long)bh->b_blocknr,
 223                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
 224                         (unsigned long long)le64_to_cpu(di->i_blkno));
 225                return -EINVAL;
 226        }
 227
 228        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 229        if (le16_to_cpu(gd->bg_bits) > max_bits) {
 230                do_error("Group descriptor #%llu has bit count of %u",
 231                         (unsigned long long)bh->b_blocknr,
 232                         le16_to_cpu(gd->bg_bits));
 233                return -EINVAL;
 234        }
 235
 236        if (le16_to_cpu(gd->bg_chain) >=
 237            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
 238                do_error("Group descriptor #%llu has bad chain %u",
 239                         (unsigned long long)bh->b_blocknr,
 240                         le16_to_cpu(gd->bg_chain));
 241                return -EINVAL;
 242        }
 243
 244        return 0;
 245}
 246
 247#undef do_error
 248
 249/*
 250 * This version only prints errors.  It does not fail the filesystem, and
 251 * exists only for resize.
 252 */
 253int ocfs2_check_group_descriptor(struct super_block *sb,
 254                                 struct ocfs2_dinode *di,
 255                                 struct buffer_head *bh)
 256{
 257        int rc;
 258        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 259
 260        BUG_ON(!buffer_uptodate(bh));
 261
 262        /*
 263         * If the ecc fails, we return the error but otherwise
 264         * leave the filesystem running.  We know any error is
 265         * local to this block.
 266         */
 267        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 268        if (rc) {
 269                mlog(ML_ERROR,
 270                     "Checksum failed for group descriptor %llu\n",
 271                     (unsigned long long)bh->b_blocknr);
 272        } else
 273                rc = ocfs2_validate_gd_self(sb, bh, 1);
 274        if (!rc)
 275                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
 276
 277        return rc;
 278}
 279
 280static int ocfs2_validate_group_descriptor(struct super_block *sb,
 281                                           struct buffer_head *bh)
 282{
 283        int rc;
 284        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 285
 286        mlog(0, "Validating group descriptor %llu\n",
 287             (unsigned long long)bh->b_blocknr);
 288
 289        BUG_ON(!buffer_uptodate(bh));
 290
 291        /*
 292         * If the ecc fails, we return the error but otherwise
 293         * leave the filesystem running.  We know any error is
 294         * local to this block.
 295         */
 296        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 297        if (rc)
 298                return rc;
 299
 300        /*
 301         * Errors after here are fatal.
 302         */
 303
 304        return ocfs2_validate_gd_self(sb, bh, 0);
 305}
 306
 307int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
 308                                u64 gd_blkno, struct buffer_head **bh)
 309{
 310        int rc;
 311        struct buffer_head *tmp = *bh;
 312
 313        rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
 314                              ocfs2_validate_group_descriptor);
 315        if (rc)
 316                goto out;
 317
 318        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
 319        if (rc) {
 320                brelse(tmp);
 321                goto out;
 322        }
 323
 324        /* If ocfs2_read_block() got us a new bh, pass it up. */
 325        if (!*bh)
 326                *bh = tmp;
 327
 328out:
 329        return rc;
 330}
 331
 332static int ocfs2_block_group_fill(handle_t *handle,
 333                                  struct inode *alloc_inode,
 334                                  struct buffer_head *bg_bh,
 335                                  u64 group_blkno,
 336                                  u16 my_chain,
 337                                  struct ocfs2_chain_list *cl)
 338{
 339        int status = 0;
 340        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 341        struct super_block * sb = alloc_inode->i_sb;
 342
 343        mlog_entry_void();
 344
 345        if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
 346                ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
 347                            "b_blocknr (%llu)",
 348                            (unsigned long long)group_blkno,
 349                            (unsigned long long) bg_bh->b_blocknr);
 350                status = -EIO;
 351                goto bail;
 352        }
 353
 354        status = ocfs2_journal_access_gd(handle,
 355                                         INODE_CACHE(alloc_inode),
 356                                         bg_bh,
 357                                         OCFS2_JOURNAL_ACCESS_CREATE);
 358        if (status < 0) {
 359                mlog_errno(status);
 360                goto bail;
 361        }
 362
 363        memset(bg, 0, sb->s_blocksize);
 364        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
 365        bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
 366        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
 367        bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
 368        bg->bg_chain = cpu_to_le16(my_chain);
 369        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
 370        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
 371        bg->bg_blkno = cpu_to_le64(group_blkno);
 372        /* set the 1st bit in the bitmap to account for the descriptor block */
 373        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
 374        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
 375
 376        status = ocfs2_journal_dirty(handle, bg_bh);
 377        if (status < 0)
 378                mlog_errno(status);
 379
 380        /* There is no need to zero out or otherwise initialize the
 381         * other blocks in a group - All valid FS metadata in a block
 382         * group stores the superblock fs_generation value at
 383         * allocation time. */
 384
 385bail:
 386        mlog_exit(status);
 387        return status;
 388}
 389
 390static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 391{
 392        u16 curr, best;
 393
 394        best = curr = 0;
 395        while (curr < le16_to_cpu(cl->cl_count)) {
 396                if (le32_to_cpu(cl->cl_recs[best].c_total) >
 397                    le32_to_cpu(cl->cl_recs[curr].c_total))
 398                        best = curr;
 399                curr++;
 400        }
 401        return best;
 402}
 403
 404/*
 405 * We expect the block group allocator to already be locked.
 406 */
 407static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 408                                   struct inode *alloc_inode,
 409                                   struct buffer_head *bh,
 410                                   u64 max_block,
 411                                   u64 *last_alloc_group,
 412                                   int flags)
 413{
 414        int status, credits;
 415        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
 416        struct ocfs2_chain_list *cl;
 417        struct ocfs2_alloc_context *ac = NULL;
 418        handle_t *handle = NULL;
 419        u32 bit_off, num_bits;
 420        u16 alloc_rec;
 421        u64 bg_blkno;
 422        struct buffer_head *bg_bh = NULL;
 423        struct ocfs2_group_desc *bg;
 424
 425        BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
 426
 427        mlog_entry_void();
 428
 429        cl = &fe->id2.i_chain;
 430        status = ocfs2_reserve_clusters_with_limit(osb,
 431                                                   le16_to_cpu(cl->cl_cpg),
 432                                                   max_block, flags, &ac);
 433        if (status < 0) {
 434                if (status != -ENOSPC)
 435                        mlog_errno(status);
 436                goto bail;
 437        }
 438
 439        credits = ocfs2_calc_group_alloc_credits(osb->sb,
 440                                                 le16_to_cpu(cl->cl_cpg));
 441        handle = ocfs2_start_trans(osb, credits);
 442        if (IS_ERR(handle)) {
 443                status = PTR_ERR(handle);
 444                handle = NULL;
 445                mlog_errno(status);
 446                goto bail;
 447        }
 448
 449        if (last_alloc_group && *last_alloc_group != 0) {
 450                mlog(0, "use old allocation group %llu for block group alloc\n",
 451                     (unsigned long long)*last_alloc_group);
 452                ac->ac_last_group = *last_alloc_group;
 453        }
 454        status = ocfs2_claim_clusters(osb,
 455                                      handle,
 456                                      ac,
 457                                      le16_to_cpu(cl->cl_cpg),
 458                                      &bit_off,
 459                                      &num_bits);
 460        if (status < 0) {
 461                if (status != -ENOSPC)
 462                        mlog_errno(status);
 463                goto bail;
 464        }
 465
 466        alloc_rec = ocfs2_find_smallest_chain(cl);
 467
 468        /* setup the group */
 469        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 470        mlog(0, "new descriptor, record %u, at block %llu\n",
 471             alloc_rec, (unsigned long long)bg_blkno);
 472
 473        bg_bh = sb_getblk(osb->sb, bg_blkno);
 474        if (!bg_bh) {
 475                status = -EIO;
 476                mlog_errno(status);
 477                goto bail;
 478        }
 479        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
 480
 481        status = ocfs2_block_group_fill(handle,
 482                                        alloc_inode,
 483                                        bg_bh,
 484                                        bg_blkno,
 485                                        alloc_rec,
 486                                        cl);
 487        if (status < 0) {
 488                mlog_errno(status);
 489                goto bail;
 490        }
 491
 492        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 493
 494        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
 495                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
 496        if (status < 0) {
 497                mlog_errno(status);
 498                goto bail;
 499        }
 500
 501        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
 502                     le16_to_cpu(bg->bg_free_bits_count));
 503        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
 504        cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
 505        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
 506                le16_add_cpu(&cl->cl_next_free_rec, 1);
 507
 508        le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
 509                                        le16_to_cpu(bg->bg_free_bits_count));
 510        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
 511        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
 512
 513        status = ocfs2_journal_dirty(handle, bh);
 514        if (status < 0) {
 515                mlog_errno(status);
 516                goto bail;
 517        }
 518
 519        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
 520        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 521        fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
 522                                             le32_to_cpu(fe->i_clusters)));
 523        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
 524        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
 525        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
 526
 527        status = 0;
 528
 529        /* save the new last alloc group so that the caller can cache it. */
 530        if (last_alloc_group)
 531                *last_alloc_group = ac->ac_last_group;
 532
 533bail:
 534        if (handle)
 535                ocfs2_commit_trans(osb, handle);
 536
 537        if (ac)
 538                ocfs2_free_alloc_context(ac);
 539
 540        brelse(bg_bh);
 541
 542        mlog_exit(status);
 543        return status;
 544}
 545
 546static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 547                                       struct ocfs2_alloc_context *ac,
 548                                       int type,
 549                                       u32 slot,
 550                                       u64 *last_alloc_group,
 551                                       int flags)
 552{
 553        int status;
 554        u32 bits_wanted = ac->ac_bits_wanted;
 555        struct inode *alloc_inode;
 556        struct buffer_head *bh = NULL;
 557        struct ocfs2_dinode *fe;
 558        u32 free_bits;
 559
 560        mlog_entry_void();
 561
 562        alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
 563        if (!alloc_inode) {
 564                mlog_errno(-EINVAL);
 565                return -EINVAL;
 566        }
 567
 568        mutex_lock(&alloc_inode->i_mutex);
 569
 570        status = ocfs2_inode_lock(alloc_inode, &bh, 1);
 571        if (status < 0) {
 572                mutex_unlock(&alloc_inode->i_mutex);
 573                iput(alloc_inode);
 574
 575                mlog_errno(status);
 576                return status;
 577        }
 578
 579        ac->ac_inode = alloc_inode;
 580        ac->ac_alloc_slot = slot;
 581
 582        fe = (struct ocfs2_dinode *) bh->b_data;
 583
 584        /* The bh was validated by the inode read inside
 585         * ocfs2_inode_lock().  Any corruption is a code bug. */
 586        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 587
 588        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 589                ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
 590                            (unsigned long long)le64_to_cpu(fe->i_blkno));
 591                status = -EIO;
 592                goto bail;
 593        }
 594
 595        free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
 596                le32_to_cpu(fe->id1.bitmap1.i_used);
 597
 598        if (bits_wanted > free_bits) {
 599                /* cluster bitmap never grows */
 600                if (ocfs2_is_cluster_bitmap(alloc_inode)) {
 601                        mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
 602                             bits_wanted, free_bits);
 603                        status = -ENOSPC;
 604                        goto bail;
 605                }
 606
 607                if (!(flags & ALLOC_NEW_GROUP)) {
 608                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
 609                             "and we don't alloc a new group for it.\n",
 610                             slot, bits_wanted, free_bits);
 611                        status = -ENOSPC;
 612                        goto bail;
 613                }
 614
 615                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
 616                                                 ac->ac_max_block,
 617                                                 last_alloc_group, flags);
 618                if (status < 0) {
 619                        if (status != -ENOSPC)
 620                                mlog_errno(status);
 621                        goto bail;
 622                }
 623                atomic_inc(&osb->alloc_stats.bg_extends);
 624
 625                /* You should never ask for this much metadata */
 626                BUG_ON(bits_wanted >
 627                       (le32_to_cpu(fe->id1.bitmap1.i_total)
 628                        - le32_to_cpu(fe->id1.bitmap1.i_used)));
 629        }
 630
 631        get_bh(bh);
 632        ac->ac_bh = bh;
 633bail:
 634        brelse(bh);
 635
 636        mlog_exit(status);
 637        return status;
 638}
 639
 640int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 641                                      int blocks,
 642                                      struct ocfs2_alloc_context **ac)
 643{
 644        int status;
 645        u32 slot;
 646
 647        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 648        if (!(*ac)) {
 649                status = -ENOMEM;
 650                mlog_errno(status);
 651                goto bail;
 652        }
 653
 654        (*ac)->ac_bits_wanted = blocks;
 655        (*ac)->ac_which = OCFS2_AC_USE_META;
 656        slot = osb->slot_num;
 657        (*ac)->ac_group_search = ocfs2_block_group_search;
 658
 659        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
 660                                             EXTENT_ALLOC_SYSTEM_INODE,
 661                                             slot, NULL, ALLOC_NEW_GROUP);
 662        if (status < 0) {
 663                if (status != -ENOSPC)
 664                        mlog_errno(status);
 665                goto bail;
 666        }
 667
 668        status = 0;
 669bail:
 670        if ((status < 0) && *ac) {
 671                ocfs2_free_alloc_context(*ac);
 672                *ac = NULL;
 673        }
 674
 675        mlog_exit(status);
 676        return status;
 677}
 678
 679int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 680                               struct ocfs2_extent_list *root_el,
 681                               struct ocfs2_alloc_context **ac)
 682{
 683        return ocfs2_reserve_new_metadata_blocks(osb,
 684                                        ocfs2_extend_meta_needed(root_el),
 685                                        ac);
 686}
 687
 688static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
 689                                              struct ocfs2_alloc_context *ac)
 690{
 691        int i, status = -ENOSPC;
 692        s16 slot = ocfs2_get_inode_steal_slot(osb);
 693
 694        /* Start to steal inodes from the first slot after ours. */
 695        if (slot == OCFS2_INVALID_SLOT)
 696                slot = osb->slot_num + 1;
 697
 698        for (i = 0; i < osb->max_slots; i++, slot++) {
 699                if (slot == osb->max_slots)
 700                        slot = 0;
 701
 702                if (slot == osb->slot_num)
 703                        continue;
 704
 705                status = ocfs2_reserve_suballoc_bits(osb, ac,
 706                                                     INODE_ALLOC_SYSTEM_INODE,
 707                                                     slot, NULL,
 708                                                     NOT_ALLOC_NEW_GROUP);
 709                if (status >= 0) {
 710                        ocfs2_set_inode_steal_slot(osb, slot);
 711                        break;
 712                }
 713
 714                ocfs2_free_ac_resource(ac);
 715        }
 716
 717        return status;
 718}
 719
 720int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 721                            struct ocfs2_alloc_context **ac)
 722{
 723        int status;
 724        s16 slot = ocfs2_get_inode_steal_slot(osb);
 725        u64 alloc_group;
 726
 727        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 728        if (!(*ac)) {
 729                status = -ENOMEM;
 730                mlog_errno(status);
 731                goto bail;
 732        }
 733
 734        (*ac)->ac_bits_wanted = 1;
 735        (*ac)->ac_which = OCFS2_AC_USE_INODE;
 736
 737        (*ac)->ac_group_search = ocfs2_block_group_search;
 738
 739        /*
 740         * stat(2) can't handle i_ino > 32bits, so we tell the
 741         * lower levels not to allocate us a block group past that
 742         * limit.  The 'inode64' mount option avoids this behavior.
 743         */
 744        if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
 745                (*ac)->ac_max_block = (u32)~0U;
 746
 747        /*
 748         * slot is set when we successfully steal inode from other nodes.
 749         * It is reset in 3 places:
 750         * 1. when we flush the truncate log
 751         * 2. when we complete local alloc recovery.
 752         * 3. when we successfully allocate from our own slot.
 753         * After it is set, we will go on stealing inodes until we find the
 754         * need to check our slots to see whether there is some space for us.
 755         */
 756        if (slot != OCFS2_INVALID_SLOT &&
 757            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
 758                goto inode_steal;
 759
 760        atomic_set(&osb->s_num_inodes_stolen, 0);
 761        alloc_group = osb->osb_inode_alloc_group;
 762        status = ocfs2_reserve_suballoc_bits(osb, *ac,
 763                                             INODE_ALLOC_SYSTEM_INODE,
 764                                             osb->slot_num,
 765                                             &alloc_group,
 766                                             ALLOC_NEW_GROUP |
 767                                             ALLOC_GROUPS_FROM_GLOBAL);
 768        if (status >= 0) {
 769                status = 0;
 770
 771                spin_lock(&osb->osb_lock);
 772                osb->osb_inode_alloc_group = alloc_group;
 773                spin_unlock(&osb->osb_lock);
 774                mlog(0, "after reservation, new allocation group is "
 775                     "%llu\n", (unsigned long long)alloc_group);
 776
 777                /*
 778                 * Some inodes must be freed by us, so try to allocate
 779                 * from our own next time.
 780                 */
 781                if (slot != OCFS2_INVALID_SLOT)
 782                        ocfs2_init_inode_steal_slot(osb);
 783                goto bail;
 784        } else if (status < 0 && status != -ENOSPC) {
 785                mlog_errno(status);
 786                goto bail;
 787        }
 788
 789        ocfs2_free_ac_resource(*ac);
 790
 791inode_steal:
 792        status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
 793        atomic_inc(&osb->s_num_inodes_stolen);
 794        if (status < 0) {
 795                if (status != -ENOSPC)
 796                        mlog_errno(status);
 797                goto bail;
 798        }
 799
 800        status = 0;
 801bail:
 802        if ((status < 0) && *ac) {
 803                ocfs2_free_alloc_context(*ac);
 804                *ac = NULL;
 805        }
 806
 807        mlog_exit(status);
 808        return status;
 809}
 810
 811/* local alloc code has to do the same thing, so rather than do this
 812 * twice.. */
 813int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 814                                      struct ocfs2_alloc_context *ac)
 815{
 816        int status;
 817
 818        ac->ac_which = OCFS2_AC_USE_MAIN;
 819        ac->ac_group_search = ocfs2_cluster_group_search;
 820
 821        status = ocfs2_reserve_suballoc_bits(osb, ac,
 822                                             GLOBAL_BITMAP_SYSTEM_INODE,
 823                                             OCFS2_INVALID_SLOT, NULL,
 824                                             ALLOC_NEW_GROUP);
 825        if (status < 0 && status != -ENOSPC) {
 826                mlog_errno(status);
 827                goto bail;
 828        }
 829
 830bail:
 831        return status;
 832}
 833
 834/* Callers don't need to care which bitmap (local alloc or main) to
 835 * use so we figure it out for them, but unfortunately this clutters
 836 * things a bit. */
 837static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 838                                             u32 bits_wanted, u64 max_block,
 839                                             int flags,
 840                                             struct ocfs2_alloc_context **ac)
 841{
 842        int status;
 843
 844        mlog_entry_void();
 845
 846        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 847        if (!(*ac)) {
 848                status = -ENOMEM;
 849                mlog_errno(status);
 850                goto bail;
 851        }
 852
 853        (*ac)->ac_bits_wanted = bits_wanted;
 854        (*ac)->ac_max_block = max_block;
 855
 856        status = -ENOSPC;
 857        if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
 858            ocfs2_alloc_should_use_local(osb, bits_wanted)) {
 859                status = ocfs2_reserve_local_alloc_bits(osb,
 860                                                        bits_wanted,
 861                                                        *ac);
 862                if (status == -EFBIG) {
 863                        /* The local alloc window is outside ac_max_block.
 864                         * use the main bitmap. */
 865                        status = -ENOSPC;
 866                } else if ((status < 0) && (status != -ENOSPC)) {
 867                        mlog_errno(status);
 868                        goto bail;
 869                }
 870        }
 871
 872        if (status == -ENOSPC) {
 873                status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
 874                if (status < 0) {
 875                        if (status != -ENOSPC)
 876                                mlog_errno(status);
 877                        goto bail;
 878                }
 879        }
 880
 881        status = 0;
 882bail:
 883        if ((status < 0) && *ac) {
 884                ocfs2_free_alloc_context(*ac);
 885                *ac = NULL;
 886        }
 887
 888        mlog_exit(status);
 889        return status;
 890}
 891
 892int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 893                           u32 bits_wanted,
 894                           struct ocfs2_alloc_context **ac)
 895{
 896        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
 897                                                 ALLOC_NEW_GROUP, ac);
 898}
 899
 900/*
 901 * More or less lifted from ext3. I'll leave their description below:
 902 *
 903 * "For ext3 allocations, we must not reuse any blocks which are
 904 * allocated in the bitmap buffer's "last committed data" copy.  This
 905 * prevents deletes from freeing up the page for reuse until we have
 906 * committed the delete transaction.
 907 *
 908 * If we didn't do this, then deleting something and reallocating it as
 909 * data would allow the old block to be overwritten before the
 910 * transaction committed (because we force data to disk before commit).
 911 * This would lead to corruption if we crashed between overwriting the
 912 * data and committing the delete.
 913 *
 914 * @@@ We may want to make this allocation behaviour conditional on
 915 * data-writes at some point, and disable it for metadata allocations or
 916 * sync-data inodes."
 917 *
 918 * Note: OCFS2 already does this differently for metadata vs data
 919 * allocations, as those bitmaps are separate and undo access is never
 920 * called on a metadata group descriptor.
 921 */
 922static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 923                                         int nr)
 924{
 925        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 926        int ret;
 927
 928        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
 929                return 0;
 930
 931        if (!buffer_jbd(bg_bh))
 932                return 1;
 933
 934        jbd_lock_bh_state(bg_bh);
 935        bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
 936        if (bg)
 937                ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
 938        else
 939                ret = 1;
 940        jbd_unlock_bh_state(bg_bh);
 941
 942        return ret;
 943}
 944
 945static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
 946                                             struct buffer_head *bg_bh,
 947                                             unsigned int bits_wanted,
 948                                             unsigned int total_bits,
 949                                             u16 *bit_off,
 950                                             u16 *bits_found)
 951{
 952        void *bitmap;
 953        u16 best_offset, best_size;
 954        int offset, start, found, status = 0;
 955        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 956
 957        /* Callers got this descriptor from
 958         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 959        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 960
 961        found = start = best_offset = best_size = 0;
 962        bitmap = bg->bg_bitmap;
 963
 964        while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
 965                if (offset == total_bits)
 966                        break;
 967
 968                if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
 969                        /* We found a zero, but we can't use it as it
 970                         * hasn't been put to disk yet! */
 971                        found = 0;
 972                        start = offset + 1;
 973                } else if (offset == start) {
 974                        /* we found a zero */
 975                        found++;
 976                        /* move start to the next bit to test */
 977                        start++;
 978                } else {
 979                        /* got a zero after some ones */
 980                        found = 1;
 981                        start = offset + 1;
 982                }
 983                if (found > best_size) {
 984                        best_size = found;
 985                        best_offset = start - found;
 986                }
 987                /* we got everything we needed */
 988                if (found == bits_wanted) {
 989                        /* mlog(0, "Found it all!\n"); */
 990                        break;
 991                }
 992        }
 993
 994        /* XXX: I think the first clause is equivalent to the second
 995         *      - jlbec */
 996        if (found == bits_wanted) {
 997                *bit_off = start - found;
 998                *bits_found = found;
 999        } else if (best_size) {
1000                *bit_off = best_offset;
1001                *bits_found = best_size;
1002        } else {
1003                status = -ENOSPC;
1004                /* No error log here -- see the comment above
1005                 * ocfs2_test_bg_bit_allocatable */
1006        }
1007
1008        return status;
1009}
1010
1011static inline int ocfs2_block_group_set_bits(handle_t *handle,
1012                                             struct inode *alloc_inode,
1013                                             struct ocfs2_group_desc *bg,
1014                                             struct buffer_head *group_bh,
1015                                             unsigned int bit_off,
1016                                             unsigned int num_bits)
1017{
1018        int status;
1019        void *bitmap = bg->bg_bitmap;
1020        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1021
1022        mlog_entry_void();
1023
1024        /* All callers get the descriptor via
1025         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1026        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1027        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1028
1029        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1030             num_bits);
1031
1032        if (ocfs2_is_cluster_bitmap(alloc_inode))
1033                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1034
1035        status = ocfs2_journal_access_gd(handle,
1036                                         INODE_CACHE(alloc_inode),
1037                                         group_bh,
1038                                         journal_type);
1039        if (status < 0) {
1040                mlog_errno(status);
1041                goto bail;
1042        }
1043
1044        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1045
1046        while(num_bits--)
1047                ocfs2_set_bit(bit_off++, bitmap);
1048
1049        status = ocfs2_journal_dirty(handle,
1050                                     group_bh);
1051        if (status < 0) {
1052                mlog_errno(status);
1053                goto bail;
1054        }
1055
1056bail:
1057        mlog_exit(status);
1058        return status;
1059}
1060
1061/* find the one with the most empty bits */
1062static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1063{
1064        u16 curr, best;
1065
1066        BUG_ON(!cl->cl_next_free_rec);
1067
1068        best = curr = 0;
1069        while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1070                if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1071                    le32_to_cpu(cl->cl_recs[best].c_free))
1072                        best = curr;
1073                curr++;
1074        }
1075
1076        BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1077        return best;
1078}
1079
1080static int ocfs2_relink_block_group(handle_t *handle,
1081                                    struct inode *alloc_inode,
1082                                    struct buffer_head *fe_bh,
1083                                    struct buffer_head *bg_bh,
1084                                    struct buffer_head *prev_bg_bh,
1085                                    u16 chain)
1086{
1087        int status;
1088        /* there is a really tiny chance the journal calls could fail,
1089         * but we wouldn't want inconsistent blocks in *any* case. */
1090        u64 fe_ptr, bg_ptr, prev_bg_ptr;
1091        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1092        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1093        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1094
1095        /* The caller got these descriptors from
1096         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1097        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1098        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1099
1100        mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1101             (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1102             (unsigned long long)le64_to_cpu(bg->bg_blkno),
1103             (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1104
1105        fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1106        bg_ptr = le64_to_cpu(bg->bg_next_group);
1107        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1108
1109        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1110                                         prev_bg_bh,
1111                                         OCFS2_JOURNAL_ACCESS_WRITE);
1112        if (status < 0) {
1113                mlog_errno(status);
1114                goto out_rollback;
1115        }
1116
1117        prev_bg->bg_next_group = bg->bg_next_group;
1118
1119        status = ocfs2_journal_dirty(handle, prev_bg_bh);
1120        if (status < 0) {
1121                mlog_errno(status);
1122                goto out_rollback;
1123        }
1124
1125        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1126                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1127        if (status < 0) {
1128                mlog_errno(status);
1129                goto out_rollback;
1130        }
1131
1132        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1133
1134        status = ocfs2_journal_dirty(handle, bg_bh);
1135        if (status < 0) {
1136                mlog_errno(status);
1137                goto out_rollback;
1138        }
1139
1140        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1141                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1142        if (status < 0) {
1143                mlog_errno(status);
1144                goto out_rollback;
1145        }
1146
1147        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1148
1149        status = ocfs2_journal_dirty(handle, fe_bh);
1150        if (status < 0) {
1151                mlog_errno(status);
1152                goto out_rollback;
1153        }
1154
1155        status = 0;
1156out_rollback:
1157        if (status < 0) {
1158                fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1159                bg->bg_next_group = cpu_to_le64(bg_ptr);
1160                prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1161        }
1162
1163        mlog_exit(status);
1164        return status;
1165}
1166
1167static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1168                                                     u32 wanted)
1169{
1170        return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1171}
1172
1173/* return 0 on success, -ENOSPC to keep searching and any other < 0
1174 * value on error. */
1175static int ocfs2_cluster_group_search(struct inode *inode,
1176                                      struct buffer_head *group_bh,
1177                                      u32 bits_wanted, u32 min_bits,
1178                                      u64 max_block,
1179                                      u16 *bit_off, u16 *bits_found)
1180{
1181        int search = -ENOSPC;
1182        int ret;
1183        u64 blkoff;
1184        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1185        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1186        u16 tmp_off, tmp_found;
1187        unsigned int max_bits, gd_cluster_off;
1188
1189        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1190
1191        if (gd->bg_free_bits_count) {
1192                max_bits = le16_to_cpu(gd->bg_bits);
1193
1194                /* Tail groups in cluster bitmaps which aren't cpg
1195                 * aligned are prone to partial extention by a failed
1196                 * fs resize. If the file system resize never got to
1197                 * update the dinode cluster count, then we don't want
1198                 * to trust any clusters past it, regardless of what
1199                 * the group descriptor says. */
1200                gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1201                                                          le64_to_cpu(gd->bg_blkno));
1202                if ((gd_cluster_off + max_bits) >
1203                    OCFS2_I(inode)->ip_clusters) {
1204                        max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1205                        mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1206                             (unsigned long long)le64_to_cpu(gd->bg_blkno),
1207                             le16_to_cpu(gd->bg_bits),
1208                             OCFS2_I(inode)->ip_clusters, max_bits);
1209                }
1210
1211                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1212                                                        group_bh, bits_wanted,
1213                                                        max_bits,
1214                                                        &tmp_off, &tmp_found);
1215                if (ret)
1216                        return ret;
1217
1218                if (max_block) {
1219                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1220                                                          gd_cluster_off +
1221                                                          tmp_off + tmp_found);
1222                        mlog(0, "Checking %llu against %llu\n",
1223                             (unsigned long long)blkoff,
1224                             (unsigned long long)max_block);
1225                        if (blkoff > max_block)
1226                                return -ENOSPC;
1227                }
1228
1229                /* ocfs2_block_group_find_clear_bits() might
1230                 * return success, but we still want to return
1231                 * -ENOSPC unless it found the minimum number
1232                 * of bits. */
1233                if (min_bits <= tmp_found) {
1234                        *bit_off = tmp_off;
1235                        *bits_found = tmp_found;
1236                        search = 0; /* success */
1237                } else if (tmp_found) {
1238                        /*
1239                         * Don't show bits which we'll be returning
1240                         * for allocation to the local alloc bitmap.
1241                         */
1242                        ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1243                }
1244        }
1245
1246        return search;
1247}
1248
1249static int ocfs2_block_group_search(struct inode *inode,
1250                                    struct buffer_head *group_bh,
1251                                    u32 bits_wanted, u32 min_bits,
1252                                    u64 max_block,
1253                                    u16 *bit_off, u16 *bits_found)
1254{
1255        int ret = -ENOSPC;
1256        u64 blkoff;
1257        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1258
1259        BUG_ON(min_bits != 1);
1260        BUG_ON(ocfs2_is_cluster_bitmap(inode));
1261
1262        if (bg->bg_free_bits_count) {
1263                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1264                                                        group_bh, bits_wanted,
1265                                                        le16_to_cpu(bg->bg_bits),
1266                                                        bit_off, bits_found);
1267                if (!ret && max_block) {
1268                        blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1269                                *bits_found;
1270                        mlog(0, "Checking %llu against %llu\n",
1271                             (unsigned long long)blkoff,
1272                             (unsigned long long)max_block);
1273                        if (blkoff > max_block)
1274                                ret = -ENOSPC;
1275                }
1276        }
1277
1278        return ret;
1279}
1280
1281static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1282                                       handle_t *handle,
1283                                       struct buffer_head *di_bh,
1284                                       u32 num_bits,
1285                                       u16 chain)
1286{
1287        int ret;
1288        u32 tmp_used;
1289        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1290        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1291
1292        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1293                                      OCFS2_JOURNAL_ACCESS_WRITE);
1294        if (ret < 0) {
1295                mlog_errno(ret);
1296                goto out;
1297        }
1298
1299        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1300        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1301        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1302
1303        ret = ocfs2_journal_dirty(handle, di_bh);
1304        if (ret < 0)
1305                mlog_errno(ret);
1306
1307out:
1308        return ret;
1309}
1310
1311static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1312                                  handle_t *handle,
1313                                  u32 bits_wanted,
1314                                  u32 min_bits,
1315                                  u16 *bit_off,
1316                                  unsigned int *num_bits,
1317                                  u64 gd_blkno,
1318                                  u16 *bits_left)
1319{
1320        int ret;
1321        u16 found;
1322        struct buffer_head *group_bh = NULL;
1323        struct ocfs2_group_desc *gd;
1324        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1325        struct inode *alloc_inode = ac->ac_inode;
1326
1327        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1328                                          &group_bh);
1329        if (ret < 0) {
1330                mlog_errno(ret);
1331                return ret;
1332        }
1333
1334        gd = (struct ocfs2_group_desc *) group_bh->b_data;
1335        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1336                                  ac->ac_max_block, bit_off, &found);
1337        if (ret < 0) {
1338                if (ret != -ENOSPC)
1339                        mlog_errno(ret);
1340                goto out;
1341        }
1342
1343        *num_bits = found;
1344
1345        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1346                                               *num_bits,
1347                                               le16_to_cpu(gd->bg_chain));
1348        if (ret < 0) {
1349                mlog_errno(ret);
1350                goto out;
1351        }
1352
1353        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1354                                         *bit_off, *num_bits);
1355        if (ret < 0)
1356                mlog_errno(ret);
1357
1358        *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1359
1360out:
1361        brelse(group_bh);
1362
1363        return ret;
1364}
1365
1366static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1367                              handle_t *handle,
1368                              u32 bits_wanted,
1369                              u32 min_bits,
1370                              u16 *bit_off,
1371                              unsigned int *num_bits,
1372                              u64 *bg_blkno,
1373                              u16 *bits_left)
1374{
1375        int status;
1376        u16 chain, tmp_bits;
1377        u32 tmp_used;
1378        u64 next_group;
1379        struct inode *alloc_inode = ac->ac_inode;
1380        struct buffer_head *group_bh = NULL;
1381        struct buffer_head *prev_group_bh = NULL;
1382        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1383        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1384        struct ocfs2_group_desc *bg;
1385
1386        chain = ac->ac_chain;
1387        mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1388             bits_wanted, chain,
1389             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1390
1391        status = ocfs2_read_group_descriptor(alloc_inode, fe,
1392                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
1393                                             &group_bh);
1394        if (status < 0) {
1395                mlog_errno(status);
1396                goto bail;
1397        }
1398        bg = (struct ocfs2_group_desc *) group_bh->b_data;
1399
1400        status = -ENOSPC;
1401        /* for now, the chain search is a bit simplistic. We just use
1402         * the 1st group with any empty bits. */
1403        while ((status = ac->ac_group_search(alloc_inode, group_bh,
1404                                             bits_wanted, min_bits,
1405                                             ac->ac_max_block, bit_off,
1406                                             &tmp_bits)) == -ENOSPC) {
1407                if (!bg->bg_next_group)
1408                        break;
1409
1410                brelse(prev_group_bh);
1411                prev_group_bh = NULL;
1412
1413                next_group = le64_to_cpu(bg->bg_next_group);
1414                prev_group_bh = group_bh;
1415                group_bh = NULL;
1416                status = ocfs2_read_group_descriptor(alloc_inode, fe,
1417                                                     next_group, &group_bh);
1418                if (status < 0) {
1419                        mlog_errno(status);
1420                        goto bail;
1421                }
1422                bg = (struct ocfs2_group_desc *) group_bh->b_data;
1423        }
1424        if (status < 0) {
1425                if (status != -ENOSPC)
1426                        mlog_errno(status);
1427                goto bail;
1428        }
1429
1430        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1431             tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1432
1433        *num_bits = tmp_bits;
1434
1435        BUG_ON(*num_bits == 0);
1436
1437        /*
1438         * Keep track of previous block descriptor read. When
1439         * we find a target, if we have read more than X
1440         * number of descriptors, and the target is reasonably
1441         * empty, relink him to top of his chain.
1442         *
1443         * We've read 0 extra blocks and only send one more to
1444         * the transaction, yet the next guy to search has a
1445         * much easier time.
1446         *
1447         * Do this *after* figuring out how many bits we're taking out
1448         * of our target group.
1449         */
1450        if (ac->ac_allow_chain_relink &&
1451            (prev_group_bh) &&
1452            (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1453                status = ocfs2_relink_block_group(handle, alloc_inode,
1454                                                  ac->ac_bh, group_bh,
1455                                                  prev_group_bh, chain);
1456                if (status < 0) {
1457                        mlog_errno(status);
1458                        goto bail;
1459                }
1460        }
1461
1462        /* Ok, claim our bits now: set the info on dinode, chainlist
1463         * and then the group */
1464        status = ocfs2_journal_access_di(handle,
1465                                         INODE_CACHE(alloc_inode),
1466                                         ac->ac_bh,
1467                                         OCFS2_JOURNAL_ACCESS_WRITE);
1468        if (status < 0) {
1469                mlog_errno(status);
1470                goto bail;
1471        }
1472
1473        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1474        fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1475        le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1476
1477        status = ocfs2_journal_dirty(handle,
1478                                     ac->ac_bh);
1479        if (status < 0) {
1480                mlog_errno(status);
1481                goto bail;
1482        }
1483
1484        status = ocfs2_block_group_set_bits(handle,
1485                                            alloc_inode,
1486                                            bg,
1487                                            group_bh,
1488                                            *bit_off,
1489                                            *num_bits);
1490        if (status < 0) {
1491                mlog_errno(status);
1492                goto bail;
1493        }
1494
1495        mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1496             (unsigned long long)le64_to_cpu(fe->i_blkno));
1497
1498        *bg_blkno = le64_to_cpu(bg->bg_blkno);
1499        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1500bail:
1501        brelse(group_bh);
1502        brelse(prev_group_bh);
1503
1504        mlog_exit(status);
1505        return status;
1506}
1507
1508/* will give out up to bits_wanted contiguous bits. */
1509static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1510                                     struct ocfs2_alloc_context *ac,
1511                                     handle_t *handle,
1512                                     u32 bits_wanted,
1513                                     u32 min_bits,
1514                                     u16 *bit_off,
1515                                     unsigned int *num_bits,
1516                                     u64 *bg_blkno)
1517{
1518        int status;
1519        u16 victim, i;
1520        u16 bits_left = 0;
1521        u64 hint_blkno = ac->ac_last_group;
1522        struct ocfs2_chain_list *cl;
1523        struct ocfs2_dinode *fe;
1524
1525        mlog_entry_void();
1526
1527        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1528        BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1529        BUG_ON(!ac->ac_bh);
1530
1531        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1532
1533        /* The bh was validated by the inode read during
1534         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1535        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1536
1537        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1538            le32_to_cpu(fe->id1.bitmap1.i_total)) {
1539                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1540                            "bits but only %u total.",
1541                            (unsigned long long)le64_to_cpu(fe->i_blkno),
1542                            le32_to_cpu(fe->id1.bitmap1.i_used),
1543                            le32_to_cpu(fe->id1.bitmap1.i_total));
1544                status = -EIO;
1545                goto bail;
1546        }
1547
1548        if (hint_blkno) {
1549                /* Attempt to short-circuit the usual search mechanism
1550                 * by jumping straight to the most recently used
1551                 * allocation group. This helps us mantain some
1552                 * contiguousness across allocations. */
1553                status = ocfs2_search_one_group(ac, handle, bits_wanted,
1554                                                min_bits, bit_off, num_bits,
1555                                                hint_blkno, &bits_left);
1556                if (!status) {
1557                        /* Be careful to update *bg_blkno here as the
1558                         * caller is expecting it to be filled in, and
1559                         * ocfs2_search_one_group() won't do that for
1560                         * us. */
1561                        *bg_blkno = hint_blkno;
1562                        goto set_hint;
1563                }
1564                if (status < 0 && status != -ENOSPC) {
1565                        mlog_errno(status);
1566                        goto bail;
1567                }
1568        }
1569
1570        cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1571
1572        victim = ocfs2_find_victim_chain(cl);
1573        ac->ac_chain = victim;
1574        ac->ac_allow_chain_relink = 1;
1575
1576        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1577                                    num_bits, bg_blkno, &bits_left);
1578        if (!status)
1579                goto set_hint;
1580        if (status < 0 && status != -ENOSPC) {
1581                mlog_errno(status);
1582                goto bail;
1583        }
1584
1585        mlog(0, "Search of victim chain %u came up with nothing, "
1586             "trying all chains now.\n", victim);
1587
1588        /* If we didn't pick a good victim, then just default to
1589         * searching each chain in order. Don't allow chain relinking
1590         * because we only calculate enough journal credits for one
1591         * relink per alloc. */
1592        ac->ac_allow_chain_relink = 0;
1593        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1594                if (i == victim)
1595                        continue;
1596                if (!cl->cl_recs[i].c_free)
1597                        continue;
1598
1599                ac->ac_chain = i;
1600                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1601                                            bit_off, num_bits, bg_blkno,
1602                                            &bits_left);
1603                if (!status)
1604                        break;
1605                if (status < 0 && status != -ENOSPC) {
1606                        mlog_errno(status);
1607                        goto bail;
1608                }
1609        }
1610
1611set_hint:
1612        if (status != -ENOSPC) {
1613                /* If the next search of this group is not likely to
1614                 * yield a suitable extent, then we reset the last
1615                 * group hint so as to not waste a disk read */
1616                if (bits_left < min_bits)
1617                        ac->ac_last_group = 0;
1618                else
1619                        ac->ac_last_group = *bg_blkno;
1620        }
1621
1622bail:
1623        mlog_exit(status);
1624        return status;
1625}
1626
1627int ocfs2_claim_metadata(struct ocfs2_super *osb,
1628                         handle_t *handle,
1629                         struct ocfs2_alloc_context *ac,
1630                         u32 bits_wanted,
1631                         u16 *suballoc_bit_start,
1632                         unsigned int *num_bits,
1633                         u64 *blkno_start)
1634{
1635        int status;
1636        u64 bg_blkno;
1637
1638        BUG_ON(!ac);
1639        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1640        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1641
1642        status = ocfs2_claim_suballoc_bits(osb,
1643                                           ac,
1644                                           handle,
1645                                           bits_wanted,
1646                                           1,
1647                                           suballoc_bit_start,
1648                                           num_bits,
1649                                           &bg_blkno);
1650        if (status < 0) {
1651                mlog_errno(status);
1652                goto bail;
1653        }
1654        atomic_inc(&osb->alloc_stats.bg_allocs);
1655
1656        *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1657        ac->ac_bits_given += (*num_bits);
1658        status = 0;
1659bail:
1660        mlog_exit(status);
1661        return status;
1662}
1663
1664static void ocfs2_init_inode_ac_group(struct inode *dir,
1665                                      struct buffer_head *parent_fe_bh,
1666                                      struct ocfs2_alloc_context *ac)
1667{
1668        struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1669        /*
1670         * Try to allocate inodes from some specific group.
1671         *
1672         * If the parent dir has recorded the last group used in allocation,
1673         * cool, use it. Otherwise if we try to allocate new inode from the
1674         * same slot the parent dir belongs to, use the same chunk.
1675         *
1676         * We are very careful here to avoid the mistake of setting
1677         * ac_last_group to a group descriptor from a different (unlocked) slot.
1678         */
1679        if (OCFS2_I(dir)->ip_last_used_group &&
1680            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1681                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1682        else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1683                ac->ac_last_group = ocfs2_which_suballoc_group(
1684                                        le64_to_cpu(fe->i_blkno),
1685                                        le16_to_cpu(fe->i_suballoc_bit));
1686}
1687
1688static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1689                                             struct ocfs2_alloc_context *ac)
1690{
1691        OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1692        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1693}
1694
1695int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1696                          handle_t *handle,
1697                          struct inode *dir,
1698                          struct buffer_head *parent_fe_bh,
1699                          struct ocfs2_alloc_context *ac,
1700                          u16 *suballoc_bit,
1701                          u64 *fe_blkno)
1702{
1703        int status;
1704        unsigned int num_bits;
1705        u64 bg_blkno;
1706
1707        mlog_entry_void();
1708
1709        BUG_ON(!ac);
1710        BUG_ON(ac->ac_bits_given != 0);
1711        BUG_ON(ac->ac_bits_wanted != 1);
1712        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1713
1714        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1715
1716        status = ocfs2_claim_suballoc_bits(osb,
1717                                           ac,
1718                                           handle,
1719                                           1,
1720                                           1,
1721                                           suballoc_bit,
1722                                           &num_bits,
1723                                           &bg_blkno);
1724        if (status < 0) {
1725                mlog_errno(status);
1726                goto bail;
1727        }
1728        atomic_inc(&osb->alloc_stats.bg_allocs);
1729
1730        BUG_ON(num_bits != 1);
1731
1732        *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1733        ac->ac_bits_given++;
1734        ocfs2_save_inode_ac_group(dir, ac);
1735        status = 0;
1736bail:
1737        mlog_exit(status);
1738        return status;
1739}
1740
1741/* translate a group desc. blkno and it's bitmap offset into
1742 * disk cluster offset. */
1743static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1744                                                   u64 bg_blkno,
1745                                                   u16 bg_bit_off)
1746{
1747        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1748        u32 cluster = 0;
1749
1750        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1751
1752        if (bg_blkno != osb->first_cluster_group_blkno)
1753                cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1754        cluster += (u32) bg_bit_off;
1755        return cluster;
1756}
1757
1758/* given a cluster offset, calculate which block group it belongs to
1759 * and return that block offset. */
1760u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1761{
1762        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1763        u32 group_no;
1764
1765        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1766
1767        group_no = cluster / osb->bitmap_cpg;
1768        if (!group_no)
1769                return osb->first_cluster_group_blkno;
1770        return ocfs2_clusters_to_blocks(inode->i_sb,
1771                                        group_no * osb->bitmap_cpg);
1772}
1773
1774/* given the block number of a cluster start, calculate which cluster
1775 * group and descriptor bitmap offset that corresponds to. */
1776static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1777                                                u64 data_blkno,
1778                                                u64 *bg_blkno,
1779                                                u16 *bg_bit_off)
1780{
1781        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1782        u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1783
1784        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1785
1786        *bg_blkno = ocfs2_which_cluster_group(inode,
1787                                              data_cluster);
1788
1789        if (*bg_blkno == osb->first_cluster_group_blkno)
1790                *bg_bit_off = (u16) data_cluster;
1791        else
1792                *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1793                                                             data_blkno - *bg_blkno);
1794}
1795
1796/*
1797 * min_bits - minimum contiguous chunk from this total allocation we
1798 * can handle. set to what we asked for originally for a full
1799 * contig. allocation, set to '1' to indicate we can deal with extents
1800 * of any size.
1801 */
1802int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1803                           handle_t *handle,
1804                           struct ocfs2_alloc_context *ac,
1805                           u32 min_clusters,
1806                           u32 max_clusters,
1807                           u32 *cluster_start,
1808                           u32 *num_clusters)
1809{
1810        int status;
1811        unsigned int bits_wanted = max_clusters;
1812        u64 bg_blkno = 0;
1813        u16 bg_bit_off;
1814
1815        mlog_entry_void();
1816
1817        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1818
1819        BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1820               && ac->ac_which != OCFS2_AC_USE_MAIN);
1821
1822        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1823                status = ocfs2_claim_local_alloc_bits(osb,
1824                                                      handle,
1825                                                      ac,
1826                                                      bits_wanted,
1827                                                      cluster_start,
1828                                                      num_clusters);
1829                if (!status)
1830                        atomic_inc(&osb->alloc_stats.local_data);
1831        } else {
1832                if (min_clusters > (osb->bitmap_cpg - 1)) {
1833                        /* The only paths asking for contiguousness
1834                         * should know about this already. */
1835                        mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1836                             "group bitmap size %u!\n", min_clusters,
1837                             osb->bitmap_cpg);
1838                        status = -ENOSPC;
1839                        goto bail;
1840                }
1841                /* clamp the current request down to a realistic size. */
1842                if (bits_wanted > (osb->bitmap_cpg - 1))
1843                        bits_wanted = osb->bitmap_cpg - 1;
1844
1845                status = ocfs2_claim_suballoc_bits(osb,
1846                                                   ac,
1847                                                   handle,
1848                                                   bits_wanted,
1849                                                   min_clusters,
1850                                                   &bg_bit_off,
1851                                                   num_clusters,
1852                                                   &bg_blkno);
1853                if (!status) {
1854                        *cluster_start =
1855                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1856                                                                 bg_blkno,
1857                                                                 bg_bit_off);
1858                        atomic_inc(&osb->alloc_stats.bitmap_data);
1859                }
1860        }
1861        if (status < 0) {
1862                if (status != -ENOSPC)
1863                        mlog_errno(status);
1864                goto bail;
1865        }
1866
1867        ac->ac_bits_given += *num_clusters;
1868
1869bail:
1870        mlog_exit(status);
1871        return status;
1872}
1873
1874int ocfs2_claim_clusters(struct ocfs2_super *osb,
1875                         handle_t *handle,
1876                         struct ocfs2_alloc_context *ac,
1877                         u32 min_clusters,
1878                         u32 *cluster_start,
1879                         u32 *num_clusters)
1880{
1881        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1882
1883        return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1884                                      bits_wanted, cluster_start, num_clusters);
1885}
1886
1887static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1888                                               struct inode *alloc_inode,
1889                                               struct ocfs2_group_desc *bg,
1890                                               struct buffer_head *group_bh,
1891                                               unsigned int bit_off,
1892                                               unsigned int num_bits)
1893{
1894        int status;
1895        unsigned int tmp;
1896        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1897        struct ocfs2_group_desc *undo_bg = NULL;
1898        int cluster_bitmap = 0;
1899
1900        mlog_entry_void();
1901
1902        /* The caller got this descriptor from
1903         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1904        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1905
1906        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1907
1908        if (ocfs2_is_cluster_bitmap(alloc_inode))
1909                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1910
1911        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1912                                         group_bh, journal_type);
1913        if (status < 0) {
1914                mlog_errno(status);
1915                goto bail;
1916        }
1917
1918        if (ocfs2_is_cluster_bitmap(alloc_inode))
1919                cluster_bitmap = 1;
1920
1921        if (cluster_bitmap) {
1922                jbd_lock_bh_state(group_bh);
1923                undo_bg = (struct ocfs2_group_desc *)
1924                                        bh2jh(group_bh)->b_committed_data;
1925                BUG_ON(!undo_bg);
1926        }
1927
1928        tmp = num_bits;
1929        while(tmp--) {
1930                ocfs2_clear_bit((bit_off + tmp),
1931                                (unsigned long *) bg->bg_bitmap);
1932                if (cluster_bitmap)
1933                        ocfs2_set_bit(bit_off + tmp,
1934                                      (unsigned long *) undo_bg->bg_bitmap);
1935        }
1936        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1937
1938        if (cluster_bitmap)
1939                jbd_unlock_bh_state(group_bh);
1940
1941        status = ocfs2_journal_dirty(handle, group_bh);
1942        if (status < 0)
1943                mlog_errno(status);
1944bail:
1945        return status;
1946}
1947
1948/*
1949 * expects the suballoc inode to already be locked.
1950 */
1951int ocfs2_free_suballoc_bits(handle_t *handle,
1952                             struct inode *alloc_inode,
1953                             struct buffer_head *alloc_bh,
1954                             unsigned int start_bit,
1955                             u64 bg_blkno,
1956                             unsigned int count)
1957{
1958        int status = 0;
1959        u32 tmp_used;
1960        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1961        struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1962        struct buffer_head *group_bh = NULL;
1963        struct ocfs2_group_desc *group;
1964
1965        mlog_entry_void();
1966
1967        /* The alloc_bh comes from ocfs2_free_dinode() or
1968         * ocfs2_free_clusters().  The callers have all locked the
1969         * allocator and gotten alloc_bh from the lock call.  This
1970         * validates the dinode buffer.  Any corruption that has happended
1971         * is a code bug. */
1972        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1973        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1974
1975        mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1976             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1977             (unsigned long long)bg_blkno, start_bit);
1978
1979        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1980                                             &group_bh);
1981        if (status < 0) {
1982                mlog_errno(status);
1983                goto bail;
1984        }
1985        group = (struct ocfs2_group_desc *) group_bh->b_data;
1986
1987        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1988
1989        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1990                                              group, group_bh,
1991                                              start_bit, count);
1992        if (status < 0) {
1993                mlog_errno(status);
1994                goto bail;
1995        }
1996
1997        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1998                                         alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1999        if (status < 0) {
2000                mlog_errno(status);
2001                goto bail;
2002        }
2003
2004        le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2005                     count);
2006        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2007        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2008
2009        status = ocfs2_journal_dirty(handle, alloc_bh);
2010        if (status < 0) {
2011                mlog_errno(status);
2012                goto bail;
2013        }
2014
2015bail:
2016        brelse(group_bh);
2017
2018        mlog_exit(status);
2019        return status;
2020}
2021
2022int ocfs2_free_dinode(handle_t *handle,
2023                      struct inode *inode_alloc_inode,
2024                      struct buffer_head *inode_alloc_bh,
2025                      struct ocfs2_dinode *di)
2026{
2027        u64 blk = le64_to_cpu(di->i_blkno);
2028        u16 bit = le16_to_cpu(di->i_suballoc_bit);
2029        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2030
2031        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2032                                        inode_alloc_bh, bit, bg_blkno, 1);
2033}
2034
2035int ocfs2_free_clusters(handle_t *handle,
2036                       struct inode *bitmap_inode,
2037                       struct buffer_head *bitmap_bh,
2038                       u64 start_blk,
2039                       unsigned int num_clusters)
2040{
2041        int status;
2042        u16 bg_start_bit;
2043        u64 bg_blkno;
2044        struct ocfs2_dinode *fe;
2045
2046        /* You can't ever have a contiguous set of clusters
2047         * bigger than a block group bitmap so we never have to worry
2048         * about looping on them. */
2049
2050        mlog_entry_void();
2051
2052        /* This is expensive. We can safely remove once this stuff has
2053         * gotten tested really well. */
2054        BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2055
2056        fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2057
2058        ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2059                                     &bg_start_bit);
2060
2061        mlog(0, "want to free %u clusters starting at block %llu\n",
2062             num_clusters, (unsigned long long)start_blk);
2063        mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2064             (unsigned long long)bg_blkno, bg_start_bit);
2065
2066        status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2067                                          bg_start_bit, bg_blkno,
2068                                          num_clusters);
2069        if (status < 0) {
2070                mlog_errno(status);
2071                goto out;
2072        }
2073
2074        ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2075                                         num_clusters);
2076
2077out:
2078        mlog_exit(status);
2079        return status;
2080}
2081
2082static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2083{
2084        printk("Block Group:\n");
2085        printk("bg_signature:       %s\n", bg->bg_signature);
2086        printk("bg_size:            %u\n", bg->bg_size);
2087        printk("bg_bits:            %u\n", bg->bg_bits);
2088        printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2089        printk("bg_chain:           %u\n", bg->bg_chain);
2090        printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2091        printk("bg_next_group:      %llu\n",
2092               (unsigned long long)bg->bg_next_group);
2093        printk("bg_parent_dinode:   %llu\n",
2094               (unsigned long long)bg->bg_parent_dinode);
2095        printk("bg_blkno:           %llu\n",
2096               (unsigned long long)bg->bg_blkno);
2097}
2098
2099static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2100{
2101        int i;
2102
2103        printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2104        printk("i_signature:                  %s\n", fe->i_signature);
2105        printk("i_size:                       %llu\n",
2106               (unsigned long long)fe->i_size);
2107        printk("i_clusters:                   %u\n", fe->i_clusters);
2108        printk("i_generation:                 %u\n",
2109               le32_to_cpu(fe->i_generation));
2110        printk("id1.bitmap1.i_used:           %u\n",
2111               le32_to_cpu(fe->id1.bitmap1.i_used));
2112        printk("id1.bitmap1.i_total:          %u\n",
2113               le32_to_cpu(fe->id1.bitmap1.i_total));
2114        printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2115        printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2116        printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2117        printk("id2.i_chain.cl_next_free_rec: %u\n",
2118               fe->id2.i_chain.cl_next_free_rec);
2119        for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2120                printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2121                       fe->id2.i_chain.cl_recs[i].c_free);
2122                printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2123                       fe->id2.i_chain.cl_recs[i].c_total);
2124                printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2125                       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2126        }
2127}
2128
2129/*
2130 * For a given allocation, determine which allocators will need to be
2131 * accessed, and lock them, reserving the appropriate number of bits.
2132 *
2133 * Sparse file systems call this from ocfs2_write_begin_nolock()
2134 * and ocfs2_allocate_unwritten_extents().
2135 *
2136 * File systems which don't support holes call this from
2137 * ocfs2_extend_allocation().
2138 */
2139int ocfs2_lock_allocators(struct inode *inode,
2140                          struct ocfs2_extent_tree *et,
2141                          u32 clusters_to_add, u32 extents_to_split,
2142                          struct ocfs2_alloc_context **data_ac,
2143                          struct ocfs2_alloc_context **meta_ac)
2144{
2145        int ret = 0, num_free_extents;
2146        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2147        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2148
2149        *meta_ac = NULL;
2150        if (data_ac)
2151                *data_ac = NULL;
2152
2153        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2154
2155        num_free_extents = ocfs2_num_free_extents(osb, et);
2156        if (num_free_extents < 0) {
2157                ret = num_free_extents;
2158                mlog_errno(ret);
2159                goto out;
2160        }
2161
2162        /*
2163         * Sparse allocation file systems need to be more conservative
2164         * with reserving room for expansion - the actual allocation
2165         * happens while we've got a journal handle open so re-taking
2166         * a cluster lock (because we ran out of room for another
2167         * extent) will violate ordering rules.
2168         *
2169         * Most of the time we'll only be seeing this 1 cluster at a time
2170         * anyway.
2171         *
2172         * Always lock for any unwritten extents - we might want to
2173         * add blocks during a split.
2174         */
2175        if (!num_free_extents ||
2176            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2177                ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2178                if (ret < 0) {
2179                        if (ret != -ENOSPC)
2180                                mlog_errno(ret);
2181                        goto out;
2182                }
2183        }
2184
2185        if (clusters_to_add == 0)
2186                goto out;
2187
2188        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2189        if (ret < 0) {
2190                if (ret != -ENOSPC)
2191                        mlog_errno(ret);
2192                goto out;
2193        }
2194
2195out:
2196        if (ret) {
2197                if (*meta_ac) {
2198                        ocfs2_free_alloc_context(*meta_ac);
2199                        *meta_ac = NULL;
2200                }
2201
2202                /*
2203                 * We cannot have an error and a non null *data_ac.
2204                 */
2205        }
2206
2207        return ret;
2208}
2209
2210/*
2211 * Read the inode specified by blkno to get suballoc_slot and
2212 * suballoc_bit.
2213 */
2214static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2215                                       u16 *suballoc_slot, u16 *suballoc_bit)
2216{
2217        int status;
2218        struct buffer_head *inode_bh = NULL;
2219        struct ocfs2_dinode *inode_fe;
2220
2221        mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2222
2223        /* dirty read disk */
2224        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2225        if (status < 0) {
2226                mlog(ML_ERROR, "read block %llu failed %d\n",
2227                     (unsigned long long)blkno, status);
2228                goto bail;
2229        }
2230
2231        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2232        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2233                mlog(ML_ERROR, "invalid inode %llu requested\n",
2234                     (unsigned long long)blkno);
2235                status = -EINVAL;
2236                goto bail;
2237        }
2238
2239        if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2240            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2241                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2242                     (unsigned long long)blkno,
2243                     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2244                status = -EINVAL;
2245                goto bail;
2246        }
2247
2248        if (suballoc_slot)
2249                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2250        if (suballoc_bit)
2251                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2252
2253bail:
2254        brelse(inode_bh);
2255
2256        mlog_exit(status);
2257        return status;
2258}
2259
2260/*
2261 * test whether bit is SET in allocator bitmap or not.  on success, 0
2262 * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2263 * is returned and *res is meaningless.  Call this after you have
2264 * cluster locked against suballoc, or you may get a result based on
2265 * non-up2date contents
2266 */
2267static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2268                                   struct inode *suballoc,
2269                                   struct buffer_head *alloc_bh, u64 blkno,
2270                                   u16 bit, int *res)
2271{
2272        struct ocfs2_dinode *alloc_fe;
2273        struct ocfs2_group_desc *group;
2274        struct buffer_head *group_bh = NULL;
2275        u64 bg_blkno;
2276        int status;
2277
2278        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2279                   (unsigned int)bit);
2280
2281        alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2282        if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2283                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2284                     (unsigned int)bit,
2285                     ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2286                status = -EINVAL;
2287                goto bail;
2288        }
2289
2290        bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2291        status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2292                                             &group_bh);
2293        if (status < 0) {
2294                mlog(ML_ERROR, "read group %llu failed %d\n",
2295                     (unsigned long long)bg_blkno, status);
2296                goto bail;
2297        }
2298
2299        group = (struct ocfs2_group_desc *) group_bh->b_data;
2300        *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2301
2302bail:
2303        brelse(group_bh);
2304
2305        mlog_exit(status);
2306        return status;
2307}
2308
2309/*
2310 * Test if the bit representing this inode (blkno) is set in the
2311 * suballocator.
2312 *
2313 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2314 *
2315 * In the event of failure, a negative value is returned and *res is
2316 * meaningless.
2317 *
2318 * Callers must make sure to hold nfs_sync_lock to prevent
2319 * ocfs2_delete_inode() on another node from accessing the same
2320 * suballocator concurrently.
2321 */
2322int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2323{
2324        int status;
2325        u16 suballoc_bit = 0, suballoc_slot = 0;
2326        struct inode *inode_alloc_inode;
2327        struct buffer_head *alloc_bh = NULL;
2328
2329        mlog_entry("blkno: %llu", (unsigned long long)blkno);
2330
2331        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2332                                             &suballoc_bit);
2333        if (status < 0) {
2334                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2335                goto bail;
2336        }
2337
2338        inode_alloc_inode =
2339                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2340                                            suballoc_slot);
2341        if (!inode_alloc_inode) {
2342                /* the error code could be inaccurate, but we are not able to
2343                 * get the correct one. */
2344                status = -EINVAL;
2345                mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2346                     (u32)suballoc_slot);
2347                goto bail;
2348        }
2349
2350        mutex_lock(&inode_alloc_inode->i_mutex);
2351        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2352        if (status < 0) {
2353                mutex_unlock(&inode_alloc_inode->i_mutex);
2354                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2355                     (u32)suballoc_slot, status);
2356                goto bail;
2357        }
2358
2359        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2360                                         blkno, suballoc_bit, res);
2361        if (status < 0)
2362                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2363
2364        ocfs2_inode_unlock(inode_alloc_inode, 0);
2365        mutex_unlock(&inode_alloc_inode->i_mutex);
2366
2367        iput(inode_alloc_inode);
2368        brelse(alloc_bh);
2369bail:
2370        mlog_exit(status);
2371        return status;
2372}
2373