linux/fs/ocfs2/suballoc.c
<<
>>
Prefs
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * suballoc.c
   5 *
   6 * metadata alloc and free
   7 * Inspired by ext3 block groups.
   8 *
   9 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  10 *
  11 * This program is free software; you can redistribute it and/or
  12 * modify it under the terms of the GNU General Public
  13 * License as published by the Free Software Foundation; either
  14 * version 2 of the License, or (at your option) any later version.
  15 *
  16 * This program is distributed in the hope that it will be useful,
  17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 * General Public License for more details.
  20 *
  21 * You should have received a copy of the GNU General Public
  22 * License along with this program; if not, write to the
  23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 * Boston, MA 021110-1307, USA.
  25 */
  26
  27#include <linux/fs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/highmem.h>
  31
  32#include <cluster/masklog.h>
  33
  34#include "ocfs2.h"
  35
  36#include "alloc.h"
  37#include "blockcheck.h"
  38#include "dlmglue.h"
  39#include "inode.h"
  40#include "journal.h"
  41#include "localalloc.h"
  42#include "suballoc.h"
  43#include "super.h"
  44#include "sysfile.h"
  45#include "uptodate.h"
  46#include "ocfs2_trace.h"
  47
  48#include "buffer_head_io.h"
  49
  50#define NOT_ALLOC_NEW_GROUP             0
  51#define ALLOC_NEW_GROUP                 0x1
  52#define ALLOC_GROUPS_FROM_GLOBAL        0x2
  53
  54#define OCFS2_MAX_TO_STEAL              1024
  55
  56struct ocfs2_suballoc_result {
  57        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
  58                                           to 0 when a block group is
  59                                           contiguous. */
  60        u64             sr_bg_stable_blkno; /*
  61                                             * Doesn't change, always
  62                                             * set to target block
  63                                             * group descriptor
  64                                             * block.
  65                                             */
  66        u64             sr_blkno;       /* The first allocated block */
  67        unsigned int    sr_bit_offset;  /* The bit in the bg */
  68        unsigned int    sr_bits;        /* How many bits we claimed */
  69};
  70
  71static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
  72{
  73        if (res->sr_blkno == 0)
  74                return 0;
  75
  76        if (res->sr_bg_blkno)
  77                return res->sr_bg_blkno;
  78
  79        return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
  80}
  81
  82static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
  83static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
  84static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
  85static int ocfs2_block_group_fill(handle_t *handle,
  86                                  struct inode *alloc_inode,
  87                                  struct buffer_head *bg_bh,
  88                                  u64 group_blkno,
  89                                  unsigned int group_clusters,
  90                                  u16 my_chain,
  91                                  struct ocfs2_chain_list *cl);
  92static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
  93                                   struct inode *alloc_inode,
  94                                   struct buffer_head *bh,
  95                                   u64 max_block,
  96                                   u64 *last_alloc_group,
  97                                   int flags);
  98
  99static int ocfs2_cluster_group_search(struct inode *inode,
 100                                      struct buffer_head *group_bh,
 101                                      u32 bits_wanted, u32 min_bits,
 102                                      u64 max_block,
 103                                      struct ocfs2_suballoc_result *res);
 104static int ocfs2_block_group_search(struct inode *inode,
 105                                    struct buffer_head *group_bh,
 106                                    u32 bits_wanted, u32 min_bits,
 107                                    u64 max_block,
 108                                    struct ocfs2_suballoc_result *res);
 109static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 110                                     handle_t *handle,
 111                                     u32 bits_wanted,
 112                                     u32 min_bits,
 113                                     struct ocfs2_suballoc_result *res);
 114static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 115                                         int nr);
 116static int ocfs2_relink_block_group(handle_t *handle,
 117                                    struct inode *alloc_inode,
 118                                    struct buffer_head *fe_bh,
 119                                    struct buffer_head *bg_bh,
 120                                    struct buffer_head *prev_bg_bh,
 121                                    u16 chain);
 122static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
 123                                                     u32 wanted);
 124static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 125                                                   u64 bg_blkno,
 126                                                   u16 bg_bit_off);
 127static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 128                                                u64 data_blkno,
 129                                                u64 *bg_blkno,
 130                                                u16 *bg_bit_off);
 131static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 132                                             u32 bits_wanted, u64 max_block,
 133                                             int flags,
 134                                             struct ocfs2_alloc_context **ac);
 135
 136void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 137{
 138        struct inode *inode = ac->ac_inode;
 139
 140        if (inode) {
 141                if (ac->ac_which != OCFS2_AC_USE_LOCAL)
 142                        ocfs2_inode_unlock(inode, 1);
 143
 144                mutex_unlock(&inode->i_mutex);
 145
 146                iput(inode);
 147                ac->ac_inode = NULL;
 148        }
 149        brelse(ac->ac_bh);
 150        ac->ac_bh = NULL;
 151        ac->ac_resv = NULL;
 152        if (ac->ac_find_loc_priv) {
 153                kfree(ac->ac_find_loc_priv);
 154                ac->ac_find_loc_priv = NULL;
 155        }
 156}
 157
 158void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
 159{
 160        ocfs2_free_ac_resource(ac);
 161        kfree(ac);
 162}
 163
 164static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 165{
 166        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 167}
 168
 169#define do_error(fmt, ...)                                              \
 170        do{                                                             \
 171                if (resize)                                     \
 172                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
 173                else                                                    \
 174                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
 175        } while (0)
 176
 177static int ocfs2_validate_gd_self(struct super_block *sb,
 178                                  struct buffer_head *bh,
 179                                  int resize)
 180{
 181        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 182
 183        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
 184                do_error("Group descriptor #%llu has bad signature %.*s",
 185                         (unsigned long long)bh->b_blocknr, 7,
 186                         gd->bg_signature);
 187                return -EINVAL;
 188        }
 189
 190        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
 191                do_error("Group descriptor #%llu has an invalid bg_blkno "
 192                         "of %llu",
 193                         (unsigned long long)bh->b_blocknr,
 194                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
 195                return -EINVAL;
 196        }
 197
 198        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
 199                do_error("Group descriptor #%llu has an invalid "
 200                         "fs_generation of #%u",
 201                         (unsigned long long)bh->b_blocknr,
 202                         le32_to_cpu(gd->bg_generation));
 203                return -EINVAL;
 204        }
 205
 206        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
 207                do_error("Group descriptor #%llu has bit count %u but "
 208                         "claims that %u are free",
 209                         (unsigned long long)bh->b_blocknr,
 210                         le16_to_cpu(gd->bg_bits),
 211                         le16_to_cpu(gd->bg_free_bits_count));
 212                return -EINVAL;
 213        }
 214
 215        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
 216                do_error("Group descriptor #%llu has bit count %u but "
 217                         "max bitmap bits of %u",
 218                         (unsigned long long)bh->b_blocknr,
 219                         le16_to_cpu(gd->bg_bits),
 220                         8 * le16_to_cpu(gd->bg_size));
 221                return -EINVAL;
 222        }
 223
 224        return 0;
 225}
 226
 227static int ocfs2_validate_gd_parent(struct super_block *sb,
 228                                    struct ocfs2_dinode *di,
 229                                    struct buffer_head *bh,
 230                                    int resize)
 231{
 232        unsigned int max_bits;
 233        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 234
 235        if (di->i_blkno != gd->bg_parent_dinode) {
 236                do_error("Group descriptor #%llu has bad parent "
 237                         "pointer (%llu, expected %llu)",
 238                         (unsigned long long)bh->b_blocknr,
 239                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
 240                         (unsigned long long)le64_to_cpu(di->i_blkno));
 241                return -EINVAL;
 242        }
 243
 244        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 245        if (le16_to_cpu(gd->bg_bits) > max_bits) {
 246                do_error("Group descriptor #%llu has bit count of %u",
 247                         (unsigned long long)bh->b_blocknr,
 248                         le16_to_cpu(gd->bg_bits));
 249                return -EINVAL;
 250        }
 251
 252        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
 253        if ((le16_to_cpu(gd->bg_chain) >
 254             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
 255            ((le16_to_cpu(gd->bg_chain) ==
 256             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
 257                do_error("Group descriptor #%llu has bad chain %u",
 258                         (unsigned long long)bh->b_blocknr,
 259                         le16_to_cpu(gd->bg_chain));
 260                return -EINVAL;
 261        }
 262
 263        return 0;
 264}
 265
 266#undef do_error
 267
 268/*
 269 * This version only prints errors.  It does not fail the filesystem, and
 270 * exists only for resize.
 271 */
 272int ocfs2_check_group_descriptor(struct super_block *sb,
 273                                 struct ocfs2_dinode *di,
 274                                 struct buffer_head *bh)
 275{
 276        int rc;
 277        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 278
 279        BUG_ON(!buffer_uptodate(bh));
 280
 281        /*
 282         * If the ecc fails, we return the error but otherwise
 283         * leave the filesystem running.  We know any error is
 284         * local to this block.
 285         */
 286        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 287        if (rc) {
 288                mlog(ML_ERROR,
 289                     "Checksum failed for group descriptor %llu\n",
 290                     (unsigned long long)bh->b_blocknr);
 291        } else
 292                rc = ocfs2_validate_gd_self(sb, bh, 1);
 293        if (!rc)
 294                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
 295
 296        return rc;
 297}
 298
 299static int ocfs2_validate_group_descriptor(struct super_block *sb,
 300                                           struct buffer_head *bh)
 301{
 302        int rc;
 303        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 304
 305        trace_ocfs2_validate_group_descriptor(
 306                                        (unsigned long long)bh->b_blocknr);
 307
 308        BUG_ON(!buffer_uptodate(bh));
 309
 310        /*
 311         * If the ecc fails, we return the error but otherwise
 312         * leave the filesystem running.  We know any error is
 313         * local to this block.
 314         */
 315        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 316        if (rc)
 317                return rc;
 318
 319        /*
 320         * Errors after here are fatal.
 321         */
 322
 323        return ocfs2_validate_gd_self(sb, bh, 0);
 324}
 325
 326int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
 327                                u64 gd_blkno, struct buffer_head **bh)
 328{
 329        int rc;
 330        struct buffer_head *tmp = *bh;
 331
 332        rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
 333                              ocfs2_validate_group_descriptor);
 334        if (rc)
 335                goto out;
 336
 337        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
 338        if (rc) {
 339                brelse(tmp);
 340                goto out;
 341        }
 342
 343        /* If ocfs2_read_block() got us a new bh, pass it up. */
 344        if (!*bh)
 345                *bh = tmp;
 346
 347out:
 348        return rc;
 349}
 350
 351static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
 352                                          struct ocfs2_group_desc *bg,
 353                                          struct ocfs2_chain_list *cl,
 354                                          u64 p_blkno, unsigned int clusters)
 355{
 356        struct ocfs2_extent_list *el = &bg->bg_list;
 357        struct ocfs2_extent_rec *rec;
 358
 359        BUG_ON(!ocfs2_supports_discontig_bg(osb));
 360        if (!el->l_next_free_rec)
 361                el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
 362        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
 363        rec->e_blkno = cpu_to_le64(p_blkno);
 364        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
 365                                  le16_to_cpu(cl->cl_bpc));
 366        rec->e_leaf_clusters = cpu_to_le16(clusters);
 367        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
 368        le16_add_cpu(&bg->bg_free_bits_count,
 369                     clusters * le16_to_cpu(cl->cl_bpc));
 370        le16_add_cpu(&el->l_next_free_rec, 1);
 371}
 372
 373static int ocfs2_block_group_fill(handle_t *handle,
 374                                  struct inode *alloc_inode,
 375                                  struct buffer_head *bg_bh,
 376                                  u64 group_blkno,
 377                                  unsigned int group_clusters,
 378                                  u16 my_chain,
 379                                  struct ocfs2_chain_list *cl)
 380{
 381        int status = 0;
 382        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 383        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 384        struct super_block * sb = alloc_inode->i_sb;
 385
 386        if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
 387                ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
 388                            "b_blocknr (%llu)",
 389                            (unsigned long long)group_blkno,
 390                            (unsigned long long) bg_bh->b_blocknr);
 391                status = -EIO;
 392                goto bail;
 393        }
 394
 395        status = ocfs2_journal_access_gd(handle,
 396                                         INODE_CACHE(alloc_inode),
 397                                         bg_bh,
 398                                         OCFS2_JOURNAL_ACCESS_CREATE);
 399        if (status < 0) {
 400                mlog_errno(status);
 401                goto bail;
 402        }
 403
 404        memset(bg, 0, sb->s_blocksize);
 405        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
 406        bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
 407        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
 408                                                osb->s_feature_incompat));
 409        bg->bg_chain = cpu_to_le16(my_chain);
 410        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
 411        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
 412        bg->bg_blkno = cpu_to_le64(group_blkno);
 413        if (group_clusters == le16_to_cpu(cl->cl_cpg))
 414                bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
 415        else
 416                ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
 417                                              group_clusters);
 418
 419        /* set the 1st bit in the bitmap to account for the descriptor block */
 420        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
 421        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
 422
 423        ocfs2_journal_dirty(handle, bg_bh);
 424
 425        /* There is no need to zero out or otherwise initialize the
 426         * other blocks in a group - All valid FS metadata in a block
 427         * group stores the superblock fs_generation value at
 428         * allocation time. */
 429
 430bail:
 431        if (status)
 432                mlog_errno(status);
 433        return status;
 434}
 435
 436static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 437{
 438        u16 curr, best;
 439
 440        best = curr = 0;
 441        while (curr < le16_to_cpu(cl->cl_count)) {
 442                if (le32_to_cpu(cl->cl_recs[best].c_total) >
 443                    le32_to_cpu(cl->cl_recs[curr].c_total))
 444                        best = curr;
 445                curr++;
 446        }
 447        return best;
 448}
 449
 450static struct buffer_head *
 451ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
 452                               struct inode *alloc_inode,
 453                               struct ocfs2_alloc_context *ac,
 454                               struct ocfs2_chain_list *cl)
 455{
 456        int status;
 457        u32 bit_off, num_bits;
 458        u64 bg_blkno;
 459        struct buffer_head *bg_bh;
 460        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
 461
 462        status = ocfs2_claim_clusters(handle, ac,
 463                                      le16_to_cpu(cl->cl_cpg), &bit_off,
 464                                      &num_bits);
 465        if (status < 0) {
 466                if (status != -ENOSPC)
 467                        mlog_errno(status);
 468                goto bail;
 469        }
 470
 471        /* setup the group */
 472        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 473        trace_ocfs2_block_group_alloc_contig(
 474             (unsigned long long)bg_blkno, alloc_rec);
 475
 476        bg_bh = sb_getblk(osb->sb, bg_blkno);
 477        if (!bg_bh) {
 478                status = -ENOMEM;
 479                mlog_errno(status);
 480                goto bail;
 481        }
 482        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
 483
 484        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
 485                                        bg_blkno, num_bits, alloc_rec, cl);
 486        if (status < 0) {
 487                brelse(bg_bh);
 488                mlog_errno(status);
 489        }
 490
 491bail:
 492        return status ? ERR_PTR(status) : bg_bh;
 493}
 494
 495static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
 496                                        handle_t *handle,
 497                                        struct ocfs2_alloc_context *ac,
 498                                        unsigned int min_bits,
 499                                        u32 *bit_off, u32 *num_bits)
 500{
 501        int status = 0;
 502
 503        while (min_bits) {
 504                status = ocfs2_claim_clusters(handle, ac, min_bits,
 505                                              bit_off, num_bits);
 506                if (status != -ENOSPC)
 507                        break;
 508
 509                min_bits >>= 1;
 510        }
 511
 512        return status;
 513}
 514
 515static int ocfs2_block_group_grow_discontig(handle_t *handle,
 516                                            struct inode *alloc_inode,
 517                                            struct buffer_head *bg_bh,
 518                                            struct ocfs2_alloc_context *ac,
 519                                            struct ocfs2_chain_list *cl,
 520                                            unsigned int min_bits)
 521{
 522        int status;
 523        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 524        struct ocfs2_group_desc *bg =
 525                (struct ocfs2_group_desc *)bg_bh->b_data;
 526        unsigned int needed = le16_to_cpu(cl->cl_cpg) -
 527                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
 528        u32 p_cpos, clusters;
 529        u64 p_blkno;
 530        struct ocfs2_extent_list *el = &bg->bg_list;
 531
 532        status = ocfs2_journal_access_gd(handle,
 533                                         INODE_CACHE(alloc_inode),
 534                                         bg_bh,
 535                                         OCFS2_JOURNAL_ACCESS_CREATE);
 536        if (status < 0) {
 537                mlog_errno(status);
 538                goto bail;
 539        }
 540
 541        while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
 542                                le16_to_cpu(el->l_count))) {
 543                if (min_bits > needed)
 544                        min_bits = needed;
 545                status = ocfs2_block_group_claim_bits(osb, handle, ac,
 546                                                      min_bits, &p_cpos,
 547                                                      &clusters);
 548                if (status < 0) {
 549                        if (status != -ENOSPC)
 550                                mlog_errno(status);
 551                        goto bail;
 552                }
 553                p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
 554                ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
 555                                              clusters);
 556
 557                min_bits = clusters;
 558                needed = le16_to_cpu(cl->cl_cpg) -
 559                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
 560        }
 561
 562        if (needed > 0) {
 563                /*
 564                 * We have used up all the extent rec but can't fill up
 565                 * the cpg. So bail out.
 566                 */
 567                status = -ENOSPC;
 568                goto bail;
 569        }
 570
 571        ocfs2_journal_dirty(handle, bg_bh);
 572
 573bail:
 574        return status;
 575}
 576
 577static void ocfs2_bg_alloc_cleanup(handle_t *handle,
 578                                   struct ocfs2_alloc_context *cluster_ac,
 579                                   struct inode *alloc_inode,
 580                                   struct buffer_head *bg_bh)
 581{
 582        int i, ret;
 583        struct ocfs2_group_desc *bg;
 584        struct ocfs2_extent_list *el;
 585        struct ocfs2_extent_rec *rec;
 586
 587        if (!bg_bh)
 588                return;
 589
 590        bg = (struct ocfs2_group_desc *)bg_bh->b_data;
 591        el = &bg->bg_list;
 592        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 593                rec = &el->l_recs[i];
 594                ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
 595                                          cluster_ac->ac_bh,
 596                                          le64_to_cpu(rec->e_blkno),
 597                                          le16_to_cpu(rec->e_leaf_clusters));
 598                if (ret)
 599                        mlog_errno(ret);
 600                /* Try all the clusters to free */
 601        }
 602
 603        ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
 604        brelse(bg_bh);
 605}
 606
 607static struct buffer_head *
 608ocfs2_block_group_alloc_discontig(handle_t *handle,
 609                                  struct inode *alloc_inode,
 610                                  struct ocfs2_alloc_context *ac,
 611                                  struct ocfs2_chain_list *cl)
 612{
 613        int status;
 614        u32 bit_off, num_bits;
 615        u64 bg_blkno;
 616        unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
 617        struct buffer_head *bg_bh = NULL;
 618        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
 619        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 620
 621        if (!ocfs2_supports_discontig_bg(osb)) {
 622                status = -ENOSPC;
 623                goto bail;
 624        }
 625
 626        status = ocfs2_extend_trans(handle,
 627                                    ocfs2_calc_bg_discontig_credits(osb->sb));
 628        if (status) {
 629                mlog_errno(status);
 630                goto bail;
 631        }
 632
 633        /*
 634         * We're going to be grabbing from multiple cluster groups.
 635         * We don't have enough credits to relink them all, and the
 636         * cluster groups will be staying in cache for the duration of
 637         * this operation.
 638         */
 639        ac->ac_disable_chain_relink = 1;
 640
 641        /* Claim the first region */
 642        status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
 643                                              &bit_off, &num_bits);
 644        if (status < 0) {
 645                if (status != -ENOSPC)
 646                        mlog_errno(status);
 647                goto bail;
 648        }
 649        min_bits = num_bits;
 650
 651        /* setup the group */
 652        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 653        trace_ocfs2_block_group_alloc_discontig(
 654                                (unsigned long long)bg_blkno, alloc_rec);
 655
 656        bg_bh = sb_getblk(osb->sb, bg_blkno);
 657        if (!bg_bh) {
 658                status = -ENOMEM;
 659                mlog_errno(status);
 660                goto bail;
 661        }
 662        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
 663
 664        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
 665                                        bg_blkno, num_bits, alloc_rec, cl);
 666        if (status < 0) {
 667                mlog_errno(status);
 668                goto bail;
 669        }
 670
 671        status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
 672                                                  bg_bh, ac, cl, min_bits);
 673        if (status)
 674                mlog_errno(status);
 675
 676bail:
 677        if (status)
 678                ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
 679        return status ? ERR_PTR(status) : bg_bh;
 680}
 681
 682/*
 683 * We expect the block group allocator to already be locked.
 684 */
 685static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 686                                   struct inode *alloc_inode,
 687                                   struct buffer_head *bh,
 688                                   u64 max_block,
 689                                   u64 *last_alloc_group,
 690                                   int flags)
 691{
 692        int status, credits;
 693        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
 694        struct ocfs2_chain_list *cl;
 695        struct ocfs2_alloc_context *ac = NULL;
 696        handle_t *handle = NULL;
 697        u16 alloc_rec;
 698        struct buffer_head *bg_bh = NULL;
 699        struct ocfs2_group_desc *bg;
 700
 701        BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
 702
 703        cl = &fe->id2.i_chain;
 704        status = ocfs2_reserve_clusters_with_limit(osb,
 705                                                   le16_to_cpu(cl->cl_cpg),
 706                                                   max_block, flags, &ac);
 707        if (status < 0) {
 708                if (status != -ENOSPC)
 709                        mlog_errno(status);
 710                goto bail;
 711        }
 712
 713        credits = ocfs2_calc_group_alloc_credits(osb->sb,
 714                                                 le16_to_cpu(cl->cl_cpg));
 715        handle = ocfs2_start_trans(osb, credits);
 716        if (IS_ERR(handle)) {
 717                status = PTR_ERR(handle);
 718                handle = NULL;
 719                mlog_errno(status);
 720                goto bail;
 721        }
 722
 723        if (last_alloc_group && *last_alloc_group != 0) {
 724                trace_ocfs2_block_group_alloc(
 725                                (unsigned long long)*last_alloc_group);
 726                ac->ac_last_group = *last_alloc_group;
 727        }
 728
 729        bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
 730                                               ac, cl);
 731        if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
 732                bg_bh = ocfs2_block_group_alloc_discontig(handle,
 733                                                          alloc_inode,
 734                                                          ac, cl);
 735        if (IS_ERR(bg_bh)) {
 736                status = PTR_ERR(bg_bh);
 737                bg_bh = NULL;
 738                if (status != -ENOSPC)
 739                        mlog_errno(status);
 740                goto bail;
 741        }
 742        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 743
 744        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
 745                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
 746        if (status < 0) {
 747                mlog_errno(status);
 748                goto bail;
 749        }
 750
 751        alloc_rec = le16_to_cpu(bg->bg_chain);
 752        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
 753                     le16_to_cpu(bg->bg_free_bits_count));
 754        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
 755                     le16_to_cpu(bg->bg_bits));
 756        cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
 757        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
 758                le16_add_cpu(&cl->cl_next_free_rec, 1);
 759
 760        le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
 761                                        le16_to_cpu(bg->bg_free_bits_count));
 762        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
 763        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
 764
 765        ocfs2_journal_dirty(handle, bh);
 766
 767        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
 768        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 769        fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
 770                                             le32_to_cpu(fe->i_clusters)));
 771        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
 772        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
 773        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
 774        ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
 775
 776        status = 0;
 777
 778        /* save the new last alloc group so that the caller can cache it. */
 779        if (last_alloc_group)
 780                *last_alloc_group = ac->ac_last_group;
 781
 782bail:
 783        if (handle)
 784                ocfs2_commit_trans(osb, handle);
 785
 786        if (ac)
 787                ocfs2_free_alloc_context(ac);
 788
 789        brelse(bg_bh);
 790
 791        if (status)
 792                mlog_errno(status);
 793        return status;
 794}
 795
 796static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 797                                       struct ocfs2_alloc_context *ac,
 798                                       int type,
 799                                       u32 slot,
 800                                       u64 *last_alloc_group,
 801                                       int flags)
 802{
 803        int status;
 804        u32 bits_wanted = ac->ac_bits_wanted;
 805        struct inode *alloc_inode;
 806        struct buffer_head *bh = NULL;
 807        struct ocfs2_dinode *fe;
 808        u32 free_bits;
 809
 810        alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
 811        if (!alloc_inode) {
 812                mlog_errno(-EINVAL);
 813                return -EINVAL;
 814        }
 815
 816        mutex_lock(&alloc_inode->i_mutex);
 817
 818        status = ocfs2_inode_lock(alloc_inode, &bh, 1);
 819        if (status < 0) {
 820                mutex_unlock(&alloc_inode->i_mutex);
 821                iput(alloc_inode);
 822
 823                mlog_errno(status);
 824                return status;
 825        }
 826
 827        ac->ac_inode = alloc_inode;
 828        ac->ac_alloc_slot = slot;
 829
 830        fe = (struct ocfs2_dinode *) bh->b_data;
 831
 832        /* The bh was validated by the inode read inside
 833         * ocfs2_inode_lock().  Any corruption is a code bug. */
 834        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 835
 836        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 837                ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
 838                            (unsigned long long)le64_to_cpu(fe->i_blkno));
 839                status = -EIO;
 840                goto bail;
 841        }
 842
 843        free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
 844                le32_to_cpu(fe->id1.bitmap1.i_used);
 845
 846        if (bits_wanted > free_bits) {
 847                /* cluster bitmap never grows */
 848                if (ocfs2_is_cluster_bitmap(alloc_inode)) {
 849                        trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
 850                                                                free_bits);
 851                        status = -ENOSPC;
 852                        goto bail;
 853                }
 854
 855                if (!(flags & ALLOC_NEW_GROUP)) {
 856                        trace_ocfs2_reserve_suballoc_bits_no_new_group(
 857                                                slot, bits_wanted, free_bits);
 858                        status = -ENOSPC;
 859                        goto bail;
 860                }
 861
 862                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
 863                                                 ac->ac_max_block,
 864                                                 last_alloc_group, flags);
 865                if (status < 0) {
 866                        if (status != -ENOSPC)
 867                                mlog_errno(status);
 868                        goto bail;
 869                }
 870                atomic_inc(&osb->alloc_stats.bg_extends);
 871
 872                /* You should never ask for this much metadata */
 873                BUG_ON(bits_wanted >
 874                       (le32_to_cpu(fe->id1.bitmap1.i_total)
 875                        - le32_to_cpu(fe->id1.bitmap1.i_used)));
 876        }
 877
 878        get_bh(bh);
 879        ac->ac_bh = bh;
 880bail:
 881        brelse(bh);
 882
 883        if (status)
 884                mlog_errno(status);
 885        return status;
 886}
 887
 888static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 889{
 890        spin_lock(&osb->osb_lock);
 891        osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
 892        spin_unlock(&osb->osb_lock);
 893        atomic_set(&osb->s_num_inodes_stolen, 0);
 894}
 895
 896static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
 897{
 898        spin_lock(&osb->osb_lock);
 899        osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
 900        spin_unlock(&osb->osb_lock);
 901        atomic_set(&osb->s_num_meta_stolen, 0);
 902}
 903
 904void ocfs2_init_steal_slots(struct ocfs2_super *osb)
 905{
 906        ocfs2_init_inode_steal_slot(osb);
 907        ocfs2_init_meta_steal_slot(osb);
 908}
 909
 910static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
 911{
 912        spin_lock(&osb->osb_lock);
 913        if (type == INODE_ALLOC_SYSTEM_INODE)
 914                osb->s_inode_steal_slot = slot;
 915        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
 916                osb->s_meta_steal_slot = slot;
 917        spin_unlock(&osb->osb_lock);
 918}
 919
 920static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
 921{
 922        int slot = OCFS2_INVALID_SLOT;
 923
 924        spin_lock(&osb->osb_lock);
 925        if (type == INODE_ALLOC_SYSTEM_INODE)
 926                slot = osb->s_inode_steal_slot;
 927        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
 928                slot = osb->s_meta_steal_slot;
 929        spin_unlock(&osb->osb_lock);
 930
 931        return slot;
 932}
 933
 934static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 935{
 936        return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
 937}
 938
 939static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
 940{
 941        return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
 942}
 943
 944static int ocfs2_steal_resource(struct ocfs2_super *osb,
 945                                struct ocfs2_alloc_context *ac,
 946                                int type)
 947{
 948        int i, status = -ENOSPC;
 949        int slot = __ocfs2_get_steal_slot(osb, type);
 950
 951        /* Start to steal resource from the first slot after ours. */
 952        if (slot == OCFS2_INVALID_SLOT)
 953                slot = osb->slot_num + 1;
 954
 955        for (i = 0; i < osb->max_slots; i++, slot++) {
 956                if (slot == osb->max_slots)
 957                        slot = 0;
 958
 959                if (slot == osb->slot_num)
 960                        continue;
 961
 962                status = ocfs2_reserve_suballoc_bits(osb, ac,
 963                                                     type,
 964                                                     (u32)slot, NULL,
 965                                                     NOT_ALLOC_NEW_GROUP);
 966                if (status >= 0) {
 967                        __ocfs2_set_steal_slot(osb, slot, type);
 968                        break;
 969                }
 970
 971                ocfs2_free_ac_resource(ac);
 972        }
 973
 974        return status;
 975}
 976
 977static int ocfs2_steal_inode(struct ocfs2_super *osb,
 978                             struct ocfs2_alloc_context *ac)
 979{
 980        return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
 981}
 982
 983static int ocfs2_steal_meta(struct ocfs2_super *osb,
 984                            struct ocfs2_alloc_context *ac)
 985{
 986        return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
 987}
 988
 989int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 990                                      int blocks,
 991                                      struct ocfs2_alloc_context **ac)
 992{
 993        int status;
 994        int slot = ocfs2_get_meta_steal_slot(osb);
 995
 996        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 997        if (!(*ac)) {
 998                status = -ENOMEM;
 999                mlog_errno(status);
1000                goto bail;
1001        }
1002
1003        (*ac)->ac_bits_wanted = blocks;
1004        (*ac)->ac_which = OCFS2_AC_USE_META;
1005        (*ac)->ac_group_search = ocfs2_block_group_search;
1006
1007        if (slot != OCFS2_INVALID_SLOT &&
1008                atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
1009                goto extent_steal;
1010
1011        atomic_set(&osb->s_num_meta_stolen, 0);
1012        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
1013                                             EXTENT_ALLOC_SYSTEM_INODE,
1014                                             (u32)osb->slot_num, NULL,
1015                                             ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
1016
1017
1018        if (status >= 0) {
1019                status = 0;
1020                if (slot != OCFS2_INVALID_SLOT)
1021                        ocfs2_init_meta_steal_slot(osb);
1022                goto bail;
1023        } else if (status < 0 && status != -ENOSPC) {
1024                mlog_errno(status);
1025                goto bail;
1026        }
1027
1028        ocfs2_free_ac_resource(*ac);
1029
1030extent_steal:
1031        status = ocfs2_steal_meta(osb, *ac);
1032        atomic_inc(&osb->s_num_meta_stolen);
1033        if (status < 0) {
1034                if (status != -ENOSPC)
1035                        mlog_errno(status);
1036                goto bail;
1037        }
1038
1039        status = 0;
1040bail:
1041        if ((status < 0) && *ac) {
1042                ocfs2_free_alloc_context(*ac);
1043                *ac = NULL;
1044        }
1045
1046        if (status)
1047                mlog_errno(status);
1048        return status;
1049}
1050
1051int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1052                               struct ocfs2_extent_list *root_el,
1053                               struct ocfs2_alloc_context **ac)
1054{
1055        return ocfs2_reserve_new_metadata_blocks(osb,
1056                                        ocfs2_extend_meta_needed(root_el),
1057                                        ac);
1058}
1059
1060int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1061                            struct ocfs2_alloc_context **ac)
1062{
1063        int status;
1064        int slot = ocfs2_get_inode_steal_slot(osb);
1065        u64 alloc_group;
1066
1067        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1068        if (!(*ac)) {
1069                status = -ENOMEM;
1070                mlog_errno(status);
1071                goto bail;
1072        }
1073
1074        (*ac)->ac_bits_wanted = 1;
1075        (*ac)->ac_which = OCFS2_AC_USE_INODE;
1076
1077        (*ac)->ac_group_search = ocfs2_block_group_search;
1078
1079        /*
1080         * stat(2) can't handle i_ino > 32bits, so we tell the
1081         * lower levels not to allocate us a block group past that
1082         * limit.  The 'inode64' mount option avoids this behavior.
1083         */
1084        if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1085                (*ac)->ac_max_block = (u32)~0U;
1086
1087        /*
1088         * slot is set when we successfully steal inode from other nodes.
1089         * It is reset in 3 places:
1090         * 1. when we flush the truncate log
1091         * 2. when we complete local alloc recovery.
1092         * 3. when we successfully allocate from our own slot.
1093         * After it is set, we will go on stealing inodes until we find the
1094         * need to check our slots to see whether there is some space for us.
1095         */
1096        if (slot != OCFS2_INVALID_SLOT &&
1097            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1098                goto inode_steal;
1099
1100        atomic_set(&osb->s_num_inodes_stolen, 0);
1101        alloc_group = osb->osb_inode_alloc_group;
1102        status = ocfs2_reserve_suballoc_bits(osb, *ac,
1103                                             INODE_ALLOC_SYSTEM_INODE,
1104                                             (u32)osb->slot_num,
1105                                             &alloc_group,
1106                                             ALLOC_NEW_GROUP |
1107                                             ALLOC_GROUPS_FROM_GLOBAL);
1108        if (status >= 0) {
1109                status = 0;
1110
1111                spin_lock(&osb->osb_lock);
1112                osb->osb_inode_alloc_group = alloc_group;
1113                spin_unlock(&osb->osb_lock);
1114                trace_ocfs2_reserve_new_inode_new_group(
1115                        (unsigned long long)alloc_group);
1116
1117                /*
1118                 * Some inodes must be freed by us, so try to allocate
1119                 * from our own next time.
1120                 */
1121                if (slot != OCFS2_INVALID_SLOT)
1122                        ocfs2_init_inode_steal_slot(osb);
1123                goto bail;
1124        } else if (status < 0 && status != -ENOSPC) {
1125                mlog_errno(status);
1126                goto bail;
1127        }
1128
1129        ocfs2_free_ac_resource(*ac);
1130
1131inode_steal:
1132        status = ocfs2_steal_inode(osb, *ac);
1133        atomic_inc(&osb->s_num_inodes_stolen);
1134        if (status < 0) {
1135                if (status != -ENOSPC)
1136                        mlog_errno(status);
1137                goto bail;
1138        }
1139
1140        status = 0;
1141bail:
1142        if ((status < 0) && *ac) {
1143                ocfs2_free_alloc_context(*ac);
1144                *ac = NULL;
1145        }
1146
1147        if (status)
1148                mlog_errno(status);
1149        return status;
1150}
1151
1152/* local alloc code has to do the same thing, so rather than do this
1153 * twice.. */
1154int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1155                                      struct ocfs2_alloc_context *ac)
1156{
1157        int status;
1158
1159        ac->ac_which = OCFS2_AC_USE_MAIN;
1160        ac->ac_group_search = ocfs2_cluster_group_search;
1161
1162        status = ocfs2_reserve_suballoc_bits(osb, ac,
1163                                             GLOBAL_BITMAP_SYSTEM_INODE,
1164                                             OCFS2_INVALID_SLOT, NULL,
1165                                             ALLOC_NEW_GROUP);
1166        if (status < 0 && status != -ENOSPC) {
1167                mlog_errno(status);
1168                goto bail;
1169        }
1170
1171bail:
1172        return status;
1173}
1174
1175/* Callers don't need to care which bitmap (local alloc or main) to
1176 * use so we figure it out for them, but unfortunately this clutters
1177 * things a bit. */
1178static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1179                                             u32 bits_wanted, u64 max_block,
1180                                             int flags,
1181                                             struct ocfs2_alloc_context **ac)
1182{
1183        int status;
1184
1185        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1186        if (!(*ac)) {
1187                status = -ENOMEM;
1188                mlog_errno(status);
1189                goto bail;
1190        }
1191
1192        (*ac)->ac_bits_wanted = bits_wanted;
1193        (*ac)->ac_max_block = max_block;
1194
1195        status = -ENOSPC;
1196        if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1197            ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1198                status = ocfs2_reserve_local_alloc_bits(osb,
1199                                                        bits_wanted,
1200                                                        *ac);
1201                if ((status < 0) && (status != -ENOSPC)) {
1202                        mlog_errno(status);
1203                        goto bail;
1204                }
1205        }
1206
1207        if (status == -ENOSPC) {
1208                status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1209                if (status < 0) {
1210                        if (status != -ENOSPC)
1211                                mlog_errno(status);
1212                        goto bail;
1213                }
1214        }
1215
1216        status = 0;
1217bail:
1218        if ((status < 0) && *ac) {
1219                ocfs2_free_alloc_context(*ac);
1220                *ac = NULL;
1221        }
1222
1223        if (status)
1224                mlog_errno(status);
1225        return status;
1226}
1227
1228int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1229                           u32 bits_wanted,
1230                           struct ocfs2_alloc_context **ac)
1231{
1232        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1233                                                 ALLOC_NEW_GROUP, ac);
1234}
1235
1236/*
1237 * More or less lifted from ext3. I'll leave their description below:
1238 *
1239 * "For ext3 allocations, we must not reuse any blocks which are
1240 * allocated in the bitmap buffer's "last committed data" copy.  This
1241 * prevents deletes from freeing up the page for reuse until we have
1242 * committed the delete transaction.
1243 *
1244 * If we didn't do this, then deleting something and reallocating it as
1245 * data would allow the old block to be overwritten before the
1246 * transaction committed (because we force data to disk before commit).
1247 * This would lead to corruption if we crashed between overwriting the
1248 * data and committing the delete.
1249 *
1250 * @@@ We may want to make this allocation behaviour conditional on
1251 * data-writes at some point, and disable it for metadata allocations or
1252 * sync-data inodes."
1253 *
1254 * Note: OCFS2 already does this differently for metadata vs data
1255 * allocations, as those bitmaps are separate and undo access is never
1256 * called on a metadata group descriptor.
1257 */
1258static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1259                                         int nr)
1260{
1261        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1262        int ret;
1263
1264        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1265                return 0;
1266
1267        if (!buffer_jbd(bg_bh))
1268                return 1;
1269
1270        jbd_lock_bh_state(bg_bh);
1271        bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1272        if (bg)
1273                ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1274        else
1275                ret = 1;
1276        jbd_unlock_bh_state(bg_bh);
1277
1278        return ret;
1279}
1280
1281static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1282                                             struct buffer_head *bg_bh,
1283                                             unsigned int bits_wanted,
1284                                             unsigned int total_bits,
1285                                             struct ocfs2_suballoc_result *res)
1286{
1287        void *bitmap;
1288        u16 best_offset, best_size;
1289        int offset, start, found, status = 0;
1290        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1291
1292        /* Callers got this descriptor from
1293         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1294        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1295
1296        found = start = best_offset = best_size = 0;
1297        bitmap = bg->bg_bitmap;
1298
1299        while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1300                if (offset == total_bits)
1301                        break;
1302
1303                if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1304                        /* We found a zero, but we can't use it as it
1305                         * hasn't been put to disk yet! */
1306                        found = 0;
1307                        start = offset + 1;
1308                } else if (offset == start) {
1309                        /* we found a zero */
1310                        found++;
1311                        /* move start to the next bit to test */
1312                        start++;
1313                } else {
1314                        /* got a zero after some ones */
1315                        found = 1;
1316                        start = offset + 1;
1317                }
1318                if (found > best_size) {
1319                        best_size = found;
1320                        best_offset = start - found;
1321                }
1322                /* we got everything we needed */
1323                if (found == bits_wanted) {
1324                        /* mlog(0, "Found it all!\n"); */
1325                        break;
1326                }
1327        }
1328
1329        if (best_size) {
1330                res->sr_bit_offset = best_offset;
1331                res->sr_bits = best_size;
1332        } else {
1333                status = -ENOSPC;
1334                /* No error log here -- see the comment above
1335                 * ocfs2_test_bg_bit_allocatable */
1336        }
1337
1338        return status;
1339}
1340
1341int ocfs2_block_group_set_bits(handle_t *handle,
1342                                             struct inode *alloc_inode,
1343                                             struct ocfs2_group_desc *bg,
1344                                             struct buffer_head *group_bh,
1345                                             unsigned int bit_off,
1346                                             unsigned int num_bits)
1347{
1348        int status;
1349        void *bitmap = bg->bg_bitmap;
1350        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1351
1352        /* All callers get the descriptor via
1353         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1354        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1355        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1356
1357        trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1358
1359        if (ocfs2_is_cluster_bitmap(alloc_inode))
1360                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1361
1362        status = ocfs2_journal_access_gd(handle,
1363                                         INODE_CACHE(alloc_inode),
1364                                         group_bh,
1365                                         journal_type);
1366        if (status < 0) {
1367                mlog_errno(status);
1368                goto bail;
1369        }
1370
1371        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1372        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1373                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
1374                            " count %u but claims %u are freed. num_bits %d",
1375                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
1376                            le16_to_cpu(bg->bg_bits),
1377                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
1378                return -EROFS;
1379        }
1380        while(num_bits--)
1381                ocfs2_set_bit(bit_off++, bitmap);
1382
1383        ocfs2_journal_dirty(handle, group_bh);
1384
1385bail:
1386        return status;
1387}
1388
1389/* find the one with the most empty bits */
1390static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1391{
1392        u16 curr, best;
1393
1394        BUG_ON(!cl->cl_next_free_rec);
1395
1396        best = curr = 0;
1397        while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1398                if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1399                    le32_to_cpu(cl->cl_recs[best].c_free))
1400                        best = curr;
1401                curr++;
1402        }
1403
1404        BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1405        return best;
1406}
1407
1408static int ocfs2_relink_block_group(handle_t *handle,
1409                                    struct inode *alloc_inode,
1410                                    struct buffer_head *fe_bh,
1411                                    struct buffer_head *bg_bh,
1412                                    struct buffer_head *prev_bg_bh,
1413                                    u16 chain)
1414{
1415        int status;
1416        /* there is a really tiny chance the journal calls could fail,
1417         * but we wouldn't want inconsistent blocks in *any* case. */
1418        u64 bg_ptr, prev_bg_ptr;
1419        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1420        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1421        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1422
1423        /* The caller got these descriptors from
1424         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1425        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1426        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1427
1428        trace_ocfs2_relink_block_group(
1429                (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1430                (unsigned long long)le64_to_cpu(bg->bg_blkno),
1431                (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1432
1433        bg_ptr = le64_to_cpu(bg->bg_next_group);
1434        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1435
1436        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1437                                         prev_bg_bh,
1438                                         OCFS2_JOURNAL_ACCESS_WRITE);
1439        if (status < 0)
1440                goto out;
1441
1442        prev_bg->bg_next_group = bg->bg_next_group;
1443        ocfs2_journal_dirty(handle, prev_bg_bh);
1444
1445        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1446                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1447        if (status < 0)
1448                goto out_rollback_prev_bg;
1449
1450        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1451        ocfs2_journal_dirty(handle, bg_bh);
1452
1453        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1454                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1455        if (status < 0)
1456                goto out_rollback_bg;
1457
1458        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1459        ocfs2_journal_dirty(handle, fe_bh);
1460
1461out:
1462        if (status < 0)
1463                mlog_errno(status);
1464        return status;
1465
1466out_rollback_bg:
1467        bg->bg_next_group = cpu_to_le64(bg_ptr);
1468out_rollback_prev_bg:
1469        prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1470        goto out;
1471}
1472
1473static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1474                                                     u32 wanted)
1475{
1476        return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1477}
1478
1479/* return 0 on success, -ENOSPC to keep searching and any other < 0
1480 * value on error. */
1481static int ocfs2_cluster_group_search(struct inode *inode,
1482                                      struct buffer_head *group_bh,
1483                                      u32 bits_wanted, u32 min_bits,
1484                                      u64 max_block,
1485                                      struct ocfs2_suballoc_result *res)
1486{
1487        int search = -ENOSPC;
1488        int ret;
1489        u64 blkoff;
1490        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1491        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1492        unsigned int max_bits, gd_cluster_off;
1493
1494        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1495
1496        if (gd->bg_free_bits_count) {
1497                max_bits = le16_to_cpu(gd->bg_bits);
1498
1499                /* Tail groups in cluster bitmaps which aren't cpg
1500                 * aligned are prone to partial extension by a failed
1501                 * fs resize. If the file system resize never got to
1502                 * update the dinode cluster count, then we don't want
1503                 * to trust any clusters past it, regardless of what
1504                 * the group descriptor says. */
1505                gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1506                                                          le64_to_cpu(gd->bg_blkno));
1507                if ((gd_cluster_off + max_bits) >
1508                    OCFS2_I(inode)->ip_clusters) {
1509                        max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1510                        trace_ocfs2_cluster_group_search_wrong_max_bits(
1511                                (unsigned long long)le64_to_cpu(gd->bg_blkno),
1512                                le16_to_cpu(gd->bg_bits),
1513                                OCFS2_I(inode)->ip_clusters, max_bits);
1514                }
1515
1516                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1517                                                        group_bh, bits_wanted,
1518                                                        max_bits, res);
1519                if (ret)
1520                        return ret;
1521
1522                if (max_block) {
1523                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1524                                                          gd_cluster_off +
1525                                                          res->sr_bit_offset +
1526                                                          res->sr_bits);
1527                        trace_ocfs2_cluster_group_search_max_block(
1528                                (unsigned long long)blkoff,
1529                                (unsigned long long)max_block);
1530                        if (blkoff > max_block)
1531                                return -ENOSPC;
1532                }
1533
1534                /* ocfs2_block_group_find_clear_bits() might
1535                 * return success, but we still want to return
1536                 * -ENOSPC unless it found the minimum number
1537                 * of bits. */
1538                if (min_bits <= res->sr_bits)
1539                        search = 0; /* success */
1540                else if (res->sr_bits) {
1541                        /*
1542                         * Don't show bits which we'll be returning
1543                         * for allocation to the local alloc bitmap.
1544                         */
1545                        ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1546                }
1547        }
1548
1549        return search;
1550}
1551
1552static int ocfs2_block_group_search(struct inode *inode,
1553                                    struct buffer_head *group_bh,
1554                                    u32 bits_wanted, u32 min_bits,
1555                                    u64 max_block,
1556                                    struct ocfs2_suballoc_result *res)
1557{
1558        int ret = -ENOSPC;
1559        u64 blkoff;
1560        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1561
1562        BUG_ON(min_bits != 1);
1563        BUG_ON(ocfs2_is_cluster_bitmap(inode));
1564
1565        if (bg->bg_free_bits_count) {
1566                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1567                                                        group_bh, bits_wanted,
1568                                                        le16_to_cpu(bg->bg_bits),
1569                                                        res);
1570                if (!ret && max_block) {
1571                        blkoff = le64_to_cpu(bg->bg_blkno) +
1572                                res->sr_bit_offset + res->sr_bits;
1573                        trace_ocfs2_block_group_search_max_block(
1574                                (unsigned long long)blkoff,
1575                                (unsigned long long)max_block);
1576                        if (blkoff > max_block)
1577                                ret = -ENOSPC;
1578                }
1579        }
1580
1581        return ret;
1582}
1583
1584int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1585                                       handle_t *handle,
1586                                       struct buffer_head *di_bh,
1587                                       u32 num_bits,
1588                                       u16 chain)
1589{
1590        int ret;
1591        u32 tmp_used;
1592        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1593        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1594
1595        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1596                                      OCFS2_JOURNAL_ACCESS_WRITE);
1597        if (ret < 0) {
1598                mlog_errno(ret);
1599                goto out;
1600        }
1601
1602        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1603        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1604        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1605        ocfs2_journal_dirty(handle, di_bh);
1606
1607out:
1608        return ret;
1609}
1610
1611void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1612                                       struct buffer_head *di_bh,
1613                                       u32 num_bits,
1614                                       u16 chain)
1615{
1616        u32 tmp_used;
1617        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1618        struct ocfs2_chain_list *cl;
1619
1620        cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1621        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1622        di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1623        le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1624}
1625
1626static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1627                                         struct ocfs2_extent_rec *rec,
1628                                         struct ocfs2_chain_list *cl)
1629{
1630        unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1631        unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1632        unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1633
1634        if (res->sr_bit_offset < bitoff)
1635                return 0;
1636        if (res->sr_bit_offset >= (bitoff + bitcount))
1637                return 0;
1638        res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1639                (res->sr_bit_offset - bitoff);
1640        if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1641                res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1642        return 1;
1643}
1644
1645static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1646                                          struct ocfs2_group_desc *bg,
1647                                          struct ocfs2_suballoc_result *res)
1648{
1649        int i;
1650        u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1651        struct ocfs2_extent_rec *rec;
1652        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1653        struct ocfs2_chain_list *cl = &di->id2.i_chain;
1654
1655        if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1656                res->sr_blkno = 0;
1657                return;
1658        }
1659
1660        res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1661        res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1662        if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1663            !bg->bg_list.l_next_free_rec)
1664                return;
1665
1666        for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1667                rec = &bg->bg_list.l_recs[i];
1668                if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1669                        res->sr_bg_blkno = bg_blkno;  /* Restore */
1670                        break;
1671                }
1672        }
1673}
1674
1675static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1676                                  handle_t *handle,
1677                                  u32 bits_wanted,
1678                                  u32 min_bits,
1679                                  struct ocfs2_suballoc_result *res,
1680                                  u16 *bits_left)
1681{
1682        int ret;
1683        struct buffer_head *group_bh = NULL;
1684        struct ocfs2_group_desc *gd;
1685        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1686        struct inode *alloc_inode = ac->ac_inode;
1687
1688        ret = ocfs2_read_group_descriptor(alloc_inode, di,
1689                                          res->sr_bg_blkno, &group_bh);
1690        if (ret < 0) {
1691                mlog_errno(ret);
1692                return ret;
1693        }
1694
1695        gd = (struct ocfs2_group_desc *) group_bh->b_data;
1696        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1697                                  ac->ac_max_block, res);
1698        if (ret < 0) {
1699                if (ret != -ENOSPC)
1700                        mlog_errno(ret);
1701                goto out;
1702        }
1703
1704        if (!ret)
1705                ocfs2_bg_discontig_fix_result(ac, gd, res);
1706
1707        /*
1708         * sr_bg_blkno might have been changed by
1709         * ocfs2_bg_discontig_fix_result
1710         */
1711        res->sr_bg_stable_blkno = group_bh->b_blocknr;
1712
1713        if (ac->ac_find_loc_only)
1714                goto out_loc_only;
1715
1716        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1717                                               res->sr_bits,
1718                                               le16_to_cpu(gd->bg_chain));
1719        if (ret < 0) {
1720                mlog_errno(ret);
1721                goto out;
1722        }
1723
1724        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1725                                         res->sr_bit_offset, res->sr_bits);
1726        if (ret < 0) {
1727                ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1728                                               res->sr_bits,
1729                                               le16_to_cpu(gd->bg_chain));
1730                mlog_errno(ret);
1731        }
1732
1733out_loc_only:
1734        *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1735
1736out:
1737        brelse(group_bh);
1738
1739        return ret;
1740}
1741
1742static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1743                              handle_t *handle,
1744                              u32 bits_wanted,
1745                              u32 min_bits,
1746                              struct ocfs2_suballoc_result *res,
1747                              u16 *bits_left)
1748{
1749        int status;
1750        u16 chain;
1751        u64 next_group;
1752        struct inode *alloc_inode = ac->ac_inode;
1753        struct buffer_head *group_bh = NULL;
1754        struct buffer_head *prev_group_bh = NULL;
1755        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1756        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1757        struct ocfs2_group_desc *bg;
1758
1759        chain = ac->ac_chain;
1760        trace_ocfs2_search_chain_begin(
1761                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1762                bits_wanted, chain);
1763
1764        status = ocfs2_read_group_descriptor(alloc_inode, fe,
1765                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
1766                                             &group_bh);
1767        if (status < 0) {
1768                mlog_errno(status);
1769                goto bail;
1770        }
1771        bg = (struct ocfs2_group_desc *) group_bh->b_data;
1772
1773        status = -ENOSPC;
1774        /* for now, the chain search is a bit simplistic. We just use
1775         * the 1st group with any empty bits. */
1776        while ((status = ac->ac_group_search(alloc_inode, group_bh,
1777                                             bits_wanted, min_bits,
1778                                             ac->ac_max_block,
1779                                             res)) == -ENOSPC) {
1780                if (!bg->bg_next_group)
1781                        break;
1782
1783                brelse(prev_group_bh);
1784                prev_group_bh = NULL;
1785
1786                next_group = le64_to_cpu(bg->bg_next_group);
1787                prev_group_bh = group_bh;
1788                group_bh = NULL;
1789                status = ocfs2_read_group_descriptor(alloc_inode, fe,
1790                                                     next_group, &group_bh);
1791                if (status < 0) {
1792                        mlog_errno(status);
1793                        goto bail;
1794                }
1795                bg = (struct ocfs2_group_desc *) group_bh->b_data;
1796        }
1797        if (status < 0) {
1798                if (status != -ENOSPC)
1799                        mlog_errno(status);
1800                goto bail;
1801        }
1802
1803        trace_ocfs2_search_chain_succ(
1804                (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1805
1806        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1807
1808        BUG_ON(res->sr_bits == 0);
1809        if (!status)
1810                ocfs2_bg_discontig_fix_result(ac, bg, res);
1811
1812        /*
1813         * sr_bg_blkno might have been changed by
1814         * ocfs2_bg_discontig_fix_result
1815         */
1816        res->sr_bg_stable_blkno = group_bh->b_blocknr;
1817
1818        /*
1819         * Keep track of previous block descriptor read. When
1820         * we find a target, if we have read more than X
1821         * number of descriptors, and the target is reasonably
1822         * empty, relink him to top of his chain.
1823         *
1824         * We've read 0 extra blocks and only send one more to
1825         * the transaction, yet the next guy to search has a
1826         * much easier time.
1827         *
1828         * Do this *after* figuring out how many bits we're taking out
1829         * of our target group.
1830         */
1831        if (!ac->ac_disable_chain_relink &&
1832            (prev_group_bh) &&
1833            (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1834                status = ocfs2_relink_block_group(handle, alloc_inode,
1835                                                  ac->ac_bh, group_bh,
1836                                                  prev_group_bh, chain);
1837                if (status < 0) {
1838                        mlog_errno(status);
1839                        goto bail;
1840                }
1841        }
1842
1843        if (ac->ac_find_loc_only)
1844                goto out_loc_only;
1845
1846        status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1847                                                  ac->ac_bh, res->sr_bits,
1848                                                  chain);
1849        if (status) {
1850                mlog_errno(status);
1851                goto bail;
1852        }
1853
1854        status = ocfs2_block_group_set_bits(handle,
1855                                            alloc_inode,
1856                                            bg,
1857                                            group_bh,
1858                                            res->sr_bit_offset,
1859                                            res->sr_bits);
1860        if (status < 0) {
1861                ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1862                                        ac->ac_bh, res->sr_bits, chain);
1863                mlog_errno(status);
1864                goto bail;
1865        }
1866
1867        trace_ocfs2_search_chain_end(
1868                        (unsigned long long)le64_to_cpu(fe->i_blkno),
1869                        res->sr_bits);
1870
1871out_loc_only:
1872        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1873bail:
1874        brelse(group_bh);
1875        brelse(prev_group_bh);
1876
1877        if (status)
1878                mlog_errno(status);
1879        return status;
1880}
1881
1882/* will give out up to bits_wanted contiguous bits. */
1883static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1884                                     handle_t *handle,
1885                                     u32 bits_wanted,
1886                                     u32 min_bits,
1887                                     struct ocfs2_suballoc_result *res)
1888{
1889        int status;
1890        u16 victim, i;
1891        u16 bits_left = 0;
1892        u64 hint = ac->ac_last_group;
1893        struct ocfs2_chain_list *cl;
1894        struct ocfs2_dinode *fe;
1895
1896        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1897        BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1898        BUG_ON(!ac->ac_bh);
1899
1900        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1901
1902        /* The bh was validated by the inode read during
1903         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1904        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1905
1906        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1907            le32_to_cpu(fe->id1.bitmap1.i_total)) {
1908                ocfs2_error(ac->ac_inode->i_sb,
1909                            "Chain allocator dinode %llu has %u used "
1910                            "bits but only %u total.",
1911                            (unsigned long long)le64_to_cpu(fe->i_blkno),
1912                            le32_to_cpu(fe->id1.bitmap1.i_used),
1913                            le32_to_cpu(fe->id1.bitmap1.i_total));
1914                status = -EIO;
1915                goto bail;
1916        }
1917
1918        res->sr_bg_blkno = hint;
1919        if (res->sr_bg_blkno) {
1920                /* Attempt to short-circuit the usual search mechanism
1921                 * by jumping straight to the most recently used
1922                 * allocation group. This helps us maintain some
1923                 * contiguousness across allocations. */
1924                status = ocfs2_search_one_group(ac, handle, bits_wanted,
1925                                                min_bits, res, &bits_left);
1926                if (!status)
1927                        goto set_hint;
1928                if (status < 0 && status != -ENOSPC) {
1929                        mlog_errno(status);
1930                        goto bail;
1931                }
1932        }
1933
1934        cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1935
1936        victim = ocfs2_find_victim_chain(cl);
1937        ac->ac_chain = victim;
1938
1939        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1940                                    res, &bits_left);
1941        if (!status) {
1942                hint = ocfs2_group_from_res(res);
1943                goto set_hint;
1944        }
1945        if (status < 0 && status != -ENOSPC) {
1946                mlog_errno(status);
1947                goto bail;
1948        }
1949
1950        trace_ocfs2_claim_suballoc_bits(victim);
1951
1952        /* If we didn't pick a good victim, then just default to
1953         * searching each chain in order. Don't allow chain relinking
1954         * because we only calculate enough journal credits for one
1955         * relink per alloc. */
1956        ac->ac_disable_chain_relink = 1;
1957        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1958                if (i == victim)
1959                        continue;
1960                if (!cl->cl_recs[i].c_free)
1961                        continue;
1962
1963                ac->ac_chain = i;
1964                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1965                                            res, &bits_left);
1966                if (!status) {
1967                        hint = ocfs2_group_from_res(res);
1968                        break;
1969                }
1970                if (status < 0 && status != -ENOSPC) {
1971                        mlog_errno(status);
1972                        goto bail;
1973                }
1974        }
1975
1976set_hint:
1977        if (status != -ENOSPC) {
1978                /* If the next search of this group is not likely to
1979                 * yield a suitable extent, then we reset the last
1980                 * group hint so as to not waste a disk read */
1981                if (bits_left < min_bits)
1982                        ac->ac_last_group = 0;
1983                else
1984                        ac->ac_last_group = hint;
1985        }
1986
1987bail:
1988        if (status)
1989                mlog_errno(status);
1990        return status;
1991}
1992
1993int ocfs2_claim_metadata(handle_t *handle,
1994                         struct ocfs2_alloc_context *ac,
1995                         u32 bits_wanted,
1996                         u64 *suballoc_loc,
1997                         u16 *suballoc_bit_start,
1998                         unsigned int *num_bits,
1999                         u64 *blkno_start)
2000{
2001        int status;
2002        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2003
2004        BUG_ON(!ac);
2005        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2006        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2007
2008        status = ocfs2_claim_suballoc_bits(ac,
2009                                           handle,
2010                                           bits_wanted,
2011                                           1,
2012                                           &res);
2013        if (status < 0) {
2014                mlog_errno(status);
2015                goto bail;
2016        }
2017        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2018
2019        *suballoc_loc = res.sr_bg_blkno;
2020        *suballoc_bit_start = res.sr_bit_offset;
2021        *blkno_start = res.sr_blkno;
2022        ac->ac_bits_given += res.sr_bits;
2023        *num_bits = res.sr_bits;
2024        status = 0;
2025bail:
2026        if (status)
2027                mlog_errno(status);
2028        return status;
2029}
2030
2031static void ocfs2_init_inode_ac_group(struct inode *dir,
2032                                      struct buffer_head *parent_di_bh,
2033                                      struct ocfs2_alloc_context *ac)
2034{
2035        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2036        /*
2037         * Try to allocate inodes from some specific group.
2038         *
2039         * If the parent dir has recorded the last group used in allocation,
2040         * cool, use it. Otherwise if we try to allocate new inode from the
2041         * same slot the parent dir belongs to, use the same chunk.
2042         *
2043         * We are very careful here to avoid the mistake of setting
2044         * ac_last_group to a group descriptor from a different (unlocked) slot.
2045         */
2046        if (OCFS2_I(dir)->ip_last_used_group &&
2047            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2048                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2049        else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2050                if (di->i_suballoc_loc)
2051                        ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2052                else
2053                        ac->ac_last_group = ocfs2_which_suballoc_group(
2054                                        le64_to_cpu(di->i_blkno),
2055                                        le16_to_cpu(di->i_suballoc_bit));
2056        }
2057}
2058
2059static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2060                                             struct ocfs2_alloc_context *ac)
2061{
2062        OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2063        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2064}
2065
2066int ocfs2_find_new_inode_loc(struct inode *dir,
2067                             struct buffer_head *parent_fe_bh,
2068                             struct ocfs2_alloc_context *ac,
2069                             u64 *fe_blkno)
2070{
2071        int ret;
2072        handle_t *handle = NULL;
2073        struct ocfs2_suballoc_result *res;
2074
2075        BUG_ON(!ac);
2076        BUG_ON(ac->ac_bits_given != 0);
2077        BUG_ON(ac->ac_bits_wanted != 1);
2078        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2079
2080        res = kzalloc(sizeof(*res), GFP_NOFS);
2081        if (res == NULL) {
2082                ret = -ENOMEM;
2083                mlog_errno(ret);
2084                goto out;
2085        }
2086
2087        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2088
2089        /*
2090         * The handle started here is for chain relink. Alternatively,
2091         * we could just disable relink for these calls.
2092         */
2093        handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2094        if (IS_ERR(handle)) {
2095                ret = PTR_ERR(handle);
2096                handle = NULL;
2097                mlog_errno(ret);
2098                goto out;
2099        }
2100
2101        /*
2102         * This will instruct ocfs2_claim_suballoc_bits and
2103         * ocfs2_search_one_group to search but save actual allocation
2104         * for later.
2105         */
2106        ac->ac_find_loc_only = 1;
2107
2108        ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2109        if (ret < 0) {
2110                mlog_errno(ret);
2111                goto out;
2112        }
2113
2114        ac->ac_find_loc_priv = res;
2115        *fe_blkno = res->sr_blkno;
2116        ocfs2_update_inode_fsync_trans(handle, dir, 0);
2117out:
2118        if (handle)
2119                ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2120
2121        if (ret)
2122                kfree(res);
2123
2124        return ret;
2125}
2126
2127int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2128                                 struct inode *dir,
2129                                 struct ocfs2_alloc_context *ac,
2130                                 u64 *suballoc_loc,
2131                                 u16 *suballoc_bit,
2132                                 u64 di_blkno)
2133{
2134        int ret;
2135        u16 chain;
2136        struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2137        struct buffer_head *bg_bh = NULL;
2138        struct ocfs2_group_desc *bg;
2139        struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2140
2141        /*
2142         * Since di_blkno is being passed back in, we check for any
2143         * inconsistencies which may have happened between
2144         * calls. These are code bugs as di_blkno is not expected to
2145         * change once returned from ocfs2_find_new_inode_loc()
2146         */
2147        BUG_ON(res->sr_blkno != di_blkno);
2148
2149        ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2150                                          res->sr_bg_stable_blkno, &bg_bh);
2151        if (ret) {
2152                mlog_errno(ret);
2153                goto out;
2154        }
2155
2156        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2157        chain = le16_to_cpu(bg->bg_chain);
2158
2159        ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2160                                               ac->ac_bh, res->sr_bits,
2161                                               chain);
2162        if (ret) {
2163                mlog_errno(ret);
2164                goto out;
2165        }
2166
2167        ret = ocfs2_block_group_set_bits(handle,
2168                                         ac->ac_inode,
2169                                         bg,
2170                                         bg_bh,
2171                                         res->sr_bit_offset,
2172                                         res->sr_bits);
2173        if (ret < 0) {
2174                ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2175                                               ac->ac_bh, res->sr_bits, chain);
2176                mlog_errno(ret);
2177                goto out;
2178        }
2179
2180        trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2181                                           res->sr_bits);
2182
2183        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2184
2185        BUG_ON(res->sr_bits != 1);
2186
2187        *suballoc_loc = res->sr_bg_blkno;
2188        *suballoc_bit = res->sr_bit_offset;
2189        ac->ac_bits_given++;
2190        ocfs2_save_inode_ac_group(dir, ac);
2191
2192out:
2193        brelse(bg_bh);
2194
2195        return ret;
2196}
2197
2198int ocfs2_claim_new_inode(handle_t *handle,
2199                          struct inode *dir,
2200                          struct buffer_head *parent_fe_bh,
2201                          struct ocfs2_alloc_context *ac,
2202                          u64 *suballoc_loc,
2203                          u16 *suballoc_bit,
2204                          u64 *fe_blkno)
2205{
2206        int status;
2207        struct ocfs2_suballoc_result res;
2208
2209        BUG_ON(!ac);
2210        BUG_ON(ac->ac_bits_given != 0);
2211        BUG_ON(ac->ac_bits_wanted != 1);
2212        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2213
2214        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2215
2216        status = ocfs2_claim_suballoc_bits(ac,
2217                                           handle,
2218                                           1,
2219                                           1,
2220                                           &res);
2221        if (status < 0) {
2222                mlog_errno(status);
2223                goto bail;
2224        }
2225        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2226
2227        BUG_ON(res.sr_bits != 1);
2228
2229        *suballoc_loc = res.sr_bg_blkno;
2230        *suballoc_bit = res.sr_bit_offset;
2231        *fe_blkno = res.sr_blkno;
2232        ac->ac_bits_given++;
2233        ocfs2_save_inode_ac_group(dir, ac);
2234        status = 0;
2235bail:
2236        if (status)
2237                mlog_errno(status);
2238        return status;
2239}
2240
2241/* translate a group desc. blkno and it's bitmap offset into
2242 * disk cluster offset. */
2243static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2244                                                   u64 bg_blkno,
2245                                                   u16 bg_bit_off)
2246{
2247        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2248        u32 cluster = 0;
2249
2250        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2251
2252        if (bg_blkno != osb->first_cluster_group_blkno)
2253                cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2254        cluster += (u32) bg_bit_off;
2255        return cluster;
2256}
2257
2258/* given a cluster offset, calculate which block group it belongs to
2259 * and return that block offset. */
2260u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2261{
2262        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2263        u32 group_no;
2264
2265        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2266
2267        group_no = cluster / osb->bitmap_cpg;
2268        if (!group_no)
2269                return osb->first_cluster_group_blkno;
2270        return ocfs2_clusters_to_blocks(inode->i_sb,
2271                                        group_no * osb->bitmap_cpg);
2272}
2273
2274/* given the block number of a cluster start, calculate which cluster
2275 * group and descriptor bitmap offset that corresponds to. */
2276static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2277                                                u64 data_blkno,
2278                                                u64 *bg_blkno,
2279                                                u16 *bg_bit_off)
2280{
2281        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2282        u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2283
2284        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2285
2286        *bg_blkno = ocfs2_which_cluster_group(inode,
2287                                              data_cluster);
2288
2289        if (*bg_blkno == osb->first_cluster_group_blkno)
2290                *bg_bit_off = (u16) data_cluster;
2291        else
2292                *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2293                                                             data_blkno - *bg_blkno);
2294}
2295
2296/*
2297 * min_bits - minimum contiguous chunk from this total allocation we
2298 * can handle. set to what we asked for originally for a full
2299 * contig. allocation, set to '1' to indicate we can deal with extents
2300 * of any size.
2301 */
2302int __ocfs2_claim_clusters(handle_t *handle,
2303                           struct ocfs2_alloc_context *ac,
2304                           u32 min_clusters,
2305                           u32 max_clusters,
2306                           u32 *cluster_start,
2307                           u32 *num_clusters)
2308{
2309        int status;
2310        unsigned int bits_wanted = max_clusters;
2311        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2312        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2313
2314        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2315
2316        BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2317               && ac->ac_which != OCFS2_AC_USE_MAIN);
2318
2319        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2320                WARN_ON(min_clusters > 1);
2321
2322                status = ocfs2_claim_local_alloc_bits(osb,
2323                                                      handle,
2324                                                      ac,
2325                                                      bits_wanted,
2326                                                      cluster_start,
2327                                                      num_clusters);
2328                if (!status)
2329                        atomic_inc(&osb->alloc_stats.local_data);
2330        } else {
2331                if (min_clusters > (osb->bitmap_cpg - 1)) {
2332                        /* The only paths asking for contiguousness
2333                         * should know about this already. */
2334                        mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2335                             "group bitmap size %u!\n", min_clusters,
2336                             osb->bitmap_cpg);
2337                        status = -ENOSPC;
2338                        goto bail;
2339                }
2340                /* clamp the current request down to a realistic size. */
2341                if (bits_wanted > (osb->bitmap_cpg - 1))
2342                        bits_wanted = osb->bitmap_cpg - 1;
2343
2344                status = ocfs2_claim_suballoc_bits(ac,
2345                                                   handle,
2346                                                   bits_wanted,
2347                                                   min_clusters,
2348                                                   &res);
2349                if (!status) {
2350                        BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2351                        *cluster_start =
2352                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2353                                                                 res.sr_bg_blkno,
2354                                                                 res.sr_bit_offset);
2355                        atomic_inc(&osb->alloc_stats.bitmap_data);
2356                        *num_clusters = res.sr_bits;
2357                }
2358        }
2359        if (status < 0) {
2360                if (status != -ENOSPC)
2361                        mlog_errno(status);
2362                goto bail;
2363        }
2364
2365        ac->ac_bits_given += *num_clusters;
2366
2367bail:
2368        if (status)
2369                mlog_errno(status);
2370        return status;
2371}
2372
2373int ocfs2_claim_clusters(handle_t *handle,
2374                         struct ocfs2_alloc_context *ac,
2375                         u32 min_clusters,
2376                         u32 *cluster_start,
2377                         u32 *num_clusters)
2378{
2379        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2380
2381        return __ocfs2_claim_clusters(handle, ac, min_clusters,
2382                                      bits_wanted, cluster_start, num_clusters);
2383}
2384
2385static int ocfs2_block_group_clear_bits(handle_t *handle,
2386                                        struct inode *alloc_inode,
2387                                        struct ocfs2_group_desc *bg,
2388                                        struct buffer_head *group_bh,
2389                                        unsigned int bit_off,
2390                                        unsigned int num_bits,
2391                                        void (*undo_fn)(unsigned int bit,
2392                                                        unsigned long *bmap))
2393{
2394        int status;
2395        unsigned int tmp;
2396        struct ocfs2_group_desc *undo_bg = NULL;
2397
2398        /* The caller got this descriptor from
2399         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2400        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2401
2402        trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2403
2404        BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2405        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2406                                         group_bh,
2407                                         undo_fn ?
2408                                         OCFS2_JOURNAL_ACCESS_UNDO :
2409                                         OCFS2_JOURNAL_ACCESS_WRITE);
2410        if (status < 0) {
2411                mlog_errno(status);
2412                goto bail;
2413        }
2414
2415        if (undo_fn) {
2416                jbd_lock_bh_state(group_bh);
2417                undo_bg = (struct ocfs2_group_desc *)
2418                                        bh2jh(group_bh)->b_committed_data;
2419                BUG_ON(!undo_bg);
2420        }
2421
2422        tmp = num_bits;
2423        while(tmp--) {
2424                ocfs2_clear_bit((bit_off + tmp),
2425                                (unsigned long *) bg->bg_bitmap);
2426                if (undo_fn)
2427                        undo_fn(bit_off + tmp,
2428                                (unsigned long *) undo_bg->bg_bitmap);
2429        }
2430        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2431        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2432                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
2433                            " count %u but claims %u are freed. num_bits %d",
2434                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
2435                            le16_to_cpu(bg->bg_bits),
2436                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
2437                return -EROFS;
2438        }
2439
2440        if (undo_fn)
2441                jbd_unlock_bh_state(group_bh);
2442
2443        ocfs2_journal_dirty(handle, group_bh);
2444bail:
2445        return status;
2446}
2447
2448/*
2449 * expects the suballoc inode to already be locked.
2450 */
2451static int _ocfs2_free_suballoc_bits(handle_t *handle,
2452                                     struct inode *alloc_inode,
2453                                     struct buffer_head *alloc_bh,
2454                                     unsigned int start_bit,
2455                                     u64 bg_blkno,
2456                                     unsigned int count,
2457                                     void (*undo_fn)(unsigned int bit,
2458                                                     unsigned long *bitmap))
2459{
2460        int status = 0;
2461        u32 tmp_used;
2462        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2463        struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2464        struct buffer_head *group_bh = NULL;
2465        struct ocfs2_group_desc *group;
2466
2467        /* The alloc_bh comes from ocfs2_free_dinode() or
2468         * ocfs2_free_clusters().  The callers have all locked the
2469         * allocator and gotten alloc_bh from the lock call.  This
2470         * validates the dinode buffer.  Any corruption that has happened
2471         * is a code bug. */
2472        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2473        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2474
2475        trace_ocfs2_free_suballoc_bits(
2476                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2477                (unsigned long long)bg_blkno,
2478                start_bit, count);
2479
2480        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2481                                             &group_bh);
2482        if (status < 0) {
2483                mlog_errno(status);
2484                goto bail;
2485        }
2486        group = (struct ocfs2_group_desc *) group_bh->b_data;
2487
2488        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2489
2490        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2491                                              group, group_bh,
2492                                              start_bit, count, undo_fn);
2493        if (status < 0) {
2494                mlog_errno(status);
2495                goto bail;
2496        }
2497
2498        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2499                                         alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2500        if (status < 0) {
2501                mlog_errno(status);
2502                goto bail;
2503        }
2504
2505        le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2506                     count);
2507        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2508        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2509        ocfs2_journal_dirty(handle, alloc_bh);
2510
2511bail:
2512        brelse(group_bh);
2513
2514        if (status)
2515                mlog_errno(status);
2516        return status;
2517}
2518
2519int ocfs2_free_suballoc_bits(handle_t *handle,
2520                             struct inode *alloc_inode,
2521                             struct buffer_head *alloc_bh,
2522                             unsigned int start_bit,
2523                             u64 bg_blkno,
2524                             unsigned int count)
2525{
2526        return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2527                                         start_bit, bg_blkno, count, NULL);
2528}
2529
2530int ocfs2_free_dinode(handle_t *handle,
2531                      struct inode *inode_alloc_inode,
2532                      struct buffer_head *inode_alloc_bh,
2533                      struct ocfs2_dinode *di)
2534{
2535        u64 blk = le64_to_cpu(di->i_blkno);
2536        u16 bit = le16_to_cpu(di->i_suballoc_bit);
2537        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2538
2539        if (di->i_suballoc_loc)
2540                bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2541        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2542                                        inode_alloc_bh, bit, bg_blkno, 1);
2543}
2544
2545static int _ocfs2_free_clusters(handle_t *handle,
2546                                struct inode *bitmap_inode,
2547                                struct buffer_head *bitmap_bh,
2548                                u64 start_blk,
2549                                unsigned int num_clusters,
2550                                void (*undo_fn)(unsigned int bit,
2551                                                unsigned long *bitmap))
2552{
2553        int status;
2554        u16 bg_start_bit;
2555        u64 bg_blkno;
2556        struct ocfs2_dinode *fe;
2557
2558        /* You can't ever have a contiguous set of clusters
2559         * bigger than a block group bitmap so we never have to worry
2560         * about looping on them.
2561         * This is expensive. We can safely remove once this stuff has
2562         * gotten tested really well. */
2563        BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2564
2565        fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2566
2567        ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2568                                     &bg_start_bit);
2569
2570        trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2571                        (unsigned long long)start_blk,
2572                        bg_start_bit, num_clusters);
2573
2574        status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2575                                           bg_start_bit, bg_blkno,
2576                                           num_clusters, undo_fn);
2577        if (status < 0) {
2578                mlog_errno(status);
2579                goto out;
2580        }
2581
2582        ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2583                                         num_clusters);
2584
2585out:
2586        if (status)
2587                mlog_errno(status);
2588        return status;
2589}
2590
2591int ocfs2_free_clusters(handle_t *handle,
2592                        struct inode *bitmap_inode,
2593                        struct buffer_head *bitmap_bh,
2594                        u64 start_blk,
2595                        unsigned int num_clusters)
2596{
2597        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2598                                    start_blk, num_clusters,
2599                                    _ocfs2_set_bit);
2600}
2601
2602/*
2603 * Give never-used clusters back to the global bitmap.  We don't need
2604 * to protect these bits in the undo buffer.
2605 */
2606int ocfs2_release_clusters(handle_t *handle,
2607                           struct inode *bitmap_inode,
2608                           struct buffer_head *bitmap_bh,
2609                           u64 start_blk,
2610                           unsigned int num_clusters)
2611{
2612        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2613                                    start_blk, num_clusters,
2614                                    _ocfs2_clear_bit);
2615}
2616
2617static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2618{
2619        printk("Block Group:\n");
2620        printk("bg_signature:       %s\n", bg->bg_signature);
2621        printk("bg_size:            %u\n", bg->bg_size);
2622        printk("bg_bits:            %u\n", bg->bg_bits);
2623        printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2624        printk("bg_chain:           %u\n", bg->bg_chain);
2625        printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2626        printk("bg_next_group:      %llu\n",
2627               (unsigned long long)bg->bg_next_group);
2628        printk("bg_parent_dinode:   %llu\n",
2629               (unsigned long long)bg->bg_parent_dinode);
2630        printk("bg_blkno:           %llu\n",
2631               (unsigned long long)bg->bg_blkno);
2632}
2633
2634static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2635{
2636        int i;
2637
2638        printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2639        printk("i_signature:                  %s\n", fe->i_signature);
2640        printk("i_size:                       %llu\n",
2641               (unsigned long long)fe->i_size);
2642        printk("i_clusters:                   %u\n", fe->i_clusters);
2643        printk("i_generation:                 %u\n",
2644               le32_to_cpu(fe->i_generation));
2645        printk("id1.bitmap1.i_used:           %u\n",
2646               le32_to_cpu(fe->id1.bitmap1.i_used));
2647        printk("id1.bitmap1.i_total:          %u\n",
2648               le32_to_cpu(fe->id1.bitmap1.i_total));
2649        printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2650        printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2651        printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2652        printk("id2.i_chain.cl_next_free_rec: %u\n",
2653               fe->id2.i_chain.cl_next_free_rec);
2654        for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2655                printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2656                       fe->id2.i_chain.cl_recs[i].c_free);
2657                printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2658                       fe->id2.i_chain.cl_recs[i].c_total);
2659                printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2660                       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2661        }
2662}
2663
2664/*
2665 * For a given allocation, determine which allocators will need to be
2666 * accessed, and lock them, reserving the appropriate number of bits.
2667 *
2668 * Sparse file systems call this from ocfs2_write_begin_nolock()
2669 * and ocfs2_allocate_unwritten_extents().
2670 *
2671 * File systems which don't support holes call this from
2672 * ocfs2_extend_allocation().
2673 */
2674int ocfs2_lock_allocators(struct inode *inode,
2675                          struct ocfs2_extent_tree *et,
2676                          u32 clusters_to_add, u32 extents_to_split,
2677                          struct ocfs2_alloc_context **data_ac,
2678                          struct ocfs2_alloc_context **meta_ac)
2679{
2680        int ret = 0, num_free_extents;
2681        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2682        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2683
2684        *meta_ac = NULL;
2685        if (data_ac)
2686                *data_ac = NULL;
2687
2688        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2689
2690        num_free_extents = ocfs2_num_free_extents(osb, et);
2691        if (num_free_extents < 0) {
2692                ret = num_free_extents;
2693                mlog_errno(ret);
2694                goto out;
2695        }
2696
2697        /*
2698         * Sparse allocation file systems need to be more conservative
2699         * with reserving room for expansion - the actual allocation
2700         * happens while we've got a journal handle open so re-taking
2701         * a cluster lock (because we ran out of room for another
2702         * extent) will violate ordering rules.
2703         *
2704         * Most of the time we'll only be seeing this 1 cluster at a time
2705         * anyway.
2706         *
2707         * Always lock for any unwritten extents - we might want to
2708         * add blocks during a split.
2709         */
2710        if (!num_free_extents ||
2711            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2712                ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2713                if (ret < 0) {
2714                        if (ret != -ENOSPC)
2715                                mlog_errno(ret);
2716                        goto out;
2717                }
2718        }
2719
2720        if (clusters_to_add == 0)
2721                goto out;
2722
2723        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2724        if (ret < 0) {
2725                if (ret != -ENOSPC)
2726                        mlog_errno(ret);
2727                goto out;
2728        }
2729
2730out:
2731        if (ret) {
2732                if (*meta_ac) {
2733                        ocfs2_free_alloc_context(*meta_ac);
2734                        *meta_ac = NULL;
2735                }
2736
2737                /*
2738                 * We cannot have an error and a non null *data_ac.
2739                 */
2740        }
2741
2742        return ret;
2743}
2744
2745/*
2746 * Read the inode specified by blkno to get suballoc_slot and
2747 * suballoc_bit.
2748 */
2749static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2750                                       u16 *suballoc_slot, u64 *group_blkno,
2751                                       u16 *suballoc_bit)
2752{
2753        int status;
2754        struct buffer_head *inode_bh = NULL;
2755        struct ocfs2_dinode *inode_fe;
2756
2757        trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2758
2759        /* dirty read disk */
2760        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2761        if (status < 0) {
2762                mlog(ML_ERROR, "read block %llu failed %d\n",
2763                     (unsigned long long)blkno, status);
2764                goto bail;
2765        }
2766
2767        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2768        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2769                mlog(ML_ERROR, "invalid inode %llu requested\n",
2770                     (unsigned long long)blkno);
2771                status = -EINVAL;
2772                goto bail;
2773        }
2774
2775        if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2776            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2777                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2778                     (unsigned long long)blkno,
2779                     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2780                status = -EINVAL;
2781                goto bail;
2782        }
2783
2784        if (suballoc_slot)
2785                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2786        if (suballoc_bit)
2787                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2788        if (group_blkno)
2789                *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2790
2791bail:
2792        brelse(inode_bh);
2793
2794        if (status)
2795                mlog_errno(status);
2796        return status;
2797}
2798
2799/*
2800 * test whether bit is SET in allocator bitmap or not.  on success, 0
2801 * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2802 * is returned and *res is meaningless.  Call this after you have
2803 * cluster locked against suballoc, or you may get a result based on
2804 * non-up2date contents
2805 */
2806static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2807                                   struct inode *suballoc,
2808                                   struct buffer_head *alloc_bh,
2809                                   u64 group_blkno, u64 blkno,
2810                                   u16 bit, int *res)
2811{
2812        struct ocfs2_dinode *alloc_di;
2813        struct ocfs2_group_desc *group;
2814        struct buffer_head *group_bh = NULL;
2815        u64 bg_blkno;
2816        int status;
2817
2818        trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2819                                      (unsigned int)bit);
2820
2821        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2822        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2823                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2824                     (unsigned int)bit,
2825                     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2826                status = -EINVAL;
2827                goto bail;
2828        }
2829
2830        bg_blkno = group_blkno ? group_blkno :
2831                   ocfs2_which_suballoc_group(blkno, bit);
2832        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2833                                             &group_bh);
2834        if (status < 0) {
2835                mlog(ML_ERROR, "read group %llu failed %d\n",
2836                     (unsigned long long)bg_blkno, status);
2837                goto bail;
2838        }
2839
2840        group = (struct ocfs2_group_desc *) group_bh->b_data;
2841        *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2842
2843bail:
2844        brelse(group_bh);
2845
2846        if (status)
2847                mlog_errno(status);
2848        return status;
2849}
2850
2851/*
2852 * Test if the bit representing this inode (blkno) is set in the
2853 * suballocator.
2854 *
2855 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2856 *
2857 * In the event of failure, a negative value is returned and *res is
2858 * meaningless.
2859 *
2860 * Callers must make sure to hold nfs_sync_lock to prevent
2861 * ocfs2_delete_inode() on another node from accessing the same
2862 * suballocator concurrently.
2863 */
2864int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2865{
2866        int status;
2867        u64 group_blkno = 0;
2868        u16 suballoc_bit = 0, suballoc_slot = 0;
2869        struct inode *inode_alloc_inode;
2870        struct buffer_head *alloc_bh = NULL;
2871
2872        trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2873
2874        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2875                                             &group_blkno, &suballoc_bit);
2876        if (status < 0) {
2877                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2878                goto bail;
2879        }
2880
2881        inode_alloc_inode =
2882                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2883                                            suballoc_slot);
2884        if (!inode_alloc_inode) {
2885                /* the error code could be inaccurate, but we are not able to
2886                 * get the correct one. */
2887                status = -EINVAL;
2888                mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2889                     (u32)suballoc_slot);
2890                goto bail;
2891        }
2892
2893        mutex_lock(&inode_alloc_inode->i_mutex);
2894        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2895        if (status < 0) {
2896                mutex_unlock(&inode_alloc_inode->i_mutex);
2897                iput(inode_alloc_inode);
2898                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2899                     (u32)suballoc_slot, status);
2900                goto bail;
2901        }
2902
2903        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2904                                         group_blkno, blkno, suballoc_bit, res);
2905        if (status < 0)
2906                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2907
2908        ocfs2_inode_unlock(inode_alloc_inode, 0);
2909        mutex_unlock(&inode_alloc_inode->i_mutex);
2910
2911        iput(inode_alloc_inode);
2912        brelse(alloc_bh);
2913bail:
2914        if (status)
2915                mlog_errno(status);
2916        return status;
2917}
2918