linux/fs/ocfs2/suballoc.c
<<
>>
Prefs
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * suballoc.c
   5 *
   6 * metadata alloc and free
   7 * Inspired by ext3 block groups.
   8 *
   9 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  10 *
  11 * This program is free software; you can redistribute it and/or
  12 * modify it under the terms of the GNU General Public
  13 * License as published by the Free Software Foundation; either
  14 * version 2 of the License, or (at your option) any later version.
  15 *
  16 * This program is distributed in the hope that it will be useful,
  17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19 * General Public License for more details.
  20 *
  21 * You should have received a copy of the GNU General Public
  22 * License along with this program; if not, write to the
  23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 * Boston, MA 021110-1307, USA.
  25 */
  26
  27#include <linux/fs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/highmem.h>
  31
  32#include <cluster/masklog.h>
  33
  34#include "ocfs2.h"
  35
  36#include "alloc.h"
  37#include "blockcheck.h"
  38#include "dlmglue.h"
  39#include "inode.h"
  40#include "journal.h"
  41#include "localalloc.h"
  42#include "suballoc.h"
  43#include "super.h"
  44#include "sysfile.h"
  45#include "uptodate.h"
  46#include "ocfs2_trace.h"
  47
  48#include "buffer_head_io.h"
  49
  50#define NOT_ALLOC_NEW_GROUP             0
  51#define ALLOC_NEW_GROUP                 0x1
  52#define ALLOC_GROUPS_FROM_GLOBAL        0x2
  53
  54#define OCFS2_MAX_TO_STEAL              1024
  55
  56struct ocfs2_suballoc_result {
  57        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
  58                                           to 0 when a block group is
  59                                           contiguous. */
  60        u64             sr_bg_stable_blkno; /*
  61                                             * Doesn't change, always
  62                                             * set to target block
  63                                             * group descriptor
  64                                             * block.
  65                                             */
  66        u64             sr_blkno;       /* The first allocated block */
  67        unsigned int    sr_bit_offset;  /* The bit in the bg */
  68        unsigned int    sr_bits;        /* How many bits we claimed */
  69};
  70
  71static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
  72{
  73        if (res->sr_blkno == 0)
  74                return 0;
  75
  76        if (res->sr_bg_blkno)
  77                return res->sr_bg_blkno;
  78
  79        return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
  80}
  81
  82static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
  83static int ocfs2_block_group_fill(handle_t *handle,
  84                                  struct inode *alloc_inode,
  85                                  struct buffer_head *bg_bh,
  86                                  u64 group_blkno,
  87                                  unsigned int group_clusters,
  88                                  u16 my_chain,
  89                                  struct ocfs2_chain_list *cl);
  90static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
  91                                   struct inode *alloc_inode,
  92                                   struct buffer_head *bh,
  93                                   u64 max_block,
  94                                   u64 *last_alloc_group,
  95                                   int flags);
  96
  97static int ocfs2_cluster_group_search(struct inode *inode,
  98                                      struct buffer_head *group_bh,
  99                                      u32 bits_wanted, u32 min_bits,
 100                                      u64 max_block,
 101                                      struct ocfs2_suballoc_result *res);
 102static int ocfs2_block_group_search(struct inode *inode,
 103                                    struct buffer_head *group_bh,
 104                                    u32 bits_wanted, u32 min_bits,
 105                                    u64 max_block,
 106                                    struct ocfs2_suballoc_result *res);
 107static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 108                                     handle_t *handle,
 109                                     u32 bits_wanted,
 110                                     u32 min_bits,
 111                                     struct ocfs2_suballoc_result *res);
 112static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
 113                                         int nr);
 114static int ocfs2_relink_block_group(handle_t *handle,
 115                                    struct inode *alloc_inode,
 116                                    struct buffer_head *fe_bh,
 117                                    struct buffer_head *bg_bh,
 118                                    struct buffer_head *prev_bg_bh,
 119                                    u16 chain);
 120static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
 121                                                     u32 wanted);
 122static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 123                                                   u64 bg_blkno,
 124                                                   u16 bg_bit_off);
 125static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 126                                                u64 data_blkno,
 127                                                u64 *bg_blkno,
 128                                                u16 *bg_bit_off);
 129static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 130                                             u32 bits_wanted, u64 max_block,
 131                                             int flags,
 132                                             struct ocfs2_alloc_context **ac);
 133
 134void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 135{
 136        struct inode *inode = ac->ac_inode;
 137
 138        if (inode) {
 139                if (ac->ac_which != OCFS2_AC_USE_LOCAL)
 140                        ocfs2_inode_unlock(inode, 1);
 141
 142                inode_unlock(inode);
 143
 144                iput(inode);
 145                ac->ac_inode = NULL;
 146        }
 147        brelse(ac->ac_bh);
 148        ac->ac_bh = NULL;
 149        ac->ac_resv = NULL;
 150        kfree(ac->ac_find_loc_priv);
 151        ac->ac_find_loc_priv = NULL;
 152}
 153
 154void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
 155{
 156        ocfs2_free_ac_resource(ac);
 157        kfree(ac);
 158}
 159
 160static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 161{
 162        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 163}
 164
 165#define do_error(fmt, ...)                                              \
 166do {                                                                    \
 167        if (resize)                                                     \
 168                mlog(ML_ERROR, fmt, ##__VA_ARGS__);                     \
 169        else                                                            \
 170                return ocfs2_error(sb, fmt, ##__VA_ARGS__);             \
 171} while (0)
 172
 173static int ocfs2_validate_gd_self(struct super_block *sb,
 174                                  struct buffer_head *bh,
 175                                  int resize)
 176{
 177        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 178
 179        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
 180                do_error("Group descriptor #%llu has bad signature %.*s\n",
 181                         (unsigned long long)bh->b_blocknr, 7,
 182                         gd->bg_signature);
 183        }
 184
 185        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
 186                do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
 187                         (unsigned long long)bh->b_blocknr,
 188                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
 189        }
 190
 191        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
 192                do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
 193                         (unsigned long long)bh->b_blocknr,
 194                         le32_to_cpu(gd->bg_generation));
 195        }
 196
 197        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
 198                do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
 199                         (unsigned long long)bh->b_blocknr,
 200                         le16_to_cpu(gd->bg_bits),
 201                         le16_to_cpu(gd->bg_free_bits_count));
 202        }
 203
 204        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
 205                do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
 206                         (unsigned long long)bh->b_blocknr,
 207                         le16_to_cpu(gd->bg_bits),
 208                         8 * le16_to_cpu(gd->bg_size));
 209        }
 210
 211        return 0;
 212}
 213
 214static int ocfs2_validate_gd_parent(struct super_block *sb,
 215                                    struct ocfs2_dinode *di,
 216                                    struct buffer_head *bh,
 217                                    int resize)
 218{
 219        unsigned int max_bits;
 220        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 221
 222        if (di->i_blkno != gd->bg_parent_dinode) {
 223                do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
 224                         (unsigned long long)bh->b_blocknr,
 225                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
 226                         (unsigned long long)le64_to_cpu(di->i_blkno));
 227        }
 228
 229        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 230        if (le16_to_cpu(gd->bg_bits) > max_bits) {
 231                do_error("Group descriptor #%llu has bit count of %u\n",
 232                         (unsigned long long)bh->b_blocknr,
 233                         le16_to_cpu(gd->bg_bits));
 234        }
 235
 236        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
 237        if ((le16_to_cpu(gd->bg_chain) >
 238             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
 239            ((le16_to_cpu(gd->bg_chain) ==
 240             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
 241                do_error("Group descriptor #%llu has bad chain %u\n",
 242                         (unsigned long long)bh->b_blocknr,
 243                         le16_to_cpu(gd->bg_chain));
 244        }
 245
 246        return 0;
 247}
 248
 249#undef do_error
 250
 251/*
 252 * This version only prints errors.  It does not fail the filesystem, and
 253 * exists only for resize.
 254 */
 255int ocfs2_check_group_descriptor(struct super_block *sb,
 256                                 struct ocfs2_dinode *di,
 257                                 struct buffer_head *bh)
 258{
 259        int rc;
 260        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 261
 262        BUG_ON(!buffer_uptodate(bh));
 263
 264        /*
 265         * If the ecc fails, we return the error but otherwise
 266         * leave the filesystem running.  We know any error is
 267         * local to this block.
 268         */
 269        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 270        if (rc) {
 271                mlog(ML_ERROR,
 272                     "Checksum failed for group descriptor %llu\n",
 273                     (unsigned long long)bh->b_blocknr);
 274        } else
 275                rc = ocfs2_validate_gd_self(sb, bh, 1);
 276        if (!rc)
 277                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
 278
 279        return rc;
 280}
 281
 282static int ocfs2_validate_group_descriptor(struct super_block *sb,
 283                                           struct buffer_head *bh)
 284{
 285        int rc;
 286        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 287
 288        trace_ocfs2_validate_group_descriptor(
 289                                        (unsigned long long)bh->b_blocknr);
 290
 291        BUG_ON(!buffer_uptodate(bh));
 292
 293        /*
 294         * If the ecc fails, we return the error but otherwise
 295         * leave the filesystem running.  We know any error is
 296         * local to this block.
 297         */
 298        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 299        if (rc)
 300                return rc;
 301
 302        /*
 303         * Errors after here are fatal.
 304         */
 305
 306        return ocfs2_validate_gd_self(sb, bh, 0);
 307}
 308
 309int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
 310                                u64 gd_blkno, struct buffer_head **bh)
 311{
 312        int rc;
 313        struct buffer_head *tmp = *bh;
 314
 315        rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
 316                              ocfs2_validate_group_descriptor);
 317        if (rc)
 318                goto out;
 319
 320        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
 321        if (rc) {
 322                brelse(tmp);
 323                goto out;
 324        }
 325
 326        /* If ocfs2_read_block() got us a new bh, pass it up. */
 327        if (!*bh)
 328                *bh = tmp;
 329
 330out:
 331        return rc;
 332}
 333
 334static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
 335                                          struct ocfs2_group_desc *bg,
 336                                          struct ocfs2_chain_list *cl,
 337                                          u64 p_blkno, unsigned int clusters)
 338{
 339        struct ocfs2_extent_list *el = &bg->bg_list;
 340        struct ocfs2_extent_rec *rec;
 341
 342        BUG_ON(!ocfs2_supports_discontig_bg(osb));
 343        if (!el->l_next_free_rec)
 344                el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
 345        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
 346        rec->e_blkno = cpu_to_le64(p_blkno);
 347        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
 348                                  le16_to_cpu(cl->cl_bpc));
 349        rec->e_leaf_clusters = cpu_to_le16(clusters);
 350        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
 351        le16_add_cpu(&bg->bg_free_bits_count,
 352                     clusters * le16_to_cpu(cl->cl_bpc));
 353        le16_add_cpu(&el->l_next_free_rec, 1);
 354}
 355
 356static int ocfs2_block_group_fill(handle_t *handle,
 357                                  struct inode *alloc_inode,
 358                                  struct buffer_head *bg_bh,
 359                                  u64 group_blkno,
 360                                  unsigned int group_clusters,
 361                                  u16 my_chain,
 362                                  struct ocfs2_chain_list *cl)
 363{
 364        int status = 0;
 365        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 366        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 367        struct super_block * sb = alloc_inode->i_sb;
 368
 369        if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
 370                status = ocfs2_error(alloc_inode->i_sb,
 371                                     "group block (%llu) != b_blocknr (%llu)\n",
 372                                     (unsigned long long)group_blkno,
 373                                     (unsigned long long) bg_bh->b_blocknr);
 374                goto bail;
 375        }
 376
 377        status = ocfs2_journal_access_gd(handle,
 378                                         INODE_CACHE(alloc_inode),
 379                                         bg_bh,
 380                                         OCFS2_JOURNAL_ACCESS_CREATE);
 381        if (status < 0) {
 382                mlog_errno(status);
 383                goto bail;
 384        }
 385
 386        memset(bg, 0, sb->s_blocksize);
 387        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
 388        bg->bg_generation = cpu_to_le32(osb->fs_generation);
 389        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
 390                                                osb->s_feature_incompat));
 391        bg->bg_chain = cpu_to_le16(my_chain);
 392        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
 393        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
 394        bg->bg_blkno = cpu_to_le64(group_blkno);
 395        if (group_clusters == le16_to_cpu(cl->cl_cpg))
 396                bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
 397        else
 398                ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
 399                                              group_clusters);
 400
 401        /* set the 1st bit in the bitmap to account for the descriptor block */
 402        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
 403        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
 404
 405        ocfs2_journal_dirty(handle, bg_bh);
 406
 407        /* There is no need to zero out or otherwise initialize the
 408         * other blocks in a group - All valid FS metadata in a block
 409         * group stores the superblock fs_generation value at
 410         * allocation time. */
 411
 412bail:
 413        if (status)
 414                mlog_errno(status);
 415        return status;
 416}
 417
 418static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 419{
 420        u16 curr, best;
 421
 422        best = curr = 0;
 423        while (curr < le16_to_cpu(cl->cl_count)) {
 424                if (le32_to_cpu(cl->cl_recs[best].c_total) >
 425                    le32_to_cpu(cl->cl_recs[curr].c_total))
 426                        best = curr;
 427                curr++;
 428        }
 429        return best;
 430}
 431
 432static struct buffer_head *
 433ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
 434                               struct inode *alloc_inode,
 435                               struct ocfs2_alloc_context *ac,
 436                               struct ocfs2_chain_list *cl)
 437{
 438        int status;
 439        u32 bit_off, num_bits;
 440        u64 bg_blkno;
 441        struct buffer_head *bg_bh;
 442        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
 443
 444        status = ocfs2_claim_clusters(handle, ac,
 445                                      le16_to_cpu(cl->cl_cpg), &bit_off,
 446                                      &num_bits);
 447        if (status < 0) {
 448                if (status != -ENOSPC)
 449                        mlog_errno(status);
 450                goto bail;
 451        }
 452
 453        /* setup the group */
 454        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 455        trace_ocfs2_block_group_alloc_contig(
 456             (unsigned long long)bg_blkno, alloc_rec);
 457
 458        bg_bh = sb_getblk(osb->sb, bg_blkno);
 459        if (!bg_bh) {
 460                status = -ENOMEM;
 461                mlog_errno(status);
 462                goto bail;
 463        }
 464        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
 465
 466        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
 467                                        bg_blkno, num_bits, alloc_rec, cl);
 468        if (status < 0) {
 469                brelse(bg_bh);
 470                mlog_errno(status);
 471        }
 472
 473bail:
 474        return status ? ERR_PTR(status) : bg_bh;
 475}
 476
 477static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
 478                                        handle_t *handle,
 479                                        struct ocfs2_alloc_context *ac,
 480                                        unsigned int min_bits,
 481                                        u32 *bit_off, u32 *num_bits)
 482{
 483        int status = 0;
 484
 485        while (min_bits) {
 486                status = ocfs2_claim_clusters(handle, ac, min_bits,
 487                                              bit_off, num_bits);
 488                if (status != -ENOSPC)
 489                        break;
 490
 491                min_bits >>= 1;
 492        }
 493
 494        return status;
 495}
 496
 497static int ocfs2_block_group_grow_discontig(handle_t *handle,
 498                                            struct inode *alloc_inode,
 499                                            struct buffer_head *bg_bh,
 500                                            struct ocfs2_alloc_context *ac,
 501                                            struct ocfs2_chain_list *cl,
 502                                            unsigned int min_bits)
 503{
 504        int status;
 505        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 506        struct ocfs2_group_desc *bg =
 507                (struct ocfs2_group_desc *)bg_bh->b_data;
 508        unsigned int needed = le16_to_cpu(cl->cl_cpg) -
 509                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
 510        u32 p_cpos, clusters;
 511        u64 p_blkno;
 512        struct ocfs2_extent_list *el = &bg->bg_list;
 513
 514        status = ocfs2_journal_access_gd(handle,
 515                                         INODE_CACHE(alloc_inode),
 516                                         bg_bh,
 517                                         OCFS2_JOURNAL_ACCESS_CREATE);
 518        if (status < 0) {
 519                mlog_errno(status);
 520                goto bail;
 521        }
 522
 523        while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
 524                                le16_to_cpu(el->l_count))) {
 525                if (min_bits > needed)
 526                        min_bits = needed;
 527                status = ocfs2_block_group_claim_bits(osb, handle, ac,
 528                                                      min_bits, &p_cpos,
 529                                                      &clusters);
 530                if (status < 0) {
 531                        if (status != -ENOSPC)
 532                                mlog_errno(status);
 533                        goto bail;
 534                }
 535                p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
 536                ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
 537                                              clusters);
 538
 539                min_bits = clusters;
 540                needed = le16_to_cpu(cl->cl_cpg) -
 541                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
 542        }
 543
 544        if (needed > 0) {
 545                /*
 546                 * We have used up all the extent rec but can't fill up
 547                 * the cpg. So bail out.
 548                 */
 549                status = -ENOSPC;
 550                goto bail;
 551        }
 552
 553        ocfs2_journal_dirty(handle, bg_bh);
 554
 555bail:
 556        return status;
 557}
 558
 559static void ocfs2_bg_alloc_cleanup(handle_t *handle,
 560                                   struct ocfs2_alloc_context *cluster_ac,
 561                                   struct inode *alloc_inode,
 562                                   struct buffer_head *bg_bh)
 563{
 564        int i, ret;
 565        struct ocfs2_group_desc *bg;
 566        struct ocfs2_extent_list *el;
 567        struct ocfs2_extent_rec *rec;
 568
 569        if (!bg_bh)
 570                return;
 571
 572        bg = (struct ocfs2_group_desc *)bg_bh->b_data;
 573        el = &bg->bg_list;
 574        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 575                rec = &el->l_recs[i];
 576                ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
 577                                          cluster_ac->ac_bh,
 578                                          le64_to_cpu(rec->e_blkno),
 579                                          le16_to_cpu(rec->e_leaf_clusters));
 580                if (ret)
 581                        mlog_errno(ret);
 582                /* Try all the clusters to free */
 583        }
 584
 585        ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
 586        brelse(bg_bh);
 587}
 588
 589static struct buffer_head *
 590ocfs2_block_group_alloc_discontig(handle_t *handle,
 591                                  struct inode *alloc_inode,
 592                                  struct ocfs2_alloc_context *ac,
 593                                  struct ocfs2_chain_list *cl)
 594{
 595        int status;
 596        u32 bit_off, num_bits;
 597        u64 bg_blkno;
 598        unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
 599        struct buffer_head *bg_bh = NULL;
 600        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
 601        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 602
 603        if (!ocfs2_supports_discontig_bg(osb)) {
 604                status = -ENOSPC;
 605                goto bail;
 606        }
 607
 608        status = ocfs2_extend_trans(handle,
 609                                    ocfs2_calc_bg_discontig_credits(osb->sb));
 610        if (status) {
 611                mlog_errno(status);
 612                goto bail;
 613        }
 614
 615        /*
 616         * We're going to be grabbing from multiple cluster groups.
 617         * We don't have enough credits to relink them all, and the
 618         * cluster groups will be staying in cache for the duration of
 619         * this operation.
 620         */
 621        ac->ac_disable_chain_relink = 1;
 622
 623        /* Claim the first region */
 624        status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
 625                                              &bit_off, &num_bits);
 626        if (status < 0) {
 627                if (status != -ENOSPC)
 628                        mlog_errno(status);
 629                goto bail;
 630        }
 631        min_bits = num_bits;
 632
 633        /* setup the group */
 634        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 635        trace_ocfs2_block_group_alloc_discontig(
 636                                (unsigned long long)bg_blkno, alloc_rec);
 637
 638        bg_bh = sb_getblk(osb->sb, bg_blkno);
 639        if (!bg_bh) {
 640                status = -ENOMEM;
 641                mlog_errno(status);
 642                goto bail;
 643        }
 644        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
 645
 646        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
 647                                        bg_blkno, num_bits, alloc_rec, cl);
 648        if (status < 0) {
 649                mlog_errno(status);
 650                goto bail;
 651        }
 652
 653        status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
 654                                                  bg_bh, ac, cl, min_bits);
 655        if (status)
 656                mlog_errno(status);
 657
 658bail:
 659        if (status)
 660                ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
 661        return status ? ERR_PTR(status) : bg_bh;
 662}
 663
 664/*
 665 * We expect the block group allocator to already be locked.
 666 */
 667static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 668                                   struct inode *alloc_inode,
 669                                   struct buffer_head *bh,
 670                                   u64 max_block,
 671                                   u64 *last_alloc_group,
 672                                   int flags)
 673{
 674        int status, credits;
 675        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
 676        struct ocfs2_chain_list *cl;
 677        struct ocfs2_alloc_context *ac = NULL;
 678        handle_t *handle = NULL;
 679        u16 alloc_rec;
 680        struct buffer_head *bg_bh = NULL;
 681        struct ocfs2_group_desc *bg;
 682
 683        BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
 684
 685        cl = &fe->id2.i_chain;
 686        status = ocfs2_reserve_clusters_with_limit(osb,
 687                                                   le16_to_cpu(cl->cl_cpg),
 688                                                   max_block, flags, &ac);
 689        if (status < 0) {
 690                if (status != -ENOSPC)
 691                        mlog_errno(status);
 692                goto bail;
 693        }
 694
 695        credits = ocfs2_calc_group_alloc_credits(osb->sb,
 696                                                 le16_to_cpu(cl->cl_cpg));
 697        handle = ocfs2_start_trans(osb, credits);
 698        if (IS_ERR(handle)) {
 699                status = PTR_ERR(handle);
 700                handle = NULL;
 701                mlog_errno(status);
 702                goto bail;
 703        }
 704
 705        if (last_alloc_group && *last_alloc_group != 0) {
 706                trace_ocfs2_block_group_alloc(
 707                                (unsigned long long)*last_alloc_group);
 708                ac->ac_last_group = *last_alloc_group;
 709        }
 710
 711        bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
 712                                               ac, cl);
 713        if (IS_ERR(bg_bh) && (PTR_ERR(bg_bh) == -ENOSPC))
 714                bg_bh = ocfs2_block_group_alloc_discontig(handle,
 715                                                          alloc_inode,
 716                                                          ac, cl);
 717        if (IS_ERR(bg_bh)) {
 718                status = PTR_ERR(bg_bh);
 719                bg_bh = NULL;
 720                if (status != -ENOSPC)
 721                        mlog_errno(status);
 722                goto bail;
 723        }
 724        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 725
 726        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
 727                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
 728        if (status < 0) {
 729                mlog_errno(status);
 730                goto bail;
 731        }
 732
 733        alloc_rec = le16_to_cpu(bg->bg_chain);
 734        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
 735                     le16_to_cpu(bg->bg_free_bits_count));
 736        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
 737                     le16_to_cpu(bg->bg_bits));
 738        cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
 739        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
 740                le16_add_cpu(&cl->cl_next_free_rec, 1);
 741
 742        le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
 743                                        le16_to_cpu(bg->bg_free_bits_count));
 744        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
 745        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
 746
 747        ocfs2_journal_dirty(handle, bh);
 748
 749        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
 750        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 751        fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
 752                                             le32_to_cpu(fe->i_clusters)));
 753        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
 754        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
 755        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
 756        ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
 757
 758        status = 0;
 759
 760        /* save the new last alloc group so that the caller can cache it. */
 761        if (last_alloc_group)
 762                *last_alloc_group = ac->ac_last_group;
 763
 764bail:
 765        if (handle)
 766                ocfs2_commit_trans(osb, handle);
 767
 768        if (ac)
 769                ocfs2_free_alloc_context(ac);
 770
 771        brelse(bg_bh);
 772
 773        if (status)
 774                mlog_errno(status);
 775        return status;
 776}
 777
 778static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 779                                       struct ocfs2_alloc_context *ac,
 780                                       int type,
 781                                       u32 slot,
 782                                       u64 *last_alloc_group,
 783                                       int flags)
 784{
 785        int status;
 786        u32 bits_wanted = ac->ac_bits_wanted;
 787        struct inode *alloc_inode;
 788        struct buffer_head *bh = NULL;
 789        struct ocfs2_dinode *fe;
 790        u32 free_bits;
 791
 792        alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
 793        if (!alloc_inode) {
 794                mlog_errno(-EINVAL);
 795                return -EINVAL;
 796        }
 797
 798        inode_lock(alloc_inode);
 799
 800        status = ocfs2_inode_lock(alloc_inode, &bh, 1);
 801        if (status < 0) {
 802                inode_unlock(alloc_inode);
 803                iput(alloc_inode);
 804
 805                mlog_errno(status);
 806                return status;
 807        }
 808
 809        ac->ac_inode = alloc_inode;
 810        ac->ac_alloc_slot = slot;
 811
 812        fe = (struct ocfs2_dinode *) bh->b_data;
 813
 814        /* The bh was validated by the inode read inside
 815         * ocfs2_inode_lock().  Any corruption is a code bug. */
 816        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 817
 818        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 819                status = ocfs2_error(alloc_inode->i_sb,
 820                                     "Invalid chain allocator %llu\n",
 821                                     (unsigned long long)le64_to_cpu(fe->i_blkno));
 822                goto bail;
 823        }
 824
 825        free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
 826                le32_to_cpu(fe->id1.bitmap1.i_used);
 827
 828        if (bits_wanted > free_bits) {
 829                /* cluster bitmap never grows */
 830                if (ocfs2_is_cluster_bitmap(alloc_inode)) {
 831                        trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
 832                                                                free_bits);
 833                        status = -ENOSPC;
 834                        goto bail;
 835                }
 836
 837                if (!(flags & ALLOC_NEW_GROUP)) {
 838                        trace_ocfs2_reserve_suballoc_bits_no_new_group(
 839                                                slot, bits_wanted, free_bits);
 840                        status = -ENOSPC;
 841                        goto bail;
 842                }
 843
 844                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
 845                                                 ac->ac_max_block,
 846                                                 last_alloc_group, flags);
 847                if (status < 0) {
 848                        if (status != -ENOSPC)
 849                                mlog_errno(status);
 850                        goto bail;
 851                }
 852                atomic_inc(&osb->alloc_stats.bg_extends);
 853
 854                /* You should never ask for this much metadata */
 855                BUG_ON(bits_wanted >
 856                       (le32_to_cpu(fe->id1.bitmap1.i_total)
 857                        - le32_to_cpu(fe->id1.bitmap1.i_used)));
 858        }
 859
 860        get_bh(bh);
 861        ac->ac_bh = bh;
 862bail:
 863        brelse(bh);
 864
 865        if (status)
 866                mlog_errno(status);
 867        return status;
 868}
 869
 870static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 871{
 872        spin_lock(&osb->osb_lock);
 873        osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
 874        spin_unlock(&osb->osb_lock);
 875        atomic_set(&osb->s_num_inodes_stolen, 0);
 876}
 877
 878static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
 879{
 880        spin_lock(&osb->osb_lock);
 881        osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
 882        spin_unlock(&osb->osb_lock);
 883        atomic_set(&osb->s_num_meta_stolen, 0);
 884}
 885
 886void ocfs2_init_steal_slots(struct ocfs2_super *osb)
 887{
 888        ocfs2_init_inode_steal_slot(osb);
 889        ocfs2_init_meta_steal_slot(osb);
 890}
 891
 892static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
 893{
 894        spin_lock(&osb->osb_lock);
 895        if (type == INODE_ALLOC_SYSTEM_INODE)
 896                osb->s_inode_steal_slot = slot;
 897        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
 898                osb->s_meta_steal_slot = slot;
 899        spin_unlock(&osb->osb_lock);
 900}
 901
 902static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
 903{
 904        int slot = OCFS2_INVALID_SLOT;
 905
 906        spin_lock(&osb->osb_lock);
 907        if (type == INODE_ALLOC_SYSTEM_INODE)
 908                slot = osb->s_inode_steal_slot;
 909        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
 910                slot = osb->s_meta_steal_slot;
 911        spin_unlock(&osb->osb_lock);
 912
 913        return slot;
 914}
 915
 916static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 917{
 918        return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
 919}
 920
 921static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
 922{
 923        return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
 924}
 925
 926static int ocfs2_steal_resource(struct ocfs2_super *osb,
 927                                struct ocfs2_alloc_context *ac,
 928                                int type)
 929{
 930        int i, status = -ENOSPC;
 931        int slot = __ocfs2_get_steal_slot(osb, type);
 932
 933        /* Start to steal resource from the first slot after ours. */
 934        if (slot == OCFS2_INVALID_SLOT)
 935                slot = osb->slot_num + 1;
 936
 937        for (i = 0; i < osb->max_slots; i++, slot++) {
 938                if (slot == osb->max_slots)
 939                        slot = 0;
 940
 941                if (slot == osb->slot_num)
 942                        continue;
 943
 944                status = ocfs2_reserve_suballoc_bits(osb, ac,
 945                                                     type,
 946                                                     (u32)slot, NULL,
 947                                                     NOT_ALLOC_NEW_GROUP);
 948                if (status >= 0) {
 949                        __ocfs2_set_steal_slot(osb, slot, type);
 950                        break;
 951                }
 952
 953                ocfs2_free_ac_resource(ac);
 954        }
 955
 956        return status;
 957}
 958
 959static int ocfs2_steal_inode(struct ocfs2_super *osb,
 960                             struct ocfs2_alloc_context *ac)
 961{
 962        return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
 963}
 964
 965static int ocfs2_steal_meta(struct ocfs2_super *osb,
 966                            struct ocfs2_alloc_context *ac)
 967{
 968        return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
 969}
 970
 971int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 972                                      int blocks,
 973                                      struct ocfs2_alloc_context **ac)
 974{
 975        int status;
 976        int slot = ocfs2_get_meta_steal_slot(osb);
 977
 978        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 979        if (!(*ac)) {
 980                status = -ENOMEM;
 981                mlog_errno(status);
 982                goto bail;
 983        }
 984
 985        (*ac)->ac_bits_wanted = blocks;
 986        (*ac)->ac_which = OCFS2_AC_USE_META;
 987        (*ac)->ac_group_search = ocfs2_block_group_search;
 988
 989        if (slot != OCFS2_INVALID_SLOT &&
 990                atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
 991                goto extent_steal;
 992
 993        atomic_set(&osb->s_num_meta_stolen, 0);
 994        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
 995                                             EXTENT_ALLOC_SYSTEM_INODE,
 996                                             (u32)osb->slot_num, NULL,
 997                                             ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
 998
 999
1000        if (status >= 0) {
1001                status = 0;
1002                if (slot != OCFS2_INVALID_SLOT)
1003                        ocfs2_init_meta_steal_slot(osb);
1004                goto bail;
1005        } else if (status < 0 && status != -ENOSPC) {
1006                mlog_errno(status);
1007                goto bail;
1008        }
1009
1010        ocfs2_free_ac_resource(*ac);
1011
1012extent_steal:
1013        status = ocfs2_steal_meta(osb, *ac);
1014        atomic_inc(&osb->s_num_meta_stolen);
1015        if (status < 0) {
1016                if (status != -ENOSPC)
1017                        mlog_errno(status);
1018                goto bail;
1019        }
1020
1021        status = 0;
1022bail:
1023        if ((status < 0) && *ac) {
1024                ocfs2_free_alloc_context(*ac);
1025                *ac = NULL;
1026        }
1027
1028        if (status)
1029                mlog_errno(status);
1030        return status;
1031}
1032
1033int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1034                               struct ocfs2_extent_list *root_el,
1035                               struct ocfs2_alloc_context **ac)
1036{
1037        return ocfs2_reserve_new_metadata_blocks(osb,
1038                                        ocfs2_extend_meta_needed(root_el),
1039                                        ac);
1040}
1041
1042int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1043                            struct ocfs2_alloc_context **ac)
1044{
1045        int status;
1046        int slot = ocfs2_get_inode_steal_slot(osb);
1047        u64 alloc_group;
1048
1049        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1050        if (!(*ac)) {
1051                status = -ENOMEM;
1052                mlog_errno(status);
1053                goto bail;
1054        }
1055
1056        (*ac)->ac_bits_wanted = 1;
1057        (*ac)->ac_which = OCFS2_AC_USE_INODE;
1058
1059        (*ac)->ac_group_search = ocfs2_block_group_search;
1060
1061        /*
1062         * stat(2) can't handle i_ino > 32bits, so we tell the
1063         * lower levels not to allocate us a block group past that
1064         * limit.  The 'inode64' mount option avoids this behavior.
1065         */
1066        if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1067                (*ac)->ac_max_block = (u32)~0U;
1068
1069        /*
1070         * slot is set when we successfully steal inode from other nodes.
1071         * It is reset in 3 places:
1072         * 1. when we flush the truncate log
1073         * 2. when we complete local alloc recovery.
1074         * 3. when we successfully allocate from our own slot.
1075         * After it is set, we will go on stealing inodes until we find the
1076         * need to check our slots to see whether there is some space for us.
1077         */
1078        if (slot != OCFS2_INVALID_SLOT &&
1079            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1080                goto inode_steal;
1081
1082        atomic_set(&osb->s_num_inodes_stolen, 0);
1083        alloc_group = osb->osb_inode_alloc_group;
1084        status = ocfs2_reserve_suballoc_bits(osb, *ac,
1085                                             INODE_ALLOC_SYSTEM_INODE,
1086                                             (u32)osb->slot_num,
1087                                             &alloc_group,
1088                                             ALLOC_NEW_GROUP |
1089                                             ALLOC_GROUPS_FROM_GLOBAL);
1090        if (status >= 0) {
1091                status = 0;
1092
1093                spin_lock(&osb->osb_lock);
1094                osb->osb_inode_alloc_group = alloc_group;
1095                spin_unlock(&osb->osb_lock);
1096                trace_ocfs2_reserve_new_inode_new_group(
1097                        (unsigned long long)alloc_group);
1098
1099                /*
1100                 * Some inodes must be freed by us, so try to allocate
1101                 * from our own next time.
1102                 */
1103                if (slot != OCFS2_INVALID_SLOT)
1104                        ocfs2_init_inode_steal_slot(osb);
1105                goto bail;
1106        } else if (status < 0 && status != -ENOSPC) {
1107                mlog_errno(status);
1108                goto bail;
1109        }
1110
1111        ocfs2_free_ac_resource(*ac);
1112
1113inode_steal:
1114        status = ocfs2_steal_inode(osb, *ac);
1115        atomic_inc(&osb->s_num_inodes_stolen);
1116        if (status < 0) {
1117                if (status != -ENOSPC)
1118                        mlog_errno(status);
1119                goto bail;
1120        }
1121
1122        status = 0;
1123bail:
1124        if ((status < 0) && *ac) {
1125                ocfs2_free_alloc_context(*ac);
1126                *ac = NULL;
1127        }
1128
1129        if (status)
1130                mlog_errno(status);
1131        return status;
1132}
1133
1134/* local alloc code has to do the same thing, so rather than do this
1135 * twice.. */
1136int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1137                                      struct ocfs2_alloc_context *ac)
1138{
1139        int status;
1140
1141        ac->ac_which = OCFS2_AC_USE_MAIN;
1142        ac->ac_group_search = ocfs2_cluster_group_search;
1143
1144        status = ocfs2_reserve_suballoc_bits(osb, ac,
1145                                             GLOBAL_BITMAP_SYSTEM_INODE,
1146                                             OCFS2_INVALID_SLOT, NULL,
1147                                             ALLOC_NEW_GROUP);
1148        if (status < 0 && status != -ENOSPC)
1149                mlog_errno(status);
1150
1151        return status;
1152}
1153
1154/* Callers don't need to care which bitmap (local alloc or main) to
1155 * use so we figure it out for them, but unfortunately this clutters
1156 * things a bit. */
1157static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1158                                             u32 bits_wanted, u64 max_block,
1159                                             int flags,
1160                                             struct ocfs2_alloc_context **ac)
1161{
1162        int status, ret = 0;
1163        int retried = 0;
1164
1165        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1166        if (!(*ac)) {
1167                status = -ENOMEM;
1168                mlog_errno(status);
1169                goto bail;
1170        }
1171
1172        (*ac)->ac_bits_wanted = bits_wanted;
1173        (*ac)->ac_max_block = max_block;
1174
1175        status = -ENOSPC;
1176        if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1177            ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1178                status = ocfs2_reserve_local_alloc_bits(osb,
1179                                                        bits_wanted,
1180                                                        *ac);
1181                if ((status < 0) && (status != -ENOSPC)) {
1182                        mlog_errno(status);
1183                        goto bail;
1184                }
1185        }
1186
1187        if (status == -ENOSPC) {
1188retry:
1189                status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1190                /* Retry if there is sufficient space cached in truncate log */
1191                if (status == -ENOSPC && !retried) {
1192                        retried = 1;
1193                        ocfs2_inode_unlock((*ac)->ac_inode, 1);
1194                        inode_unlock((*ac)->ac_inode);
1195
1196                        ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1197                        if (ret == 1) {
1198                                iput((*ac)->ac_inode);
1199                                (*ac)->ac_inode = NULL;
1200                                goto retry;
1201                        }
1202
1203                        if (ret < 0)
1204                                mlog_errno(ret);
1205
1206                        inode_lock((*ac)->ac_inode);
1207                        ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1208                        if (ret < 0) {
1209                                mlog_errno(ret);
1210                                inode_unlock((*ac)->ac_inode);
1211                                iput((*ac)->ac_inode);
1212                                (*ac)->ac_inode = NULL;
1213                                goto bail;
1214                        }
1215                }
1216                if (status < 0) {
1217                        if (status != -ENOSPC)
1218                                mlog_errno(status);
1219                        goto bail;
1220                }
1221        }
1222
1223        status = 0;
1224bail:
1225        if ((status < 0) && *ac) {
1226                ocfs2_free_alloc_context(*ac);
1227                *ac = NULL;
1228        }
1229
1230        if (status)
1231                mlog_errno(status);
1232        return status;
1233}
1234
1235int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1236                           u32 bits_wanted,
1237                           struct ocfs2_alloc_context **ac)
1238{
1239        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1240                                                 ALLOC_NEW_GROUP, ac);
1241}
1242
1243/*
1244 * More or less lifted from ext3. I'll leave their description below:
1245 *
1246 * "For ext3 allocations, we must not reuse any blocks which are
1247 * allocated in the bitmap buffer's "last committed data" copy.  This
1248 * prevents deletes from freeing up the page for reuse until we have
1249 * committed the delete transaction.
1250 *
1251 * If we didn't do this, then deleting something and reallocating it as
1252 * data would allow the old block to be overwritten before the
1253 * transaction committed (because we force data to disk before commit).
1254 * This would lead to corruption if we crashed between overwriting the
1255 * data and committing the delete.
1256 *
1257 * @@@ We may want to make this allocation behaviour conditional on
1258 * data-writes at some point, and disable it for metadata allocations or
1259 * sync-data inodes."
1260 *
1261 * Note: OCFS2 already does this differently for metadata vs data
1262 * allocations, as those bitmaps are separate and undo access is never
1263 * called on a metadata group descriptor.
1264 */
1265static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1266                                         int nr)
1267{
1268        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1269        int ret;
1270
1271        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1272                return 0;
1273
1274        if (!buffer_jbd(bg_bh))
1275                return 1;
1276
1277        jbd_lock_bh_state(bg_bh);
1278        bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1279        if (bg)
1280                ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1281        else
1282                ret = 1;
1283        jbd_unlock_bh_state(bg_bh);
1284
1285        return ret;
1286}
1287
1288static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1289                                             struct buffer_head *bg_bh,
1290                                             unsigned int bits_wanted,
1291                                             unsigned int total_bits,
1292                                             struct ocfs2_suballoc_result *res)
1293{
1294        void *bitmap;
1295        u16 best_offset, best_size;
1296        int offset, start, found, status = 0;
1297        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1298
1299        /* Callers got this descriptor from
1300         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1301        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1302
1303        found = start = best_offset = best_size = 0;
1304        bitmap = bg->bg_bitmap;
1305
1306        while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1307                if (offset == total_bits)
1308                        break;
1309
1310                if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1311                        /* We found a zero, but we can't use it as it
1312                         * hasn't been put to disk yet! */
1313                        found = 0;
1314                        start = offset + 1;
1315                } else if (offset == start) {
1316                        /* we found a zero */
1317                        found++;
1318                        /* move start to the next bit to test */
1319                        start++;
1320                } else {
1321                        /* got a zero after some ones */
1322                        found = 1;
1323                        start = offset + 1;
1324                }
1325                if (found > best_size) {
1326                        best_size = found;
1327                        best_offset = start - found;
1328                }
1329                /* we got everything we needed */
1330                if (found == bits_wanted) {
1331                        /* mlog(0, "Found it all!\n"); */
1332                        break;
1333                }
1334        }
1335
1336        if (best_size) {
1337                res->sr_bit_offset = best_offset;
1338                res->sr_bits = best_size;
1339        } else {
1340                status = -ENOSPC;
1341                /* No error log here -- see the comment above
1342                 * ocfs2_test_bg_bit_allocatable */
1343        }
1344
1345        return status;
1346}
1347
1348int ocfs2_block_group_set_bits(handle_t *handle,
1349                                             struct inode *alloc_inode,
1350                                             struct ocfs2_group_desc *bg,
1351                                             struct buffer_head *group_bh,
1352                                             unsigned int bit_off,
1353                                             unsigned int num_bits)
1354{
1355        int status;
1356        void *bitmap = bg->bg_bitmap;
1357        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1358
1359        /* All callers get the descriptor via
1360         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1361        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1362        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1363
1364        trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1365
1366        if (ocfs2_is_cluster_bitmap(alloc_inode))
1367                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1368
1369        status = ocfs2_journal_access_gd(handle,
1370                                         INODE_CACHE(alloc_inode),
1371                                         group_bh,
1372                                         journal_type);
1373        if (status < 0) {
1374                mlog_errno(status);
1375                goto bail;
1376        }
1377
1378        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1379        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1380                return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1381                                   (unsigned long long)le64_to_cpu(bg->bg_blkno),
1382                                   le16_to_cpu(bg->bg_bits),
1383                                   le16_to_cpu(bg->bg_free_bits_count),
1384                                   num_bits);
1385        }
1386        while(num_bits--)
1387                ocfs2_set_bit(bit_off++, bitmap);
1388
1389        ocfs2_journal_dirty(handle, group_bh);
1390
1391bail:
1392        return status;
1393}
1394
1395/* find the one with the most empty bits */
1396static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1397{
1398        u16 curr, best;
1399
1400        BUG_ON(!cl->cl_next_free_rec);
1401
1402        best = curr = 0;
1403        while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1404                if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1405                    le32_to_cpu(cl->cl_recs[best].c_free))
1406                        best = curr;
1407                curr++;
1408        }
1409
1410        BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1411        return best;
1412}
1413
1414static int ocfs2_relink_block_group(handle_t *handle,
1415                                    struct inode *alloc_inode,
1416                                    struct buffer_head *fe_bh,
1417                                    struct buffer_head *bg_bh,
1418                                    struct buffer_head *prev_bg_bh,
1419                                    u16 chain)
1420{
1421        int status;
1422        /* there is a really tiny chance the journal calls could fail,
1423         * but we wouldn't want inconsistent blocks in *any* case. */
1424        u64 bg_ptr, prev_bg_ptr;
1425        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1426        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1427        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1428
1429        /* The caller got these descriptors from
1430         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1431        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1432        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1433
1434        trace_ocfs2_relink_block_group(
1435                (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1436                (unsigned long long)le64_to_cpu(bg->bg_blkno),
1437                (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1438
1439        bg_ptr = le64_to_cpu(bg->bg_next_group);
1440        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1441
1442        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1443                                         prev_bg_bh,
1444                                         OCFS2_JOURNAL_ACCESS_WRITE);
1445        if (status < 0)
1446                goto out;
1447
1448        prev_bg->bg_next_group = bg->bg_next_group;
1449        ocfs2_journal_dirty(handle, prev_bg_bh);
1450
1451        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1452                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1453        if (status < 0)
1454                goto out_rollback_prev_bg;
1455
1456        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1457        ocfs2_journal_dirty(handle, bg_bh);
1458
1459        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1460                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1461        if (status < 0)
1462                goto out_rollback_bg;
1463
1464        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1465        ocfs2_journal_dirty(handle, fe_bh);
1466
1467out:
1468        if (status < 0)
1469                mlog_errno(status);
1470        return status;
1471
1472out_rollback_bg:
1473        bg->bg_next_group = cpu_to_le64(bg_ptr);
1474out_rollback_prev_bg:
1475        prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1476        goto out;
1477}
1478
1479static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1480                                                     u32 wanted)
1481{
1482        return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1483}
1484
1485/* return 0 on success, -ENOSPC to keep searching and any other < 0
1486 * value on error. */
1487static int ocfs2_cluster_group_search(struct inode *inode,
1488                                      struct buffer_head *group_bh,
1489                                      u32 bits_wanted, u32 min_bits,
1490                                      u64 max_block,
1491                                      struct ocfs2_suballoc_result *res)
1492{
1493        int search = -ENOSPC;
1494        int ret;
1495        u64 blkoff;
1496        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1497        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1498        unsigned int max_bits, gd_cluster_off;
1499
1500        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1501
1502        if (gd->bg_free_bits_count) {
1503                max_bits = le16_to_cpu(gd->bg_bits);
1504
1505                /* Tail groups in cluster bitmaps which aren't cpg
1506                 * aligned are prone to partial extension by a failed
1507                 * fs resize. If the file system resize never got to
1508                 * update the dinode cluster count, then we don't want
1509                 * to trust any clusters past it, regardless of what
1510                 * the group descriptor says. */
1511                gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1512                                                          le64_to_cpu(gd->bg_blkno));
1513                if ((gd_cluster_off + max_bits) >
1514                    OCFS2_I(inode)->ip_clusters) {
1515                        max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1516                        trace_ocfs2_cluster_group_search_wrong_max_bits(
1517                                (unsigned long long)le64_to_cpu(gd->bg_blkno),
1518                                le16_to_cpu(gd->bg_bits),
1519                                OCFS2_I(inode)->ip_clusters, max_bits);
1520                }
1521
1522                ret = ocfs2_block_group_find_clear_bits(osb,
1523                                                        group_bh, bits_wanted,
1524                                                        max_bits, res);
1525                if (ret)
1526                        return ret;
1527
1528                if (max_block) {
1529                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1530                                                          gd_cluster_off +
1531                                                          res->sr_bit_offset +
1532                                                          res->sr_bits);
1533                        trace_ocfs2_cluster_group_search_max_block(
1534                                (unsigned long long)blkoff,
1535                                (unsigned long long)max_block);
1536                        if (blkoff > max_block)
1537                                return -ENOSPC;
1538                }
1539
1540                /* ocfs2_block_group_find_clear_bits() might
1541                 * return success, but we still want to return
1542                 * -ENOSPC unless it found the minimum number
1543                 * of bits. */
1544                if (min_bits <= res->sr_bits)
1545                        search = 0; /* success */
1546                else if (res->sr_bits) {
1547                        /*
1548                         * Don't show bits which we'll be returning
1549                         * for allocation to the local alloc bitmap.
1550                         */
1551                        ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1552                }
1553        }
1554
1555        return search;
1556}
1557
1558static int ocfs2_block_group_search(struct inode *inode,
1559                                    struct buffer_head *group_bh,
1560                                    u32 bits_wanted, u32 min_bits,
1561                                    u64 max_block,
1562                                    struct ocfs2_suballoc_result *res)
1563{
1564        int ret = -ENOSPC;
1565        u64 blkoff;
1566        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1567
1568        BUG_ON(min_bits != 1);
1569        BUG_ON(ocfs2_is_cluster_bitmap(inode));
1570
1571        if (bg->bg_free_bits_count) {
1572                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1573                                                        group_bh, bits_wanted,
1574                                                        le16_to_cpu(bg->bg_bits),
1575                                                        res);
1576                if (!ret && max_block) {
1577                        blkoff = le64_to_cpu(bg->bg_blkno) +
1578                                res->sr_bit_offset + res->sr_bits;
1579                        trace_ocfs2_block_group_search_max_block(
1580                                (unsigned long long)blkoff,
1581                                (unsigned long long)max_block);
1582                        if (blkoff > max_block)
1583                                ret = -ENOSPC;
1584                }
1585        }
1586
1587        return ret;
1588}
1589
1590int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1591                                       handle_t *handle,
1592                                       struct buffer_head *di_bh,
1593                                       u32 num_bits,
1594                                       u16 chain)
1595{
1596        int ret;
1597        u32 tmp_used;
1598        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1599        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1600
1601        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1602                                      OCFS2_JOURNAL_ACCESS_WRITE);
1603        if (ret < 0) {
1604                mlog_errno(ret);
1605                goto out;
1606        }
1607
1608        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1609        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1610        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1611        ocfs2_journal_dirty(handle, di_bh);
1612
1613out:
1614        return ret;
1615}
1616
1617void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1618                                       struct buffer_head *di_bh,
1619                                       u32 num_bits,
1620                                       u16 chain)
1621{
1622        u32 tmp_used;
1623        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1624        struct ocfs2_chain_list *cl;
1625
1626        cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1627        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1628        di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1629        le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1630}
1631
1632static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1633                                         struct ocfs2_extent_rec *rec,
1634                                         struct ocfs2_chain_list *cl)
1635{
1636        unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1637        unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1638        unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1639
1640        if (res->sr_bit_offset < bitoff)
1641                return 0;
1642        if (res->sr_bit_offset >= (bitoff + bitcount))
1643                return 0;
1644        res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1645                (res->sr_bit_offset - bitoff);
1646        if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1647                res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1648        return 1;
1649}
1650
1651static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1652                                          struct ocfs2_group_desc *bg,
1653                                          struct ocfs2_suballoc_result *res)
1654{
1655        int i;
1656        u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1657        struct ocfs2_extent_rec *rec;
1658        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1659        struct ocfs2_chain_list *cl = &di->id2.i_chain;
1660
1661        if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1662                res->sr_blkno = 0;
1663                return;
1664        }
1665
1666        res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1667        res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1668        if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1669            !bg->bg_list.l_next_free_rec)
1670                return;
1671
1672        for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1673                rec = &bg->bg_list.l_recs[i];
1674                if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1675                        res->sr_bg_blkno = bg_blkno;  /* Restore */
1676                        break;
1677                }
1678        }
1679}
1680
1681static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1682                                  handle_t *handle,
1683                                  u32 bits_wanted,
1684                                  u32 min_bits,
1685                                  struct ocfs2_suballoc_result *res,
1686                                  u16 *bits_left)
1687{
1688        int ret;
1689        struct buffer_head *group_bh = NULL;
1690        struct ocfs2_group_desc *gd;
1691        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1692        struct inode *alloc_inode = ac->ac_inode;
1693
1694        ret = ocfs2_read_group_descriptor(alloc_inode, di,
1695                                          res->sr_bg_blkno, &group_bh);
1696        if (ret < 0) {
1697                mlog_errno(ret);
1698                return ret;
1699        }
1700
1701        gd = (struct ocfs2_group_desc *) group_bh->b_data;
1702        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1703                                  ac->ac_max_block, res);
1704        if (ret < 0) {
1705                if (ret != -ENOSPC)
1706                        mlog_errno(ret);
1707                goto out;
1708        }
1709
1710        if (!ret)
1711                ocfs2_bg_discontig_fix_result(ac, gd, res);
1712
1713        /*
1714         * sr_bg_blkno might have been changed by
1715         * ocfs2_bg_discontig_fix_result
1716         */
1717        res->sr_bg_stable_blkno = group_bh->b_blocknr;
1718
1719        if (ac->ac_find_loc_only)
1720                goto out_loc_only;
1721
1722        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1723                                               res->sr_bits,
1724                                               le16_to_cpu(gd->bg_chain));
1725        if (ret < 0) {
1726                mlog_errno(ret);
1727                goto out;
1728        }
1729
1730        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1731                                         res->sr_bit_offset, res->sr_bits);
1732        if (ret < 0) {
1733                ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1734                                               res->sr_bits,
1735                                               le16_to_cpu(gd->bg_chain));
1736                mlog_errno(ret);
1737        }
1738
1739out_loc_only:
1740        *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1741
1742out:
1743        brelse(group_bh);
1744
1745        return ret;
1746}
1747
1748static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1749                              handle_t *handle,
1750                              u32 bits_wanted,
1751                              u32 min_bits,
1752                              struct ocfs2_suballoc_result *res,
1753                              u16 *bits_left)
1754{
1755        int status;
1756        u16 chain;
1757        u64 next_group;
1758        struct inode *alloc_inode = ac->ac_inode;
1759        struct buffer_head *group_bh = NULL;
1760        struct buffer_head *prev_group_bh = NULL;
1761        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1762        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1763        struct ocfs2_group_desc *bg;
1764
1765        chain = ac->ac_chain;
1766        trace_ocfs2_search_chain_begin(
1767                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1768                bits_wanted, chain);
1769
1770        status = ocfs2_read_group_descriptor(alloc_inode, fe,
1771                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
1772                                             &group_bh);
1773        if (status < 0) {
1774                mlog_errno(status);
1775                goto bail;
1776        }
1777        bg = (struct ocfs2_group_desc *) group_bh->b_data;
1778
1779        status = -ENOSPC;
1780        /* for now, the chain search is a bit simplistic. We just use
1781         * the 1st group with any empty bits. */
1782        while ((status = ac->ac_group_search(alloc_inode, group_bh,
1783                                             bits_wanted, min_bits,
1784                                             ac->ac_max_block,
1785                                             res)) == -ENOSPC) {
1786                if (!bg->bg_next_group)
1787                        break;
1788
1789                brelse(prev_group_bh);
1790                prev_group_bh = NULL;
1791
1792                next_group = le64_to_cpu(bg->bg_next_group);
1793                prev_group_bh = group_bh;
1794                group_bh = NULL;
1795                status = ocfs2_read_group_descriptor(alloc_inode, fe,
1796                                                     next_group, &group_bh);
1797                if (status < 0) {
1798                        mlog_errno(status);
1799                        goto bail;
1800                }
1801                bg = (struct ocfs2_group_desc *) group_bh->b_data;
1802        }
1803        if (status < 0) {
1804                if (status != -ENOSPC)
1805                        mlog_errno(status);
1806                goto bail;
1807        }
1808
1809        trace_ocfs2_search_chain_succ(
1810                (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1811
1812        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1813
1814        BUG_ON(res->sr_bits == 0);
1815        if (!status)
1816                ocfs2_bg_discontig_fix_result(ac, bg, res);
1817
1818        /*
1819         * sr_bg_blkno might have been changed by
1820         * ocfs2_bg_discontig_fix_result
1821         */
1822        res->sr_bg_stable_blkno = group_bh->b_blocknr;
1823
1824        /*
1825         * Keep track of previous block descriptor read. When
1826         * we find a target, if we have read more than X
1827         * number of descriptors, and the target is reasonably
1828         * empty, relink him to top of his chain.
1829         *
1830         * We've read 0 extra blocks and only send one more to
1831         * the transaction, yet the next guy to search has a
1832         * much easier time.
1833         *
1834         * Do this *after* figuring out how many bits we're taking out
1835         * of our target group.
1836         */
1837        if (!ac->ac_disable_chain_relink &&
1838            (prev_group_bh) &&
1839            (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1840                status = ocfs2_relink_block_group(handle, alloc_inode,
1841                                                  ac->ac_bh, group_bh,
1842                                                  prev_group_bh, chain);
1843                if (status < 0) {
1844                        mlog_errno(status);
1845                        goto bail;
1846                }
1847        }
1848
1849        if (ac->ac_find_loc_only)
1850                goto out_loc_only;
1851
1852        status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1853                                                  ac->ac_bh, res->sr_bits,
1854                                                  chain);
1855        if (status) {
1856                mlog_errno(status);
1857                goto bail;
1858        }
1859
1860        status = ocfs2_block_group_set_bits(handle,
1861                                            alloc_inode,
1862                                            bg,
1863                                            group_bh,
1864                                            res->sr_bit_offset,
1865                                            res->sr_bits);
1866        if (status < 0) {
1867                ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1868                                        ac->ac_bh, res->sr_bits, chain);
1869                mlog_errno(status);
1870                goto bail;
1871        }
1872
1873        trace_ocfs2_search_chain_end(
1874                        (unsigned long long)le64_to_cpu(fe->i_blkno),
1875                        res->sr_bits);
1876
1877out_loc_only:
1878        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1879bail:
1880        brelse(group_bh);
1881        brelse(prev_group_bh);
1882
1883        if (status)
1884                mlog_errno(status);
1885        return status;
1886}
1887
1888/* will give out up to bits_wanted contiguous bits. */
1889static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1890                                     handle_t *handle,
1891                                     u32 bits_wanted,
1892                                     u32 min_bits,
1893                                     struct ocfs2_suballoc_result *res)
1894{
1895        int status;
1896        u16 victim, i;
1897        u16 bits_left = 0;
1898        u64 hint = ac->ac_last_group;
1899        struct ocfs2_chain_list *cl;
1900        struct ocfs2_dinode *fe;
1901
1902        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1903        BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1904        BUG_ON(!ac->ac_bh);
1905
1906        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1907
1908        /* The bh was validated by the inode read during
1909         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1910        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1911
1912        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1913            le32_to_cpu(fe->id1.bitmap1.i_total)) {
1914                status = ocfs2_error(ac->ac_inode->i_sb,
1915                                     "Chain allocator dinode %llu has %u used bits but only %u total\n",
1916                                     (unsigned long long)le64_to_cpu(fe->i_blkno),
1917                                     le32_to_cpu(fe->id1.bitmap1.i_used),
1918                                     le32_to_cpu(fe->id1.bitmap1.i_total));
1919                goto bail;
1920        }
1921
1922        res->sr_bg_blkno = hint;
1923        if (res->sr_bg_blkno) {
1924                /* Attempt to short-circuit the usual search mechanism
1925                 * by jumping straight to the most recently used
1926                 * allocation group. This helps us maintain some
1927                 * contiguousness across allocations. */
1928                status = ocfs2_search_one_group(ac, handle, bits_wanted,
1929                                                min_bits, res, &bits_left);
1930                if (!status)
1931                        goto set_hint;
1932                if (status < 0 && status != -ENOSPC) {
1933                        mlog_errno(status);
1934                        goto bail;
1935                }
1936        }
1937
1938        cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1939
1940        victim = ocfs2_find_victim_chain(cl);
1941        ac->ac_chain = victim;
1942
1943        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1944                                    res, &bits_left);
1945        if (!status) {
1946                if (ocfs2_is_cluster_bitmap(ac->ac_inode))
1947                        hint = res->sr_bg_blkno;
1948                else
1949                        hint = ocfs2_group_from_res(res);
1950                goto set_hint;
1951        }
1952        if (status < 0 && status != -ENOSPC) {
1953                mlog_errno(status);
1954                goto bail;
1955        }
1956
1957        trace_ocfs2_claim_suballoc_bits(victim);
1958
1959        /* If we didn't pick a good victim, then just default to
1960         * searching each chain in order. Don't allow chain relinking
1961         * because we only calculate enough journal credits for one
1962         * relink per alloc. */
1963        ac->ac_disable_chain_relink = 1;
1964        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1965                if (i == victim)
1966                        continue;
1967                if (!cl->cl_recs[i].c_free)
1968                        continue;
1969
1970                ac->ac_chain = i;
1971                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1972                                            res, &bits_left);
1973                if (!status) {
1974                        hint = ocfs2_group_from_res(res);
1975                        break;
1976                }
1977                if (status < 0 && status != -ENOSPC) {
1978                        mlog_errno(status);
1979                        goto bail;
1980                }
1981        }
1982
1983set_hint:
1984        if (status != -ENOSPC) {
1985                /* If the next search of this group is not likely to
1986                 * yield a suitable extent, then we reset the last
1987                 * group hint so as to not waste a disk read */
1988                if (bits_left < min_bits)
1989                        ac->ac_last_group = 0;
1990                else
1991                        ac->ac_last_group = hint;
1992        }
1993
1994bail:
1995        if (status)
1996                mlog_errno(status);
1997        return status;
1998}
1999
2000int ocfs2_claim_metadata(handle_t *handle,
2001                         struct ocfs2_alloc_context *ac,
2002                         u32 bits_wanted,
2003                         u64 *suballoc_loc,
2004                         u16 *suballoc_bit_start,
2005                         unsigned int *num_bits,
2006                         u64 *blkno_start)
2007{
2008        int status;
2009        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2010
2011        BUG_ON(!ac);
2012        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2013        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2014
2015        status = ocfs2_claim_suballoc_bits(ac,
2016                                           handle,
2017                                           bits_wanted,
2018                                           1,
2019                                           &res);
2020        if (status < 0) {
2021                mlog_errno(status);
2022                goto bail;
2023        }
2024        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2025
2026        *suballoc_loc = res.sr_bg_blkno;
2027        *suballoc_bit_start = res.sr_bit_offset;
2028        *blkno_start = res.sr_blkno;
2029        ac->ac_bits_given += res.sr_bits;
2030        *num_bits = res.sr_bits;
2031        status = 0;
2032bail:
2033        if (status)
2034                mlog_errno(status);
2035        return status;
2036}
2037
2038static void ocfs2_init_inode_ac_group(struct inode *dir,
2039                                      struct buffer_head *parent_di_bh,
2040                                      struct ocfs2_alloc_context *ac)
2041{
2042        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2043        /*
2044         * Try to allocate inodes from some specific group.
2045         *
2046         * If the parent dir has recorded the last group used in allocation,
2047         * cool, use it. Otherwise if we try to allocate new inode from the
2048         * same slot the parent dir belongs to, use the same chunk.
2049         *
2050         * We are very careful here to avoid the mistake of setting
2051         * ac_last_group to a group descriptor from a different (unlocked) slot.
2052         */
2053        if (OCFS2_I(dir)->ip_last_used_group &&
2054            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2055                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2056        else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2057                if (di->i_suballoc_loc)
2058                        ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2059                else
2060                        ac->ac_last_group = ocfs2_which_suballoc_group(
2061                                        le64_to_cpu(di->i_blkno),
2062                                        le16_to_cpu(di->i_suballoc_bit));
2063        }
2064}
2065
2066static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2067                                             struct ocfs2_alloc_context *ac)
2068{
2069        OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2070        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2071}
2072
2073int ocfs2_find_new_inode_loc(struct inode *dir,
2074                             struct buffer_head *parent_fe_bh,
2075                             struct ocfs2_alloc_context *ac,
2076                             u64 *fe_blkno)
2077{
2078        int ret;
2079        handle_t *handle = NULL;
2080        struct ocfs2_suballoc_result *res;
2081
2082        BUG_ON(!ac);
2083        BUG_ON(ac->ac_bits_given != 0);
2084        BUG_ON(ac->ac_bits_wanted != 1);
2085        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2086
2087        res = kzalloc(sizeof(*res), GFP_NOFS);
2088        if (res == NULL) {
2089                ret = -ENOMEM;
2090                mlog_errno(ret);
2091                goto out;
2092        }
2093
2094        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2095
2096        /*
2097         * The handle started here is for chain relink. Alternatively,
2098         * we could just disable relink for these calls.
2099         */
2100        handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2101        if (IS_ERR(handle)) {
2102                ret = PTR_ERR(handle);
2103                handle = NULL;
2104                mlog_errno(ret);
2105                goto out;
2106        }
2107
2108        /*
2109         * This will instruct ocfs2_claim_suballoc_bits and
2110         * ocfs2_search_one_group to search but save actual allocation
2111         * for later.
2112         */
2113        ac->ac_find_loc_only = 1;
2114
2115        ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2116        if (ret < 0) {
2117                mlog_errno(ret);
2118                goto out;
2119        }
2120
2121        ac->ac_find_loc_priv = res;
2122        *fe_blkno = res->sr_blkno;
2123        ocfs2_update_inode_fsync_trans(handle, dir, 0);
2124out:
2125        if (handle)
2126                ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2127
2128        if (ret)
2129                kfree(res);
2130
2131        return ret;
2132}
2133
2134int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2135                                 struct inode *dir,
2136                                 struct ocfs2_alloc_context *ac,
2137                                 u64 *suballoc_loc,
2138                                 u16 *suballoc_bit,
2139                                 u64 di_blkno)
2140{
2141        int ret;
2142        u16 chain;
2143        struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2144        struct buffer_head *bg_bh = NULL;
2145        struct ocfs2_group_desc *bg;
2146        struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2147
2148        /*
2149         * Since di_blkno is being passed back in, we check for any
2150         * inconsistencies which may have happened between
2151         * calls. These are code bugs as di_blkno is not expected to
2152         * change once returned from ocfs2_find_new_inode_loc()
2153         */
2154        BUG_ON(res->sr_blkno != di_blkno);
2155
2156        ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2157                                          res->sr_bg_stable_blkno, &bg_bh);
2158        if (ret) {
2159                mlog_errno(ret);
2160                goto out;
2161        }
2162
2163        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2164        chain = le16_to_cpu(bg->bg_chain);
2165
2166        ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2167                                               ac->ac_bh, res->sr_bits,
2168                                               chain);
2169        if (ret) {
2170                mlog_errno(ret);
2171                goto out;
2172        }
2173
2174        ret = ocfs2_block_group_set_bits(handle,
2175                                         ac->ac_inode,
2176                                         bg,
2177                                         bg_bh,
2178                                         res->sr_bit_offset,
2179                                         res->sr_bits);
2180        if (ret < 0) {
2181                ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2182                                               ac->ac_bh, res->sr_bits, chain);
2183                mlog_errno(ret);
2184                goto out;
2185        }
2186
2187        trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2188                                           res->sr_bits);
2189
2190        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2191
2192        BUG_ON(res->sr_bits != 1);
2193
2194        *suballoc_loc = res->sr_bg_blkno;
2195        *suballoc_bit = res->sr_bit_offset;
2196        ac->ac_bits_given++;
2197        ocfs2_save_inode_ac_group(dir, ac);
2198
2199out:
2200        brelse(bg_bh);
2201
2202        return ret;
2203}
2204
2205int ocfs2_claim_new_inode(handle_t *handle,
2206                          struct inode *dir,
2207                          struct buffer_head *parent_fe_bh,
2208                          struct ocfs2_alloc_context *ac,
2209                          u64 *suballoc_loc,
2210                          u16 *suballoc_bit,
2211                          u64 *fe_blkno)
2212{
2213        int status;
2214        struct ocfs2_suballoc_result res;
2215
2216        BUG_ON(!ac);
2217        BUG_ON(ac->ac_bits_given != 0);
2218        BUG_ON(ac->ac_bits_wanted != 1);
2219        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2220
2221        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2222
2223        status = ocfs2_claim_suballoc_bits(ac,
2224                                           handle,
2225                                           1,
2226                                           1,
2227                                           &res);
2228        if (status < 0) {
2229                mlog_errno(status);
2230                goto bail;
2231        }
2232        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2233
2234        BUG_ON(res.sr_bits != 1);
2235
2236        *suballoc_loc = res.sr_bg_blkno;
2237        *suballoc_bit = res.sr_bit_offset;
2238        *fe_blkno = res.sr_blkno;
2239        ac->ac_bits_given++;
2240        ocfs2_save_inode_ac_group(dir, ac);
2241        status = 0;
2242bail:
2243        if (status)
2244                mlog_errno(status);
2245        return status;
2246}
2247
2248/* translate a group desc. blkno and it's bitmap offset into
2249 * disk cluster offset. */
2250static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2251                                                   u64 bg_blkno,
2252                                                   u16 bg_bit_off)
2253{
2254        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2255        u32 cluster = 0;
2256
2257        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2258
2259        if (bg_blkno != osb->first_cluster_group_blkno)
2260                cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2261        cluster += (u32) bg_bit_off;
2262        return cluster;
2263}
2264
2265/* given a cluster offset, calculate which block group it belongs to
2266 * and return that block offset. */
2267u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2268{
2269        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2270        u32 group_no;
2271
2272        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2273
2274        group_no = cluster / osb->bitmap_cpg;
2275        if (!group_no)
2276                return osb->first_cluster_group_blkno;
2277        return ocfs2_clusters_to_blocks(inode->i_sb,
2278                                        group_no * osb->bitmap_cpg);
2279}
2280
2281/* given the block number of a cluster start, calculate which cluster
2282 * group and descriptor bitmap offset that corresponds to. */
2283static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2284                                                u64 data_blkno,
2285                                                u64 *bg_blkno,
2286                                                u16 *bg_bit_off)
2287{
2288        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2289        u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2290
2291        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2292
2293        *bg_blkno = ocfs2_which_cluster_group(inode,
2294                                              data_cluster);
2295
2296        if (*bg_blkno == osb->first_cluster_group_blkno)
2297                *bg_bit_off = (u16) data_cluster;
2298        else
2299                *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2300                                                             data_blkno - *bg_blkno);
2301}
2302
2303/*
2304 * min_bits - minimum contiguous chunk from this total allocation we
2305 * can handle. set to what we asked for originally for a full
2306 * contig. allocation, set to '1' to indicate we can deal with extents
2307 * of any size.
2308 */
2309int __ocfs2_claim_clusters(handle_t *handle,
2310                           struct ocfs2_alloc_context *ac,
2311                           u32 min_clusters,
2312                           u32 max_clusters,
2313                           u32 *cluster_start,
2314                           u32 *num_clusters)
2315{
2316        int status;
2317        unsigned int bits_wanted = max_clusters;
2318        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2319        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2320
2321        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2322
2323        BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2324               && ac->ac_which != OCFS2_AC_USE_MAIN);
2325
2326        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2327                WARN_ON(min_clusters > 1);
2328
2329                status = ocfs2_claim_local_alloc_bits(osb,
2330                                                      handle,
2331                                                      ac,
2332                                                      bits_wanted,
2333                                                      cluster_start,
2334                                                      num_clusters);
2335                if (!status)
2336                        atomic_inc(&osb->alloc_stats.local_data);
2337        } else {
2338                if (min_clusters > (osb->bitmap_cpg - 1)) {
2339                        /* The only paths asking for contiguousness
2340                         * should know about this already. */
2341                        mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2342                             "group bitmap size %u!\n", min_clusters,
2343                             osb->bitmap_cpg);
2344                        status = -ENOSPC;
2345                        goto bail;
2346                }
2347                /* clamp the current request down to a realistic size. */
2348                if (bits_wanted > (osb->bitmap_cpg - 1))
2349                        bits_wanted = osb->bitmap_cpg - 1;
2350
2351                status = ocfs2_claim_suballoc_bits(ac,
2352                                                   handle,
2353                                                   bits_wanted,
2354                                                   min_clusters,
2355                                                   &res);
2356                if (!status) {
2357                        BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2358                        *cluster_start =
2359                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2360                                                                 res.sr_bg_blkno,
2361                                                                 res.sr_bit_offset);
2362                        atomic_inc(&osb->alloc_stats.bitmap_data);
2363                        *num_clusters = res.sr_bits;
2364                }
2365        }
2366        if (status < 0) {
2367                if (status != -ENOSPC)
2368                        mlog_errno(status);
2369                goto bail;
2370        }
2371
2372        ac->ac_bits_given += *num_clusters;
2373
2374bail:
2375        if (status)
2376                mlog_errno(status);
2377        return status;
2378}
2379
2380int ocfs2_claim_clusters(handle_t *handle,
2381                         struct ocfs2_alloc_context *ac,
2382                         u32 min_clusters,
2383                         u32 *cluster_start,
2384                         u32 *num_clusters)
2385{
2386        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2387
2388        return __ocfs2_claim_clusters(handle, ac, min_clusters,
2389                                      bits_wanted, cluster_start, num_clusters);
2390}
2391
2392static int ocfs2_block_group_clear_bits(handle_t *handle,
2393                                        struct inode *alloc_inode,
2394                                        struct ocfs2_group_desc *bg,
2395                                        struct buffer_head *group_bh,
2396                                        unsigned int bit_off,
2397                                        unsigned int num_bits,
2398                                        void (*undo_fn)(unsigned int bit,
2399                                                        unsigned long *bmap))
2400{
2401        int status;
2402        unsigned int tmp;
2403        struct ocfs2_group_desc *undo_bg = NULL;
2404
2405        /* The caller got this descriptor from
2406         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2407        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2408
2409        trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2410
2411        BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2412        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2413                                         group_bh,
2414                                         undo_fn ?
2415                                         OCFS2_JOURNAL_ACCESS_UNDO :
2416                                         OCFS2_JOURNAL_ACCESS_WRITE);
2417        if (status < 0) {
2418                mlog_errno(status);
2419                goto bail;
2420        }
2421
2422        if (undo_fn) {
2423                jbd_lock_bh_state(group_bh);
2424                undo_bg = (struct ocfs2_group_desc *)
2425                                        bh2jh(group_bh)->b_committed_data;
2426                BUG_ON(!undo_bg);
2427        }
2428
2429        tmp = num_bits;
2430        while(tmp--) {
2431                ocfs2_clear_bit((bit_off + tmp),
2432                                (unsigned long *) bg->bg_bitmap);
2433                if (undo_fn)
2434                        undo_fn(bit_off + tmp,
2435                                (unsigned long *) undo_bg->bg_bitmap);
2436        }
2437        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2438        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2439                if (undo_fn)
2440                        jbd_unlock_bh_state(group_bh);
2441                return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2442                                   (unsigned long long)le64_to_cpu(bg->bg_blkno),
2443                                   le16_to_cpu(bg->bg_bits),
2444                                   le16_to_cpu(bg->bg_free_bits_count),
2445                                   num_bits);
2446        }
2447
2448        if (undo_fn)
2449                jbd_unlock_bh_state(group_bh);
2450
2451        ocfs2_journal_dirty(handle, group_bh);
2452bail:
2453        return status;
2454}
2455
2456/*
2457 * expects the suballoc inode to already be locked.
2458 */
2459static int _ocfs2_free_suballoc_bits(handle_t *handle,
2460                                     struct inode *alloc_inode,
2461                                     struct buffer_head *alloc_bh,
2462                                     unsigned int start_bit,
2463                                     u64 bg_blkno,
2464                                     unsigned int count,
2465                                     void (*undo_fn)(unsigned int bit,
2466                                                     unsigned long *bitmap))
2467{
2468        int status = 0;
2469        u32 tmp_used;
2470        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2471        struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2472        struct buffer_head *group_bh = NULL;
2473        struct ocfs2_group_desc *group;
2474
2475        /* The alloc_bh comes from ocfs2_free_dinode() or
2476         * ocfs2_free_clusters().  The callers have all locked the
2477         * allocator and gotten alloc_bh from the lock call.  This
2478         * validates the dinode buffer.  Any corruption that has happened
2479         * is a code bug. */
2480        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2481        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2482
2483        trace_ocfs2_free_suballoc_bits(
2484                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2485                (unsigned long long)bg_blkno,
2486                start_bit, count);
2487
2488        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2489                                             &group_bh);
2490        if (status < 0) {
2491                mlog_errno(status);
2492                goto bail;
2493        }
2494        group = (struct ocfs2_group_desc *) group_bh->b_data;
2495
2496        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2497
2498        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2499                                              group, group_bh,
2500                                              start_bit, count, undo_fn);
2501        if (status < 0) {
2502                mlog_errno(status);
2503                goto bail;
2504        }
2505
2506        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2507                                         alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2508        if (status < 0) {
2509                mlog_errno(status);
2510                ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2511                                start_bit, count);
2512                goto bail;
2513        }
2514
2515        le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2516                     count);
2517        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2518        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2519        ocfs2_journal_dirty(handle, alloc_bh);
2520
2521bail:
2522        brelse(group_bh);
2523
2524        if (status)
2525                mlog_errno(status);
2526        return status;
2527}
2528
2529int ocfs2_free_suballoc_bits(handle_t *handle,
2530                             struct inode *alloc_inode,
2531                             struct buffer_head *alloc_bh,
2532                             unsigned int start_bit,
2533                             u64 bg_blkno,
2534                             unsigned int count)
2535{
2536        return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2537                                         start_bit, bg_blkno, count, NULL);
2538}
2539
2540int ocfs2_free_dinode(handle_t *handle,
2541                      struct inode *inode_alloc_inode,
2542                      struct buffer_head *inode_alloc_bh,
2543                      struct ocfs2_dinode *di)
2544{
2545        u64 blk = le64_to_cpu(di->i_blkno);
2546        u16 bit = le16_to_cpu(di->i_suballoc_bit);
2547        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2548
2549        if (di->i_suballoc_loc)
2550                bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2551        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2552                                        inode_alloc_bh, bit, bg_blkno, 1);
2553}
2554
2555static int _ocfs2_free_clusters(handle_t *handle,
2556                                struct inode *bitmap_inode,
2557                                struct buffer_head *bitmap_bh,
2558                                u64 start_blk,
2559                                unsigned int num_clusters,
2560                                void (*undo_fn)(unsigned int bit,
2561                                                unsigned long *bitmap))
2562{
2563        int status;
2564        u16 bg_start_bit;
2565        u64 bg_blkno;
2566
2567        /* You can't ever have a contiguous set of clusters
2568         * bigger than a block group bitmap so we never have to worry
2569         * about looping on them.
2570         * This is expensive. We can safely remove once this stuff has
2571         * gotten tested really well. */
2572        BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2573                                ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2574                                                         start_blk)));
2575
2576
2577        ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2578                                     &bg_start_bit);
2579
2580        trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2581                        (unsigned long long)start_blk,
2582                        bg_start_bit, num_clusters);
2583
2584        status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2585                                           bg_start_bit, bg_blkno,
2586                                           num_clusters, undo_fn);
2587        if (status < 0) {
2588                mlog_errno(status);
2589                goto out;
2590        }
2591
2592        ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2593                                         num_clusters);
2594
2595out:
2596        if (status)
2597                mlog_errno(status);
2598        return status;
2599}
2600
2601int ocfs2_free_clusters(handle_t *handle,
2602                        struct inode *bitmap_inode,
2603                        struct buffer_head *bitmap_bh,
2604                        u64 start_blk,
2605                        unsigned int num_clusters)
2606{
2607        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2608                                    start_blk, num_clusters,
2609                                    _ocfs2_set_bit);
2610}
2611
2612/*
2613 * Give never-used clusters back to the global bitmap.  We don't need
2614 * to protect these bits in the undo buffer.
2615 */
2616int ocfs2_release_clusters(handle_t *handle,
2617                           struct inode *bitmap_inode,
2618                           struct buffer_head *bitmap_bh,
2619                           u64 start_blk,
2620                           unsigned int num_clusters)
2621{
2622        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2623                                    start_blk, num_clusters,
2624                                    _ocfs2_clear_bit);
2625}
2626
2627/*
2628 * For a given allocation, determine which allocators will need to be
2629 * accessed, and lock them, reserving the appropriate number of bits.
2630 *
2631 * Sparse file systems call this from ocfs2_write_begin_nolock()
2632 * and ocfs2_allocate_unwritten_extents().
2633 *
2634 * File systems which don't support holes call this from
2635 * ocfs2_extend_allocation().
2636 */
2637int ocfs2_lock_allocators(struct inode *inode,
2638                          struct ocfs2_extent_tree *et,
2639                          u32 clusters_to_add, u32 extents_to_split,
2640                          struct ocfs2_alloc_context **data_ac,
2641                          struct ocfs2_alloc_context **meta_ac)
2642{
2643        int ret = 0, num_free_extents;
2644        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2645        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2646
2647        *meta_ac = NULL;
2648        if (data_ac)
2649                *data_ac = NULL;
2650
2651        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2652
2653        num_free_extents = ocfs2_num_free_extents(et);
2654        if (num_free_extents < 0) {
2655                ret = num_free_extents;
2656                mlog_errno(ret);
2657                goto out;
2658        }
2659
2660        /*
2661         * Sparse allocation file systems need to be more conservative
2662         * with reserving room for expansion - the actual allocation
2663         * happens while we've got a journal handle open so re-taking
2664         * a cluster lock (because we ran out of room for another
2665         * extent) will violate ordering rules.
2666         *
2667         * Most of the time we'll only be seeing this 1 cluster at a time
2668         * anyway.
2669         *
2670         * Always lock for any unwritten extents - we might want to
2671         * add blocks during a split.
2672         */
2673        if (!num_free_extents ||
2674            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2675                ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2676                if (ret < 0) {
2677                        if (ret != -ENOSPC)
2678                                mlog_errno(ret);
2679                        goto out;
2680                }
2681        }
2682
2683        if (clusters_to_add == 0)
2684                goto out;
2685
2686        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2687        if (ret < 0) {
2688                if (ret != -ENOSPC)
2689                        mlog_errno(ret);
2690                goto out;
2691        }
2692
2693out:
2694        if (ret) {
2695                if (*meta_ac) {
2696                        ocfs2_free_alloc_context(*meta_ac);
2697                        *meta_ac = NULL;
2698                }
2699
2700                /*
2701                 * We cannot have an error and a non null *data_ac.
2702                 */
2703        }
2704
2705        return ret;
2706}
2707
2708/*
2709 * Read the inode specified by blkno to get suballoc_slot and
2710 * suballoc_bit.
2711 */
2712static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2713                                       u16 *suballoc_slot, u64 *group_blkno,
2714                                       u16 *suballoc_bit)
2715{
2716        int status;
2717        struct buffer_head *inode_bh = NULL;
2718        struct ocfs2_dinode *inode_fe;
2719
2720        trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2721
2722        /* dirty read disk */
2723        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2724        if (status < 0) {
2725                mlog(ML_ERROR, "read block %llu failed %d\n",
2726                     (unsigned long long)blkno, status);
2727                goto bail;
2728        }
2729
2730        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2731        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2732                mlog(ML_ERROR, "invalid inode %llu requested\n",
2733                     (unsigned long long)blkno);
2734                status = -EINVAL;
2735                goto bail;
2736        }
2737
2738        if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2739            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2740                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2741                     (unsigned long long)blkno,
2742                     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2743                status = -EINVAL;
2744                goto bail;
2745        }
2746
2747        if (suballoc_slot)
2748                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2749        if (suballoc_bit)
2750                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2751        if (group_blkno)
2752                *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2753
2754bail:
2755        brelse(inode_bh);
2756
2757        if (status)
2758                mlog_errno(status);
2759        return status;
2760}
2761
2762/*
2763 * test whether bit is SET in allocator bitmap or not.  on success, 0
2764 * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2765 * is returned and *res is meaningless.  Call this after you have
2766 * cluster locked against suballoc, or you may get a result based on
2767 * non-up2date contents
2768 */
2769static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2770                                   struct inode *suballoc,
2771                                   struct buffer_head *alloc_bh,
2772                                   u64 group_blkno, u64 blkno,
2773                                   u16 bit, int *res)
2774{
2775        struct ocfs2_dinode *alloc_di;
2776        struct ocfs2_group_desc *group;
2777        struct buffer_head *group_bh = NULL;
2778        u64 bg_blkno;
2779        int status;
2780
2781        trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2782                                      (unsigned int)bit);
2783
2784        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2785        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2786                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2787                     (unsigned int)bit,
2788                     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2789                status = -EINVAL;
2790                goto bail;
2791        }
2792
2793        bg_blkno = group_blkno ? group_blkno :
2794                   ocfs2_which_suballoc_group(blkno, bit);
2795        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2796                                             &group_bh);
2797        if (status < 0) {
2798                mlog(ML_ERROR, "read group %llu failed %d\n",
2799                     (unsigned long long)bg_blkno, status);
2800                goto bail;
2801        }
2802
2803        group = (struct ocfs2_group_desc *) group_bh->b_data;
2804        *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2805
2806bail:
2807        brelse(group_bh);
2808
2809        if (status)
2810                mlog_errno(status);
2811        return status;
2812}
2813
2814/*
2815 * Test if the bit representing this inode (blkno) is set in the
2816 * suballocator.
2817 *
2818 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2819 *
2820 * In the event of failure, a negative value is returned and *res is
2821 * meaningless.
2822 *
2823 * Callers must make sure to hold nfs_sync_lock to prevent
2824 * ocfs2_delete_inode() on another node from accessing the same
2825 * suballocator concurrently.
2826 */
2827int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2828{
2829        int status;
2830        u64 group_blkno = 0;
2831        u16 suballoc_bit = 0, suballoc_slot = 0;
2832        struct inode *inode_alloc_inode;
2833        struct buffer_head *alloc_bh = NULL;
2834
2835        trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2836
2837        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2838                                             &group_blkno, &suballoc_bit);
2839        if (status < 0) {
2840                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2841                goto bail;
2842        }
2843
2844        inode_alloc_inode =
2845                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2846                                            suballoc_slot);
2847        if (!inode_alloc_inode) {
2848                /* the error code could be inaccurate, but we are not able to
2849                 * get the correct one. */
2850                status = -EINVAL;
2851                mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2852                     (u32)suballoc_slot);
2853                goto bail;
2854        }
2855
2856        inode_lock(inode_alloc_inode);
2857        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2858        if (status < 0) {
2859                inode_unlock(inode_alloc_inode);
2860                iput(inode_alloc_inode);
2861                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2862                     (u32)suballoc_slot, status);
2863                goto bail;
2864        }
2865
2866        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2867                                         group_blkno, blkno, suballoc_bit, res);
2868        if (status < 0)
2869                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2870
2871        ocfs2_inode_unlock(inode_alloc_inode, 0);
2872        inode_unlock(inode_alloc_inode);
2873
2874        iput(inode_alloc_inode);
2875        brelse(alloc_bh);
2876bail:
2877        if (status)
2878                mlog_errno(status);
2879        return status;
2880}
2881