linux/fs/ocfs2/suballoc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* -*- mode: c; c-basic-offset: 8; -*-
   3 * vim: noexpandtab sw=8 ts=8 sts=0:
   4 *
   5 * suballoc.c
   6 *
   7 * metadata alloc and free
   8 * Inspired by ext3 block groups.
   9 *
  10 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  11 */
  12
  13#include <linux/fs.h>
  14#include <linux/types.h>
  15#include <linux/slab.h>
  16#include <linux/highmem.h>
  17
  18#include <cluster/masklog.h>
  19
  20#include "ocfs2.h"
  21
  22#include "alloc.h"
  23#include "blockcheck.h"
  24#include "dlmglue.h"
  25#include "inode.h"
  26#include "journal.h"
  27#include "localalloc.h"
  28#include "suballoc.h"
  29#include "super.h"
  30#include "sysfile.h"
  31#include "uptodate.h"
  32#include "ocfs2_trace.h"
  33
  34#include "buffer_head_io.h"
  35
  36#define NOT_ALLOC_NEW_GROUP             0
  37#define ALLOC_NEW_GROUP                 0x1
  38#define ALLOC_GROUPS_FROM_GLOBAL        0x2
  39
  40#define OCFS2_MAX_TO_STEAL              1024
  41
  42struct ocfs2_suballoc_result {
  43        u64             sr_bg_blkno;    /* The bg we allocated from.  Set
  44                                           to 0 when a block group is
  45                                           contiguous. */
  46        u64             sr_bg_stable_blkno; /*
  47                                             * Doesn't change, always
  48                                             * set to target block
  49                                             * group descriptor
  50                                             * block.
  51                                             */
  52        u64             sr_blkno;       /* The first allocated block */
  53        unsigned int    sr_bit_offset;  /* The bit in the bg */
  54        unsigned int    sr_bits;        /* How many bits we claimed */
  55};
  56
  57static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
  58{
  59        if (res->sr_blkno == 0)
  60                return 0;
  61
  62        if (res->sr_bg_blkno)
  63                return res->sr_bg_blkno;
  64
  65        return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
  66}
  67
  68static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
  69static int ocfs2_block_group_fill(handle_t *handle,
  70                                  struct inode *alloc_inode,
  71                                  struct buffer_head *bg_bh,
  72                                  u64 group_blkno,
  73                                  unsigned int group_clusters,
  74                                  u16 my_chain,
  75                                  struct ocfs2_chain_list *cl);
  76static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
  77                                   struct inode *alloc_inode,
  78                                   struct buffer_head *bh,
  79                                   u64 max_block,
  80                                   u64 *last_alloc_group,
  81                                   int flags);
  82
  83static int ocfs2_cluster_group_search(struct inode *inode,
  84                                      struct buffer_head *group_bh,
  85                                      u32 bits_wanted, u32 min_bits,
  86                                      u64 max_block,
  87                                      struct ocfs2_suballoc_result *res);
  88static int ocfs2_block_group_search(struct inode *inode,
  89                                    struct buffer_head *group_bh,
  90                                    u32 bits_wanted, u32 min_bits,
  91                                    u64 max_block,
  92                                    struct ocfs2_suballoc_result *res);
  93static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
  94                                     handle_t *handle,
  95                                     u32 bits_wanted,
  96                                     u32 min_bits,
  97                                     struct ocfs2_suballoc_result *res);
  98static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
  99                                         int nr);
 100static int ocfs2_relink_block_group(handle_t *handle,
 101                                    struct inode *alloc_inode,
 102                                    struct buffer_head *fe_bh,
 103                                    struct buffer_head *bg_bh,
 104                                    struct buffer_head *prev_bg_bh,
 105                                    u16 chain);
 106static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
 107                                                     u32 wanted);
 108static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
 109                                                   u64 bg_blkno,
 110                                                   u16 bg_bit_off);
 111static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 112                                                u64 data_blkno,
 113                                                u64 *bg_blkno,
 114                                                u16 *bg_bit_off);
 115static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 116                                             u32 bits_wanted, u64 max_block,
 117                                             int flags,
 118                                             struct ocfs2_alloc_context **ac);
 119
 120void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 121{
 122        struct inode *inode = ac->ac_inode;
 123
 124        if (inode) {
 125                if (ac->ac_which != OCFS2_AC_USE_LOCAL)
 126                        ocfs2_inode_unlock(inode, 1);
 127
 128                inode_unlock(inode);
 129
 130                iput(inode);
 131                ac->ac_inode = NULL;
 132        }
 133        brelse(ac->ac_bh);
 134        ac->ac_bh = NULL;
 135        ac->ac_resv = NULL;
 136        kfree(ac->ac_find_loc_priv);
 137        ac->ac_find_loc_priv = NULL;
 138}
 139
 140void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
 141{
 142        ocfs2_free_ac_resource(ac);
 143        kfree(ac);
 144}
 145
 146static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 147{
 148        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 149}
 150
 151#define do_error(fmt, ...)                                              \
 152do {                                                                    \
 153        if (resize)                                                     \
 154                mlog(ML_ERROR, fmt, ##__VA_ARGS__);                     \
 155        else                                                            \
 156                return ocfs2_error(sb, fmt, ##__VA_ARGS__);             \
 157} while (0)
 158
 159static int ocfs2_validate_gd_self(struct super_block *sb,
 160                                  struct buffer_head *bh,
 161                                  int resize)
 162{
 163        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 164
 165        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
 166                do_error("Group descriptor #%llu has bad signature %.*s\n",
 167                         (unsigned long long)bh->b_blocknr, 7,
 168                         gd->bg_signature);
 169        }
 170
 171        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
 172                do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
 173                         (unsigned long long)bh->b_blocknr,
 174                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
 175        }
 176
 177        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
 178                do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
 179                         (unsigned long long)bh->b_blocknr,
 180                         le32_to_cpu(gd->bg_generation));
 181        }
 182
 183        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
 184                do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
 185                         (unsigned long long)bh->b_blocknr,
 186                         le16_to_cpu(gd->bg_bits),
 187                         le16_to_cpu(gd->bg_free_bits_count));
 188        }
 189
 190        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
 191                do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
 192                         (unsigned long long)bh->b_blocknr,
 193                         le16_to_cpu(gd->bg_bits),
 194                         8 * le16_to_cpu(gd->bg_size));
 195        }
 196
 197        return 0;
 198}
 199
 200static int ocfs2_validate_gd_parent(struct super_block *sb,
 201                                    struct ocfs2_dinode *di,
 202                                    struct buffer_head *bh,
 203                                    int resize)
 204{
 205        unsigned int max_bits;
 206        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 207
 208        if (di->i_blkno != gd->bg_parent_dinode) {
 209                do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
 210                         (unsigned long long)bh->b_blocknr,
 211                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
 212                         (unsigned long long)le64_to_cpu(di->i_blkno));
 213        }
 214
 215        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 216        if (le16_to_cpu(gd->bg_bits) > max_bits) {
 217                do_error("Group descriptor #%llu has bit count of %u\n",
 218                         (unsigned long long)bh->b_blocknr,
 219                         le16_to_cpu(gd->bg_bits));
 220        }
 221
 222        /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
 223        if ((le16_to_cpu(gd->bg_chain) >
 224             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
 225            ((le16_to_cpu(gd->bg_chain) ==
 226             le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
 227                do_error("Group descriptor #%llu has bad chain %u\n",
 228                         (unsigned long long)bh->b_blocknr,
 229                         le16_to_cpu(gd->bg_chain));
 230        }
 231
 232        return 0;
 233}
 234
 235#undef do_error
 236
 237/*
 238 * This version only prints errors.  It does not fail the filesystem, and
 239 * exists only for resize.
 240 */
 241int ocfs2_check_group_descriptor(struct super_block *sb,
 242                                 struct ocfs2_dinode *di,
 243                                 struct buffer_head *bh)
 244{
 245        int rc;
 246        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 247
 248        BUG_ON(!buffer_uptodate(bh));
 249
 250        /*
 251         * If the ecc fails, we return the error but otherwise
 252         * leave the filesystem running.  We know any error is
 253         * local to this block.
 254         */
 255        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 256        if (rc) {
 257                mlog(ML_ERROR,
 258                     "Checksum failed for group descriptor %llu\n",
 259                     (unsigned long long)bh->b_blocknr);
 260        } else
 261                rc = ocfs2_validate_gd_self(sb, bh, 1);
 262        if (!rc)
 263                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
 264
 265        return rc;
 266}
 267
 268static int ocfs2_validate_group_descriptor(struct super_block *sb,
 269                                           struct buffer_head *bh)
 270{
 271        int rc;
 272        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 273
 274        trace_ocfs2_validate_group_descriptor(
 275                                        (unsigned long long)bh->b_blocknr);
 276
 277        BUG_ON(!buffer_uptodate(bh));
 278
 279        /*
 280         * If the ecc fails, we return the error but otherwise
 281         * leave the filesystem running.  We know any error is
 282         * local to this block.
 283         */
 284        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
 285        if (rc)
 286                return rc;
 287
 288        /*
 289         * Errors after here are fatal.
 290         */
 291
 292        return ocfs2_validate_gd_self(sb, bh, 0);
 293}
 294
 295int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
 296                                u64 gd_blkno, struct buffer_head **bh)
 297{
 298        int rc;
 299        struct buffer_head *tmp = *bh;
 300
 301        rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
 302                              ocfs2_validate_group_descriptor);
 303        if (rc)
 304                goto out;
 305
 306        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
 307        if (rc) {
 308                brelse(tmp);
 309                goto out;
 310        }
 311
 312        /* If ocfs2_read_block() got us a new bh, pass it up. */
 313        if (!*bh)
 314                *bh = tmp;
 315
 316out:
 317        return rc;
 318}
 319
 320static void ocfs2_bg_discontig_add_extent(struct ocfs2_super *osb,
 321                                          struct ocfs2_group_desc *bg,
 322                                          struct ocfs2_chain_list *cl,
 323                                          u64 p_blkno, unsigned int clusters)
 324{
 325        struct ocfs2_extent_list *el = &bg->bg_list;
 326        struct ocfs2_extent_rec *rec;
 327
 328        BUG_ON(!ocfs2_supports_discontig_bg(osb));
 329        if (!el->l_next_free_rec)
 330                el->l_count = cpu_to_le16(ocfs2_extent_recs_per_gd(osb->sb));
 331        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec)];
 332        rec->e_blkno = cpu_to_le64(p_blkno);
 333        rec->e_cpos = cpu_to_le32(le16_to_cpu(bg->bg_bits) /
 334                                  le16_to_cpu(cl->cl_bpc));
 335        rec->e_leaf_clusters = cpu_to_le16(clusters);
 336        le16_add_cpu(&bg->bg_bits, clusters * le16_to_cpu(cl->cl_bpc));
 337        le16_add_cpu(&bg->bg_free_bits_count,
 338                     clusters * le16_to_cpu(cl->cl_bpc));
 339        le16_add_cpu(&el->l_next_free_rec, 1);
 340}
 341
 342static int ocfs2_block_group_fill(handle_t *handle,
 343                                  struct inode *alloc_inode,
 344                                  struct buffer_head *bg_bh,
 345                                  u64 group_blkno,
 346                                  unsigned int group_clusters,
 347                                  u16 my_chain,
 348                                  struct ocfs2_chain_list *cl)
 349{
 350        int status = 0;
 351        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 352        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 353        struct super_block * sb = alloc_inode->i_sb;
 354
 355        if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
 356                status = ocfs2_error(alloc_inode->i_sb,
 357                                     "group block (%llu) != b_blocknr (%llu)\n",
 358                                     (unsigned long long)group_blkno,
 359                                     (unsigned long long) bg_bh->b_blocknr);
 360                goto bail;
 361        }
 362
 363        status = ocfs2_journal_access_gd(handle,
 364                                         INODE_CACHE(alloc_inode),
 365                                         bg_bh,
 366                                         OCFS2_JOURNAL_ACCESS_CREATE);
 367        if (status < 0) {
 368                mlog_errno(status);
 369                goto bail;
 370        }
 371
 372        memset(bg, 0, sb->s_blocksize);
 373        strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
 374        bg->bg_generation = cpu_to_le32(osb->fs_generation);
 375        bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
 376                                                osb->s_feature_incompat));
 377        bg->bg_chain = cpu_to_le16(my_chain);
 378        bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
 379        bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
 380        bg->bg_blkno = cpu_to_le64(group_blkno);
 381        if (group_clusters == le16_to_cpu(cl->cl_cpg))
 382                bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
 383        else
 384                ocfs2_bg_discontig_add_extent(osb, bg, cl, group_blkno,
 385                                              group_clusters);
 386
 387        /* set the 1st bit in the bitmap to account for the descriptor block */
 388        ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
 389        bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
 390
 391        ocfs2_journal_dirty(handle, bg_bh);
 392
 393        /* There is no need to zero out or otherwise initialize the
 394         * other blocks in a group - All valid FS metadata in a block
 395         * group stores the superblock fs_generation value at
 396         * allocation time. */
 397
 398bail:
 399        if (status)
 400                mlog_errno(status);
 401        return status;
 402}
 403
 404static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
 405{
 406        u16 curr, best;
 407
 408        best = curr = 0;
 409        while (curr < le16_to_cpu(cl->cl_count)) {
 410                if (le32_to_cpu(cl->cl_recs[best].c_total) >
 411                    le32_to_cpu(cl->cl_recs[curr].c_total))
 412                        best = curr;
 413                curr++;
 414        }
 415        return best;
 416}
 417
 418static struct buffer_head *
 419ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
 420                               struct inode *alloc_inode,
 421                               struct ocfs2_alloc_context *ac,
 422                               struct ocfs2_chain_list *cl)
 423{
 424        int status;
 425        u32 bit_off, num_bits;
 426        u64 bg_blkno;
 427        struct buffer_head *bg_bh;
 428        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
 429
 430        status = ocfs2_claim_clusters(handle, ac,
 431                                      le16_to_cpu(cl->cl_cpg), &bit_off,
 432                                      &num_bits);
 433        if (status < 0) {
 434                if (status != -ENOSPC)
 435                        mlog_errno(status);
 436                goto bail;
 437        }
 438
 439        /* setup the group */
 440        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 441        trace_ocfs2_block_group_alloc_contig(
 442             (unsigned long long)bg_blkno, alloc_rec);
 443
 444        bg_bh = sb_getblk(osb->sb, bg_blkno);
 445        if (!bg_bh) {
 446                status = -ENOMEM;
 447                mlog_errno(status);
 448                goto bail;
 449        }
 450        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
 451
 452        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
 453                                        bg_blkno, num_bits, alloc_rec, cl);
 454        if (status < 0) {
 455                brelse(bg_bh);
 456                mlog_errno(status);
 457        }
 458
 459bail:
 460        return status ? ERR_PTR(status) : bg_bh;
 461}
 462
 463static int ocfs2_block_group_claim_bits(struct ocfs2_super *osb,
 464                                        handle_t *handle,
 465                                        struct ocfs2_alloc_context *ac,
 466                                        unsigned int min_bits,
 467                                        u32 *bit_off, u32 *num_bits)
 468{
 469        int status = 0;
 470
 471        while (min_bits) {
 472                status = ocfs2_claim_clusters(handle, ac, min_bits,
 473                                              bit_off, num_bits);
 474                if (status != -ENOSPC)
 475                        break;
 476
 477                min_bits >>= 1;
 478        }
 479
 480        return status;
 481}
 482
 483static int ocfs2_block_group_grow_discontig(handle_t *handle,
 484                                            struct inode *alloc_inode,
 485                                            struct buffer_head *bg_bh,
 486                                            struct ocfs2_alloc_context *ac,
 487                                            struct ocfs2_chain_list *cl,
 488                                            unsigned int min_bits)
 489{
 490        int status;
 491        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 492        struct ocfs2_group_desc *bg =
 493                (struct ocfs2_group_desc *)bg_bh->b_data;
 494        unsigned int needed = le16_to_cpu(cl->cl_cpg) -
 495                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
 496        u32 p_cpos, clusters;
 497        u64 p_blkno;
 498        struct ocfs2_extent_list *el = &bg->bg_list;
 499
 500        status = ocfs2_journal_access_gd(handle,
 501                                         INODE_CACHE(alloc_inode),
 502                                         bg_bh,
 503                                         OCFS2_JOURNAL_ACCESS_CREATE);
 504        if (status < 0) {
 505                mlog_errno(status);
 506                goto bail;
 507        }
 508
 509        while ((needed > 0) && (le16_to_cpu(el->l_next_free_rec) <
 510                                le16_to_cpu(el->l_count))) {
 511                if (min_bits > needed)
 512                        min_bits = needed;
 513                status = ocfs2_block_group_claim_bits(osb, handle, ac,
 514                                                      min_bits, &p_cpos,
 515                                                      &clusters);
 516                if (status < 0) {
 517                        if (status != -ENOSPC)
 518                                mlog_errno(status);
 519                        goto bail;
 520                }
 521                p_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cpos);
 522                ocfs2_bg_discontig_add_extent(osb, bg, cl, p_blkno,
 523                                              clusters);
 524
 525                min_bits = clusters;
 526                needed = le16_to_cpu(cl->cl_cpg) -
 527                         le16_to_cpu(bg->bg_bits) / le16_to_cpu(cl->cl_bpc);
 528        }
 529
 530        if (needed > 0) {
 531                /*
 532                 * We have used up all the extent rec but can't fill up
 533                 * the cpg. So bail out.
 534                 */
 535                status = -ENOSPC;
 536                goto bail;
 537        }
 538
 539        ocfs2_journal_dirty(handle, bg_bh);
 540
 541bail:
 542        return status;
 543}
 544
 545static void ocfs2_bg_alloc_cleanup(handle_t *handle,
 546                                   struct ocfs2_alloc_context *cluster_ac,
 547                                   struct inode *alloc_inode,
 548                                   struct buffer_head *bg_bh)
 549{
 550        int i, ret;
 551        struct ocfs2_group_desc *bg;
 552        struct ocfs2_extent_list *el;
 553        struct ocfs2_extent_rec *rec;
 554
 555        if (!bg_bh)
 556                return;
 557
 558        bg = (struct ocfs2_group_desc *)bg_bh->b_data;
 559        el = &bg->bg_list;
 560        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 561                rec = &el->l_recs[i];
 562                ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
 563                                          cluster_ac->ac_bh,
 564                                          le64_to_cpu(rec->e_blkno),
 565                                          le16_to_cpu(rec->e_leaf_clusters));
 566                if (ret)
 567                        mlog_errno(ret);
 568                /* Try all the clusters to free */
 569        }
 570
 571        ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), bg_bh);
 572        brelse(bg_bh);
 573}
 574
 575static struct buffer_head *
 576ocfs2_block_group_alloc_discontig(handle_t *handle,
 577                                  struct inode *alloc_inode,
 578                                  struct ocfs2_alloc_context *ac,
 579                                  struct ocfs2_chain_list *cl)
 580{
 581        int status;
 582        u32 bit_off, num_bits;
 583        u64 bg_blkno;
 584        unsigned int min_bits = le16_to_cpu(cl->cl_cpg) >> 1;
 585        struct buffer_head *bg_bh = NULL;
 586        unsigned int alloc_rec = ocfs2_find_smallest_chain(cl);
 587        struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 588
 589        if (!ocfs2_supports_discontig_bg(osb)) {
 590                status = -ENOSPC;
 591                goto bail;
 592        }
 593
 594        status = ocfs2_extend_trans(handle,
 595                                    ocfs2_calc_bg_discontig_credits(osb->sb));
 596        if (status) {
 597                mlog_errno(status);
 598                goto bail;
 599        }
 600
 601        /*
 602         * We're going to be grabbing from multiple cluster groups.
 603         * We don't have enough credits to relink them all, and the
 604         * cluster groups will be staying in cache for the duration of
 605         * this operation.
 606         */
 607        ac->ac_disable_chain_relink = 1;
 608
 609        /* Claim the first region */
 610        status = ocfs2_block_group_claim_bits(osb, handle, ac, min_bits,
 611                                              &bit_off, &num_bits);
 612        if (status < 0) {
 613                if (status != -ENOSPC)
 614                        mlog_errno(status);
 615                goto bail;
 616        }
 617        min_bits = num_bits;
 618
 619        /* setup the group */
 620        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 621        trace_ocfs2_block_group_alloc_discontig(
 622                                (unsigned long long)bg_blkno, alloc_rec);
 623
 624        bg_bh = sb_getblk(osb->sb, bg_blkno);
 625        if (!bg_bh) {
 626                status = -ENOMEM;
 627                mlog_errno(status);
 628                goto bail;
 629        }
 630        ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
 631
 632        status = ocfs2_block_group_fill(handle, alloc_inode, bg_bh,
 633                                        bg_blkno, num_bits, alloc_rec, cl);
 634        if (status < 0) {
 635                mlog_errno(status);
 636                goto bail;
 637        }
 638
 639        status = ocfs2_block_group_grow_discontig(handle, alloc_inode,
 640                                                  bg_bh, ac, cl, min_bits);
 641        if (status)
 642                mlog_errno(status);
 643
 644bail:
 645        if (status)
 646                ocfs2_bg_alloc_cleanup(handle, ac, alloc_inode, bg_bh);
 647        return status ? ERR_PTR(status) : bg_bh;
 648}
 649
 650/*
 651 * We expect the block group allocator to already be locked.
 652 */
 653static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 654                                   struct inode *alloc_inode,
 655                                   struct buffer_head *bh,
 656                                   u64 max_block,
 657                                   u64 *last_alloc_group,
 658                                   int flags)
 659{
 660        int status, credits;
 661        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
 662        struct ocfs2_chain_list *cl;
 663        struct ocfs2_alloc_context *ac = NULL;
 664        handle_t *handle = NULL;
 665        u16 alloc_rec;
 666        struct buffer_head *bg_bh = NULL;
 667        struct ocfs2_group_desc *bg;
 668
 669        BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
 670
 671        cl = &fe->id2.i_chain;
 672        status = ocfs2_reserve_clusters_with_limit(osb,
 673                                                   le16_to_cpu(cl->cl_cpg),
 674                                                   max_block, flags, &ac);
 675        if (status < 0) {
 676                if (status != -ENOSPC)
 677                        mlog_errno(status);
 678                goto bail;
 679        }
 680
 681        credits = ocfs2_calc_group_alloc_credits(osb->sb,
 682                                                 le16_to_cpu(cl->cl_cpg));
 683        handle = ocfs2_start_trans(osb, credits);
 684        if (IS_ERR(handle)) {
 685                status = PTR_ERR(handle);
 686                handle = NULL;
 687                mlog_errno(status);
 688                goto bail;
 689        }
 690
 691        if (last_alloc_group && *last_alloc_group != 0) {
 692                trace_ocfs2_block_group_alloc(
 693                                (unsigned long long)*last_alloc_group);
 694                ac->ac_last_group = *last_alloc_group;
 695        }
 696
 697        bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
 698                                               ac, cl);
 699        if (PTR_ERR(bg_bh) == -ENOSPC)
 700                bg_bh = ocfs2_block_group_alloc_discontig(handle,
 701                                                          alloc_inode,
 702                                                          ac, cl);
 703        if (IS_ERR(bg_bh)) {
 704                status = PTR_ERR(bg_bh);
 705                bg_bh = NULL;
 706                if (status != -ENOSPC)
 707                        mlog_errno(status);
 708                goto bail;
 709        }
 710        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 711
 712        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
 713                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
 714        if (status < 0) {
 715                mlog_errno(status);
 716                goto bail;
 717        }
 718
 719        alloc_rec = le16_to_cpu(bg->bg_chain);
 720        le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
 721                     le16_to_cpu(bg->bg_free_bits_count));
 722        le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
 723                     le16_to_cpu(bg->bg_bits));
 724        cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
 725        if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
 726                le16_add_cpu(&cl->cl_next_free_rec, 1);
 727
 728        le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
 729                                        le16_to_cpu(bg->bg_free_bits_count));
 730        le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
 731        le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
 732
 733        ocfs2_journal_dirty(handle, bh);
 734
 735        spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
 736        OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 737        fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
 738                                             le32_to_cpu(fe->i_clusters)));
 739        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
 740        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
 741        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
 742        ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
 743
 744        status = 0;
 745
 746        /* save the new last alloc group so that the caller can cache it. */
 747        if (last_alloc_group)
 748                *last_alloc_group = ac->ac_last_group;
 749
 750bail:
 751        if (handle)
 752                ocfs2_commit_trans(osb, handle);
 753
 754        if (ac)
 755                ocfs2_free_alloc_context(ac);
 756
 757        brelse(bg_bh);
 758
 759        if (status)
 760                mlog_errno(status);
 761        return status;
 762}
 763
 764static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 765                                       struct ocfs2_alloc_context *ac,
 766                                       int type,
 767                                       u32 slot,
 768                                       u64 *last_alloc_group,
 769                                       int flags)
 770{
 771        int status;
 772        u32 bits_wanted = ac->ac_bits_wanted;
 773        struct inode *alloc_inode;
 774        struct buffer_head *bh = NULL;
 775        struct ocfs2_dinode *fe;
 776        u32 free_bits;
 777
 778        alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
 779        if (!alloc_inode) {
 780                mlog_errno(-EINVAL);
 781                return -EINVAL;
 782        }
 783
 784        inode_lock(alloc_inode);
 785
 786        status = ocfs2_inode_lock(alloc_inode, &bh, 1);
 787        if (status < 0) {
 788                inode_unlock(alloc_inode);
 789                iput(alloc_inode);
 790
 791                mlog_errno(status);
 792                return status;
 793        }
 794
 795        ac->ac_inode = alloc_inode;
 796        ac->ac_alloc_slot = slot;
 797
 798        fe = (struct ocfs2_dinode *) bh->b_data;
 799
 800        /* The bh was validated by the inode read inside
 801         * ocfs2_inode_lock().  Any corruption is a code bug. */
 802        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 803
 804        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 805                status = ocfs2_error(alloc_inode->i_sb,
 806                                     "Invalid chain allocator %llu\n",
 807                                     (unsigned long long)le64_to_cpu(fe->i_blkno));
 808                goto bail;
 809        }
 810
 811        free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
 812                le32_to_cpu(fe->id1.bitmap1.i_used);
 813
 814        if (bits_wanted > free_bits) {
 815                /* cluster bitmap never grows */
 816                if (ocfs2_is_cluster_bitmap(alloc_inode)) {
 817                        trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
 818                                                                free_bits);
 819                        status = -ENOSPC;
 820                        goto bail;
 821                }
 822
 823                if (!(flags & ALLOC_NEW_GROUP)) {
 824                        trace_ocfs2_reserve_suballoc_bits_no_new_group(
 825                                                slot, bits_wanted, free_bits);
 826                        status = -ENOSPC;
 827                        goto bail;
 828                }
 829
 830                status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
 831                                                 ac->ac_max_block,
 832                                                 last_alloc_group, flags);
 833                if (status < 0) {
 834                        if (status != -ENOSPC)
 835                                mlog_errno(status);
 836                        goto bail;
 837                }
 838                atomic_inc(&osb->alloc_stats.bg_extends);
 839
 840                /* You should never ask for this much metadata */
 841                BUG_ON(bits_wanted >
 842                       (le32_to_cpu(fe->id1.bitmap1.i_total)
 843                        - le32_to_cpu(fe->id1.bitmap1.i_used)));
 844        }
 845
 846        get_bh(bh);
 847        ac->ac_bh = bh;
 848bail:
 849        brelse(bh);
 850
 851        if (status)
 852                mlog_errno(status);
 853        return status;
 854}
 855
 856static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 857{
 858        spin_lock(&osb->osb_lock);
 859        osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
 860        spin_unlock(&osb->osb_lock);
 861        atomic_set(&osb->s_num_inodes_stolen, 0);
 862}
 863
 864static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
 865{
 866        spin_lock(&osb->osb_lock);
 867        osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
 868        spin_unlock(&osb->osb_lock);
 869        atomic_set(&osb->s_num_meta_stolen, 0);
 870}
 871
 872void ocfs2_init_steal_slots(struct ocfs2_super *osb)
 873{
 874        ocfs2_init_inode_steal_slot(osb);
 875        ocfs2_init_meta_steal_slot(osb);
 876}
 877
 878static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
 879{
 880        spin_lock(&osb->osb_lock);
 881        if (type == INODE_ALLOC_SYSTEM_INODE)
 882                osb->s_inode_steal_slot = slot;
 883        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
 884                osb->s_meta_steal_slot = slot;
 885        spin_unlock(&osb->osb_lock);
 886}
 887
 888static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
 889{
 890        int slot = OCFS2_INVALID_SLOT;
 891
 892        spin_lock(&osb->osb_lock);
 893        if (type == INODE_ALLOC_SYSTEM_INODE)
 894                slot = osb->s_inode_steal_slot;
 895        else if (type == EXTENT_ALLOC_SYSTEM_INODE)
 896                slot = osb->s_meta_steal_slot;
 897        spin_unlock(&osb->osb_lock);
 898
 899        return slot;
 900}
 901
 902static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 903{
 904        return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
 905}
 906
 907static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
 908{
 909        return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
 910}
 911
 912static int ocfs2_steal_resource(struct ocfs2_super *osb,
 913                                struct ocfs2_alloc_context *ac,
 914                                int type)
 915{
 916        int i, status = -ENOSPC;
 917        int slot = __ocfs2_get_steal_slot(osb, type);
 918
 919        /* Start to steal resource from the first slot after ours. */
 920        if (slot == OCFS2_INVALID_SLOT)
 921                slot = osb->slot_num + 1;
 922
 923        for (i = 0; i < osb->max_slots; i++, slot++) {
 924                if (slot == osb->max_slots)
 925                        slot = 0;
 926
 927                if (slot == osb->slot_num)
 928                        continue;
 929
 930                status = ocfs2_reserve_suballoc_bits(osb, ac,
 931                                                     type,
 932                                                     (u32)slot, NULL,
 933                                                     NOT_ALLOC_NEW_GROUP);
 934                if (status >= 0) {
 935                        __ocfs2_set_steal_slot(osb, slot, type);
 936                        break;
 937                }
 938
 939                ocfs2_free_ac_resource(ac);
 940        }
 941
 942        return status;
 943}
 944
 945static int ocfs2_steal_inode(struct ocfs2_super *osb,
 946                             struct ocfs2_alloc_context *ac)
 947{
 948        return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
 949}
 950
 951static int ocfs2_steal_meta(struct ocfs2_super *osb,
 952                            struct ocfs2_alloc_context *ac)
 953{
 954        return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
 955}
 956
 957int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
 958                                      int blocks,
 959                                      struct ocfs2_alloc_context **ac)
 960{
 961        int status;
 962        int slot = ocfs2_get_meta_steal_slot(osb);
 963
 964        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 965        if (!(*ac)) {
 966                status = -ENOMEM;
 967                mlog_errno(status);
 968                goto bail;
 969        }
 970
 971        (*ac)->ac_bits_wanted = blocks;
 972        (*ac)->ac_which = OCFS2_AC_USE_META;
 973        (*ac)->ac_group_search = ocfs2_block_group_search;
 974
 975        if (slot != OCFS2_INVALID_SLOT &&
 976                atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
 977                goto extent_steal;
 978
 979        atomic_set(&osb->s_num_meta_stolen, 0);
 980        status = ocfs2_reserve_suballoc_bits(osb, (*ac),
 981                                             EXTENT_ALLOC_SYSTEM_INODE,
 982                                             (u32)osb->slot_num, NULL,
 983                                             ALLOC_GROUPS_FROM_GLOBAL|ALLOC_NEW_GROUP);
 984
 985
 986        if (status >= 0) {
 987                status = 0;
 988                if (slot != OCFS2_INVALID_SLOT)
 989                        ocfs2_init_meta_steal_slot(osb);
 990                goto bail;
 991        } else if (status < 0 && status != -ENOSPC) {
 992                mlog_errno(status);
 993                goto bail;
 994        }
 995
 996        ocfs2_free_ac_resource(*ac);
 997
 998extent_steal:
 999        status = ocfs2_steal_meta(osb, *ac);
1000        atomic_inc(&osb->s_num_meta_stolen);
1001        if (status < 0) {
1002                if (status != -ENOSPC)
1003                        mlog_errno(status);
1004                goto bail;
1005        }
1006
1007        status = 0;
1008bail:
1009        if ((status < 0) && *ac) {
1010                ocfs2_free_alloc_context(*ac);
1011                *ac = NULL;
1012        }
1013
1014        if (status)
1015                mlog_errno(status);
1016        return status;
1017}
1018
1019int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
1020                               struct ocfs2_extent_list *root_el,
1021                               struct ocfs2_alloc_context **ac)
1022{
1023        return ocfs2_reserve_new_metadata_blocks(osb,
1024                                        ocfs2_extend_meta_needed(root_el),
1025                                        ac);
1026}
1027
1028int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1029                            struct ocfs2_alloc_context **ac)
1030{
1031        int status;
1032        int slot = ocfs2_get_inode_steal_slot(osb);
1033        u64 alloc_group;
1034
1035        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1036        if (!(*ac)) {
1037                status = -ENOMEM;
1038                mlog_errno(status);
1039                goto bail;
1040        }
1041
1042        (*ac)->ac_bits_wanted = 1;
1043        (*ac)->ac_which = OCFS2_AC_USE_INODE;
1044
1045        (*ac)->ac_group_search = ocfs2_block_group_search;
1046
1047        /*
1048         * stat(2) can't handle i_ino > 32bits, so we tell the
1049         * lower levels not to allocate us a block group past that
1050         * limit.  The 'inode64' mount option avoids this behavior.
1051         */
1052        if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
1053                (*ac)->ac_max_block = (u32)~0U;
1054
1055        /*
1056         * slot is set when we successfully steal inode from other nodes.
1057         * It is reset in 3 places:
1058         * 1. when we flush the truncate log
1059         * 2. when we complete local alloc recovery.
1060         * 3. when we successfully allocate from our own slot.
1061         * After it is set, we will go on stealing inodes until we find the
1062         * need to check our slots to see whether there is some space for us.
1063         */
1064        if (slot != OCFS2_INVALID_SLOT &&
1065            atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
1066                goto inode_steal;
1067
1068        atomic_set(&osb->s_num_inodes_stolen, 0);
1069        alloc_group = osb->osb_inode_alloc_group;
1070        status = ocfs2_reserve_suballoc_bits(osb, *ac,
1071                                             INODE_ALLOC_SYSTEM_INODE,
1072                                             (u32)osb->slot_num,
1073                                             &alloc_group,
1074                                             ALLOC_NEW_GROUP |
1075                                             ALLOC_GROUPS_FROM_GLOBAL);
1076        if (status >= 0) {
1077                status = 0;
1078
1079                spin_lock(&osb->osb_lock);
1080                osb->osb_inode_alloc_group = alloc_group;
1081                spin_unlock(&osb->osb_lock);
1082                trace_ocfs2_reserve_new_inode_new_group(
1083                        (unsigned long long)alloc_group);
1084
1085                /*
1086                 * Some inodes must be freed by us, so try to allocate
1087                 * from our own next time.
1088                 */
1089                if (slot != OCFS2_INVALID_SLOT)
1090                        ocfs2_init_inode_steal_slot(osb);
1091                goto bail;
1092        } else if (status < 0 && status != -ENOSPC) {
1093                mlog_errno(status);
1094                goto bail;
1095        }
1096
1097        ocfs2_free_ac_resource(*ac);
1098
1099inode_steal:
1100        status = ocfs2_steal_inode(osb, *ac);
1101        atomic_inc(&osb->s_num_inodes_stolen);
1102        if (status < 0) {
1103                if (status != -ENOSPC)
1104                        mlog_errno(status);
1105                goto bail;
1106        }
1107
1108        status = 0;
1109bail:
1110        if ((status < 0) && *ac) {
1111                ocfs2_free_alloc_context(*ac);
1112                *ac = NULL;
1113        }
1114
1115        if (status)
1116                mlog_errno(status);
1117        return status;
1118}
1119
1120/* local alloc code has to do the same thing, so rather than do this
1121 * twice.. */
1122int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
1123                                      struct ocfs2_alloc_context *ac)
1124{
1125        int status;
1126
1127        ac->ac_which = OCFS2_AC_USE_MAIN;
1128        ac->ac_group_search = ocfs2_cluster_group_search;
1129
1130        status = ocfs2_reserve_suballoc_bits(osb, ac,
1131                                             GLOBAL_BITMAP_SYSTEM_INODE,
1132                                             OCFS2_INVALID_SLOT, NULL,
1133                                             ALLOC_NEW_GROUP);
1134        if (status < 0 && status != -ENOSPC)
1135                mlog_errno(status);
1136
1137        return status;
1138}
1139
1140/* Callers don't need to care which bitmap (local alloc or main) to
1141 * use so we figure it out for them, but unfortunately this clutters
1142 * things a bit. */
1143static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1144                                             u32 bits_wanted, u64 max_block,
1145                                             int flags,
1146                                             struct ocfs2_alloc_context **ac)
1147{
1148        int status, ret = 0;
1149        int retried = 0;
1150
1151        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1152        if (!(*ac)) {
1153                status = -ENOMEM;
1154                mlog_errno(status);
1155                goto bail;
1156        }
1157
1158        (*ac)->ac_bits_wanted = bits_wanted;
1159        (*ac)->ac_max_block = max_block;
1160
1161        status = -ENOSPC;
1162        if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
1163            ocfs2_alloc_should_use_local(osb, bits_wanted)) {
1164                status = ocfs2_reserve_local_alloc_bits(osb,
1165                                                        bits_wanted,
1166                                                        *ac);
1167                if ((status < 0) && (status != -ENOSPC)) {
1168                        mlog_errno(status);
1169                        goto bail;
1170                }
1171        }
1172
1173        if (status == -ENOSPC) {
1174retry:
1175                status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1176                /* Retry if there is sufficient space cached in truncate log */
1177                if (status == -ENOSPC && !retried) {
1178                        retried = 1;
1179                        ocfs2_inode_unlock((*ac)->ac_inode, 1);
1180                        inode_unlock((*ac)->ac_inode);
1181
1182                        ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1183                        if (ret == 1) {
1184                                iput((*ac)->ac_inode);
1185                                (*ac)->ac_inode = NULL;
1186                                goto retry;
1187                        }
1188
1189                        if (ret < 0)
1190                                mlog_errno(ret);
1191
1192                        inode_lock((*ac)->ac_inode);
1193                        ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1194                        if (ret < 0) {
1195                                mlog_errno(ret);
1196                                inode_unlock((*ac)->ac_inode);
1197                                iput((*ac)->ac_inode);
1198                                (*ac)->ac_inode = NULL;
1199                                goto bail;
1200                        }
1201                }
1202                if (status < 0) {
1203                        if (status != -ENOSPC)
1204                                mlog_errno(status);
1205                        goto bail;
1206                }
1207        }
1208
1209        status = 0;
1210bail:
1211        if ((status < 0) && *ac) {
1212                ocfs2_free_alloc_context(*ac);
1213                *ac = NULL;
1214        }
1215
1216        if (status)
1217                mlog_errno(status);
1218        return status;
1219}
1220
1221int ocfs2_reserve_clusters(struct ocfs2_super *osb,
1222                           u32 bits_wanted,
1223                           struct ocfs2_alloc_context **ac)
1224{
1225        return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
1226                                                 ALLOC_NEW_GROUP, ac);
1227}
1228
1229/*
1230 * More or less lifted from ext3. I'll leave their description below:
1231 *
1232 * "For ext3 allocations, we must not reuse any blocks which are
1233 * allocated in the bitmap buffer's "last committed data" copy.  This
1234 * prevents deletes from freeing up the page for reuse until we have
1235 * committed the delete transaction.
1236 *
1237 * If we didn't do this, then deleting something and reallocating it as
1238 * data would allow the old block to be overwritten before the
1239 * transaction committed (because we force data to disk before commit).
1240 * This would lead to corruption if we crashed between overwriting the
1241 * data and committing the delete.
1242 *
1243 * @@@ We may want to make this allocation behaviour conditional on
1244 * data-writes at some point, and disable it for metadata allocations or
1245 * sync-data inodes."
1246 *
1247 * Note: OCFS2 already does this differently for metadata vs data
1248 * allocations, as those bitmaps are separate and undo access is never
1249 * called on a metadata group descriptor.
1250 */
1251static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1252                                         int nr)
1253{
1254        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1255        struct journal_head *jh;
1256        int ret;
1257
1258        if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1259                return 0;
1260
1261        if (!buffer_jbd(bg_bh))
1262                return 1;
1263
1264        jh = bh2jh(bg_bh);
1265        spin_lock(&jh->b_state_lock);
1266        bg = (struct ocfs2_group_desc *) jh->b_committed_data;
1267        if (bg)
1268                ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1269        else
1270                ret = 1;
1271        spin_unlock(&jh->b_state_lock);
1272
1273        return ret;
1274}
1275
1276static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1277                                             struct buffer_head *bg_bh,
1278                                             unsigned int bits_wanted,
1279                                             unsigned int total_bits,
1280                                             struct ocfs2_suballoc_result *res)
1281{
1282        void *bitmap;
1283        u16 best_offset, best_size;
1284        int offset, start, found, status = 0;
1285        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1286
1287        /* Callers got this descriptor from
1288         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1289        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1290
1291        found = start = best_offset = best_size = 0;
1292        bitmap = bg->bg_bitmap;
1293
1294        while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1295                if (offset == total_bits)
1296                        break;
1297
1298                if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1299                        /* We found a zero, but we can't use it as it
1300                         * hasn't been put to disk yet! */
1301                        found = 0;
1302                        start = offset + 1;
1303                } else if (offset == start) {
1304                        /* we found a zero */
1305                        found++;
1306                        /* move start to the next bit to test */
1307                        start++;
1308                } else {
1309                        /* got a zero after some ones */
1310                        found = 1;
1311                        start = offset + 1;
1312                }
1313                if (found > best_size) {
1314                        best_size = found;
1315                        best_offset = start - found;
1316                }
1317                /* we got everything we needed */
1318                if (found == bits_wanted) {
1319                        /* mlog(0, "Found it all!\n"); */
1320                        break;
1321                }
1322        }
1323
1324        if (best_size) {
1325                res->sr_bit_offset = best_offset;
1326                res->sr_bits = best_size;
1327        } else {
1328                status = -ENOSPC;
1329                /* No error log here -- see the comment above
1330                 * ocfs2_test_bg_bit_allocatable */
1331        }
1332
1333        return status;
1334}
1335
1336int ocfs2_block_group_set_bits(handle_t *handle,
1337                                             struct inode *alloc_inode,
1338                                             struct ocfs2_group_desc *bg,
1339                                             struct buffer_head *group_bh,
1340                                             unsigned int bit_off,
1341                                             unsigned int num_bits)
1342{
1343        int status;
1344        void *bitmap = bg->bg_bitmap;
1345        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1346
1347        /* All callers get the descriptor via
1348         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1349        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1350        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1351
1352        trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1353
1354        if (ocfs2_is_cluster_bitmap(alloc_inode))
1355                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1356
1357        status = ocfs2_journal_access_gd(handle,
1358                                         INODE_CACHE(alloc_inode),
1359                                         group_bh,
1360                                         journal_type);
1361        if (status < 0) {
1362                mlog_errno(status);
1363                goto bail;
1364        }
1365
1366        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1367        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
1368                return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
1369                                   (unsigned long long)le64_to_cpu(bg->bg_blkno),
1370                                   le16_to_cpu(bg->bg_bits),
1371                                   le16_to_cpu(bg->bg_free_bits_count),
1372                                   num_bits);
1373        }
1374        while(num_bits--)
1375                ocfs2_set_bit(bit_off++, bitmap);
1376
1377        ocfs2_journal_dirty(handle, group_bh);
1378
1379bail:
1380        return status;
1381}
1382
1383/* find the one with the most empty bits */
1384static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1385{
1386        u16 curr, best;
1387
1388        BUG_ON(!cl->cl_next_free_rec);
1389
1390        best = curr = 0;
1391        while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1392                if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1393                    le32_to_cpu(cl->cl_recs[best].c_free))
1394                        best = curr;
1395                curr++;
1396        }
1397
1398        BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1399        return best;
1400}
1401
1402static int ocfs2_relink_block_group(handle_t *handle,
1403                                    struct inode *alloc_inode,
1404                                    struct buffer_head *fe_bh,
1405                                    struct buffer_head *bg_bh,
1406                                    struct buffer_head *prev_bg_bh,
1407                                    u16 chain)
1408{
1409        int status;
1410        /* there is a really tiny chance the journal calls could fail,
1411         * but we wouldn't want inconsistent blocks in *any* case. */
1412        u64 bg_ptr, prev_bg_ptr;
1413        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1414        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1415        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1416
1417        /* The caller got these descriptors from
1418         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1419        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1420        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1421
1422        trace_ocfs2_relink_block_group(
1423                (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1424                (unsigned long long)le64_to_cpu(bg->bg_blkno),
1425                (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1426
1427        bg_ptr = le64_to_cpu(bg->bg_next_group);
1428        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1429
1430        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1431                                         prev_bg_bh,
1432                                         OCFS2_JOURNAL_ACCESS_WRITE);
1433        if (status < 0)
1434                goto out;
1435
1436        prev_bg->bg_next_group = bg->bg_next_group;
1437        ocfs2_journal_dirty(handle, prev_bg_bh);
1438
1439        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1440                                         bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1441        if (status < 0)
1442                goto out_rollback_prev_bg;
1443
1444        bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1445        ocfs2_journal_dirty(handle, bg_bh);
1446
1447        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1448                                         fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1449        if (status < 0)
1450                goto out_rollback_bg;
1451
1452        fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1453        ocfs2_journal_dirty(handle, fe_bh);
1454
1455out:
1456        if (status < 0)
1457                mlog_errno(status);
1458        return status;
1459
1460out_rollback_bg:
1461        bg->bg_next_group = cpu_to_le64(bg_ptr);
1462out_rollback_prev_bg:
1463        prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1464        goto out;
1465}
1466
1467static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1468                                                     u32 wanted)
1469{
1470        return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1471}
1472
1473/* return 0 on success, -ENOSPC to keep searching and any other < 0
1474 * value on error. */
1475static int ocfs2_cluster_group_search(struct inode *inode,
1476                                      struct buffer_head *group_bh,
1477                                      u32 bits_wanted, u32 min_bits,
1478                                      u64 max_block,
1479                                      struct ocfs2_suballoc_result *res)
1480{
1481        int search = -ENOSPC;
1482        int ret;
1483        u64 blkoff;
1484        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1485        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1486        unsigned int max_bits, gd_cluster_off;
1487
1488        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1489
1490        if (gd->bg_free_bits_count) {
1491                max_bits = le16_to_cpu(gd->bg_bits);
1492
1493                /* Tail groups in cluster bitmaps which aren't cpg
1494                 * aligned are prone to partial extension by a failed
1495                 * fs resize. If the file system resize never got to
1496                 * update the dinode cluster count, then we don't want
1497                 * to trust any clusters past it, regardless of what
1498                 * the group descriptor says. */
1499                gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1500                                                          le64_to_cpu(gd->bg_blkno));
1501                if ((gd_cluster_off + max_bits) >
1502                    OCFS2_I(inode)->ip_clusters) {
1503                        max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1504                        trace_ocfs2_cluster_group_search_wrong_max_bits(
1505                                (unsigned long long)le64_to_cpu(gd->bg_blkno),
1506                                le16_to_cpu(gd->bg_bits),
1507                                OCFS2_I(inode)->ip_clusters, max_bits);
1508                }
1509
1510                ret = ocfs2_block_group_find_clear_bits(osb,
1511                                                        group_bh, bits_wanted,
1512                                                        max_bits, res);
1513                if (ret)
1514                        return ret;
1515
1516                if (max_block) {
1517                        blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1518                                                          gd_cluster_off +
1519                                                          res->sr_bit_offset +
1520                                                          res->sr_bits);
1521                        trace_ocfs2_cluster_group_search_max_block(
1522                                (unsigned long long)blkoff,
1523                                (unsigned long long)max_block);
1524                        if (blkoff > max_block)
1525                                return -ENOSPC;
1526                }
1527
1528                /* ocfs2_block_group_find_clear_bits() might
1529                 * return success, but we still want to return
1530                 * -ENOSPC unless it found the minimum number
1531                 * of bits. */
1532                if (min_bits <= res->sr_bits)
1533                        search = 0; /* success */
1534                else if (res->sr_bits) {
1535                        /*
1536                         * Don't show bits which we'll be returning
1537                         * for allocation to the local alloc bitmap.
1538                         */
1539                        ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits);
1540                }
1541        }
1542
1543        return search;
1544}
1545
1546static int ocfs2_block_group_search(struct inode *inode,
1547                                    struct buffer_head *group_bh,
1548                                    u32 bits_wanted, u32 min_bits,
1549                                    u64 max_block,
1550                                    struct ocfs2_suballoc_result *res)
1551{
1552        int ret = -ENOSPC;
1553        u64 blkoff;
1554        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1555
1556        BUG_ON(min_bits != 1);
1557        BUG_ON(ocfs2_is_cluster_bitmap(inode));
1558
1559        if (bg->bg_free_bits_count) {
1560                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1561                                                        group_bh, bits_wanted,
1562                                                        le16_to_cpu(bg->bg_bits),
1563                                                        res);
1564                if (!ret && max_block) {
1565                        blkoff = le64_to_cpu(bg->bg_blkno) +
1566                                res->sr_bit_offset + res->sr_bits;
1567                        trace_ocfs2_block_group_search_max_block(
1568                                (unsigned long long)blkoff,
1569                                (unsigned long long)max_block);
1570                        if (blkoff > max_block)
1571                                ret = -ENOSPC;
1572                }
1573        }
1574
1575        return ret;
1576}
1577
1578int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1579                                       handle_t *handle,
1580                                       struct buffer_head *di_bh,
1581                                       u32 num_bits,
1582                                       u16 chain)
1583{
1584        int ret;
1585        u32 tmp_used;
1586        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1587        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1588
1589        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1590                                      OCFS2_JOURNAL_ACCESS_WRITE);
1591        if (ret < 0) {
1592                mlog_errno(ret);
1593                goto out;
1594        }
1595
1596        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1597        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1598        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1599        ocfs2_journal_dirty(handle, di_bh);
1600
1601out:
1602        return ret;
1603}
1604
1605void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
1606                                       struct buffer_head *di_bh,
1607                                       u32 num_bits,
1608                                       u16 chain)
1609{
1610        u32 tmp_used;
1611        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1612        struct ocfs2_chain_list *cl;
1613
1614        cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
1615        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1616        di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
1617        le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
1618}
1619
1620static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
1621                                         struct ocfs2_extent_rec *rec,
1622                                         struct ocfs2_chain_list *cl)
1623{
1624        unsigned int bpc = le16_to_cpu(cl->cl_bpc);
1625        unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
1626        unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
1627
1628        if (res->sr_bit_offset < bitoff)
1629                return 0;
1630        if (res->sr_bit_offset >= (bitoff + bitcount))
1631                return 0;
1632        res->sr_blkno = le64_to_cpu(rec->e_blkno) +
1633                (res->sr_bit_offset - bitoff);
1634        if ((res->sr_bit_offset + res->sr_bits) > (bitoff + bitcount))
1635                res->sr_bits = (bitoff + bitcount) - res->sr_bit_offset;
1636        return 1;
1637}
1638
1639static void ocfs2_bg_discontig_fix_result(struct ocfs2_alloc_context *ac,
1640                                          struct ocfs2_group_desc *bg,
1641                                          struct ocfs2_suballoc_result *res)
1642{
1643        int i;
1644        u64 bg_blkno = res->sr_bg_blkno;  /* Save off */
1645        struct ocfs2_extent_rec *rec;
1646        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1647        struct ocfs2_chain_list *cl = &di->id2.i_chain;
1648
1649        if (ocfs2_is_cluster_bitmap(ac->ac_inode)) {
1650                res->sr_blkno = 0;
1651                return;
1652        }
1653
1654        res->sr_blkno = res->sr_bg_blkno + res->sr_bit_offset;
1655        res->sr_bg_blkno = 0;  /* Clear it for contig block groups */
1656        if (!ocfs2_supports_discontig_bg(OCFS2_SB(ac->ac_inode->i_sb)) ||
1657            !bg->bg_list.l_next_free_rec)
1658                return;
1659
1660        for (i = 0; i < le16_to_cpu(bg->bg_list.l_next_free_rec); i++) {
1661                rec = &bg->bg_list.l_recs[i];
1662                if (ocfs2_bg_discontig_fix_by_rec(res, rec, cl)) {
1663                        res->sr_bg_blkno = bg_blkno;  /* Restore */
1664                        break;
1665                }
1666        }
1667}
1668
1669static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1670                                  handle_t *handle,
1671                                  u32 bits_wanted,
1672                                  u32 min_bits,
1673                                  struct ocfs2_suballoc_result *res,
1674                                  u16 *bits_left)
1675{
1676        int ret;
1677        struct buffer_head *group_bh = NULL;
1678        struct ocfs2_group_desc *gd;
1679        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1680        struct inode *alloc_inode = ac->ac_inode;
1681
1682        ret = ocfs2_read_group_descriptor(alloc_inode, di,
1683                                          res->sr_bg_blkno, &group_bh);
1684        if (ret < 0) {
1685                mlog_errno(ret);
1686                return ret;
1687        }
1688
1689        gd = (struct ocfs2_group_desc *) group_bh->b_data;
1690        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1691                                  ac->ac_max_block, res);
1692        if (ret < 0) {
1693                if (ret != -ENOSPC)
1694                        mlog_errno(ret);
1695                goto out;
1696        }
1697
1698        if (!ret)
1699                ocfs2_bg_discontig_fix_result(ac, gd, res);
1700
1701        /*
1702         * sr_bg_blkno might have been changed by
1703         * ocfs2_bg_discontig_fix_result
1704         */
1705        res->sr_bg_stable_blkno = group_bh->b_blocknr;
1706
1707        if (ac->ac_find_loc_only)
1708                goto out_loc_only;
1709
1710        ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1711                                               res->sr_bits,
1712                                               le16_to_cpu(gd->bg_chain));
1713        if (ret < 0) {
1714                mlog_errno(ret);
1715                goto out;
1716        }
1717
1718        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1719                                         res->sr_bit_offset, res->sr_bits);
1720        if (ret < 0) {
1721                ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
1722                                               res->sr_bits,
1723                                               le16_to_cpu(gd->bg_chain));
1724                mlog_errno(ret);
1725        }
1726
1727out_loc_only:
1728        *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1729
1730out:
1731        brelse(group_bh);
1732
1733        return ret;
1734}
1735
1736static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1737                              handle_t *handle,
1738                              u32 bits_wanted,
1739                              u32 min_bits,
1740                              struct ocfs2_suballoc_result *res,
1741                              u16 *bits_left)
1742{
1743        int status;
1744        u16 chain;
1745        u64 next_group;
1746        struct inode *alloc_inode = ac->ac_inode;
1747        struct buffer_head *group_bh = NULL;
1748        struct buffer_head *prev_group_bh = NULL;
1749        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1750        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1751        struct ocfs2_group_desc *bg;
1752
1753        chain = ac->ac_chain;
1754        trace_ocfs2_search_chain_begin(
1755                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1756                bits_wanted, chain);
1757
1758        status = ocfs2_read_group_descriptor(alloc_inode, fe,
1759                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
1760                                             &group_bh);
1761        if (status < 0) {
1762                mlog_errno(status);
1763                goto bail;
1764        }
1765        bg = (struct ocfs2_group_desc *) group_bh->b_data;
1766
1767        status = -ENOSPC;
1768        /* for now, the chain search is a bit simplistic. We just use
1769         * the 1st group with any empty bits. */
1770        while ((status = ac->ac_group_search(alloc_inode, group_bh,
1771                                             bits_wanted, min_bits,
1772                                             ac->ac_max_block,
1773                                             res)) == -ENOSPC) {
1774                if (!bg->bg_next_group)
1775                        break;
1776
1777                brelse(prev_group_bh);
1778                prev_group_bh = NULL;
1779
1780                next_group = le64_to_cpu(bg->bg_next_group);
1781                prev_group_bh = group_bh;
1782                group_bh = NULL;
1783                status = ocfs2_read_group_descriptor(alloc_inode, fe,
1784                                                     next_group, &group_bh);
1785                if (status < 0) {
1786                        mlog_errno(status);
1787                        goto bail;
1788                }
1789                bg = (struct ocfs2_group_desc *) group_bh->b_data;
1790        }
1791        if (status < 0) {
1792                if (status != -ENOSPC)
1793                        mlog_errno(status);
1794                goto bail;
1795        }
1796
1797        trace_ocfs2_search_chain_succ(
1798                (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1799
1800        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1801
1802        BUG_ON(res->sr_bits == 0);
1803        if (!status)
1804                ocfs2_bg_discontig_fix_result(ac, bg, res);
1805
1806        /*
1807         * sr_bg_blkno might have been changed by
1808         * ocfs2_bg_discontig_fix_result
1809         */
1810        res->sr_bg_stable_blkno = group_bh->b_blocknr;
1811
1812        /*
1813         * Keep track of previous block descriptor read. When
1814         * we find a target, if we have read more than X
1815         * number of descriptors, and the target is reasonably
1816         * empty, relink him to top of his chain.
1817         *
1818         * We've read 0 extra blocks and only send one more to
1819         * the transaction, yet the next guy to search has a
1820         * much easier time.
1821         *
1822         * Do this *after* figuring out how many bits we're taking out
1823         * of our target group.
1824         */
1825        if (!ac->ac_disable_chain_relink &&
1826            (prev_group_bh) &&
1827            (ocfs2_block_group_reasonably_empty(bg, res->sr_bits))) {
1828                status = ocfs2_relink_block_group(handle, alloc_inode,
1829                                                  ac->ac_bh, group_bh,
1830                                                  prev_group_bh, chain);
1831                if (status < 0) {
1832                        mlog_errno(status);
1833                        goto bail;
1834                }
1835        }
1836
1837        if (ac->ac_find_loc_only)
1838                goto out_loc_only;
1839
1840        status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle,
1841                                                  ac->ac_bh, res->sr_bits,
1842                                                  chain);
1843        if (status) {
1844                mlog_errno(status);
1845                goto bail;
1846        }
1847
1848        status = ocfs2_block_group_set_bits(handle,
1849                                            alloc_inode,
1850                                            bg,
1851                                            group_bh,
1852                                            res->sr_bit_offset,
1853                                            res->sr_bits);
1854        if (status < 0) {
1855                ocfs2_rollback_alloc_dinode_counts(alloc_inode,
1856                                        ac->ac_bh, res->sr_bits, chain);
1857                mlog_errno(status);
1858                goto bail;
1859        }
1860
1861        trace_ocfs2_search_chain_end(
1862                        (unsigned long long)le64_to_cpu(fe->i_blkno),
1863                        res->sr_bits);
1864
1865out_loc_only:
1866        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1867bail:
1868        brelse(group_bh);
1869        brelse(prev_group_bh);
1870
1871        if (status)
1872                mlog_errno(status);
1873        return status;
1874}
1875
1876/* will give out up to bits_wanted contiguous bits. */
1877static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1878                                     handle_t *handle,
1879                                     u32 bits_wanted,
1880                                     u32 min_bits,
1881                                     struct ocfs2_suballoc_result *res)
1882{
1883        int status;
1884        u16 victim, i;
1885        u16 bits_left = 0;
1886        u64 hint = ac->ac_last_group;
1887        struct ocfs2_chain_list *cl;
1888        struct ocfs2_dinode *fe;
1889
1890        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1891        BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1892        BUG_ON(!ac->ac_bh);
1893
1894        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1895
1896        /* The bh was validated by the inode read during
1897         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1898        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1899
1900        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1901            le32_to_cpu(fe->id1.bitmap1.i_total)) {
1902                status = ocfs2_error(ac->ac_inode->i_sb,
1903                                     "Chain allocator dinode %llu has %u used bits but only %u total\n",
1904                                     (unsigned long long)le64_to_cpu(fe->i_blkno),
1905                                     le32_to_cpu(fe->id1.bitmap1.i_used),
1906                                     le32_to_cpu(fe->id1.bitmap1.i_total));
1907                goto bail;
1908        }
1909
1910        res->sr_bg_blkno = hint;
1911        if (res->sr_bg_blkno) {
1912                /* Attempt to short-circuit the usual search mechanism
1913                 * by jumping straight to the most recently used
1914                 * allocation group. This helps us maintain some
1915                 * contiguousness across allocations. */
1916                status = ocfs2_search_one_group(ac, handle, bits_wanted,
1917                                                min_bits, res, &bits_left);
1918                if (!status)
1919                        goto set_hint;
1920                if (status < 0 && status != -ENOSPC) {
1921                        mlog_errno(status);
1922                        goto bail;
1923                }
1924        }
1925
1926        cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1927
1928        victim = ocfs2_find_victim_chain(cl);
1929        ac->ac_chain = victim;
1930
1931        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1932                                    res, &bits_left);
1933        if (!status) {
1934                if (ocfs2_is_cluster_bitmap(ac->ac_inode))
1935                        hint = res->sr_bg_blkno;
1936                else
1937                        hint = ocfs2_group_from_res(res);
1938                goto set_hint;
1939        }
1940        if (status < 0 && status != -ENOSPC) {
1941                mlog_errno(status);
1942                goto bail;
1943        }
1944
1945        trace_ocfs2_claim_suballoc_bits(victim);
1946
1947        /* If we didn't pick a good victim, then just default to
1948         * searching each chain in order. Don't allow chain relinking
1949         * because we only calculate enough journal credits for one
1950         * relink per alloc. */
1951        ac->ac_disable_chain_relink = 1;
1952        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1953                if (i == victim)
1954                        continue;
1955                if (!cl->cl_recs[i].c_free)
1956                        continue;
1957
1958                ac->ac_chain = i;
1959                status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1960                                            res, &bits_left);
1961                if (!status) {
1962                        hint = ocfs2_group_from_res(res);
1963                        break;
1964                }
1965                if (status < 0 && status != -ENOSPC) {
1966                        mlog_errno(status);
1967                        goto bail;
1968                }
1969        }
1970
1971set_hint:
1972        if (status != -ENOSPC) {
1973                /* If the next search of this group is not likely to
1974                 * yield a suitable extent, then we reset the last
1975                 * group hint so as to not waste a disk read */
1976                if (bits_left < min_bits)
1977                        ac->ac_last_group = 0;
1978                else
1979                        ac->ac_last_group = hint;
1980        }
1981
1982bail:
1983        if (status)
1984                mlog_errno(status);
1985        return status;
1986}
1987
1988int ocfs2_claim_metadata(handle_t *handle,
1989                         struct ocfs2_alloc_context *ac,
1990                         u32 bits_wanted,
1991                         u64 *suballoc_loc,
1992                         u16 *suballoc_bit_start,
1993                         unsigned int *num_bits,
1994                         u64 *blkno_start)
1995{
1996        int status;
1997        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
1998
1999        BUG_ON(!ac);
2000        BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
2001        BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
2002
2003        status = ocfs2_claim_suballoc_bits(ac,
2004                                           handle,
2005                                           bits_wanted,
2006                                           1,
2007                                           &res);
2008        if (status < 0) {
2009                mlog_errno(status);
2010                goto bail;
2011        }
2012        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2013
2014        *suballoc_loc = res.sr_bg_blkno;
2015        *suballoc_bit_start = res.sr_bit_offset;
2016        *blkno_start = res.sr_blkno;
2017        ac->ac_bits_given += res.sr_bits;
2018        *num_bits = res.sr_bits;
2019        status = 0;
2020bail:
2021        if (status)
2022                mlog_errno(status);
2023        return status;
2024}
2025
2026static void ocfs2_init_inode_ac_group(struct inode *dir,
2027                                      struct buffer_head *parent_di_bh,
2028                                      struct ocfs2_alloc_context *ac)
2029{
2030        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_di_bh->b_data;
2031        /*
2032         * Try to allocate inodes from some specific group.
2033         *
2034         * If the parent dir has recorded the last group used in allocation,
2035         * cool, use it. Otherwise if we try to allocate new inode from the
2036         * same slot the parent dir belongs to, use the same chunk.
2037         *
2038         * We are very careful here to avoid the mistake of setting
2039         * ac_last_group to a group descriptor from a different (unlocked) slot.
2040         */
2041        if (OCFS2_I(dir)->ip_last_used_group &&
2042            OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
2043                ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
2044        else if (le16_to_cpu(di->i_suballoc_slot) == ac->ac_alloc_slot) {
2045                if (di->i_suballoc_loc)
2046                        ac->ac_last_group = le64_to_cpu(di->i_suballoc_loc);
2047                else
2048                        ac->ac_last_group = ocfs2_which_suballoc_group(
2049                                        le64_to_cpu(di->i_blkno),
2050                                        le16_to_cpu(di->i_suballoc_bit));
2051        }
2052}
2053
2054static inline void ocfs2_save_inode_ac_group(struct inode *dir,
2055                                             struct ocfs2_alloc_context *ac)
2056{
2057        OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
2058        OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
2059}
2060
2061int ocfs2_find_new_inode_loc(struct inode *dir,
2062                             struct buffer_head *parent_fe_bh,
2063                             struct ocfs2_alloc_context *ac,
2064                             u64 *fe_blkno)
2065{
2066        int ret;
2067        handle_t *handle = NULL;
2068        struct ocfs2_suballoc_result *res;
2069
2070        BUG_ON(!ac);
2071        BUG_ON(ac->ac_bits_given != 0);
2072        BUG_ON(ac->ac_bits_wanted != 1);
2073        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2074
2075        res = kzalloc(sizeof(*res), GFP_NOFS);
2076        if (res == NULL) {
2077                ret = -ENOMEM;
2078                mlog_errno(ret);
2079                goto out;
2080        }
2081
2082        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2083
2084        /*
2085         * The handle started here is for chain relink. Alternatively,
2086         * we could just disable relink for these calls.
2087         */
2088        handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC);
2089        if (IS_ERR(handle)) {
2090                ret = PTR_ERR(handle);
2091                handle = NULL;
2092                mlog_errno(ret);
2093                goto out;
2094        }
2095
2096        /*
2097         * This will instruct ocfs2_claim_suballoc_bits and
2098         * ocfs2_search_one_group to search but save actual allocation
2099         * for later.
2100         */
2101        ac->ac_find_loc_only = 1;
2102
2103        ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res);
2104        if (ret < 0) {
2105                mlog_errno(ret);
2106                goto out;
2107        }
2108
2109        ac->ac_find_loc_priv = res;
2110        *fe_blkno = res->sr_blkno;
2111        ocfs2_update_inode_fsync_trans(handle, dir, 0);
2112out:
2113        if (handle)
2114                ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
2115
2116        if (ret)
2117                kfree(res);
2118
2119        return ret;
2120}
2121
2122int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2123                                 struct inode *dir,
2124                                 struct ocfs2_alloc_context *ac,
2125                                 u64 *suballoc_loc,
2126                                 u16 *suballoc_bit,
2127                                 u64 di_blkno)
2128{
2129        int ret;
2130        u16 chain;
2131        struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv;
2132        struct buffer_head *bg_bh = NULL;
2133        struct ocfs2_group_desc *bg;
2134        struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data;
2135
2136        /*
2137         * Since di_blkno is being passed back in, we check for any
2138         * inconsistencies which may have happened between
2139         * calls. These are code bugs as di_blkno is not expected to
2140         * change once returned from ocfs2_find_new_inode_loc()
2141         */
2142        BUG_ON(res->sr_blkno != di_blkno);
2143
2144        ret = ocfs2_read_group_descriptor(ac->ac_inode, di,
2145                                          res->sr_bg_stable_blkno, &bg_bh);
2146        if (ret) {
2147                mlog_errno(ret);
2148                goto out;
2149        }
2150
2151        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
2152        chain = le16_to_cpu(bg->bg_chain);
2153
2154        ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle,
2155                                               ac->ac_bh, res->sr_bits,
2156                                               chain);
2157        if (ret) {
2158                mlog_errno(ret);
2159                goto out;
2160        }
2161
2162        ret = ocfs2_block_group_set_bits(handle,
2163                                         ac->ac_inode,
2164                                         bg,
2165                                         bg_bh,
2166                                         res->sr_bit_offset,
2167                                         res->sr_bits);
2168        if (ret < 0) {
2169                ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
2170                                               ac->ac_bh, res->sr_bits, chain);
2171                mlog_errno(ret);
2172                goto out;
2173        }
2174
2175        trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2176                                           res->sr_bits);
2177
2178        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2179
2180        BUG_ON(res->sr_bits != 1);
2181
2182        *suballoc_loc = res->sr_bg_blkno;
2183        *suballoc_bit = res->sr_bit_offset;
2184        ac->ac_bits_given++;
2185        ocfs2_save_inode_ac_group(dir, ac);
2186
2187out:
2188        brelse(bg_bh);
2189
2190        return ret;
2191}
2192
2193int ocfs2_claim_new_inode(handle_t *handle,
2194                          struct inode *dir,
2195                          struct buffer_head *parent_fe_bh,
2196                          struct ocfs2_alloc_context *ac,
2197                          u64 *suballoc_loc,
2198                          u16 *suballoc_bit,
2199                          u64 *fe_blkno)
2200{
2201        int status;
2202        struct ocfs2_suballoc_result res;
2203
2204        BUG_ON(!ac);
2205        BUG_ON(ac->ac_bits_given != 0);
2206        BUG_ON(ac->ac_bits_wanted != 1);
2207        BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
2208
2209        ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
2210
2211        status = ocfs2_claim_suballoc_bits(ac,
2212                                           handle,
2213                                           1,
2214                                           1,
2215                                           &res);
2216        if (status < 0) {
2217                mlog_errno(status);
2218                goto bail;
2219        }
2220        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2221
2222        BUG_ON(res.sr_bits != 1);
2223
2224        *suballoc_loc = res.sr_bg_blkno;
2225        *suballoc_bit = res.sr_bit_offset;
2226        *fe_blkno = res.sr_blkno;
2227        ac->ac_bits_given++;
2228        ocfs2_save_inode_ac_group(dir, ac);
2229        status = 0;
2230bail:
2231        if (status)
2232                mlog_errno(status);
2233        return status;
2234}
2235
2236/* translate a group desc. blkno and it's bitmap offset into
2237 * disk cluster offset. */
2238static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
2239                                                   u64 bg_blkno,
2240                                                   u16 bg_bit_off)
2241{
2242        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2243        u32 cluster = 0;
2244
2245        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2246
2247        if (bg_blkno != osb->first_cluster_group_blkno)
2248                cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
2249        cluster += (u32) bg_bit_off;
2250        return cluster;
2251}
2252
2253/* given a cluster offset, calculate which block group it belongs to
2254 * and return that block offset. */
2255u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
2256{
2257        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2258        u32 group_no;
2259
2260        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2261
2262        group_no = cluster / osb->bitmap_cpg;
2263        if (!group_no)
2264                return osb->first_cluster_group_blkno;
2265        return ocfs2_clusters_to_blocks(inode->i_sb,
2266                                        group_no * osb->bitmap_cpg);
2267}
2268
2269/* given the block number of a cluster start, calculate which cluster
2270 * group and descriptor bitmap offset that corresponds to. */
2271static inline void ocfs2_block_to_cluster_group(struct inode *inode,
2272                                                u64 data_blkno,
2273                                                u64 *bg_blkno,
2274                                                u16 *bg_bit_off)
2275{
2276        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2277        u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
2278
2279        BUG_ON(!ocfs2_is_cluster_bitmap(inode));
2280
2281        *bg_blkno = ocfs2_which_cluster_group(inode,
2282                                              data_cluster);
2283
2284        if (*bg_blkno == osb->first_cluster_group_blkno)
2285                *bg_bit_off = (u16) data_cluster;
2286        else
2287                *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
2288                                                             data_blkno - *bg_blkno);
2289}
2290
2291/*
2292 * min_bits - minimum contiguous chunk from this total allocation we
2293 * can handle. set to what we asked for originally for a full
2294 * contig. allocation, set to '1' to indicate we can deal with extents
2295 * of any size.
2296 */
2297int __ocfs2_claim_clusters(handle_t *handle,
2298                           struct ocfs2_alloc_context *ac,
2299                           u32 min_clusters,
2300                           u32 max_clusters,
2301                           u32 *cluster_start,
2302                           u32 *num_clusters)
2303{
2304        int status;
2305        unsigned int bits_wanted = max_clusters;
2306        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2307        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2308
2309        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2310
2311        BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
2312               && ac->ac_which != OCFS2_AC_USE_MAIN);
2313
2314        if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
2315                WARN_ON(min_clusters > 1);
2316
2317                status = ocfs2_claim_local_alloc_bits(osb,
2318                                                      handle,
2319                                                      ac,
2320                                                      bits_wanted,
2321                                                      cluster_start,
2322                                                      num_clusters);
2323                if (!status)
2324                        atomic_inc(&osb->alloc_stats.local_data);
2325        } else {
2326                if (min_clusters > (osb->bitmap_cpg - 1)) {
2327                        /* The only paths asking for contiguousness
2328                         * should know about this already. */
2329                        mlog(ML_ERROR, "minimum allocation requested %u exceeds "
2330                             "group bitmap size %u!\n", min_clusters,
2331                             osb->bitmap_cpg);
2332                        status = -ENOSPC;
2333                        goto bail;
2334                }
2335                /* clamp the current request down to a realistic size. */
2336                if (bits_wanted > (osb->bitmap_cpg - 1))
2337                        bits_wanted = osb->bitmap_cpg - 1;
2338
2339                status = ocfs2_claim_suballoc_bits(ac,
2340                                                   handle,
2341                                                   bits_wanted,
2342                                                   min_clusters,
2343                                                   &res);
2344                if (!status) {
2345                        BUG_ON(res.sr_blkno); /* cluster alloc can't set */
2346                        *cluster_start =
2347                                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
2348                                                                 res.sr_bg_blkno,
2349                                                                 res.sr_bit_offset);
2350                        atomic_inc(&osb->alloc_stats.bitmap_data);
2351                        *num_clusters = res.sr_bits;
2352                }
2353        }
2354        if (status < 0) {
2355                if (status != -ENOSPC)
2356                        mlog_errno(status);
2357                goto bail;
2358        }
2359
2360        ac->ac_bits_given += *num_clusters;
2361
2362bail:
2363        if (status)
2364                mlog_errno(status);
2365        return status;
2366}
2367
2368int ocfs2_claim_clusters(handle_t *handle,
2369                         struct ocfs2_alloc_context *ac,
2370                         u32 min_clusters,
2371                         u32 *cluster_start,
2372                         u32 *num_clusters)
2373{
2374        unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
2375
2376        return __ocfs2_claim_clusters(handle, ac, min_clusters,
2377                                      bits_wanted, cluster_start, num_clusters);
2378}
2379
2380static int ocfs2_block_group_clear_bits(handle_t *handle,
2381                                        struct inode *alloc_inode,
2382                                        struct ocfs2_group_desc *bg,
2383                                        struct buffer_head *group_bh,
2384                                        unsigned int bit_off,
2385                                        unsigned int num_bits,
2386                                        void (*undo_fn)(unsigned int bit,
2387                                                        unsigned long *bmap))
2388{
2389        int status;
2390        unsigned int tmp;
2391        struct ocfs2_group_desc *undo_bg = NULL;
2392        struct journal_head *jh;
2393
2394        /* The caller got this descriptor from
2395         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
2396        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2397
2398        trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2399
2400        BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2401        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
2402                                         group_bh,
2403                                         undo_fn ?
2404                                         OCFS2_JOURNAL_ACCESS_UNDO :
2405                                         OCFS2_JOURNAL_ACCESS_WRITE);
2406        if (status < 0) {
2407                mlog_errno(status);
2408                goto bail;
2409        }
2410
2411        jh = bh2jh(group_bh);
2412        if (undo_fn) {
2413                spin_lock(&jh->b_state_lock);
2414                undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
2415                BUG_ON(!undo_bg);
2416        }
2417
2418        tmp = num_bits;
2419        while(tmp--) {
2420                ocfs2_clear_bit((bit_off + tmp),
2421                                (unsigned long *) bg->bg_bitmap);
2422                if (undo_fn)
2423                        undo_fn(bit_off + tmp,
2424                                (unsigned long *) undo_bg->bg_bitmap);
2425        }
2426        le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2427        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
2428                if (undo_fn)
2429                        spin_unlock(&jh->b_state_lock);
2430                return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
2431                                   (unsigned long long)le64_to_cpu(bg->bg_blkno),
2432                                   le16_to_cpu(bg->bg_bits),
2433                                   le16_to_cpu(bg->bg_free_bits_count),
2434                                   num_bits);
2435        }
2436
2437        if (undo_fn)
2438                spin_unlock(&jh->b_state_lock);
2439
2440        ocfs2_journal_dirty(handle, group_bh);
2441bail:
2442        return status;
2443}
2444
2445/*
2446 * expects the suballoc inode to already be locked.
2447 */
2448static int _ocfs2_free_suballoc_bits(handle_t *handle,
2449                                     struct inode *alloc_inode,
2450                                     struct buffer_head *alloc_bh,
2451                                     unsigned int start_bit,
2452                                     u64 bg_blkno,
2453                                     unsigned int count,
2454                                     void (*undo_fn)(unsigned int bit,
2455                                                     unsigned long *bitmap))
2456{
2457        int status = 0;
2458        u32 tmp_used;
2459        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2460        struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2461        struct buffer_head *group_bh = NULL;
2462        struct ocfs2_group_desc *group;
2463
2464        /* The alloc_bh comes from ocfs2_free_dinode() or
2465         * ocfs2_free_clusters().  The callers have all locked the
2466         * allocator and gotten alloc_bh from the lock call.  This
2467         * validates the dinode buffer.  Any corruption that has happened
2468         * is a code bug. */
2469        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2470        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2471
2472        trace_ocfs2_free_suballoc_bits(
2473                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2474                (unsigned long long)bg_blkno,
2475                start_bit, count);
2476
2477        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2478                                             &group_bh);
2479        if (status < 0) {
2480                mlog_errno(status);
2481                goto bail;
2482        }
2483        group = (struct ocfs2_group_desc *) group_bh->b_data;
2484
2485        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2486
2487        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2488                                              group, group_bh,
2489                                              start_bit, count, undo_fn);
2490        if (status < 0) {
2491                mlog_errno(status);
2492                goto bail;
2493        }
2494
2495        status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2496                                         alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2497        if (status < 0) {
2498                mlog_errno(status);
2499                ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh,
2500                                start_bit, count);
2501                goto bail;
2502        }
2503
2504        le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2505                     count);
2506        tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2507        fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2508        ocfs2_journal_dirty(handle, alloc_bh);
2509
2510bail:
2511        brelse(group_bh);
2512
2513        if (status)
2514                mlog_errno(status);
2515        return status;
2516}
2517
2518int ocfs2_free_suballoc_bits(handle_t *handle,
2519                             struct inode *alloc_inode,
2520                             struct buffer_head *alloc_bh,
2521                             unsigned int start_bit,
2522                             u64 bg_blkno,
2523                             unsigned int count)
2524{
2525        return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2526                                         start_bit, bg_blkno, count, NULL);
2527}
2528
2529int ocfs2_free_dinode(handle_t *handle,
2530                      struct inode *inode_alloc_inode,
2531                      struct buffer_head *inode_alloc_bh,
2532                      struct ocfs2_dinode *di)
2533{
2534        u64 blk = le64_to_cpu(di->i_blkno);
2535        u16 bit = le16_to_cpu(di->i_suballoc_bit);
2536        u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2537
2538        if (di->i_suballoc_loc)
2539                bg_blkno = le64_to_cpu(di->i_suballoc_loc);
2540        return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2541                                        inode_alloc_bh, bit, bg_blkno, 1);
2542}
2543
2544static int _ocfs2_free_clusters(handle_t *handle,
2545                                struct inode *bitmap_inode,
2546                                struct buffer_head *bitmap_bh,
2547                                u64 start_blk,
2548                                unsigned int num_clusters,
2549                                void (*undo_fn)(unsigned int bit,
2550                                                unsigned long *bitmap))
2551{
2552        int status;
2553        u16 bg_start_bit;
2554        u64 bg_blkno;
2555
2556        /* You can't ever have a contiguous set of clusters
2557         * bigger than a block group bitmap so we never have to worry
2558         * about looping on them.
2559         * This is expensive. We can safely remove once this stuff has
2560         * gotten tested really well. */
2561        BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
2562                                ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
2563                                                         start_blk)));
2564
2565
2566        ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2567                                     &bg_start_bit);
2568
2569        trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2570                        (unsigned long long)start_blk,
2571                        bg_start_bit, num_clusters);
2572
2573        status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2574                                           bg_start_bit, bg_blkno,
2575                                           num_clusters, undo_fn);
2576        if (status < 0) {
2577                mlog_errno(status);
2578                goto out;
2579        }
2580
2581        ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2582                                         num_clusters);
2583
2584out:
2585        if (status)
2586                mlog_errno(status);
2587        return status;
2588}
2589
2590int ocfs2_free_clusters(handle_t *handle,
2591                        struct inode *bitmap_inode,
2592                        struct buffer_head *bitmap_bh,
2593                        u64 start_blk,
2594                        unsigned int num_clusters)
2595{
2596        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2597                                    start_blk, num_clusters,
2598                                    _ocfs2_set_bit);
2599}
2600
2601/*
2602 * Give never-used clusters back to the global bitmap.  We don't need
2603 * to protect these bits in the undo buffer.
2604 */
2605int ocfs2_release_clusters(handle_t *handle,
2606                           struct inode *bitmap_inode,
2607                           struct buffer_head *bitmap_bh,
2608                           u64 start_blk,
2609                           unsigned int num_clusters)
2610{
2611        return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2612                                    start_blk, num_clusters,
2613                                    _ocfs2_clear_bit);
2614}
2615
2616/*
2617 * For a given allocation, determine which allocators will need to be
2618 * accessed, and lock them, reserving the appropriate number of bits.
2619 *
2620 * Sparse file systems call this from ocfs2_write_begin_nolock()
2621 * and ocfs2_allocate_unwritten_extents().
2622 *
2623 * File systems which don't support holes call this from
2624 * ocfs2_extend_allocation().
2625 */
2626int ocfs2_lock_allocators(struct inode *inode,
2627                          struct ocfs2_extent_tree *et,
2628                          u32 clusters_to_add, u32 extents_to_split,
2629                          struct ocfs2_alloc_context **data_ac,
2630                          struct ocfs2_alloc_context **meta_ac)
2631{
2632        int ret = 0, num_free_extents;
2633        unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2634        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2635
2636        *meta_ac = NULL;
2637        if (data_ac)
2638                *data_ac = NULL;
2639
2640        BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2641
2642        num_free_extents = ocfs2_num_free_extents(et);
2643        if (num_free_extents < 0) {
2644                ret = num_free_extents;
2645                mlog_errno(ret);
2646                goto out;
2647        }
2648
2649        /*
2650         * Sparse allocation file systems need to be more conservative
2651         * with reserving room for expansion - the actual allocation
2652         * happens while we've got a journal handle open so re-taking
2653         * a cluster lock (because we ran out of room for another
2654         * extent) will violate ordering rules.
2655         *
2656         * Most of the time we'll only be seeing this 1 cluster at a time
2657         * anyway.
2658         *
2659         * Always lock for any unwritten extents - we might want to
2660         * add blocks during a split.
2661         */
2662        if (!num_free_extents ||
2663            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2664                ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2665                if (ret < 0) {
2666                        if (ret != -ENOSPC)
2667                                mlog_errno(ret);
2668                        goto out;
2669                }
2670        }
2671
2672        if (clusters_to_add == 0)
2673                goto out;
2674
2675        ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2676        if (ret < 0) {
2677                if (ret != -ENOSPC)
2678                        mlog_errno(ret);
2679                goto out;
2680        }
2681
2682out:
2683        if (ret) {
2684                if (*meta_ac) {
2685                        ocfs2_free_alloc_context(*meta_ac);
2686                        *meta_ac = NULL;
2687                }
2688
2689                /*
2690                 * We cannot have an error and a non null *data_ac.
2691                 */
2692        }
2693
2694        return ret;
2695}
2696
2697/*
2698 * Read the inode specified by blkno to get suballoc_slot and
2699 * suballoc_bit.
2700 */
2701static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2702                                       u16 *suballoc_slot, u64 *group_blkno,
2703                                       u16 *suballoc_bit)
2704{
2705        int status;
2706        struct buffer_head *inode_bh = NULL;
2707        struct ocfs2_dinode *inode_fe;
2708
2709        trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2710
2711        /* dirty read disk */
2712        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2713        if (status < 0) {
2714                mlog(ML_ERROR, "read block %llu failed %d\n",
2715                     (unsigned long long)blkno, status);
2716                goto bail;
2717        }
2718
2719        inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2720        if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2721                mlog(ML_ERROR, "invalid inode %llu requested\n",
2722                     (unsigned long long)blkno);
2723                status = -EINVAL;
2724                goto bail;
2725        }
2726
2727        if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2728            (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2729                mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2730                     (unsigned long long)blkno,
2731                     (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2732                status = -EINVAL;
2733                goto bail;
2734        }
2735
2736        if (suballoc_slot)
2737                *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2738        if (suballoc_bit)
2739                *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2740        if (group_blkno)
2741                *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc);
2742
2743bail:
2744        brelse(inode_bh);
2745
2746        if (status)
2747                mlog_errno(status);
2748        return status;
2749}
2750
2751/*
2752 * test whether bit is SET in allocator bitmap or not.  on success, 0
2753 * is returned and *res is 1 for SET; 0 otherwise.  when fails, errno
2754 * is returned and *res is meaningless.  Call this after you have
2755 * cluster locked against suballoc, or you may get a result based on
2756 * non-up2date contents
2757 */
2758static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2759                                   struct inode *suballoc,
2760                                   struct buffer_head *alloc_bh,
2761                                   u64 group_blkno, u64 blkno,
2762                                   u16 bit, int *res)
2763{
2764        struct ocfs2_dinode *alloc_di;
2765        struct ocfs2_group_desc *group;
2766        struct buffer_head *group_bh = NULL;
2767        u64 bg_blkno;
2768        int status;
2769
2770        trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2771                                      (unsigned int)bit);
2772
2773        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2774        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
2775                mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2776                     (unsigned int)bit,
2777                     ocfs2_bits_per_group(&alloc_di->id2.i_chain));
2778                status = -EINVAL;
2779                goto bail;
2780        }
2781
2782        bg_blkno = group_blkno ? group_blkno :
2783                   ocfs2_which_suballoc_group(blkno, bit);
2784        status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno,
2785                                             &group_bh);
2786        if (status < 0) {
2787                mlog(ML_ERROR, "read group %llu failed %d\n",
2788                     (unsigned long long)bg_blkno, status);
2789                goto bail;
2790        }
2791
2792        group = (struct ocfs2_group_desc *) group_bh->b_data;
2793        *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2794
2795bail:
2796        brelse(group_bh);
2797
2798        if (status)
2799                mlog_errno(status);
2800        return status;
2801}
2802
2803/*
2804 * Test if the bit representing this inode (blkno) is set in the
2805 * suballocator.
2806 *
2807 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2808 *
2809 * In the event of failure, a negative value is returned and *res is
2810 * meaningless.
2811 *
2812 * Callers must make sure to hold nfs_sync_lock to prevent
2813 * ocfs2_delete_inode() on another node from accessing the same
2814 * suballocator concurrently.
2815 */
2816int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2817{
2818        int status;
2819        u64 group_blkno = 0;
2820        u16 suballoc_bit = 0, suballoc_slot = 0;
2821        struct inode *inode_alloc_inode;
2822        struct buffer_head *alloc_bh = NULL;
2823
2824        trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2825
2826        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2827                                             &group_blkno, &suballoc_bit);
2828        if (status < 0) {
2829                mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2830                goto bail;
2831        }
2832
2833        inode_alloc_inode =
2834                ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2835                                            suballoc_slot);
2836        if (!inode_alloc_inode) {
2837                /* the error code could be inaccurate, but we are not able to
2838                 * get the correct one. */
2839                status = -EINVAL;
2840                mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2841                     (u32)suballoc_slot);
2842                goto bail;
2843        }
2844
2845        inode_lock(inode_alloc_inode);
2846        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2847        if (status < 0) {
2848                inode_unlock(inode_alloc_inode);
2849                iput(inode_alloc_inode);
2850                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2851                     (u32)suballoc_slot, status);
2852                goto bail;
2853        }
2854
2855        status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2856                                         group_blkno, blkno, suballoc_bit, res);
2857        if (status < 0)
2858                mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2859
2860        ocfs2_inode_unlock(inode_alloc_inode, 0);
2861        inode_unlock(inode_alloc_inode);
2862
2863        iput(inode_alloc_inode);
2864        brelse(alloc_bh);
2865bail:
2866        if (status)
2867                mlog_errno(status);
2868        return status;
2869}
2870