linux/fs/ocfs2/dir.c
<<
>>
Prefs
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * dir.c
   5 *
   6 * Creates, reads, walks and deletes directory-nodes
   7 *
   8 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   9 *
  10 *  Portions of this code from linux/fs/ext3/dir.c
  11 *
  12 *  Copyright (C) 1992, 1993, 1994, 1995
  13 *  Remy Card (card@masi.ibp.fr)
  14 *  Laboratoire MASI - Institut Blaise pascal
  15 *  Universite Pierre et Marie Curie (Paris VI)
  16 *
  17 *   from
  18 *
  19 *   linux/fs/minix/dir.c
  20 *
  21 *   Copyright (C) 1991, 1992 Linux Torvalds
  22 *
  23 * This program is free software; you can redistribute it and/or
  24 * modify it under the terms of the GNU General Public
  25 * License as published by the Free Software Foundation; either
  26 * version 2 of the License, or (at your option) any later version.
  27 *
  28 * This program is distributed in the hope that it will be useful,
  29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  31 * General Public License for more details.
  32 *
  33 * You should have received a copy of the GNU General Public
  34 * License along with this program; if not, write to the
  35 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  36 * Boston, MA 021110-1307, USA.
  37 */
  38
  39#include <linux/fs.h>
  40#include <linux/types.h>
  41#include <linux/slab.h>
  42#include <linux/highmem.h>
  43#include <linux/quotaops.h>
  44#include <linux/sort.h>
  45
  46#include <cluster/masklog.h>
  47
  48#include "ocfs2.h"
  49
  50#include "alloc.h"
  51#include "blockcheck.h"
  52#include "dir.h"
  53#include "dlmglue.h"
  54#include "extent_map.h"
  55#include "file.h"
  56#include "inode.h"
  57#include "journal.h"
  58#include "namei.h"
  59#include "suballoc.h"
  60#include "super.h"
  61#include "sysfile.h"
  62#include "uptodate.h"
  63#include "ocfs2_trace.h"
  64
  65#include "buffer_head_io.h"
  66
  67#define NAMEI_RA_CHUNKS  2
  68#define NAMEI_RA_BLOCKS  4
  69#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
  70#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
  71
  72static unsigned char ocfs2_filetype_table[] = {
  73        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
  74};
  75
  76static int ocfs2_do_extend_dir(struct super_block *sb,
  77                               handle_t *handle,
  78                               struct inode *dir,
  79                               struct buffer_head *parent_fe_bh,
  80                               struct ocfs2_alloc_context *data_ac,
  81                               struct ocfs2_alloc_context *meta_ac,
  82                               struct buffer_head **new_bh);
  83static int ocfs2_dir_indexed(struct inode *inode);
  84
  85/*
  86 * These are distinct checks because future versions of the file system will
  87 * want to have a trailing dirent structure independent of indexing.
  88 */
  89static int ocfs2_supports_dir_trailer(struct inode *dir)
  90{
  91        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
  92
  93        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
  94                return 0;
  95
  96        return ocfs2_meta_ecc(osb) || ocfs2_dir_indexed(dir);
  97}
  98
  99/*
 100 * "new' here refers to the point at which we're creating a new
 101 * directory via "mkdir()", but also when we're expanding an inline
 102 * directory. In either case, we don't yet have the indexing bit set
 103 * on the directory, so the standard checks will fail in when metaecc
 104 * is turned off. Only directory-initialization type functions should
 105 * use this then. Everything else wants ocfs2_supports_dir_trailer()
 106 */
 107static int ocfs2_new_dir_wants_trailer(struct inode *dir)
 108{
 109        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 110
 111        return ocfs2_meta_ecc(osb) ||
 112                ocfs2_supports_indexed_dirs(osb);
 113}
 114
 115static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
 116{
 117        return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
 118}
 119
 120#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
 121
 122/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
 123 * them more consistent? */
 124struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
 125                                                            void *data)
 126{
 127        char *p = data;
 128
 129        p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
 130        return (struct ocfs2_dir_block_trailer *)p;
 131}
 132
 133/*
 134 * XXX: This is executed once on every dirent. We should consider optimizing
 135 * it.
 136 */
 137static int ocfs2_skip_dir_trailer(struct inode *dir,
 138                                  struct ocfs2_dir_entry *de,
 139                                  unsigned long offset,
 140                                  unsigned long blklen)
 141{
 142        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
 143
 144        if (!ocfs2_supports_dir_trailer(dir))
 145                return 0;
 146
 147        if (offset != toff)
 148                return 0;
 149
 150        return 1;
 151}
 152
 153static void ocfs2_init_dir_trailer(struct inode *inode,
 154                                   struct buffer_head *bh, u16 rec_len)
 155{
 156        struct ocfs2_dir_block_trailer *trailer;
 157
 158        trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
 159        strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
 160        trailer->db_compat_rec_len =
 161                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
 162        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
 163        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 164        trailer->db_free_rec_len = cpu_to_le16(rec_len);
 165}
 166/*
 167 * Link an unindexed block with a dir trailer structure into the index free
 168 * list. This function will modify dirdata_bh, but assumes you've already
 169 * passed it to the journal.
 170 */
 171static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
 172                                     struct buffer_head *dx_root_bh,
 173                                     struct buffer_head *dirdata_bh)
 174{
 175        int ret;
 176        struct ocfs2_dx_root_block *dx_root;
 177        struct ocfs2_dir_block_trailer *trailer;
 178
 179        ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
 180                                      OCFS2_JOURNAL_ACCESS_WRITE);
 181        if (ret) {
 182                mlog_errno(ret);
 183                goto out;
 184        }
 185        trailer = ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
 186        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
 187
 188        trailer->db_free_next = dx_root->dr_free_blk;
 189        dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
 190
 191        ocfs2_journal_dirty(handle, dx_root_bh);
 192
 193out:
 194        return ret;
 195}
 196
 197static int ocfs2_free_list_at_root(struct ocfs2_dir_lookup_result *res)
 198{
 199        return res->dl_prev_leaf_bh == NULL;
 200}
 201
 202void ocfs2_free_dir_lookup_result(struct ocfs2_dir_lookup_result *res)
 203{
 204        brelse(res->dl_dx_root_bh);
 205        brelse(res->dl_leaf_bh);
 206        brelse(res->dl_dx_leaf_bh);
 207        brelse(res->dl_prev_leaf_bh);
 208}
 209
 210static int ocfs2_dir_indexed(struct inode *inode)
 211{
 212        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INDEXED_DIR_FL)
 213                return 1;
 214        return 0;
 215}
 216
 217static inline int ocfs2_dx_root_inline(struct ocfs2_dx_root_block *dx_root)
 218{
 219        return dx_root->dr_flags & OCFS2_DX_FLAG_INLINE;
 220}
 221
 222/*
 223 * Hashing code adapted from ext3
 224 */
 225#define DELTA 0x9E3779B9
 226
 227static void TEA_transform(__u32 buf[4], __u32 const in[])
 228{
 229        __u32   sum = 0;
 230        __u32   b0 = buf[0], b1 = buf[1];
 231        __u32   a = in[0], b = in[1], c = in[2], d = in[3];
 232        int     n = 16;
 233
 234        do {
 235                sum += DELTA;
 236                b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
 237                b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
 238        } while (--n);
 239
 240        buf[0] += b0;
 241        buf[1] += b1;
 242}
 243
 244static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 245{
 246        __u32   pad, val;
 247        int     i;
 248
 249        pad = (__u32)len | ((__u32)len << 8);
 250        pad |= pad << 16;
 251
 252        val = pad;
 253        if (len > num*4)
 254                len = num * 4;
 255        for (i = 0; i < len; i++) {
 256                if ((i % 4) == 0)
 257                        val = pad;
 258                val = msg[i] + (val << 8);
 259                if ((i % 4) == 3) {
 260                        *buf++ = val;
 261                        val = pad;
 262                        num--;
 263                }
 264        }
 265        if (--num >= 0)
 266                *buf++ = val;
 267        while (--num >= 0)
 268                *buf++ = pad;
 269}
 270
 271static void ocfs2_dx_dir_name_hash(struct inode *dir, const char *name, int len,
 272                                   struct ocfs2_dx_hinfo *hinfo)
 273{
 274        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 275        const char      *p;
 276        __u32           in[8], buf[4];
 277
 278        /*
 279         * XXX: Is this really necessary, if the index is never looked
 280         * at by readdir? Is a hash value of '0' a bad idea?
 281         */
 282        if ((len == 1 && !strncmp(".", name, 1)) ||
 283            (len == 2 && !strncmp("..", name, 2))) {
 284                buf[0] = buf[1] = 0;
 285                goto out;
 286        }
 287
 288#ifdef OCFS2_DEBUG_DX_DIRS
 289        /*
 290         * This makes it very easy to debug indexing problems. We
 291         * should never allow this to be selected without hand editing
 292         * this file though.
 293         */
 294        buf[0] = buf[1] = len;
 295        goto out;
 296#endif
 297
 298        memcpy(buf, osb->osb_dx_seed, sizeof(buf));
 299
 300        p = name;
 301        while (len > 0) {
 302                str2hashbuf(p, len, in, 4);
 303                TEA_transform(buf, in);
 304                len -= 16;
 305                p += 16;
 306        }
 307
 308out:
 309        hinfo->major_hash = buf[0];
 310        hinfo->minor_hash = buf[1];
 311}
 312
 313/*
 314 * bh passed here can be an inode block or a dir data block, depending
 315 * on the inode inline data flag.
 316 */
 317static int ocfs2_check_dir_entry(struct inode * dir,
 318                                 struct ocfs2_dir_entry * de,
 319                                 struct buffer_head * bh,
 320                                 unsigned long offset)
 321{
 322        const char *error_msg = NULL;
 323        const int rlen = le16_to_cpu(de->rec_len);
 324
 325        if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
 326                error_msg = "rec_len is smaller than minimal";
 327        else if (unlikely(rlen % 4 != 0))
 328                error_msg = "rec_len % 4 != 0";
 329        else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
 330                error_msg = "rec_len is too small for name_len";
 331        else if (unlikely(
 332                 ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
 333                error_msg = "directory entry across blocks";
 334
 335        if (unlikely(error_msg != NULL))
 336                mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
 337                     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
 338                     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
 339                     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
 340                     de->name_len);
 341
 342        return error_msg == NULL ? 1 : 0;
 343}
 344
 345static inline int ocfs2_match(int len,
 346                              const char * const name,
 347                              struct ocfs2_dir_entry *de)
 348{
 349        if (len != de->name_len)
 350                return 0;
 351        if (!de->inode)
 352                return 0;
 353        return !memcmp(name, de->name, len);
 354}
 355
 356/*
 357 * Returns 0 if not found, -1 on failure, and 1 on success
 358 */
 359static inline int ocfs2_search_dirblock(struct buffer_head *bh,
 360                                        struct inode *dir,
 361                                        const char *name, int namelen,
 362                                        unsigned long offset,
 363                                        char *first_de,
 364                                        unsigned int bytes,
 365                                        struct ocfs2_dir_entry **res_dir)
 366{
 367        struct ocfs2_dir_entry *de;
 368        char *dlimit, *de_buf;
 369        int de_len;
 370        int ret = 0;
 371
 372        de_buf = first_de;
 373        dlimit = de_buf + bytes;
 374
 375        while (de_buf < dlimit) {
 376                /* this code is executed quadratically often */
 377                /* do minimal checking `by hand' */
 378
 379                de = (struct ocfs2_dir_entry *) de_buf;
 380
 381                if (de_buf + namelen <= dlimit &&
 382                    ocfs2_match(namelen, name, de)) {
 383                        /* found a match - just to be sure, do a full check */
 384                        if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
 385                                ret = -1;
 386                                goto bail;
 387                        }
 388                        *res_dir = de;
 389                        ret = 1;
 390                        goto bail;
 391                }
 392
 393                /* prevent looping on a bad block */
 394                de_len = le16_to_cpu(de->rec_len);
 395                if (de_len <= 0) {
 396                        ret = -1;
 397                        goto bail;
 398                }
 399
 400                de_buf += de_len;
 401                offset += de_len;
 402        }
 403
 404bail:
 405        trace_ocfs2_search_dirblock(ret);
 406        return ret;
 407}
 408
 409static struct buffer_head *ocfs2_find_entry_id(const char *name,
 410                                               int namelen,
 411                                               struct inode *dir,
 412                                               struct ocfs2_dir_entry **res_dir)
 413{
 414        int ret, found;
 415        struct buffer_head *di_bh = NULL;
 416        struct ocfs2_dinode *di;
 417        struct ocfs2_inline_data *data;
 418
 419        ret = ocfs2_read_inode_block(dir, &di_bh);
 420        if (ret) {
 421                mlog_errno(ret);
 422                goto out;
 423        }
 424
 425        di = (struct ocfs2_dinode *)di_bh->b_data;
 426        data = &di->id2.i_data;
 427
 428        found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
 429                                      data->id_data, i_size_read(dir), res_dir);
 430        if (found == 1)
 431                return di_bh;
 432
 433        brelse(di_bh);
 434out:
 435        return NULL;
 436}
 437
 438static int ocfs2_validate_dir_block(struct super_block *sb,
 439                                    struct buffer_head *bh)
 440{
 441        int rc;
 442        struct ocfs2_dir_block_trailer *trailer =
 443                ocfs2_trailer_from_bh(bh, sb);
 444
 445
 446        /*
 447         * We don't validate dirents here, that's handled
 448         * in-place when the code walks them.
 449         */
 450        trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
 451
 452        BUG_ON(!buffer_uptodate(bh));
 453
 454        /*
 455         * If the ecc fails, we return the error but otherwise
 456         * leave the filesystem running.  We know any error is
 457         * local to this block.
 458         *
 459         * Note that we are safe to call this even if the directory
 460         * doesn't have a trailer.  Filesystems without metaecc will do
 461         * nothing, and filesystems with it will have one.
 462         */
 463        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
 464        if (rc)
 465                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
 466                     (unsigned long long)bh->b_blocknr);
 467
 468        return rc;
 469}
 470
 471/*
 472 * Validate a directory trailer.
 473 *
 474 * We check the trailer here rather than in ocfs2_validate_dir_block()
 475 * because that function doesn't have the inode to test.
 476 */
 477static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
 478{
 479        int rc = 0;
 480        struct ocfs2_dir_block_trailer *trailer;
 481
 482        trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
 483        if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
 484                rc = -EINVAL;
 485                ocfs2_error(dir->i_sb,
 486                            "Invalid dirblock #%llu: "
 487                            "signature = %.*s\n",
 488                            (unsigned long long)bh->b_blocknr, 7,
 489                            trailer->db_signature);
 490                goto out;
 491        }
 492        if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
 493                rc = -EINVAL;
 494                ocfs2_error(dir->i_sb,
 495                            "Directory block #%llu has an invalid "
 496                            "db_blkno of %llu",
 497                            (unsigned long long)bh->b_blocknr,
 498                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
 499                goto out;
 500        }
 501        if (le64_to_cpu(trailer->db_parent_dinode) !=
 502            OCFS2_I(dir)->ip_blkno) {
 503                rc = -EINVAL;
 504                ocfs2_error(dir->i_sb,
 505                            "Directory block #%llu on dinode "
 506                            "#%llu has an invalid parent_dinode "
 507                            "of %llu",
 508                            (unsigned long long)bh->b_blocknr,
 509                            (unsigned long long)OCFS2_I(dir)->ip_blkno,
 510                            (unsigned long long)le64_to_cpu(trailer->db_blkno));
 511                goto out;
 512        }
 513out:
 514        return rc;
 515}
 516
 517/*
 518 * This function forces all errors to -EIO for consistency with its
 519 * predecessor, ocfs2_bread().  We haven't audited what returning the
 520 * real error codes would do to callers.  We log the real codes with
 521 * mlog_errno() before we squash them.
 522 */
 523static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 524                                struct buffer_head **bh, int flags)
 525{
 526        int rc = 0;
 527        struct buffer_head *tmp = *bh;
 528
 529        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
 530                                    ocfs2_validate_dir_block);
 531        if (rc) {
 532                mlog_errno(rc);
 533                goto out;
 534        }
 535
 536        if (!(flags & OCFS2_BH_READAHEAD) &&
 537            ocfs2_supports_dir_trailer(inode)) {
 538                rc = ocfs2_check_dir_trailer(inode, tmp);
 539                if (rc) {
 540                        if (!*bh)
 541                                brelse(tmp);
 542                        mlog_errno(rc);
 543                        goto out;
 544                }
 545        }
 546
 547        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
 548        if (!*bh)
 549                *bh = tmp;
 550
 551out:
 552        return rc ? -EIO : 0;
 553}
 554
 555/*
 556 * Read the block at 'phys' which belongs to this directory
 557 * inode. This function does no virtual->physical block translation -
 558 * what's passed in is assumed to be a valid directory block.
 559 */
 560static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
 561                                       struct buffer_head **bh)
 562{
 563        int ret;
 564        struct buffer_head *tmp = *bh;
 565
 566        ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
 567                               ocfs2_validate_dir_block);
 568        if (ret) {
 569                mlog_errno(ret);
 570                goto out;
 571        }
 572
 573        if (ocfs2_supports_dir_trailer(dir)) {
 574                ret = ocfs2_check_dir_trailer(dir, tmp);
 575                if (ret) {
 576                        if (!*bh)
 577                                brelse(tmp);
 578                        mlog_errno(ret);
 579                        goto out;
 580                }
 581        }
 582
 583        if (!ret && !*bh)
 584                *bh = tmp;
 585out:
 586        return ret;
 587}
 588
 589static int ocfs2_validate_dx_root(struct super_block *sb,
 590                                  struct buffer_head *bh)
 591{
 592        int ret;
 593        struct ocfs2_dx_root_block *dx_root;
 594
 595        BUG_ON(!buffer_uptodate(bh));
 596
 597        dx_root = (struct ocfs2_dx_root_block *) bh->b_data;
 598
 599        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_root->dr_check);
 600        if (ret) {
 601                mlog(ML_ERROR,
 602                     "Checksum failed for dir index root block %llu\n",
 603                     (unsigned long long)bh->b_blocknr);
 604                return ret;
 605        }
 606
 607        if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
 608                ocfs2_error(sb,
 609                            "Dir Index Root # %llu has bad signature %.*s",
 610                            (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
 611                            7, dx_root->dr_signature);
 612                return -EINVAL;
 613        }
 614
 615        return 0;
 616}
 617
 618static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
 619                              struct buffer_head **dx_root_bh)
 620{
 621        int ret;
 622        u64 blkno = le64_to_cpu(di->i_dx_root);
 623        struct buffer_head *tmp = *dx_root_bh;
 624
 625        ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
 626                               ocfs2_validate_dx_root);
 627
 628        /* If ocfs2_read_block() got us a new bh, pass it up. */
 629        if (!ret && !*dx_root_bh)
 630                *dx_root_bh = tmp;
 631
 632        return ret;
 633}
 634
 635static int ocfs2_validate_dx_leaf(struct super_block *sb,
 636                                  struct buffer_head *bh)
 637{
 638        int ret;
 639        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)bh->b_data;
 640
 641        BUG_ON(!buffer_uptodate(bh));
 642
 643        ret = ocfs2_validate_meta_ecc(sb, bh->b_data, &dx_leaf->dl_check);
 644        if (ret) {
 645                mlog(ML_ERROR,
 646                     "Checksum failed for dir index leaf block %llu\n",
 647                     (unsigned long long)bh->b_blocknr);
 648                return ret;
 649        }
 650
 651        if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
 652                ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
 653                            7, dx_leaf->dl_signature);
 654                return -EROFS;
 655        }
 656
 657        return 0;
 658}
 659
 660static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
 661                              struct buffer_head **dx_leaf_bh)
 662{
 663        int ret;
 664        struct buffer_head *tmp = *dx_leaf_bh;
 665
 666        ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
 667                               ocfs2_validate_dx_leaf);
 668
 669        /* If ocfs2_read_block() got us a new bh, pass it up. */
 670        if (!ret && !*dx_leaf_bh)
 671                *dx_leaf_bh = tmp;
 672
 673        return ret;
 674}
 675
 676/*
 677 * Read a series of dx_leaf blocks. This expects all buffer_head
 678 * pointers to be NULL on function entry.
 679 */
 680static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
 681                                struct buffer_head **dx_leaf_bhs)
 682{
 683        int ret;
 684
 685        ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
 686                                ocfs2_validate_dx_leaf);
 687        if (ret)
 688                mlog_errno(ret);
 689
 690        return ret;
 691}
 692
 693static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
 694                                               struct inode *dir,
 695                                               struct ocfs2_dir_entry **res_dir)
 696{
 697        struct super_block *sb;
 698        struct buffer_head *bh_use[NAMEI_RA_SIZE];
 699        struct buffer_head *bh, *ret = NULL;
 700        unsigned long start, block, b;
 701        int ra_max = 0;         /* Number of bh's in the readahead
 702                                   buffer, bh_use[] */
 703        int ra_ptr = 0;         /* Current index into readahead
 704                                   buffer */
 705        int num = 0;
 706        int nblocks, i, err;
 707
 708        sb = dir->i_sb;
 709
 710        nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
 711        start = OCFS2_I(dir)->ip_dir_start_lookup;
 712        if (start >= nblocks)
 713                start = 0;
 714        block = start;
 715
 716restart:
 717        do {
 718                /*
 719                 * We deal with the read-ahead logic here.
 720                 */
 721                if (ra_ptr >= ra_max) {
 722                        /* Refill the readahead buffer */
 723                        ra_ptr = 0;
 724                        b = block;
 725                        for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
 726                                /*
 727                                 * Terminate if we reach the end of the
 728                                 * directory and must wrap, or if our
 729                                 * search has finished at this block.
 730                                 */
 731                                if (b >= nblocks || (num && block == start)) {
 732                                        bh_use[ra_max] = NULL;
 733                                        break;
 734                                }
 735                                num++;
 736
 737                                bh = NULL;
 738                                err = ocfs2_read_dir_block(dir, b++, &bh,
 739                                                           OCFS2_BH_READAHEAD);
 740                                bh_use[ra_max] = bh;
 741                        }
 742                }
 743                if ((bh = bh_use[ra_ptr++]) == NULL)
 744                        goto next;
 745                if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
 746                        /* read error, skip block & hope for the best.
 747                         * ocfs2_read_dir_block() has released the bh. */
 748                        ocfs2_error(dir->i_sb, "reading directory %llu, "
 749                                    "offset %lu\n",
 750                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
 751                                    block);
 752                        goto next;
 753                }
 754                i = ocfs2_search_dirblock(bh, dir, name, namelen,
 755                                          block << sb->s_blocksize_bits,
 756                                          bh->b_data, sb->s_blocksize,
 757                                          res_dir);
 758                if (i == 1) {
 759                        OCFS2_I(dir)->ip_dir_start_lookup = block;
 760                        ret = bh;
 761                        goto cleanup_and_exit;
 762                } else {
 763                        brelse(bh);
 764                        if (i < 0)
 765                                goto cleanup_and_exit;
 766                }
 767        next:
 768                if (++block >= nblocks)
 769                        block = 0;
 770        } while (block != start);
 771
 772        /*
 773         * If the directory has grown while we were searching, then
 774         * search the last part of the directory before giving up.
 775         */
 776        block = nblocks;
 777        nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
 778        if (block < nblocks) {
 779                start = 0;
 780                goto restart;
 781        }
 782
 783cleanup_and_exit:
 784        /* Clean up the read-ahead blocks */
 785        for (; ra_ptr < ra_max; ra_ptr++)
 786                brelse(bh_use[ra_ptr]);
 787
 788        trace_ocfs2_find_entry_el(ret);
 789        return ret;
 790}
 791
 792static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
 793                                   struct ocfs2_extent_list *el,
 794                                   u32 major_hash,
 795                                   u32 *ret_cpos,
 796                                   u64 *ret_phys_blkno,
 797                                   unsigned int *ret_clen)
 798{
 799        int ret = 0, i, found;
 800        struct buffer_head *eb_bh = NULL;
 801        struct ocfs2_extent_block *eb;
 802        struct ocfs2_extent_rec *rec = NULL;
 803
 804        if (el->l_tree_depth) {
 805                ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
 806                                      &eb_bh);
 807                if (ret) {
 808                        mlog_errno(ret);
 809                        goto out;
 810                }
 811
 812                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 813                el = &eb->h_list;
 814
 815                if (el->l_tree_depth) {
 816                        ocfs2_error(inode->i_sb,
 817                                    "Inode %lu has non zero tree depth in "
 818                                    "btree tree block %llu\n", inode->i_ino,
 819                                    (unsigned long long)eb_bh->b_blocknr);
 820                        ret = -EROFS;
 821                        goto out;
 822                }
 823        }
 824
 825        found = 0;
 826        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
 827                rec = &el->l_recs[i];
 828
 829                if (le32_to_cpu(rec->e_cpos) <= major_hash) {
 830                        found = 1;
 831                        break;
 832                }
 833        }
 834
 835        if (!found) {
 836                ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
 837                            "record (%u, %u, 0) in btree", inode->i_ino,
 838                            le32_to_cpu(rec->e_cpos),
 839                            ocfs2_rec_clusters(el, rec));
 840                ret = -EROFS;
 841                goto out;
 842        }
 843
 844        if (ret_phys_blkno)
 845                *ret_phys_blkno = le64_to_cpu(rec->e_blkno);
 846        if (ret_cpos)
 847                *ret_cpos = le32_to_cpu(rec->e_cpos);
 848        if (ret_clen)
 849                *ret_clen = le16_to_cpu(rec->e_leaf_clusters);
 850
 851out:
 852        brelse(eb_bh);
 853        return ret;
 854}
 855
 856/*
 857 * Returns the block index, from the start of the cluster which this
 858 * hash belongs too.
 859 */
 860static inline unsigned int __ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
 861                                                   u32 minor_hash)
 862{
 863        return minor_hash & osb->osb_dx_mask;
 864}
 865
 866static inline unsigned int ocfs2_dx_dir_hash_idx(struct ocfs2_super *osb,
 867                                          struct ocfs2_dx_hinfo *hinfo)
 868{
 869        return __ocfs2_dx_dir_hash_idx(osb, hinfo->minor_hash);
 870}
 871
 872static int ocfs2_dx_dir_lookup(struct inode *inode,
 873                               struct ocfs2_extent_list *el,
 874                               struct ocfs2_dx_hinfo *hinfo,
 875                               u32 *ret_cpos,
 876                               u64 *ret_phys_blkno)
 877{
 878        int ret = 0;
 879        unsigned int cend, uninitialized_var(clen);
 880        u32 uninitialized_var(cpos);
 881        u64 uninitialized_var(blkno);
 882        u32 name_hash = hinfo->major_hash;
 883
 884        ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
 885                                      &clen);
 886        if (ret) {
 887                mlog_errno(ret);
 888                goto out;
 889        }
 890
 891        cend = cpos + clen;
 892        if (name_hash >= cend) {
 893                /* We want the last cluster */
 894                blkno += ocfs2_clusters_to_blocks(inode->i_sb, clen - 1);
 895                cpos += clen - 1;
 896        } else {
 897                blkno += ocfs2_clusters_to_blocks(inode->i_sb,
 898                                                  name_hash - cpos);
 899                cpos = name_hash;
 900        }
 901
 902        /*
 903         * We now have the cluster which should hold our entry. To
 904         * find the exact block from the start of the cluster to
 905         * search, we take the lower bits of the hash.
 906         */
 907        blkno += ocfs2_dx_dir_hash_idx(OCFS2_SB(inode->i_sb), hinfo);
 908
 909        if (ret_phys_blkno)
 910                *ret_phys_blkno = blkno;
 911        if (ret_cpos)
 912                *ret_cpos = cpos;
 913
 914out:
 915
 916        return ret;
 917}
 918
 919static int ocfs2_dx_dir_search(const char *name, int namelen,
 920                               struct inode *dir,
 921                               struct ocfs2_dx_root_block *dx_root,
 922                               struct ocfs2_dir_lookup_result *res)
 923{
 924        int ret, i, found;
 925        u64 uninitialized_var(phys);
 926        struct buffer_head *dx_leaf_bh = NULL;
 927        struct ocfs2_dx_leaf *dx_leaf;
 928        struct ocfs2_dx_entry *dx_entry = NULL;
 929        struct buffer_head *dir_ent_bh = NULL;
 930        struct ocfs2_dir_entry *dir_ent = NULL;
 931        struct ocfs2_dx_hinfo *hinfo = &res->dl_hinfo;
 932        struct ocfs2_extent_list *dr_el;
 933        struct ocfs2_dx_entry_list *entry_list;
 934
 935        ocfs2_dx_dir_name_hash(dir, name, namelen, &res->dl_hinfo);
 936
 937        if (ocfs2_dx_root_inline(dx_root)) {
 938                entry_list = &dx_root->dr_entries;
 939                goto search;
 940        }
 941
 942        dr_el = &dx_root->dr_list;
 943
 944        ret = ocfs2_dx_dir_lookup(dir, dr_el, hinfo, NULL, &phys);
 945        if (ret) {
 946                mlog_errno(ret);
 947                goto out;
 948        }
 949
 950        trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
 951                                  namelen, name, hinfo->major_hash,
 952                                  hinfo->minor_hash, (unsigned long long)phys);
 953
 954        ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
 955        if (ret) {
 956                mlog_errno(ret);
 957                goto out;
 958        }
 959
 960        dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
 961
 962        trace_ocfs2_dx_dir_search_leaf_info(
 963                        le16_to_cpu(dx_leaf->dl_list.de_num_used),
 964                        le16_to_cpu(dx_leaf->dl_list.de_count));
 965
 966        entry_list = &dx_leaf->dl_list;
 967
 968search:
 969        /*
 970         * Empty leaf is legal, so no need to check for that.
 971         */
 972        found = 0;
 973        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
 974                dx_entry = &entry_list->de_entries[i];
 975
 976                if (hinfo->major_hash != le32_to_cpu(dx_entry->dx_major_hash)
 977                    || hinfo->minor_hash != le32_to_cpu(dx_entry->dx_minor_hash))
 978                        continue;
 979
 980                /*
 981                 * Search unindexed leaf block now. We're not
 982                 * guaranteed to find anything.
 983                 */
 984                ret = ocfs2_read_dir_block_direct(dir,
 985                                          le64_to_cpu(dx_entry->dx_dirent_blk),
 986                                          &dir_ent_bh);
 987                if (ret) {
 988                        mlog_errno(ret);
 989                        goto out;
 990                }
 991
 992                /*
 993                 * XXX: We should check the unindexed block here,
 994                 * before using it.
 995                 */
 996
 997                found = ocfs2_search_dirblock(dir_ent_bh, dir, name, namelen,
 998                                              0, dir_ent_bh->b_data,
 999                                              dir->i_sb->s_blocksize, &dir_ent);
1000                if (found == 1)
1001                        break;
1002
1003                if (found == -1) {
1004                        /* This means we found a bad directory entry. */
1005                        ret = -EIO;
1006                        mlog_errno(ret);
1007                        goto out;
1008                }
1009
1010                brelse(dir_ent_bh);
1011                dir_ent_bh = NULL;
1012        }
1013
1014        if (found <= 0) {
1015                ret = -ENOENT;
1016                goto out;
1017        }
1018
1019        res->dl_leaf_bh = dir_ent_bh;
1020        res->dl_entry = dir_ent;
1021        res->dl_dx_leaf_bh = dx_leaf_bh;
1022        res->dl_dx_entry = dx_entry;
1023
1024        ret = 0;
1025out:
1026        if (ret) {
1027                brelse(dx_leaf_bh);
1028                brelse(dir_ent_bh);
1029        }
1030        return ret;
1031}
1032
1033static int ocfs2_find_entry_dx(const char *name, int namelen,
1034                               struct inode *dir,
1035                               struct ocfs2_dir_lookup_result *lookup)
1036{
1037        int ret;
1038        struct buffer_head *di_bh = NULL;
1039        struct ocfs2_dinode *di;
1040        struct buffer_head *dx_root_bh = NULL;
1041        struct ocfs2_dx_root_block *dx_root;
1042
1043        ret = ocfs2_read_inode_block(dir, &di_bh);
1044        if (ret) {
1045                mlog_errno(ret);
1046                goto out;
1047        }
1048
1049        di = (struct ocfs2_dinode *)di_bh->b_data;
1050
1051        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
1052        if (ret) {
1053                mlog_errno(ret);
1054                goto out;
1055        }
1056        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
1057
1058        ret = ocfs2_dx_dir_search(name, namelen, dir, dx_root, lookup);
1059        if (ret) {
1060                if (ret != -ENOENT)
1061                        mlog_errno(ret);
1062                goto out;
1063        }
1064
1065        lookup->dl_dx_root_bh = dx_root_bh;
1066        dx_root_bh = NULL;
1067out:
1068        brelse(di_bh);
1069        brelse(dx_root_bh);
1070        return ret;
1071}
1072
1073/*
1074 * Try to find an entry of the provided name within 'dir'.
1075 *
1076 * If nothing was found, -ENOENT is returned. Otherwise, zero is
1077 * returned and the struct 'res' will contain information useful to
1078 * other directory manipulation functions.
1079 *
1080 * Caller can NOT assume anything about the contents of the
1081 * buffer_heads - they are passed back only so that it can be passed
1082 * into any one of the manipulation functions (add entry, delete
1083 * entry, etc). As an example, bh in the extent directory case is a
1084 * data block, in the inline-data case it actually points to an inode,
1085 * in the indexed directory case, multiple buffers are involved.
1086 */
1087int ocfs2_find_entry(const char *name, int namelen,
1088                     struct inode *dir, struct ocfs2_dir_lookup_result *lookup)
1089{
1090        struct buffer_head *bh;
1091        struct ocfs2_dir_entry *res_dir = NULL;
1092
1093        if (ocfs2_dir_indexed(dir))
1094                return ocfs2_find_entry_dx(name, namelen, dir, lookup);
1095
1096        /*
1097         * The unindexed dir code only uses part of the lookup
1098         * structure, so there's no reason to push it down further
1099         * than this.
1100         */
1101        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1102                bh = ocfs2_find_entry_id(name, namelen, dir, &res_dir);
1103        else
1104                bh = ocfs2_find_entry_el(name, namelen, dir, &res_dir);
1105
1106        if (bh == NULL)
1107                return -ENOENT;
1108
1109        lookup->dl_leaf_bh = bh;
1110        lookup->dl_entry = res_dir;
1111        return 0;
1112}
1113
1114/*
1115 * Update inode number and type of a previously found directory entry.
1116 */
1117int ocfs2_update_entry(struct inode *dir, handle_t *handle,
1118                       struct ocfs2_dir_lookup_result *res,
1119                       struct inode *new_entry_inode)
1120{
1121        int ret;
1122        ocfs2_journal_access_func access = ocfs2_journal_access_db;
1123        struct ocfs2_dir_entry *de = res->dl_entry;
1124        struct buffer_head *de_bh = res->dl_leaf_bh;
1125
1126        /*
1127         * The same code works fine for both inline-data and extent
1128         * based directories, so no need to split this up.  The only
1129         * difference is the journal_access function.
1130         */
1131
1132        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1133                access = ocfs2_journal_access_di;
1134
1135        ret = access(handle, INODE_CACHE(dir), de_bh,
1136                     OCFS2_JOURNAL_ACCESS_WRITE);
1137        if (ret) {
1138                mlog_errno(ret);
1139                goto out;
1140        }
1141
1142        de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
1143        ocfs2_set_de_type(de, new_entry_inode->i_mode);
1144
1145        ocfs2_journal_dirty(handle, de_bh);
1146
1147out:
1148        return ret;
1149}
1150
1151/*
1152 * __ocfs2_delete_entry deletes a directory entry by merging it with the
1153 * previous entry
1154 */
1155static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1156                                struct ocfs2_dir_entry *de_del,
1157                                struct buffer_head *bh, char *first_de,
1158                                unsigned int bytes)
1159{
1160        struct ocfs2_dir_entry *de, *pde;
1161        int i, status = -ENOENT;
1162        ocfs2_journal_access_func access = ocfs2_journal_access_db;
1163
1164        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1165                access = ocfs2_journal_access_di;
1166
1167        i = 0;
1168        pde = NULL;
1169        de = (struct ocfs2_dir_entry *) first_de;
1170        while (i < bytes) {
1171                if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
1172                        status = -EIO;
1173                        mlog_errno(status);
1174                        goto bail;
1175                }
1176                if (de == de_del)  {
1177                        status = access(handle, INODE_CACHE(dir), bh,
1178                                        OCFS2_JOURNAL_ACCESS_WRITE);
1179                        if (status < 0) {
1180                                status = -EIO;
1181                                mlog_errno(status);
1182                                goto bail;
1183                        }
1184                        if (pde)
1185                                le16_add_cpu(&pde->rec_len,
1186                                                le16_to_cpu(de->rec_len));
1187                        else
1188                                de->inode = 0;
1189                        dir->i_version++;
1190                        ocfs2_journal_dirty(handle, bh);
1191                        goto bail;
1192                }
1193                i += le16_to_cpu(de->rec_len);
1194                pde = de;
1195                de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1196        }
1197bail:
1198        return status;
1199}
1200
1201static unsigned int ocfs2_figure_dirent_hole(struct ocfs2_dir_entry *de)
1202{
1203        unsigned int hole;
1204
1205        if (le64_to_cpu(de->inode) == 0)
1206                hole = le16_to_cpu(de->rec_len);
1207        else
1208                hole = le16_to_cpu(de->rec_len) -
1209                        OCFS2_DIR_REC_LEN(de->name_len);
1210
1211        return hole;
1212}
1213
1214static int ocfs2_find_max_rec_len(struct super_block *sb,
1215                                  struct buffer_head *dirblock_bh)
1216{
1217        int size, this_hole, largest_hole = 0;
1218        char *trailer, *de_buf, *limit, *start = dirblock_bh->b_data;
1219        struct ocfs2_dir_entry *de;
1220
1221        trailer = (char *)ocfs2_trailer_from_bh(dirblock_bh, sb);
1222        size = ocfs2_dir_trailer_blk_off(sb);
1223        limit = start + size;
1224        de_buf = start;
1225        de = (struct ocfs2_dir_entry *)de_buf;
1226        do {
1227                if (de_buf != trailer) {
1228                        this_hole = ocfs2_figure_dirent_hole(de);
1229                        if (this_hole > largest_hole)
1230                                largest_hole = this_hole;
1231                }
1232
1233                de_buf += le16_to_cpu(de->rec_len);
1234                de = (struct ocfs2_dir_entry *)de_buf;
1235        } while (de_buf < limit);
1236
1237        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
1238                return largest_hole;
1239        return 0;
1240}
1241
1242static void ocfs2_dx_list_remove_entry(struct ocfs2_dx_entry_list *entry_list,
1243                                       int index)
1244{
1245        int num_used = le16_to_cpu(entry_list->de_num_used);
1246
1247        if (num_used == 1 || index == (num_used - 1))
1248                goto clear;
1249
1250        memmove(&entry_list->de_entries[index],
1251                &entry_list->de_entries[index + 1],
1252                (num_used - index - 1)*sizeof(struct ocfs2_dx_entry));
1253clear:
1254        num_used--;
1255        memset(&entry_list->de_entries[num_used], 0,
1256               sizeof(struct ocfs2_dx_entry));
1257        entry_list->de_num_used = cpu_to_le16(num_used);
1258}
1259
1260static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1261                                 struct ocfs2_dir_lookup_result *lookup)
1262{
1263        int ret, index, max_rec_len, add_to_free_list = 0;
1264        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1265        struct buffer_head *leaf_bh = lookup->dl_leaf_bh;
1266        struct ocfs2_dx_leaf *dx_leaf;
1267        struct ocfs2_dx_entry *dx_entry = lookup->dl_dx_entry;
1268        struct ocfs2_dir_block_trailer *trailer;
1269        struct ocfs2_dx_root_block *dx_root;
1270        struct ocfs2_dx_entry_list *entry_list;
1271
1272        /*
1273         * This function gets a bit messy because we might have to
1274         * modify the root block, regardless of whether the indexed
1275         * entries are stored inline.
1276         */
1277
1278        /*
1279         * *Only* set 'entry_list' here, based on where we're looking
1280         * for the indexed entries. Later, we might still want to
1281         * journal both blocks, based on free list state.
1282         */
1283        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
1284        if (ocfs2_dx_root_inline(dx_root)) {
1285                entry_list = &dx_root->dr_entries;
1286        } else {
1287                dx_leaf = (struct ocfs2_dx_leaf *) lookup->dl_dx_leaf_bh->b_data;
1288                entry_list = &dx_leaf->dl_list;
1289        }
1290
1291        /* Neither of these are a disk corruption - that should have
1292         * been caught by lookup, before we got here. */
1293        BUG_ON(le16_to_cpu(entry_list->de_count) <= 0);
1294        BUG_ON(le16_to_cpu(entry_list->de_num_used) <= 0);
1295
1296        index = (char *)dx_entry - (char *)entry_list->de_entries;
1297        index /= sizeof(*dx_entry);
1298
1299        if (index >= le16_to_cpu(entry_list->de_num_used)) {
1300                mlog(ML_ERROR, "Dir %llu: Bad dx_entry ptr idx %d, (%p, %p)\n",
1301                     (unsigned long long)OCFS2_I(dir)->ip_blkno, index,
1302                     entry_list, dx_entry);
1303                return -EIO;
1304        }
1305
1306        /*
1307         * We know that removal of this dirent will leave enough room
1308         * for a new one, so add this block to the free list if it
1309         * isn't already there.
1310         */
1311        trailer = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
1312        if (trailer->db_free_rec_len == 0)
1313                add_to_free_list = 1;
1314
1315        /*
1316         * Add the block holding our index into the journal before
1317         * removing the unindexed entry. If we get an error return
1318         * from __ocfs2_delete_entry(), then it hasn't removed the
1319         * entry yet. Likewise, successful return means we *must*
1320         * remove the indexed entry.
1321         *
1322         * We're also careful to journal the root tree block here as
1323         * the entry count needs to be updated. Also, we might be
1324         * adding to the start of the free list.
1325         */
1326        ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1327                                      OCFS2_JOURNAL_ACCESS_WRITE);
1328        if (ret) {
1329                mlog_errno(ret);
1330                goto out;
1331        }
1332
1333        if (!ocfs2_dx_root_inline(dx_root)) {
1334                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
1335                                              lookup->dl_dx_leaf_bh,
1336                                              OCFS2_JOURNAL_ACCESS_WRITE);
1337                if (ret) {
1338                        mlog_errno(ret);
1339                        goto out;
1340                }
1341        }
1342
1343        trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
1344                                    index);
1345
1346        ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1347                                   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
1348        if (ret) {
1349                mlog_errno(ret);
1350                goto out;
1351        }
1352
1353        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, leaf_bh);
1354        trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1355        if (add_to_free_list) {
1356                trailer->db_free_next = dx_root->dr_free_blk;
1357                dx_root->dr_free_blk = cpu_to_le64(leaf_bh->b_blocknr);
1358                ocfs2_journal_dirty(handle, dx_root_bh);
1359        }
1360
1361        /* leaf_bh was journal_accessed for us in __ocfs2_delete_entry */
1362        ocfs2_journal_dirty(handle, leaf_bh);
1363
1364        le32_add_cpu(&dx_root->dr_num_entries, -1);
1365        ocfs2_journal_dirty(handle, dx_root_bh);
1366
1367        ocfs2_dx_list_remove_entry(entry_list, index);
1368
1369        if (!ocfs2_dx_root_inline(dx_root))
1370                ocfs2_journal_dirty(handle, lookup->dl_dx_leaf_bh);
1371
1372out:
1373        return ret;
1374}
1375
1376static inline int ocfs2_delete_entry_id(handle_t *handle,
1377                                        struct inode *dir,
1378                                        struct ocfs2_dir_entry *de_del,
1379                                        struct buffer_head *bh)
1380{
1381        int ret;
1382        struct buffer_head *di_bh = NULL;
1383        struct ocfs2_dinode *di;
1384        struct ocfs2_inline_data *data;
1385
1386        ret = ocfs2_read_inode_block(dir, &di_bh);
1387        if (ret) {
1388                mlog_errno(ret);
1389                goto out;
1390        }
1391
1392        di = (struct ocfs2_dinode *)di_bh->b_data;
1393        data = &di->id2.i_data;
1394
1395        ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
1396                                   i_size_read(dir));
1397
1398        brelse(di_bh);
1399out:
1400        return ret;
1401}
1402
1403static inline int ocfs2_delete_entry_el(handle_t *handle,
1404                                        struct inode *dir,
1405                                        struct ocfs2_dir_entry *de_del,
1406                                        struct buffer_head *bh)
1407{
1408        return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
1409                                    bh->b_size);
1410}
1411
1412/*
1413 * Delete a directory entry. Hide the details of directory
1414 * implementation from the caller.
1415 */
1416int ocfs2_delete_entry(handle_t *handle,
1417                       struct inode *dir,
1418                       struct ocfs2_dir_lookup_result *res)
1419{
1420        if (ocfs2_dir_indexed(dir))
1421                return ocfs2_delete_entry_dx(handle, dir, res);
1422
1423        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1424                return ocfs2_delete_entry_id(handle, dir, res->dl_entry,
1425                                             res->dl_leaf_bh);
1426
1427        return ocfs2_delete_entry_el(handle, dir, res->dl_entry,
1428                                     res->dl_leaf_bh);
1429}
1430
1431/*
1432 * Check whether 'de' has enough room to hold an entry of
1433 * 'new_rec_len' bytes.
1434 */
1435static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
1436                                         unsigned int new_rec_len)
1437{
1438        unsigned int de_really_used;
1439
1440        /* Check whether this is an empty record with enough space */
1441        if (le64_to_cpu(de->inode) == 0 &&
1442            le16_to_cpu(de->rec_len) >= new_rec_len)
1443                return 1;
1444
1445        /*
1446         * Record might have free space at the end which we can
1447         * use.
1448         */
1449        de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
1450        if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
1451            return 1;
1452
1453        return 0;
1454}
1455
1456static void ocfs2_dx_dir_leaf_insert_tail(struct ocfs2_dx_leaf *dx_leaf,
1457                                          struct ocfs2_dx_entry *dx_new_entry)
1458{
1459        int i;
1460
1461        i = le16_to_cpu(dx_leaf->dl_list.de_num_used);
1462        dx_leaf->dl_list.de_entries[i] = *dx_new_entry;
1463
1464        le16_add_cpu(&dx_leaf->dl_list.de_num_used, 1);
1465}
1466
1467static void ocfs2_dx_entry_list_insert(struct ocfs2_dx_entry_list *entry_list,
1468                                       struct ocfs2_dx_hinfo *hinfo,
1469                                       u64 dirent_blk)
1470{
1471        int i;
1472        struct ocfs2_dx_entry *dx_entry;
1473
1474        i = le16_to_cpu(entry_list->de_num_used);
1475        dx_entry = &entry_list->de_entries[i];
1476
1477        memset(dx_entry, 0, sizeof(*dx_entry));
1478        dx_entry->dx_major_hash = cpu_to_le32(hinfo->major_hash);
1479        dx_entry->dx_minor_hash = cpu_to_le32(hinfo->minor_hash);
1480        dx_entry->dx_dirent_blk = cpu_to_le64(dirent_blk);
1481
1482        le16_add_cpu(&entry_list->de_num_used, 1);
1483}
1484
1485static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1486                                      struct ocfs2_dx_hinfo *hinfo,
1487                                      u64 dirent_blk,
1488                                      struct buffer_head *dx_leaf_bh)
1489{
1490        int ret;
1491        struct ocfs2_dx_leaf *dx_leaf;
1492
1493        ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
1494                                      OCFS2_JOURNAL_ACCESS_WRITE);
1495        if (ret) {
1496                mlog_errno(ret);
1497                goto out;
1498        }
1499
1500        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
1501        ocfs2_dx_entry_list_insert(&dx_leaf->dl_list, hinfo, dirent_blk);
1502        ocfs2_journal_dirty(handle, dx_leaf_bh);
1503
1504out:
1505        return ret;
1506}
1507
1508static void ocfs2_dx_inline_root_insert(struct inode *dir, handle_t *handle,
1509                                        struct ocfs2_dx_hinfo *hinfo,
1510                                        u64 dirent_blk,
1511                                        struct ocfs2_dx_root_block *dx_root)
1512{
1513        ocfs2_dx_entry_list_insert(&dx_root->dr_entries, hinfo, dirent_blk);
1514}
1515
1516static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1517                               struct ocfs2_dir_lookup_result *lookup)
1518{
1519        int ret = 0;
1520        struct ocfs2_dx_root_block *dx_root;
1521        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1522
1523        ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1524                                      OCFS2_JOURNAL_ACCESS_WRITE);
1525        if (ret) {
1526                mlog_errno(ret);
1527                goto out;
1528        }
1529
1530        dx_root = (struct ocfs2_dx_root_block *)lookup->dl_dx_root_bh->b_data;
1531        if (ocfs2_dx_root_inline(dx_root)) {
1532                ocfs2_dx_inline_root_insert(dir, handle,
1533                                            &lookup->dl_hinfo,
1534                                            lookup->dl_leaf_bh->b_blocknr,
1535                                            dx_root);
1536        } else {
1537                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &lookup->dl_hinfo,
1538                                                 lookup->dl_leaf_bh->b_blocknr,
1539                                                 lookup->dl_dx_leaf_bh);
1540                if (ret)
1541                        goto out;
1542        }
1543
1544        le32_add_cpu(&dx_root->dr_num_entries, 1);
1545        ocfs2_journal_dirty(handle, dx_root_bh);
1546
1547out:
1548        return ret;
1549}
1550
1551static void ocfs2_remove_block_from_free_list(struct inode *dir,
1552                                       handle_t *handle,
1553                                       struct ocfs2_dir_lookup_result *lookup)
1554{
1555        struct ocfs2_dir_block_trailer *trailer, *prev;
1556        struct ocfs2_dx_root_block *dx_root;
1557        struct buffer_head *bh;
1558
1559        trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1560
1561        if (ocfs2_free_list_at_root(lookup)) {
1562                bh = lookup->dl_dx_root_bh;
1563                dx_root = (struct ocfs2_dx_root_block *)bh->b_data;
1564                dx_root->dr_free_blk = trailer->db_free_next;
1565        } else {
1566                bh = lookup->dl_prev_leaf_bh;
1567                prev = ocfs2_trailer_from_bh(bh, dir->i_sb);
1568                prev->db_free_next = trailer->db_free_next;
1569        }
1570
1571        trailer->db_free_rec_len = cpu_to_le16(0);
1572        trailer->db_free_next = cpu_to_le64(0);
1573
1574        ocfs2_journal_dirty(handle, bh);
1575        ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1576}
1577
1578/*
1579 * This expects that a journal write has been reserved on
1580 * lookup->dl_prev_leaf_bh or lookup->dl_dx_root_bh
1581 */
1582static void ocfs2_recalc_free_list(struct inode *dir, handle_t *handle,
1583                                   struct ocfs2_dir_lookup_result *lookup)
1584{
1585        int max_rec_len;
1586        struct ocfs2_dir_block_trailer *trailer;
1587
1588        /* Walk dl_leaf_bh to figure out what the new free rec_len is. */
1589        max_rec_len = ocfs2_find_max_rec_len(dir->i_sb, lookup->dl_leaf_bh);
1590        if (max_rec_len) {
1591                /*
1592                 * There's still room in this block, so no need to remove it
1593                 * from the free list. In this case, we just want to update
1594                 * the rec len accounting.
1595                 */
1596                trailer = ocfs2_trailer_from_bh(lookup->dl_leaf_bh, dir->i_sb);
1597                trailer->db_free_rec_len = cpu_to_le16(max_rec_len);
1598                ocfs2_journal_dirty(handle, lookup->dl_leaf_bh);
1599        } else {
1600                ocfs2_remove_block_from_free_list(dir, handle, lookup);
1601        }
1602}
1603
1604/* we don't always have a dentry for what we want to add, so people
1605 * like orphan dir can call this instead.
1606 *
1607 * The lookup context must have been filled from
1608 * ocfs2_prepare_dir_for_insert.
1609 */
1610int __ocfs2_add_entry(handle_t *handle,
1611                      struct inode *dir,
1612                      const char *name, int namelen,
1613                      struct inode *inode, u64 blkno,
1614                      struct buffer_head *parent_fe_bh,
1615                      struct ocfs2_dir_lookup_result *lookup)
1616{
1617        unsigned long offset;
1618        unsigned short rec_len;
1619        struct ocfs2_dir_entry *de, *de1;
1620        struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1621        struct super_block *sb = dir->i_sb;
1622        int retval, status;
1623        unsigned int size = sb->s_blocksize;
1624        struct buffer_head *insert_bh = lookup->dl_leaf_bh;
1625        char *data_start = insert_bh->b_data;
1626
1627        if (!namelen)
1628                return -EINVAL;
1629
1630        if (ocfs2_dir_indexed(dir)) {
1631                struct buffer_head *bh;
1632
1633                /*
1634                 * An indexed dir may require that we update the free space
1635                 * list. Reserve a write to the previous node in the list so
1636                 * that we don't fail later.
1637                 *
1638                 * XXX: This can be either a dx_root_block, or an unindexed
1639                 * directory tree leaf block.
1640                 */
1641                if (ocfs2_free_list_at_root(lookup)) {
1642                        bh = lookup->dl_dx_root_bh;
1643                        retval = ocfs2_journal_access_dr(handle,
1644                                                 INODE_CACHE(dir), bh,
1645                                                 OCFS2_JOURNAL_ACCESS_WRITE);
1646                } else {
1647                        bh = lookup->dl_prev_leaf_bh;
1648                        retval = ocfs2_journal_access_db(handle,
1649                                                 INODE_CACHE(dir), bh,
1650                                                 OCFS2_JOURNAL_ACCESS_WRITE);
1651                }
1652                if (retval) {
1653                        mlog_errno(retval);
1654                        return retval;
1655                }
1656        } else if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1657                data_start = di->id2.i_data.id_data;
1658                size = i_size_read(dir);
1659
1660                BUG_ON(insert_bh != parent_fe_bh);
1661        }
1662
1663        rec_len = OCFS2_DIR_REC_LEN(namelen);
1664        offset = 0;
1665        de = (struct ocfs2_dir_entry *) data_start;
1666        while (1) {
1667                BUG_ON((char *)de >= (size + data_start));
1668
1669                /* These checks should've already been passed by the
1670                 * prepare function, but I guess we can leave them
1671                 * here anyway. */
1672                if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
1673                        retval = -ENOENT;
1674                        goto bail;
1675                }
1676                if (ocfs2_match(namelen, name, de)) {
1677                        retval = -EEXIST;
1678                        goto bail;
1679                }
1680
1681                /* We're guaranteed that we should have space, so we
1682                 * can't possibly have hit the trailer...right? */
1683                mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
1684                                "Hit dir trailer trying to insert %.*s "
1685                                "(namelen %d) into directory %llu.  "
1686                                "offset is %lu, trailer offset is %d\n",
1687                                namelen, name, namelen,
1688                                (unsigned long long)parent_fe_bh->b_blocknr,
1689                                offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
1690
1691                if (ocfs2_dirent_would_fit(de, rec_len)) {
1692                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
1693                        retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
1694                        if (retval < 0) {
1695                                mlog_errno(retval);
1696                                goto bail;
1697                        }
1698
1699                        if (insert_bh == parent_fe_bh)
1700                                status = ocfs2_journal_access_di(handle,
1701                                                                 INODE_CACHE(dir),
1702                                                                 insert_bh,
1703                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
1704                        else {
1705                                status = ocfs2_journal_access_db(handle,
1706                                                                 INODE_CACHE(dir),
1707                                                                 insert_bh,
1708                                              OCFS2_JOURNAL_ACCESS_WRITE);
1709
1710                                if (ocfs2_dir_indexed(dir)) {
1711                                        status = ocfs2_dx_dir_insert(dir,
1712                                                                handle,
1713                                                                lookup);
1714                                        if (status) {
1715                                                mlog_errno(status);
1716                                                goto bail;
1717                                        }
1718                                }
1719                        }
1720
1721                        /* By now the buffer is marked for journaling */
1722                        offset += le16_to_cpu(de->rec_len);
1723                        if (le64_to_cpu(de->inode)) {
1724                                de1 = (struct ocfs2_dir_entry *)((char *) de +
1725                                        OCFS2_DIR_REC_LEN(de->name_len));
1726                                de1->rec_len =
1727                                        cpu_to_le16(le16_to_cpu(de->rec_len) -
1728                                        OCFS2_DIR_REC_LEN(de->name_len));
1729                                de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
1730                                de = de1;
1731                        }
1732                        de->file_type = OCFS2_FT_UNKNOWN;
1733                        if (blkno) {
1734                                de->inode = cpu_to_le64(blkno);
1735                                ocfs2_set_de_type(de, inode->i_mode);
1736                        } else
1737                                de->inode = 0;
1738                        de->name_len = namelen;
1739                        memcpy(de->name, name, namelen);
1740
1741                        if (ocfs2_dir_indexed(dir))
1742                                ocfs2_recalc_free_list(dir, handle, lookup);
1743
1744                        dir->i_version++;
1745                        ocfs2_journal_dirty(handle, insert_bh);
1746                        retval = 0;
1747                        goto bail;
1748                }
1749
1750                offset += le16_to_cpu(de->rec_len);
1751                de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
1752        }
1753
1754        /* when you think about it, the assert above should prevent us
1755         * from ever getting here. */
1756        retval = -ENOSPC;
1757bail:
1758        if (retval)
1759                mlog_errno(retval);
1760
1761        return retval;
1762}
1763
1764static int ocfs2_dir_foreach_blk_id(struct inode *inode,
1765                                    u64 *f_version,
1766                                    loff_t *f_pos, void *priv,
1767                                    filldir_t filldir, int *filldir_err)
1768{
1769        int ret, i, filldir_ret;
1770        unsigned long offset = *f_pos;
1771        struct buffer_head *di_bh = NULL;
1772        struct ocfs2_dinode *di;
1773        struct ocfs2_inline_data *data;
1774        struct ocfs2_dir_entry *de;
1775
1776        ret = ocfs2_read_inode_block(inode, &di_bh);
1777        if (ret) {
1778                mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
1779                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1780                goto out;
1781        }
1782
1783        di = (struct ocfs2_dinode *)di_bh->b_data;
1784        data = &di->id2.i_data;
1785
1786        while (*f_pos < i_size_read(inode)) {
1787revalidate:
1788                /* If the dir block has changed since the last call to
1789                 * readdir(2), then we might be pointing to an invalid
1790                 * dirent right now.  Scan from the start of the block
1791                 * to make sure. */
1792                if (*f_version != inode->i_version) {
1793                        for (i = 0; i < i_size_read(inode) && i < offset; ) {
1794                                de = (struct ocfs2_dir_entry *)
1795                                        (data->id_data + i);
1796                                /* It's too expensive to do a full
1797                                 * dirent test each time round this
1798                                 * loop, but we do have to test at
1799                                 * least that it is non-zero.  A
1800                                 * failure will be detected in the
1801                                 * dirent test below. */
1802                                if (le16_to_cpu(de->rec_len) <
1803                                    OCFS2_DIR_REC_LEN(1))
1804                                        break;
1805                                i += le16_to_cpu(de->rec_len);
1806                        }
1807                        *f_pos = offset = i;
1808                        *f_version = inode->i_version;
1809                }
1810
1811                de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
1812                if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
1813                        /* On error, skip the f_pos to the end. */
1814                        *f_pos = i_size_read(inode);
1815                        goto out;
1816                }
1817                offset += le16_to_cpu(de->rec_len);
1818                if (le64_to_cpu(de->inode)) {
1819                        /* We might block in the next section
1820                         * if the data destination is
1821                         * currently swapped out.  So, use a
1822                         * version stamp to detect whether or
1823                         * not the directory has been modified
1824                         * during the copy operation.
1825                         */
1826                        u64 version = *f_version;
1827                        unsigned char d_type = DT_UNKNOWN;
1828
1829                        if (de->file_type < OCFS2_FT_MAX)
1830                                d_type = ocfs2_filetype_table[de->file_type];
1831
1832                        filldir_ret = filldir(priv, de->name,
1833                                              de->name_len,
1834                                              *f_pos,
1835                                              le64_to_cpu(de->inode),
1836                                              d_type);
1837                        if (filldir_ret) {
1838                                if (filldir_err)
1839                                        *filldir_err = filldir_ret;
1840                                break;
1841                        }
1842                        if (version != *f_version)
1843                                goto revalidate;
1844                }
1845                *f_pos += le16_to_cpu(de->rec_len);
1846        }
1847
1848out:
1849        brelse(di_bh);
1850
1851        return 0;
1852}
1853
1854/*
1855 * NOTE: This function can be called against unindexed directories,
1856 * and indexed ones.
1857 */
1858static int ocfs2_dir_foreach_blk_el(struct inode *inode,
1859                                    u64 *f_version,
1860                                    loff_t *f_pos, void *priv,
1861                                    filldir_t filldir, int *filldir_err)
1862{
1863        int error = 0;
1864        unsigned long offset, blk, last_ra_blk = 0;
1865        int i, stored;
1866        struct buffer_head * bh, * tmp;
1867        struct ocfs2_dir_entry * de;
1868        struct super_block * sb = inode->i_sb;
1869        unsigned int ra_sectors = 16;
1870
1871        stored = 0;
1872        bh = NULL;
1873
1874        offset = (*f_pos) & (sb->s_blocksize - 1);
1875
1876        while (!error && !stored && *f_pos < i_size_read(inode)) {
1877                blk = (*f_pos) >> sb->s_blocksize_bits;
1878                if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
1879                        /* Skip the corrupt dirblock and keep trying */
1880                        *f_pos += sb->s_blocksize - offset;
1881                        continue;
1882                }
1883
1884                /* The idea here is to begin with 8k read-ahead and to stay
1885                 * 4k ahead of our current position.
1886                 *
1887                 * TODO: Use the pagecache for this. We just need to
1888                 * make sure it's cluster-safe... */
1889                if (!last_ra_blk
1890                    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
1891                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
1892                             i > 0; i--) {
1893                                tmp = NULL;
1894                                if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
1895                                                          OCFS2_BH_READAHEAD))
1896                                        brelse(tmp);
1897                        }
1898                        last_ra_blk = blk;
1899                        ra_sectors = 8;
1900                }
1901
1902revalidate:
1903                /* If the dir block has changed since the last call to
1904                 * readdir(2), then we might be pointing to an invalid
1905                 * dirent right now.  Scan from the start of the block
1906                 * to make sure. */
1907                if (*f_version != inode->i_version) {
1908                        for (i = 0; i < sb->s_blocksize && i < offset; ) {
1909                                de = (struct ocfs2_dir_entry *) (bh->b_data + i);
1910                                /* It's too expensive to do a full
1911                                 * dirent test each time round this
1912                                 * loop, but we do have to test at
1913                                 * least that it is non-zero.  A
1914                                 * failure will be detected in the
1915                                 * dirent test below. */
1916                                if (le16_to_cpu(de->rec_len) <
1917                                    OCFS2_DIR_REC_LEN(1))
1918                                        break;
1919                                i += le16_to_cpu(de->rec_len);
1920                        }
1921                        offset = i;
1922                        *f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
1923                                | offset;
1924                        *f_version = inode->i_version;
1925                }
1926
1927                while (!error && *f_pos < i_size_read(inode)
1928                       && offset < sb->s_blocksize) {
1929                        de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
1930                        if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
1931                                /* On error, skip the f_pos to the
1932                                   next block. */
1933                                *f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
1934                                brelse(bh);
1935                                goto out;
1936                        }
1937                        offset += le16_to_cpu(de->rec_len);
1938                        if (le64_to_cpu(de->inode)) {
1939                                /* We might block in the next section
1940                                 * if the data destination is
1941                                 * currently swapped out.  So, use a
1942                                 * version stamp to detect whether or
1943                                 * not the directory has been modified
1944                                 * during the copy operation.
1945                                 */
1946                                unsigned long version = *f_version;
1947                                unsigned char d_type = DT_UNKNOWN;
1948
1949                                if (de->file_type < OCFS2_FT_MAX)
1950                                        d_type = ocfs2_filetype_table[de->file_type];
1951                                error = filldir(priv, de->name,
1952                                                de->name_len,
1953                                                *f_pos,
1954                                                le64_to_cpu(de->inode),
1955                                                d_type);
1956                                if (error) {
1957                                        if (filldir_err)
1958                                                *filldir_err = error;
1959                                        break;
1960                                }
1961                                if (version != *f_version)
1962                                        goto revalidate;
1963                                stored ++;
1964                        }
1965                        *f_pos += le16_to_cpu(de->rec_len);
1966                }
1967                offset = 0;
1968                brelse(bh);
1969                bh = NULL;
1970        }
1971
1972        stored = 0;
1973out:
1974        return stored;
1975}
1976
1977static int ocfs2_dir_foreach_blk(struct inode *inode, u64 *f_version,
1978                                 loff_t *f_pos, void *priv, filldir_t filldir,
1979                                 int *filldir_err)
1980{
1981        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1982                return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
1983                                                filldir, filldir_err);
1984
1985        return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
1986                                        filldir_err);
1987}
1988
1989/*
1990 * This is intended to be called from inside other kernel functions,
1991 * so we fake some arguments.
1992 */
1993int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
1994                      filldir_t filldir)
1995{
1996        int ret = 0, filldir_err = 0;
1997        u64 version = inode->i_version;
1998
1999        while (*f_pos < i_size_read(inode)) {
2000                ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
2001                                            filldir, &filldir_err);
2002                if (ret || filldir_err)
2003                        break;
2004        }
2005
2006        if (ret > 0)
2007                ret = -EIO;
2008
2009        return 0;
2010}
2011
2012/*
2013 * ocfs2_readdir()
2014 *
2015 */
2016int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
2017{
2018        int error = 0;
2019        struct inode *inode = filp->f_path.dentry->d_inode;
2020        int lock_level = 0;
2021
2022        trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
2023
2024        error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2025        if (lock_level && error >= 0) {
2026                /* We release EX lock which used to update atime
2027                 * and get PR lock again to reduce contention
2028                 * on commonly accessed directories. */
2029                ocfs2_inode_unlock(inode, 1);
2030                lock_level = 0;
2031                error = ocfs2_inode_lock(inode, NULL, 0);
2032        }
2033        if (error < 0) {
2034                if (error != -ENOENT)
2035                        mlog_errno(error);
2036                /* we haven't got any yet, so propagate the error. */
2037                goto bail_nolock;
2038        }
2039
2040        error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
2041                                      dirent, filldir, NULL);
2042
2043        ocfs2_inode_unlock(inode, lock_level);
2044        if (error)
2045                mlog_errno(error);
2046
2047bail_nolock:
2048
2049        return error;
2050}
2051
2052/*
2053 * NOTE: this should always be called with parent dir i_mutex taken.
2054 */
2055int ocfs2_find_files_on_disk(const char *name,
2056                             int namelen,
2057                             u64 *blkno,
2058                             struct inode *inode,
2059                             struct ocfs2_dir_lookup_result *lookup)
2060{
2061        int status = -ENOENT;
2062
2063        trace_ocfs2_find_files_on_disk(namelen, name, blkno,
2064                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
2065
2066        status = ocfs2_find_entry(name, namelen, inode, lookup);
2067        if (status)
2068                goto leave;
2069
2070        *blkno = le64_to_cpu(lookup->dl_entry->inode);
2071
2072        status = 0;
2073leave:
2074
2075        return status;
2076}
2077
2078/*
2079 * Convenience function for callers which just want the block number
2080 * mapped to a name and don't require the full dirent info, etc.
2081 */
2082int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
2083                               int namelen, u64 *blkno)
2084{
2085        int ret;
2086        struct ocfs2_dir_lookup_result lookup = { NULL, };
2087
2088        ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &lookup);
2089        ocfs2_free_dir_lookup_result(&lookup);
2090
2091        return ret;
2092}
2093
2094/* Check for a name within a directory.
2095 *
2096 * Return 0 if the name does not exist
2097 * Return -EEXIST if the directory contains the name
2098 *
2099 * Callers should have i_mutex + a cluster lock on dir
2100 */
2101int ocfs2_check_dir_for_entry(struct inode *dir,
2102                              const char *name,
2103                              int namelen)
2104{
2105        int ret;
2106        struct ocfs2_dir_lookup_result lookup = { NULL, };
2107
2108        trace_ocfs2_check_dir_for_entry(
2109                (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
2110
2111        ret = -EEXIST;
2112        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
2113                goto bail;
2114
2115        ret = 0;
2116bail:
2117        ocfs2_free_dir_lookup_result(&lookup);
2118
2119        if (ret)
2120                mlog_errno(ret);
2121        return ret;
2122}
2123
2124struct ocfs2_empty_dir_priv {
2125        unsigned seen_dot;
2126        unsigned seen_dot_dot;
2127        unsigned seen_other;
2128        unsigned dx_dir;
2129};
2130static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
2131                                   loff_t pos, u64 ino, unsigned type)
2132{
2133        struct ocfs2_empty_dir_priv *p = priv;
2134
2135        /*
2136         * Check the positions of "." and ".." records to be sure
2137         * they're in the correct place.
2138         *
2139         * Indexed directories don't need to proceed past the first
2140         * two entries, so we end the scan after seeing '..'. Despite
2141         * that, we allow the scan to proceed In the event that we
2142         * have a corrupted indexed directory (no dot or dot dot
2143         * entries). This allows us to double check for existing
2144         * entries which might not have been found in the index.
2145         */
2146        if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
2147                p->seen_dot = 1;
2148                return 0;
2149        }
2150
2151        if (name_len == 2 && !strncmp("..", name, 2) &&
2152            pos == OCFS2_DIR_REC_LEN(1)) {
2153                p->seen_dot_dot = 1;
2154
2155                if (p->dx_dir && p->seen_dot)
2156                        return 1;
2157
2158                return 0;
2159        }
2160
2161        p->seen_other = 1;
2162        return 1;
2163}
2164
2165static int ocfs2_empty_dir_dx(struct inode *inode,
2166                              struct ocfs2_empty_dir_priv *priv)
2167{
2168        int ret;
2169        struct buffer_head *di_bh = NULL;
2170        struct buffer_head *dx_root_bh = NULL;
2171        struct ocfs2_dinode *di;
2172        struct ocfs2_dx_root_block *dx_root;
2173
2174        priv->dx_dir = 1;
2175
2176        ret = ocfs2_read_inode_block(inode, &di_bh);
2177        if (ret) {
2178                mlog_errno(ret);
2179                goto out;
2180        }
2181        di = (struct ocfs2_dinode *)di_bh->b_data;
2182
2183        ret = ocfs2_read_dx_root(inode, di, &dx_root_bh);
2184        if (ret) {
2185                mlog_errno(ret);
2186                goto out;
2187        }
2188        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2189
2190        if (le32_to_cpu(dx_root->dr_num_entries) != 2)
2191                priv->seen_other = 1;
2192
2193out:
2194        brelse(di_bh);
2195        brelse(dx_root_bh);
2196        return ret;
2197}
2198
2199/*
2200 * routine to check that the specified directory is empty (for rmdir)
2201 *
2202 * Returns 1 if dir is empty, zero otherwise.
2203 *
2204 * XXX: This is a performance problem for unindexed directories.
2205 */
2206int ocfs2_empty_dir(struct inode *inode)
2207{
2208        int ret;
2209        loff_t start = 0;
2210        struct ocfs2_empty_dir_priv priv;
2211
2212        memset(&priv, 0, sizeof(priv));
2213
2214        if (ocfs2_dir_indexed(inode)) {
2215                ret = ocfs2_empty_dir_dx(inode, &priv);
2216                if (ret)
2217                        mlog_errno(ret);
2218                /*
2219                 * We still run ocfs2_dir_foreach to get the checks
2220                 * for "." and "..".
2221                 */
2222        }
2223
2224        ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
2225        if (ret)
2226                mlog_errno(ret);
2227
2228        if (!priv.seen_dot || !priv.seen_dot_dot) {
2229                mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
2230                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2231                /*
2232                 * XXX: Is it really safe to allow an unlink to continue?
2233                 */
2234                return 1;
2235        }
2236
2237        return !priv.seen_other;
2238}
2239
2240/*
2241 * Fills "." and ".." dirents in a new directory block. Returns dirent for
2242 * "..", which might be used during creation of a directory with a trailing
2243 * header. It is otherwise safe to ignore the return code.
2244 */
2245static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
2246                                                          struct inode *parent,
2247                                                          char *start,
2248                                                          unsigned int size)
2249{
2250        struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
2251
2252        de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
2253        de->name_len = 1;
2254        de->rec_len =
2255                cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
2256        strcpy(de->name, ".");
2257        ocfs2_set_de_type(de, S_IFDIR);
2258
2259        de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
2260        de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
2261        de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
2262        de->name_len = 2;
2263        strcpy(de->name, "..");
2264        ocfs2_set_de_type(de, S_IFDIR);
2265
2266        return de;
2267}
2268
2269/*
2270 * This works together with code in ocfs2_mknod_locked() which sets
2271 * the inline-data flag and initializes the inline-data section.
2272 */
2273static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2274                                 handle_t *handle,
2275                                 struct inode *parent,
2276                                 struct inode *inode,
2277                                 struct buffer_head *di_bh)
2278{
2279        int ret;
2280        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2281        struct ocfs2_inline_data *data = &di->id2.i_data;
2282        unsigned int size = le16_to_cpu(data->id_count);
2283
2284        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2285                                      OCFS2_JOURNAL_ACCESS_WRITE);
2286        if (ret) {
2287                mlog_errno(ret);
2288                goto out;
2289        }
2290
2291        ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
2292        ocfs2_journal_dirty(handle, di_bh);
2293
2294        i_size_write(inode, size);
2295        inode->i_nlink = 2;
2296        inode->i_blocks = ocfs2_inode_sector_count(inode);
2297
2298        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
2299        if (ret < 0)
2300                mlog_errno(ret);
2301
2302out:
2303        return ret;
2304}
2305
2306static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2307                                 handle_t *handle,
2308                                 struct inode *parent,
2309                                 struct inode *inode,
2310                                 struct buffer_head *fe_bh,
2311                                 struct ocfs2_alloc_context *data_ac,
2312                                 struct buffer_head **ret_new_bh)
2313{
2314        int status;
2315        unsigned int size = osb->sb->s_blocksize;
2316        struct buffer_head *new_bh = NULL;
2317        struct ocfs2_dir_entry *de;
2318
2319        if (ocfs2_new_dir_wants_trailer(inode))
2320                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
2321
2322        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
2323                                     data_ac, NULL, &new_bh);
2324        if (status < 0) {
2325                mlog_errno(status);
2326                goto bail;
2327        }
2328
2329        ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2330
2331        status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
2332                                         OCFS2_JOURNAL_ACCESS_CREATE);
2333        if (status < 0) {
2334                mlog_errno(status);
2335                goto bail;
2336        }
2337        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
2338
2339        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
2340        if (ocfs2_new_dir_wants_trailer(inode)) {
2341                int size = le16_to_cpu(de->rec_len);
2342
2343                /*
2344                 * Figure out the size of the hole left over after
2345                 * insertion of '.' and '..'. The trailer wants this
2346                 * information.
2347                 */
2348                size -= OCFS2_DIR_REC_LEN(2);
2349                size -= sizeof(struct ocfs2_dir_block_trailer);
2350
2351                ocfs2_init_dir_trailer(inode, new_bh, size);
2352        }
2353
2354        ocfs2_journal_dirty(handle, new_bh);
2355
2356        i_size_write(inode, inode->i_sb->s_blocksize);
2357        inode->i_nlink = 2;
2358        inode->i_blocks = ocfs2_inode_sector_count(inode);
2359        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
2360        if (status < 0) {
2361                mlog_errno(status);
2362                goto bail;
2363        }
2364
2365        status = 0;
2366        if (ret_new_bh) {
2367                *ret_new_bh = new_bh;
2368                new_bh = NULL;
2369        }
2370bail:
2371        brelse(new_bh);
2372
2373        return status;
2374}
2375
2376static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2377                                     handle_t *handle, struct inode *dir,
2378                                     struct buffer_head *di_bh,
2379                                     struct buffer_head *dirdata_bh,
2380                                     struct ocfs2_alloc_context *meta_ac,
2381                                     int dx_inline, u32 num_entries,
2382                                     struct buffer_head **ret_dx_root_bh)
2383{
2384        int ret;
2385        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2386        u16 dr_suballoc_bit;
2387        u64 suballoc_loc, dr_blkno;
2388        unsigned int num_bits;
2389        struct buffer_head *dx_root_bh = NULL;
2390        struct ocfs2_dx_root_block *dx_root;
2391        struct ocfs2_dir_block_trailer *trailer =
2392                ocfs2_trailer_from_bh(dirdata_bh, dir->i_sb);
2393
2394        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
2395                                   &dr_suballoc_bit, &num_bits, &dr_blkno);
2396        if (ret) {
2397                mlog_errno(ret);
2398                goto out;
2399        }
2400
2401        trace_ocfs2_dx_dir_attach_index(
2402                                (unsigned long long)OCFS2_I(dir)->ip_blkno,
2403                                (unsigned long long)dr_blkno);
2404
2405        dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2406        if (dx_root_bh == NULL) {
2407                ret = -EIO;
2408                goto out;
2409        }
2410        ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
2411
2412        ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
2413                                      OCFS2_JOURNAL_ACCESS_CREATE);
2414        if (ret < 0) {
2415                mlog_errno(ret);
2416                goto out;
2417        }
2418
2419        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2420        memset(dx_root, 0, osb->sb->s_blocksize);
2421        strcpy(dx_root->dr_signature, OCFS2_DX_ROOT_SIGNATURE);
2422        dx_root->dr_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
2423        dx_root->dr_suballoc_loc = cpu_to_le64(suballoc_loc);
2424        dx_root->dr_suballoc_bit = cpu_to_le16(dr_suballoc_bit);
2425        dx_root->dr_fs_generation = cpu_to_le32(osb->fs_generation);
2426        dx_root->dr_blkno = cpu_to_le64(dr_blkno);
2427        dx_root->dr_dir_blkno = cpu_to_le64(OCFS2_I(dir)->ip_blkno);
2428        dx_root->dr_num_entries = cpu_to_le32(num_entries);
2429        if (le16_to_cpu(trailer->db_free_rec_len))
2430                dx_root->dr_free_blk = cpu_to_le64(dirdata_bh->b_blocknr);
2431        else
2432                dx_root->dr_free_blk = cpu_to_le64(0);
2433
2434        if (dx_inline) {
2435                dx_root->dr_flags |= OCFS2_DX_FLAG_INLINE;
2436                dx_root->dr_entries.de_count =
2437                        cpu_to_le16(ocfs2_dx_entries_per_root(osb->sb));
2438        } else {
2439                dx_root->dr_list.l_count =
2440                        cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
2441        }
2442        ocfs2_journal_dirty(handle, dx_root_bh);
2443
2444        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2445                                      OCFS2_JOURNAL_ACCESS_CREATE);
2446        if (ret) {
2447                mlog_errno(ret);
2448                goto out;
2449        }
2450
2451        di->i_dx_root = cpu_to_le64(dr_blkno);
2452
2453        spin_lock(&OCFS2_I(dir)->ip_lock);
2454        OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL;
2455        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
2456        spin_unlock(&OCFS2_I(dir)->ip_lock);
2457
2458        ocfs2_journal_dirty(handle, di_bh);
2459
2460        *ret_dx_root_bh = dx_root_bh;
2461        dx_root_bh = NULL;
2462
2463out:
2464        brelse(dx_root_bh);
2465        return ret;
2466}
2467
2468static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2469                                       handle_t *handle, struct inode *dir,
2470                                       struct buffer_head **dx_leaves,
2471                                       int num_dx_leaves, u64 start_blk)
2472{
2473        int ret, i;
2474        struct ocfs2_dx_leaf *dx_leaf;
2475        struct buffer_head *bh;
2476
2477        for (i = 0; i < num_dx_leaves; i++) {
2478                bh = sb_getblk(osb->sb, start_blk + i);
2479                if (bh == NULL) {
2480                        ret = -EIO;
2481                        goto out;
2482                }
2483                dx_leaves[i] = bh;
2484
2485                ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
2486
2487                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
2488                                              OCFS2_JOURNAL_ACCESS_CREATE);
2489                if (ret < 0) {
2490                        mlog_errno(ret);
2491                        goto out;
2492                }
2493
2494                dx_leaf = (struct ocfs2_dx_leaf *) bh->b_data;
2495
2496                memset(dx_leaf, 0, osb->sb->s_blocksize);
2497                strcpy(dx_leaf->dl_signature, OCFS2_DX_LEAF_SIGNATURE);
2498                dx_leaf->dl_fs_generation = cpu_to_le32(osb->fs_generation);
2499                dx_leaf->dl_blkno = cpu_to_le64(bh->b_blocknr);
2500                dx_leaf->dl_list.de_count =
2501                        cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2502
2503                trace_ocfs2_dx_dir_format_cluster(
2504                                (unsigned long long)OCFS2_I(dir)->ip_blkno,
2505                                (unsigned long long)bh->b_blocknr,
2506                                le16_to_cpu(dx_leaf->dl_list.de_count));
2507
2508                ocfs2_journal_dirty(handle, bh);
2509        }
2510
2511        ret = 0;
2512out:
2513        return ret;
2514}
2515
2516/*
2517 * Allocates and formats a new cluster for use in an indexed dir
2518 * leaf. This version will not do the extent insert, so that it can be
2519 * used by operations which need careful ordering.
2520 */
2521static int __ocfs2_dx_dir_new_cluster(struct inode *dir,
2522                                      u32 cpos, handle_t *handle,
2523                                      struct ocfs2_alloc_context *data_ac,
2524                                      struct buffer_head **dx_leaves,
2525                                      int num_dx_leaves, u64 *ret_phys_blkno)
2526{
2527        int ret;
2528        u32 phys, num;
2529        u64 phys_blkno;
2530        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2531
2532        /*
2533         * XXX: For create, this should claim cluster for the index
2534         * *before* the unindexed insert so that we have a better
2535         * chance of contiguousness as the directory grows in number
2536         * of entries.
2537         */
2538        ret = __ocfs2_claim_clusters(handle, data_ac, 1, 1, &phys, &num);
2539        if (ret) {
2540                mlog_errno(ret);
2541                goto out;
2542        }
2543
2544        /*
2545         * Format the new cluster first. That way, we're inserting
2546         * valid data.
2547         */
2548        phys_blkno = ocfs2_clusters_to_blocks(osb->sb, phys);
2549        ret = ocfs2_dx_dir_format_cluster(osb, handle, dir, dx_leaves,
2550                                          num_dx_leaves, phys_blkno);
2551        if (ret) {
2552                mlog_errno(ret);
2553                goto out;
2554        }
2555
2556        *ret_phys_blkno = phys_blkno;
2557out:
2558        return ret;
2559}
2560
2561static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2562                                    struct ocfs2_extent_tree *et,
2563                                    u32 cpos, handle_t *handle,
2564                                    struct ocfs2_alloc_context *data_ac,
2565                                    struct ocfs2_alloc_context *meta_ac,
2566                                    struct buffer_head **dx_leaves,
2567                                    int num_dx_leaves)
2568{
2569        int ret;
2570        u64 phys_blkno;
2571
2572        ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2573                                         num_dx_leaves, &phys_blkno);
2574        if (ret) {
2575                mlog_errno(ret);
2576                goto out;
2577        }
2578
2579        ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
2580                                  meta_ac);
2581        if (ret)
2582                mlog_errno(ret);
2583out:
2584        return ret;
2585}
2586
2587static struct buffer_head **ocfs2_dx_dir_kmalloc_leaves(struct super_block *sb,
2588                                                        int *ret_num_leaves)
2589{
2590        int num_dx_leaves = ocfs2_clusters_to_blocks(sb, 1);
2591        struct buffer_head **dx_leaves;
2592
2593        dx_leaves = kcalloc(num_dx_leaves, sizeof(struct buffer_head *),
2594                            GFP_NOFS);
2595        if (dx_leaves && ret_num_leaves)
2596                *ret_num_leaves = num_dx_leaves;
2597
2598        return dx_leaves;
2599}
2600
2601static int ocfs2_fill_new_dir_dx(struct ocfs2_super *osb,
2602                                 handle_t *handle,
2603                                 struct inode *parent,
2604                                 struct inode *inode,
2605                                 struct buffer_head *di_bh,
2606                                 struct ocfs2_alloc_context *data_ac,
2607                                 struct ocfs2_alloc_context *meta_ac)
2608{
2609        int ret;
2610        struct buffer_head *leaf_bh = NULL;
2611        struct buffer_head *dx_root_bh = NULL;
2612        struct ocfs2_dx_hinfo hinfo;
2613        struct ocfs2_dx_root_block *dx_root;
2614        struct ocfs2_dx_entry_list *entry_list;
2615
2616        /*
2617         * Our strategy is to create the directory as though it were
2618         * unindexed, then add the index block. This works with very
2619         * little complication since the state of a new directory is a
2620         * very well known quantity.
2621         *
2622         * Essentially, we have two dirents ("." and ".."), in the 1st
2623         * block which need indexing. These are easily inserted into
2624         * the index block.
2625         */
2626
2627        ret = ocfs2_fill_new_dir_el(osb, handle, parent, inode, di_bh,
2628                                    data_ac, &leaf_bh);
2629        if (ret) {
2630                mlog_errno(ret);
2631                goto out;
2632        }
2633
2634        ret = ocfs2_dx_dir_attach_index(osb, handle, inode, di_bh, leaf_bh,
2635                                        meta_ac, 1, 2, &dx_root_bh);
2636        if (ret) {
2637                mlog_errno(ret);
2638                goto out;
2639        }
2640        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2641        entry_list = &dx_root->dr_entries;
2642
2643        /* Buffer has been journaled for us by ocfs2_dx_dir_attach_index */
2644        ocfs2_dx_dir_name_hash(inode, ".", 1, &hinfo);
2645        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2646
2647        ocfs2_dx_dir_name_hash(inode, "..", 2, &hinfo);
2648        ocfs2_dx_entry_list_insert(entry_list, &hinfo, leaf_bh->b_blocknr);
2649
2650out:
2651        brelse(dx_root_bh);
2652        brelse(leaf_bh);
2653        return ret;
2654}
2655
2656int ocfs2_fill_new_dir(struct ocfs2_super *osb,
2657                       handle_t *handle,
2658                       struct inode *parent,
2659                       struct inode *inode,
2660                       struct buffer_head *fe_bh,
2661                       struct ocfs2_alloc_context *data_ac,
2662                       struct ocfs2_alloc_context *meta_ac)
2663
2664{
2665        BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
2666
2667        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2668                return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
2669
2670        if (ocfs2_supports_indexed_dirs(osb))
2671                return ocfs2_fill_new_dir_dx(osb, handle, parent, inode, fe_bh,
2672                                             data_ac, meta_ac);
2673
2674        return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
2675                                     data_ac, NULL);
2676}
2677
2678static int ocfs2_dx_dir_index_block(struct inode *dir,
2679                                    handle_t *handle,
2680                                    struct buffer_head **dx_leaves,
2681                                    int num_dx_leaves,
2682                                    u32 *num_dx_entries,
2683                                    struct buffer_head *dirent_bh)
2684{
2685        int ret = 0, namelen, i;
2686        char *de_buf, *limit;
2687        struct ocfs2_dir_entry *de;
2688        struct buffer_head *dx_leaf_bh;
2689        struct ocfs2_dx_hinfo hinfo;
2690        u64 dirent_blk = dirent_bh->b_blocknr;
2691
2692        de_buf = dirent_bh->b_data;
2693        limit = de_buf + dir->i_sb->s_blocksize;
2694
2695        while (de_buf < limit) {
2696                de = (struct ocfs2_dir_entry *)de_buf;
2697
2698                namelen = de->name_len;
2699                if (!namelen || !de->inode)
2700                        goto inc;
2701
2702                ocfs2_dx_dir_name_hash(dir, de->name, namelen, &hinfo);
2703
2704                i = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), &hinfo);
2705                dx_leaf_bh = dx_leaves[i];
2706
2707                ret = __ocfs2_dx_dir_leaf_insert(dir, handle, &hinfo,
2708                                                 dirent_blk, dx_leaf_bh);
2709                if (ret) {
2710                        mlog_errno(ret);
2711                        goto out;
2712                }
2713
2714                *num_dx_entries = *num_dx_entries + 1;
2715
2716inc:
2717                de_buf += le16_to_cpu(de->rec_len);
2718        }
2719
2720out:
2721        return ret;
2722}
2723
2724/*
2725 * XXX: This expects dx_root_bh to already be part of the transaction.
2726 */
2727static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2728                                         struct buffer_head *dx_root_bh,
2729                                         struct buffer_head *dirent_bh)
2730{
2731        char *de_buf, *limit;
2732        struct ocfs2_dx_root_block *dx_root;
2733        struct ocfs2_dir_entry *de;
2734        struct ocfs2_dx_hinfo hinfo;
2735        u64 dirent_blk = dirent_bh->b_blocknr;
2736
2737        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
2738
2739        de_buf = dirent_bh->b_data;
2740        limit = de_buf + dir->i_sb->s_blocksize;
2741
2742        while (de_buf < limit) {
2743                de = (struct ocfs2_dir_entry *)de_buf;
2744
2745                if (!de->name_len || !de->inode)
2746                        goto inc;
2747
2748                ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2749
2750                trace_ocfs2_dx_dir_index_root_block(
2751                                (unsigned long long)dir->i_ino,
2752                                hinfo.major_hash, hinfo.minor_hash,
2753                                de->name_len, de->name,
2754                                le16_to_cpu(dx_root->dr_entries.de_num_used));
2755
2756                ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2757                                           dirent_blk);
2758
2759                le32_add_cpu(&dx_root->dr_num_entries, 1);
2760inc:
2761                de_buf += le16_to_cpu(de->rec_len);
2762        }
2763}
2764
2765/*
2766 * Count the number of inline directory entries in di_bh and compare
2767 * them against the number of entries we can hold in an inline dx root
2768 * block.
2769 */
2770static int ocfs2_new_dx_should_be_inline(struct inode *dir,
2771                                         struct buffer_head *di_bh)
2772{
2773        int dirent_count = 0;
2774        char *de_buf, *limit;
2775        struct ocfs2_dir_entry *de;
2776        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2777
2778        de_buf = di->id2.i_data.id_data;
2779        limit = de_buf + i_size_read(dir);
2780
2781        while (de_buf < limit) {
2782                de = (struct ocfs2_dir_entry *)de_buf;
2783
2784                if (de->name_len && de->inode)
2785                        dirent_count++;
2786
2787                de_buf += le16_to_cpu(de->rec_len);
2788        }
2789
2790        /* We are careful to leave room for one extra record. */
2791        return dirent_count < ocfs2_dx_entries_per_root(dir->i_sb);
2792}
2793
2794/*
2795 * Expand rec_len of the rightmost dirent in a directory block so that it
2796 * contains the end of our valid space for dirents. We do this during
2797 * expansion from an inline directory to one with extents. The first dir block
2798 * in that case is taken from the inline data portion of the inode block.
2799 *
2800 * This will also return the largest amount of contiguous space for a dirent
2801 * in the block. That value is *not* necessarily the last dirent, even after
2802 * expansion. The directory indexing code wants this value for free space
2803 * accounting. We do this here since we're already walking the entire dir
2804 * block.
2805 *
2806 * We add the dir trailer if this filesystem wants it.
2807 */
2808static unsigned int ocfs2_expand_last_dirent(char *start, unsigned int old_size,
2809                                             struct inode *dir)
2810{
2811        struct super_block *sb = dir->i_sb;
2812        struct ocfs2_dir_entry *de;
2813        struct ocfs2_dir_entry *prev_de;
2814        char *de_buf, *limit;
2815        unsigned int new_size = sb->s_blocksize;
2816        unsigned int bytes, this_hole;
2817        unsigned int largest_hole = 0;
2818
2819        if (ocfs2_new_dir_wants_trailer(dir))
2820                new_size = ocfs2_dir_trailer_blk_off(sb);
2821
2822        bytes = new_size - old_size;
2823
2824        limit = start + old_size;
2825        de_buf = start;
2826        de = (struct ocfs2_dir_entry *)de_buf;
2827        do {
2828                this_hole = ocfs2_figure_dirent_hole(de);
2829                if (this_hole > largest_hole)
2830                        largest_hole = this_hole;
2831
2832                prev_de = de;
2833                de_buf += le16_to_cpu(de->rec_len);
2834                de = (struct ocfs2_dir_entry *)de_buf;
2835        } while (de_buf < limit);
2836
2837        le16_add_cpu(&prev_de->rec_len, bytes);
2838
2839        /* We need to double check this after modification of the final
2840         * dirent. */
2841        this_hole = ocfs2_figure_dirent_hole(prev_de);
2842        if (this_hole > largest_hole)
2843                largest_hole = this_hole;
2844
2845        if (largest_hole >= OCFS2_DIR_MIN_REC_LEN)
2846                return largest_hole;
2847        return 0;
2848}
2849
2850/*
2851 * We allocate enough clusters to fulfill "blocks_wanted", but set
2852 * i_size to exactly one block. Ocfs2_extend_dir() will handle the
2853 * rest automatically for us.
2854 *
2855 * *first_block_bh is a pointer to the 1st data block allocated to the
2856 *  directory.
2857 */
2858static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2859                                   unsigned int blocks_wanted,
2860                                   struct ocfs2_dir_lookup_result *lookup,
2861                                   struct buffer_head **first_block_bh)
2862{
2863        u32 alloc, dx_alloc, bit_off, len, num_dx_entries = 0;
2864        struct super_block *sb = dir->i_sb;
2865        int ret, i, num_dx_leaves = 0, dx_inline = 0,
2866                credits = ocfs2_inline_to_extents_credits(sb);
2867        u64 dx_insert_blkno, blkno,
2868                bytes = blocks_wanted << sb->s_blocksize_bits;
2869        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2870        struct ocfs2_inode_info *oi = OCFS2_I(dir);
2871        struct ocfs2_alloc_context *data_ac = NULL;
2872        struct ocfs2_alloc_context *meta_ac = NULL;
2873        struct buffer_head *dirdata_bh = NULL;
2874        struct buffer_head *dx_root_bh = NULL;
2875        struct buffer_head **dx_leaves = NULL;
2876        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2877        handle_t *handle;
2878        struct ocfs2_extent_tree et;
2879        struct ocfs2_extent_tree dx_et;
2880        int did_quota = 0, bytes_allocated = 0;
2881
2882        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
2883
2884        alloc = ocfs2_clusters_for_bytes(sb, bytes);
2885        dx_alloc = 0;
2886
2887        down_write(&oi->ip_alloc_sem);
2888
2889        if (ocfs2_supports_indexed_dirs(osb)) {
2890                credits += ocfs2_add_dir_index_credits(sb);
2891
2892                dx_inline = ocfs2_new_dx_should_be_inline(dir, di_bh);
2893                if (!dx_inline) {
2894                        /* Add one more cluster for an index leaf */
2895                        dx_alloc++;
2896                        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(sb,
2897                                                                &num_dx_leaves);
2898                        if (!dx_leaves) {
2899                                ret = -ENOMEM;
2900                                mlog_errno(ret);
2901                                goto out;
2902                        }
2903                }
2904
2905                /* This gets us the dx_root */
2906                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
2907                if (ret) {
2908                        mlog_errno(ret);
2909                        goto out;
2910                }
2911        }
2912
2913        /*
2914         * We should never need more than 2 clusters for the unindexed
2915         * tree - maximum dirent size is far less than one block. In
2916         * fact, the only time we'd need more than one cluster is if
2917         * blocksize == clustersize and the dirent won't fit in the
2918         * extra space that the expansion to a single block gives. As
2919         * of today, that only happens on 4k/4k file systems.
2920         */
2921        BUG_ON(alloc > 2);
2922
2923        ret = ocfs2_reserve_clusters(osb, alloc + dx_alloc, &data_ac);
2924        if (ret) {
2925                mlog_errno(ret);
2926                goto out;
2927        }
2928
2929        /*
2930         * Prepare for worst case allocation scenario of two separate
2931         * extents in the unindexed tree.
2932         */
2933        if (alloc == 2)
2934                credits += OCFS2_SUBALLOC_ALLOC;
2935
2936        handle = ocfs2_start_trans(osb, credits);
2937        if (IS_ERR(handle)) {
2938                ret = PTR_ERR(handle);
2939                mlog_errno(ret);
2940                goto out;
2941        }
2942
2943        ret = dquot_alloc_space_nodirty(dir,
2944                ocfs2_clusters_to_bytes(osb->sb, alloc + dx_alloc));
2945        if (ret)
2946                goto out_commit;
2947        did_quota = 1;
2948
2949        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
2950                /*
2951                 * Allocate our index cluster first, to maximize the
2952                 * possibility that unindexed leaves grow
2953                 * contiguously.
2954                 */
2955                ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac,
2956                                                 dx_leaves, num_dx_leaves,
2957                                                 &dx_insert_blkno);
2958                if (ret) {
2959                        mlog_errno(ret);
2960                        goto out_commit;
2961                }
2962                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2963        }
2964
2965        /*
2966         * Try to claim as many clusters as the bitmap can give though
2967         * if we only get one now, that's enough to continue. The rest
2968         * will be claimed after the conversion to extents.
2969         */
2970        if (ocfs2_dir_resv_allowed(osb))
2971                data_ac->ac_resv = &oi->ip_la_data_resv;
2972        ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, &len);
2973        if (ret) {
2974                mlog_errno(ret);
2975                goto out_commit;
2976        }
2977        bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
2978
2979        /*
2980         * Operations are carefully ordered so that we set up the new
2981         * data block first. The conversion from inline data to
2982         * extents follows.
2983         */
2984        blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
2985        dirdata_bh = sb_getblk(sb, blkno);
2986        if (!dirdata_bh) {
2987                ret = -EIO;
2988                mlog_errno(ret);
2989                goto out_commit;
2990        }
2991
2992        ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
2993
2994        ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
2995                                      OCFS2_JOURNAL_ACCESS_CREATE);
2996        if (ret) {
2997                mlog_errno(ret);
2998                goto out_commit;
2999        }
3000
3001        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
3002        memset(dirdata_bh->b_data + i_size_read(dir), 0,
3003               sb->s_blocksize - i_size_read(dir));
3004        i = ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), dir);
3005        if (ocfs2_new_dir_wants_trailer(dir)) {
3006                /*
3007                 * Prepare the dir trailer up front. It will otherwise look
3008                 * like a valid dirent. Even if inserting the index fails
3009                 * (unlikely), then all we'll have done is given first dir
3010                 * block a small amount of fragmentation.
3011                 */
3012                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
3013        }
3014
3015        ocfs2_journal_dirty(handle, dirdata_bh);
3016
3017        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
3018                /*
3019                 * Dx dirs with an external cluster need to do this up
3020                 * front. Inline dx root's get handled later, after
3021                 * we've allocated our root block. We get passed back
3022                 * a total number of items so that dr_num_entries can
3023                 * be correctly set once the dx_root has been
3024                 * allocated.
3025                 */
3026                ret = ocfs2_dx_dir_index_block(dir, handle, dx_leaves,
3027                                               num_dx_leaves, &num_dx_entries,
3028                                               dirdata_bh);
3029                if (ret) {
3030                        mlog_errno(ret);
3031                        goto out_commit;
3032                }
3033        }
3034
3035        /*
3036         * Set extent, i_size, etc on the directory. After this, the
3037         * inode should contain the same exact dirents as before and
3038         * be fully accessible from system calls.
3039         *
3040         * We let the later dirent insert modify c/mtime - to the user
3041         * the data hasn't changed.
3042         */
3043        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
3044                                      OCFS2_JOURNAL_ACCESS_CREATE);
3045        if (ret) {
3046                mlog_errno(ret);
3047                goto out_commit;
3048        }
3049
3050        spin_lock(&oi->ip_lock);
3051        oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
3052        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
3053        spin_unlock(&oi->ip_lock);
3054
3055        ocfs2_dinode_new_extent_list(dir, di);
3056
3057        i_size_write(dir, sb->s_blocksize);
3058        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3059
3060        di->i_size = cpu_to_le64(sb->s_blocksize);
3061        di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
3062        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
3063
3064        /*
3065         * This should never fail as our extent list is empty and all
3066         * related blocks have been journaled already.
3067         */
3068        ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
3069                                  0, NULL);
3070        if (ret) {
3071                mlog_errno(ret);
3072                goto out_commit;
3073        }
3074
3075        /*
3076         * Set i_blocks after the extent insert for the most up to
3077         * date ip_clusters value.
3078         */
3079        dir->i_blocks = ocfs2_inode_sector_count(dir);
3080
3081        ocfs2_journal_dirty(handle, di_bh);
3082
3083        if (ocfs2_supports_indexed_dirs(osb)) {
3084                ret = ocfs2_dx_dir_attach_index(osb, handle, dir, di_bh,
3085                                                dirdata_bh, meta_ac, dx_inline,
3086                                                num_dx_entries, &dx_root_bh);
3087                if (ret) {
3088                        mlog_errno(ret);
3089                        goto out_commit;
3090                }
3091
3092                if (dx_inline) {
3093                        ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3094                                                      dirdata_bh);
3095                } else {
3096                        ocfs2_init_dx_root_extent_tree(&dx_et,
3097                                                       INODE_CACHE(dir),
3098                                                       dx_root_bh);
3099                        ret = ocfs2_insert_extent(handle, &dx_et, 0,
3100                                                  dx_insert_blkno, 1, 0, NULL);
3101                        if (ret)
3102                                mlog_errno(ret);
3103                }
3104        }
3105
3106        /*
3107         * We asked for two clusters, but only got one in the 1st
3108         * pass. Claim the 2nd cluster as a separate extent.
3109         */
3110        if (alloc > len) {
3111                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
3112                                           &len);
3113                if (ret) {
3114                        mlog_errno(ret);
3115                        goto out_commit;
3116                }
3117                blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
3118
3119                ret = ocfs2_insert_extent(handle, &et, 1,
3120                                          blkno, len, 0, NULL);
3121                if (ret) {
3122                        mlog_errno(ret);
3123                        goto out_commit;
3124                }
3125                bytes_allocated += ocfs2_clusters_to_bytes(dir->i_sb, 1);
3126        }
3127
3128        *first_block_bh = dirdata_bh;
3129        dirdata_bh = NULL;
3130        if (ocfs2_supports_indexed_dirs(osb)) {
3131                unsigned int off;
3132
3133                if (!dx_inline) {
3134                        /*
3135                         * We need to return the correct block within the
3136                         * cluster which should hold our entry.
3137                         */
3138                        off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb),
3139                                                    &lookup->dl_hinfo);
3140                        get_bh(dx_leaves[off]);
3141                        lookup->dl_dx_leaf_bh = dx_leaves[off];
3142                }
3143                lookup->dl_dx_root_bh = dx_root_bh;
3144                dx_root_bh = NULL;
3145        }
3146
3147out_commit:
3148        if (ret < 0 && did_quota)
3149                dquot_free_space_nodirty(dir, bytes_allocated);
3150
3151        ocfs2_commit_trans(osb, handle);
3152
3153out:
3154        up_write(&oi->ip_alloc_sem);
3155        if (data_ac)
3156                ocfs2_free_alloc_context(data_ac);
3157        if (meta_ac)
3158                ocfs2_free_alloc_context(meta_ac);
3159
3160        if (dx_leaves) {
3161                for (i = 0; i < num_dx_leaves; i++)
3162                        brelse(dx_leaves[i]);
3163                kfree(dx_leaves);
3164        }
3165
3166        brelse(dirdata_bh);
3167        brelse(dx_root_bh);
3168
3169        return ret;
3170}
3171
3172/* returns a bh of the 1st new block in the allocation. */
3173static int ocfs2_do_extend_dir(struct super_block *sb,
3174                               handle_t *handle,
3175                               struct inode *dir,
3176                               struct buffer_head *parent_fe_bh,
3177                               struct ocfs2_alloc_context *data_ac,
3178                               struct ocfs2_alloc_context *meta_ac,
3179                               struct buffer_head **new_bh)
3180{
3181        int status;
3182        int extend, did_quota = 0;
3183        u64 p_blkno, v_blkno;
3184
3185        spin_lock(&OCFS2_I(dir)->ip_lock);
3186        extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
3187        spin_unlock(&OCFS2_I(dir)->ip_lock);
3188
3189        if (extend) {
3190                u32 offset = OCFS2_I(dir)->ip_clusters;
3191
3192                status = dquot_alloc_space_nodirty(dir,
3193                                        ocfs2_clusters_to_bytes(sb, 1));
3194                if (status)
3195                        goto bail;
3196                did_quota = 1;
3197
3198                status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
3199                                              1, 0, parent_fe_bh, handle,
3200                                              data_ac, meta_ac, NULL);
3201                BUG_ON(status == -EAGAIN);
3202                if (status < 0) {
3203                        mlog_errno(status);
3204                        goto bail;
3205                }
3206        }
3207
3208        v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
3209        status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
3210        if (status < 0) {
3211                mlog_errno(status);
3212                goto bail;
3213        }
3214
3215        *new_bh = sb_getblk(sb, p_blkno);
3216        if (!*new_bh) {
3217                status = -EIO;
3218                mlog_errno(status);
3219                goto bail;
3220        }
3221        status = 0;
3222bail:
3223        if (did_quota && status < 0)
3224                dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3225        return status;
3226}
3227
3228/*
3229 * Assumes you already have a cluster lock on the directory.
3230 *
3231 * 'blocks_wanted' is only used if we have an inline directory which
3232 * is to be turned into an extent based one. The size of the dirent to
3233 * insert might be larger than the space gained by growing to just one
3234 * block, so we may have to grow the inode by two blocks in that case.
3235 *
3236 * If the directory is already indexed, dx_root_bh must be provided.
3237 */
3238static int ocfs2_extend_dir(struct ocfs2_super *osb,
3239                            struct inode *dir,
3240                            struct buffer_head *parent_fe_bh,
3241                            unsigned int blocks_wanted,
3242                            struct ocfs2_dir_lookup_result *lookup,
3243                            struct buffer_head **new_de_bh)
3244{
3245        int status = 0;
3246        int credits, num_free_extents, drop_alloc_sem = 0;
3247        loff_t dir_i_size;
3248        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
3249        struct ocfs2_extent_list *el = &fe->id2.i_list;
3250        struct ocfs2_alloc_context *data_ac = NULL;
3251        struct ocfs2_alloc_context *meta_ac = NULL;
3252        handle_t *handle = NULL;
3253        struct buffer_head *new_bh = NULL;
3254        struct ocfs2_dir_entry * de;
3255        struct super_block *sb = osb->sb;
3256        struct ocfs2_extent_tree et;
3257        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
3258
3259        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3260                /*
3261                 * This would be a code error as an inline directory should
3262                 * never have an index root.
3263                 */
3264                BUG_ON(dx_root_bh);
3265
3266                status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
3267                                                 blocks_wanted, lookup,
3268                                                 &new_bh);
3269                if (status) {
3270                        mlog_errno(status);
3271                        goto bail;
3272                }
3273
3274                /* Expansion from inline to an indexed directory will
3275                 * have given us this. */
3276                dx_root_bh = lookup->dl_dx_root_bh;
3277
3278                if (blocks_wanted == 1) {
3279                        /*
3280                         * If the new dirent will fit inside the space
3281                         * created by pushing out to one block, then
3282                         * we can complete the operation
3283                         * here. Otherwise we have to expand i_size
3284                         * and format the 2nd block below.
3285                         */
3286                        BUG_ON(new_bh == NULL);
3287                        goto bail_bh;
3288                }
3289
3290                /*
3291                 * Get rid of 'new_bh' - we want to format the 2nd
3292                 * data block and return that instead.
3293                 */
3294                brelse(new_bh);
3295                new_bh = NULL;
3296
3297                down_write(&OCFS2_I(dir)->ip_alloc_sem);
3298                drop_alloc_sem = 1;
3299                dir_i_size = i_size_read(dir);
3300                credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
3301                goto do_extend;
3302        }
3303
3304        down_write(&OCFS2_I(dir)->ip_alloc_sem);
3305        drop_alloc_sem = 1;
3306        dir_i_size = i_size_read(dir);
3307        trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
3308                               dir_i_size);
3309
3310        /* dir->i_size is always block aligned. */
3311        spin_lock(&OCFS2_I(dir)->ip_lock);
3312        if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
3313                spin_unlock(&OCFS2_I(dir)->ip_lock);
3314                ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
3315                                              parent_fe_bh);
3316                num_free_extents = ocfs2_num_free_extents(osb, &et);
3317                if (num_free_extents < 0) {
3318                        status = num_free_extents;
3319                        mlog_errno(status);
3320                        goto bail;
3321                }
3322
3323                if (!num_free_extents) {
3324                        status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
3325                        if (status < 0) {
3326                                if (status != -ENOSPC)
3327                                        mlog_errno(status);
3328                                goto bail;
3329                        }
3330                }
3331
3332                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
3333                if (status < 0) {
3334                        if (status != -ENOSPC)
3335                                mlog_errno(status);
3336                        goto bail;
3337                }
3338
3339                if (ocfs2_dir_resv_allowed(osb))
3340                        data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
3341
3342                credits = ocfs2_calc_extend_credits(sb, el, 1);
3343        } else {
3344                spin_unlock(&OCFS2_I(dir)->ip_lock);
3345                credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
3346        }
3347
3348do_extend:
3349        if (ocfs2_dir_indexed(dir))
3350                credits++; /* For attaching the new dirent block to the
3351                            * dx_root */
3352
3353        handle = ocfs2_start_trans(osb, credits);
3354        if (IS_ERR(handle)) {
3355                status = PTR_ERR(handle);
3356                handle = NULL;
3357                mlog_errno(status);
3358                goto bail;
3359        }
3360
3361        status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
3362                                     data_ac, meta_ac, &new_bh);
3363        if (status < 0) {
3364                mlog_errno(status);
3365                goto bail;
3366        }
3367
3368        ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
3369
3370        status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
3371                                         OCFS2_JOURNAL_ACCESS_CREATE);
3372        if (status < 0) {
3373                mlog_errno(status);
3374                goto bail;
3375        }
3376        memset(new_bh->b_data, 0, sb->s_blocksize);
3377
3378        de = (struct ocfs2_dir_entry *) new_bh->b_data;
3379        de->inode = 0;
3380        if (ocfs2_supports_dir_trailer(dir)) {
3381                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
3382
3383                ocfs2_init_dir_trailer(dir, new_bh, le16_to_cpu(de->rec_len));
3384
3385                if (ocfs2_dir_indexed(dir)) {
3386                        status = ocfs2_dx_dir_link_trailer(dir, handle,
3387                                                           dx_root_bh, new_bh);
3388                        if (status) {
3389                                mlog_errno(status);
3390                                goto bail;
3391                        }
3392                }
3393        } else {
3394                de->rec_len = cpu_to_le16(sb->s_blocksize);
3395        }
3396        ocfs2_journal_dirty(handle, new_bh);
3397
3398        dir_i_size += dir->i_sb->s_blocksize;
3399        i_size_write(dir, dir_i_size);
3400        dir->i_blocks = ocfs2_inode_sector_count(dir);
3401        status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
3402        if (status < 0) {
3403                mlog_errno(status);
3404                goto bail;
3405        }
3406
3407bail_bh:
3408        *new_de_bh = new_bh;
3409        get_bh(*new_de_bh);
3410bail:
3411        if (handle)
3412                ocfs2_commit_trans(osb, handle);
3413        if (drop_alloc_sem)
3414                up_write(&OCFS2_I(dir)->ip_alloc_sem);
3415
3416        if (data_ac)
3417                ocfs2_free_alloc_context(data_ac);
3418        if (meta_ac)
3419                ocfs2_free_alloc_context(meta_ac);
3420
3421        brelse(new_bh);
3422
3423        return status;
3424}
3425
3426static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
3427                                   const char *name, int namelen,
3428                                   struct buffer_head **ret_de_bh,
3429                                   unsigned int *blocks_wanted)
3430{
3431        int ret;
3432        struct super_block *sb = dir->i_sb;
3433        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3434        struct ocfs2_dir_entry *de, *last_de = NULL;
3435        char *de_buf, *limit;
3436        unsigned long offset = 0;
3437        unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
3438
3439        /*
3440         * This calculates how many free bytes we'd have in block zero, should
3441         * this function force expansion to an extent tree.
3442         */
3443        if (ocfs2_new_dir_wants_trailer(dir))
3444                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
3445        else
3446                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
3447
3448        de_buf = di->id2.i_data.id_data;
3449        limit = de_buf + i_size_read(dir);
3450        rec_len = OCFS2_DIR_REC_LEN(namelen);
3451
3452        while (de_buf < limit) {
3453                de = (struct ocfs2_dir_entry *)de_buf;
3454
3455                if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
3456                        ret = -ENOENT;
3457                        goto out;
3458                }
3459                if (ocfs2_match(namelen, name, de)) {
3460                        ret = -EEXIST;
3461                        goto out;
3462                }
3463                /*
3464                 * No need to check for a trailing dirent record here as
3465                 * they're not used for inline dirs.
3466                 */
3467
3468                if (ocfs2_dirent_would_fit(de, rec_len)) {
3469                        /* Ok, we found a spot. Return this bh and let
3470                         * the caller actually fill it in. */
3471                        *ret_de_bh = di_bh;
3472                        get_bh(*ret_de_bh);
3473                        ret = 0;
3474                        goto out;
3475                }
3476
3477                last_de = de;
3478                de_buf += le16_to_cpu(de->rec_len);
3479                offset += le16_to_cpu(de->rec_len);
3480        }
3481
3482        /*
3483         * We're going to require expansion of the directory - figure
3484         * out how many blocks we'll need so that a place for the
3485         * dirent can be found.
3486         */
3487        *blocks_wanted = 1;
3488        new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
3489        if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
3490                *blocks_wanted = 2;
3491
3492        ret = -ENOSPC;
3493out:
3494        return ret;
3495}
3496
3497static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
3498                                   int namelen, struct buffer_head **ret_de_bh)
3499{
3500        unsigned long offset;
3501        struct buffer_head *bh = NULL;
3502        unsigned short rec_len;
3503        struct ocfs2_dir_entry *de;
3504        struct super_block *sb = dir->i_sb;
3505        int status;
3506        int blocksize = dir->i_sb->s_blocksize;
3507
3508        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
3509        if (status) {
3510                mlog_errno(status);
3511                goto bail;
3512        }
3513
3514        rec_len = OCFS2_DIR_REC_LEN(namelen);
3515        offset = 0;
3516        de = (struct ocfs2_dir_entry *) bh->b_data;
3517        while (1) {
3518                if ((char *)de >= sb->s_blocksize + bh->b_data) {
3519                        brelse(bh);
3520                        bh = NULL;
3521
3522                        if (i_size_read(dir) <= offset) {
3523                                /*
3524                                 * Caller will have to expand this
3525                                 * directory.
3526                                 */
3527                                status = -ENOSPC;
3528                                goto bail;
3529                        }
3530                        status = ocfs2_read_dir_block(dir,
3531                                             offset >> sb->s_blocksize_bits,
3532                                             &bh, 0);
3533                        if (status) {
3534                                mlog_errno(status);
3535                                goto bail;
3536                        }
3537                        /* move to next block */
3538                        de = (struct ocfs2_dir_entry *) bh->b_data;
3539                }
3540                if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
3541                        status = -ENOENT;
3542                        goto bail;
3543                }
3544                if (ocfs2_match(namelen, name, de)) {
3545                        status = -EEXIST;
3546                        goto bail;
3547                }
3548
3549                if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
3550                                           blocksize))
3551                        goto next;
3552
3553                if (ocfs2_dirent_would_fit(de, rec_len)) {
3554                        /* Ok, we found a spot. Return this bh and let
3555                         * the caller actually fill it in. */
3556                        *ret_de_bh = bh;
3557                        get_bh(*ret_de_bh);
3558                        status = 0;
3559                        goto bail;
3560                }
3561next:
3562                offset += le16_to_cpu(de->rec_len);
3563                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
3564        }
3565
3566        status = 0;
3567bail:
3568        brelse(bh);
3569        if (status)
3570                mlog_errno(status);
3571
3572        return status;
3573}
3574
3575static int dx_leaf_sort_cmp(const void *a, const void *b)
3576{
3577        const struct ocfs2_dx_entry *entry1 = a;
3578        const struct ocfs2_dx_entry *entry2 = b;
3579        u32 major_hash1 = le32_to_cpu(entry1->dx_major_hash);
3580        u32 major_hash2 = le32_to_cpu(entry2->dx_major_hash);
3581        u32 minor_hash1 = le32_to_cpu(entry1->dx_minor_hash);
3582        u32 minor_hash2 = le32_to_cpu(entry2->dx_minor_hash);
3583
3584        if (major_hash1 > major_hash2)
3585                return 1;
3586        if (major_hash1 < major_hash2)
3587                return -1;
3588
3589        /*
3590         * It is not strictly necessary to sort by minor
3591         */
3592        if (minor_hash1 > minor_hash2)
3593                return 1;
3594        if (minor_hash1 < minor_hash2)
3595                return -1;
3596        return 0;
3597}
3598
3599static void dx_leaf_sort_swap(void *a, void *b, int size)
3600{
3601        struct ocfs2_dx_entry *entry1 = a;
3602        struct ocfs2_dx_entry *entry2 = b;
3603        struct ocfs2_dx_entry tmp;
3604
3605        BUG_ON(size != sizeof(*entry1));
3606
3607        tmp = *entry1;
3608        *entry1 = *entry2;
3609        *entry2 = tmp;
3610}
3611
3612static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf)
3613{
3614        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3615        int i, num = le16_to_cpu(dl_list->de_num_used);
3616
3617        for (i = 0; i < (num - 1); i++) {
3618                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) !=
3619                    le32_to_cpu(dl_list->de_entries[i + 1].dx_major_hash))
3620                        return 0;
3621        }
3622
3623        return 1;
3624}
3625
3626/*
3627 * Find the optimal value to split this leaf on. This expects the leaf
3628 * entries to be in sorted order.
3629 *
3630 * leaf_cpos is the cpos of the leaf we're splitting. insert_hash is
3631 * the hash we want to insert.
3632 *
3633 * This function is only concerned with the major hash - that which
3634 * determines which cluster an item belongs to.
3635 */
3636static int ocfs2_dx_dir_find_leaf_split(struct ocfs2_dx_leaf *dx_leaf,
3637                                        u32 leaf_cpos, u32 insert_hash,
3638                                        u32 *split_hash)
3639{
3640        struct ocfs2_dx_entry_list *dl_list = &dx_leaf->dl_list;
3641        int i, num_used = le16_to_cpu(dl_list->de_num_used);
3642        int allsame;
3643
3644        /*
3645         * There's a couple rare, but nasty corner cases we have to
3646         * check for here. All of them involve a leaf where all value
3647         * have the same hash, which is what we look for first.
3648         *
3649         * Most of the time, all of the above is false, and we simply
3650         * pick the median value for a split.
3651         */
3652        allsame = ocfs2_dx_leaf_same_major(dx_leaf);
3653        if (allsame) {
3654                u32 val = le32_to_cpu(dl_list->de_entries[0].dx_major_hash);
3655
3656                if (val == insert_hash) {
3657                        /*
3658                         * No matter where we would choose to split,
3659                         * the new entry would want to occupy the same
3660                         * block as these. Since there's no space left
3661                         * in their existing block, we know there
3662                         * won't be space after the split.
3663                         */
3664                        return -ENOSPC;
3665                }
3666
3667                if (val == leaf_cpos) {
3668                        /*
3669                         * Because val is the same as leaf_cpos (which
3670                         * is the smallest value this leaf can have),
3671                         * yet is not equal to insert_hash, then we
3672                         * know that insert_hash *must* be larger than
3673                         * val (and leaf_cpos). At least cpos+1 in value.
3674                         *
3675                         * We also know then, that there cannot be an
3676                         * adjacent extent (otherwise we'd be looking
3677                         * at it). Choosing this value gives us a
3678                         * chance to get some contiguousness.
3679                         */
3680                        *split_hash = leaf_cpos + 1;
3681                        return 0;
3682                }
3683
3684                if (val > insert_hash) {
3685                        /*
3686                         * val can not be the same as insert hash, and
3687                         * also must be larger than leaf_cpos. Also,
3688                         * we know that there can't be a leaf between
3689                         * cpos and val, otherwise the entries with
3690                         * hash 'val' would be there.
3691                         */
3692                        *split_hash = val;
3693                        return 0;
3694                }
3695
3696                *split_hash = insert_hash;
3697                return 0;
3698        }
3699
3700        /*
3701         * Since the records are sorted and the checks above
3702         * guaranteed that not all records in this block are the same,
3703         * we simple travel forward, from the median, and pick the 1st
3704         * record whose value is larger than leaf_cpos.
3705         */
3706        for (i = (num_used / 2); i < num_used; i++)
3707                if (le32_to_cpu(dl_list->de_entries[i].dx_major_hash) >
3708                    leaf_cpos)
3709                        break;
3710
3711        BUG_ON(i == num_used); /* Should be impossible */
3712        *split_hash = le32_to_cpu(dl_list->de_entries[i].dx_major_hash);
3713        return 0;
3714}
3715
3716/*
3717 * Transfer all entries in orig_dx_leaves whose major hash is equal to or
3718 * larger than split_hash into new_dx_leaves. We use a temporary
3719 * buffer (tmp_dx_leaf) to make the changes to the original leaf blocks.
3720 *
3721 * Since the block offset inside a leaf (cluster) is a constant mask
3722 * of minor_hash, we can optimize - an item at block offset X within
3723 * the original cluster, will be at offset X within the new cluster.
3724 */
3725static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3726                                       handle_t *handle,
3727                                       struct ocfs2_dx_leaf *tmp_dx_leaf,
3728                                       struct buffer_head **orig_dx_leaves,
3729                                       struct buffer_head **new_dx_leaves,
3730                                       int num_dx_leaves)
3731{
3732        int i, j, num_used;
3733        u32 major_hash;
3734        struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3735        struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list;
3736        struct ocfs2_dx_entry *dx_entry;
3737
3738        tmp_list = &tmp_dx_leaf->dl_list;
3739
3740        for (i = 0; i < num_dx_leaves; i++) {
3741                orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3742                orig_list = &orig_dx_leaf->dl_list;
3743                new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3744                new_list = &new_dx_leaf->dl_list;
3745
3746                num_used = le16_to_cpu(orig_list->de_num_used);
3747
3748                memcpy(tmp_dx_leaf, orig_dx_leaf, dir->i_sb->s_blocksize);
3749                tmp_list->de_num_used = cpu_to_le16(0);
3750                memset(&tmp_list->de_entries, 0, sizeof(*dx_entry)*num_used);
3751
3752                for (j = 0; j < num_used; j++) {
3753                        dx_entry = &orig_list->de_entries[j];
3754                        major_hash = le32_to_cpu(dx_entry->dx_major_hash);
3755                        if (major_hash >= split_hash)
3756                                ocfs2_dx_dir_leaf_insert_tail(new_dx_leaf,
3757                                                              dx_entry);
3758                        else
3759                                ocfs2_dx_dir_leaf_insert_tail(tmp_dx_leaf,
3760                                                              dx_entry);
3761                }
3762                memcpy(orig_dx_leaf, tmp_dx_leaf, dir->i_sb->s_blocksize);
3763
3764                ocfs2_journal_dirty(handle, orig_dx_leaves[i]);
3765                ocfs2_journal_dirty(handle, new_dx_leaves[i]);
3766        }
3767}
3768
3769static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
3770                                          struct ocfs2_dx_root_block *dx_root)
3771{
3772        int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
3773
3774        credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
3775        credits += ocfs2_quota_trans_credits(osb->sb);
3776        return credits;
3777}
3778
3779/*
3780 * Find the median value in dx_leaf_bh and allocate a new leaf to move
3781 * half our entries into.
3782 */
3783static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3784                                  struct buffer_head *dx_root_bh,
3785                                  struct buffer_head *dx_leaf_bh,
3786                                  struct ocfs2_dx_hinfo *hinfo, u32 leaf_cpos,
3787                                  u64 leaf_blkno)
3788{
3789        struct ocfs2_dx_leaf *dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
3790        int credits, ret, i, num_used, did_quota = 0;
3791        u32 cpos, split_hash, insert_hash = hinfo->major_hash;
3792        u64 orig_leaves_start;
3793        int num_dx_leaves;
3794        struct buffer_head **orig_dx_leaves = NULL;
3795        struct buffer_head **new_dx_leaves = NULL;
3796        struct ocfs2_alloc_context *data_ac = NULL, *meta_ac = NULL;
3797        struct ocfs2_extent_tree et;
3798        handle_t *handle = NULL;
3799        struct ocfs2_dx_root_block *dx_root;
3800        struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3801
3802        trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
3803                                     (unsigned long long)leaf_blkno,
3804                                     insert_hash);
3805
3806        ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
3807
3808        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3809        /*
3810         * XXX: This is a rather large limit. We should use a more
3811         * realistic value.
3812         */
3813        if (le32_to_cpu(dx_root->dr_clusters) == UINT_MAX)
3814                return -ENOSPC;
3815
3816        num_used = le16_to_cpu(dx_leaf->dl_list.de_num_used);
3817        if (num_used < le16_to_cpu(dx_leaf->dl_list.de_count)) {
3818                mlog(ML_ERROR, "DX Dir: %llu, Asked to rebalance empty leaf: "
3819                     "%llu, %d\n", (unsigned long long)OCFS2_I(dir)->ip_blkno,
3820                     (unsigned long long)leaf_blkno, num_used);
3821                ret = -EIO;
3822                goto out;
3823        }
3824
3825        orig_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
3826        if (!orig_dx_leaves) {
3827                ret = -ENOMEM;
3828                mlog_errno(ret);
3829                goto out;
3830        }
3831
3832        new_dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, NULL);
3833        if (!new_dx_leaves) {
3834                ret = -ENOMEM;
3835                mlog_errno(ret);
3836                goto out;
3837        }
3838
3839        ret = ocfs2_lock_allocators(dir, &et, 1, 0, &data_ac, &meta_ac);
3840        if (ret) {
3841                if (ret != -ENOSPC)
3842                        mlog_errno(ret);
3843                goto out;
3844        }
3845
3846        credits = ocfs2_dx_dir_rebalance_credits(osb, dx_root);
3847        handle = ocfs2_start_trans(osb, credits);
3848        if (IS_ERR(handle)) {
3849                ret = PTR_ERR(handle);
3850                handle = NULL;
3851                mlog_errno(ret);
3852                goto out;
3853        }
3854
3855        ret = dquot_alloc_space_nodirty(dir,
3856                                       ocfs2_clusters_to_bytes(dir->i_sb, 1));
3857        if (ret)
3858                goto out_commit;
3859        did_quota = 1;
3860
3861        ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
3862                                      OCFS2_JOURNAL_ACCESS_WRITE);
3863        if (ret) {
3864                mlog_errno(ret);
3865                goto out_commit;
3866        }
3867
3868        /*
3869         * This block is changing anyway, so we can sort it in place.
3870         */
3871        sort(dx_leaf->dl_list.de_entries, num_used,
3872             sizeof(struct ocfs2_dx_entry), dx_leaf_sort_cmp,
3873             dx_leaf_sort_swap);
3874
3875        ocfs2_journal_dirty(handle, dx_leaf_bh);
3876
3877        ret = ocfs2_dx_dir_find_leaf_split(dx_leaf, leaf_cpos, insert_hash,
3878                                           &split_hash);
3879        if (ret) {
3880                mlog_errno(ret);
3881                goto  out_commit;
3882        }
3883
3884        trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
3885
3886        /*
3887         * We have to carefully order operations here. There are items
3888         * which want to be in the new cluster before insert, but in
3889         * order to put those items in the new cluster, we alter the
3890         * old cluster. A failure to insert gets nasty.
3891         *
3892         * So, start by reserving writes to the old
3893         * cluster. ocfs2_dx_dir_new_cluster will reserve writes on
3894         * the new cluster for us, before inserting it. The insert
3895         * won't happen if there's an error before that. Once the
3896         * insert is done then, we can transfer from one leaf into the
3897         * other without fear of hitting any error.
3898         */
3899
3900        /*
3901         * The leaf transfer wants some scratch space so that we don't
3902         * wind up doing a bunch of expensive memmove().
3903         */
3904        tmp_dx_leaf = kmalloc(osb->sb->s_blocksize, GFP_NOFS);
3905        if (!tmp_dx_leaf) {
3906                ret = -ENOMEM;
3907                mlog_errno(ret);
3908                goto out_commit;
3909        }
3910
3911        orig_leaves_start = ocfs2_block_to_cluster_start(dir->i_sb, leaf_blkno);
3912        ret = ocfs2_read_dx_leaves(dir, orig_leaves_start, num_dx_leaves,
3913                                   orig_dx_leaves);
3914        if (ret) {
3915                mlog_errno(ret);
3916                goto out_commit;
3917        }
3918
3919        cpos = split_hash;
3920        ret = ocfs2_dx_dir_new_cluster(dir, &et, cpos, handle,
3921                                       data_ac, meta_ac, new_dx_leaves,
3922                                       num_dx_leaves);
3923        if (ret) {
3924                mlog_errno(ret);
3925                goto out_commit;
3926        }
3927
3928        for (i = 0; i < num_dx_leaves; i++) {
3929                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3930                                              orig_dx_leaves[i],
3931                                              OCFS2_JOURNAL_ACCESS_WRITE);
3932                if (ret) {
3933                        mlog_errno(ret);
3934                        goto out_commit;
3935                }
3936
3937                ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3938                                              new_dx_leaves[i],
3939                                              OCFS2_JOURNAL_ACCESS_WRITE);
3940                if (ret) {
3941                        mlog_errno(ret);
3942                        goto out_commit;
3943                }
3944        }
3945
3946        ocfs2_dx_dir_transfer_leaf(dir, split_hash, handle, tmp_dx_leaf,
3947                                   orig_dx_leaves, new_dx_leaves, num_dx_leaves);
3948
3949out_commit:
3950        if (ret < 0 && did_quota)
3951                dquot_free_space_nodirty(dir,
3952                                ocfs2_clusters_to_bytes(dir->i_sb, 1));
3953
3954        ocfs2_commit_trans(osb, handle);
3955
3956out:
3957        if (orig_dx_leaves || new_dx_leaves) {
3958                for (i = 0; i < num_dx_leaves; i++) {
3959                        if (orig_dx_leaves)
3960                                brelse(orig_dx_leaves[i]);
3961                        if (new_dx_leaves)
3962                                brelse(new_dx_leaves[i]);
3963                }
3964                kfree(orig_dx_leaves);
3965                kfree(new_dx_leaves);
3966        }
3967
3968        if (meta_ac)
3969                ocfs2_free_alloc_context(meta_ac);
3970        if (data_ac)
3971                ocfs2_free_alloc_context(data_ac);
3972
3973        kfree(tmp_dx_leaf);
3974        return ret;
3975}
3976
3977static int ocfs2_find_dir_space_dx(struct ocfs2_super *osb, struct inode *dir,
3978                                   struct buffer_head *di_bh,
3979                                   struct buffer_head *dx_root_bh,
3980                                   const char *name, int namelen,
3981                                   struct ocfs2_dir_lookup_result *lookup)
3982{
3983        int ret, rebalanced = 0;
3984        struct ocfs2_dx_root_block *dx_root;
3985        struct buffer_head *dx_leaf_bh = NULL;
3986        struct ocfs2_dx_leaf *dx_leaf;
3987        u64 blkno;
3988        u32 leaf_cpos;
3989
3990        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3991
3992restart_search:
3993        ret = ocfs2_dx_dir_lookup(dir, &dx_root->dr_list, &lookup->dl_hinfo,
3994                                  &leaf_cpos, &blkno);
3995        if (ret) {
3996                mlog_errno(ret);
3997                goto out;
3998        }
3999
4000        ret = ocfs2_read_dx_leaf(dir, blkno, &dx_leaf_bh);
4001        if (ret) {
4002                mlog_errno(ret);
4003                goto out;
4004        }
4005
4006        dx_leaf = (struct ocfs2_dx_leaf *)dx_leaf_bh->b_data;
4007
4008        if (le16_to_cpu(dx_leaf->dl_list.de_num_used) >=
4009            le16_to_cpu(dx_leaf->dl_list.de_count)) {
4010                if (rebalanced) {
4011                        /*
4012                         * Rebalancing should have provided us with
4013                         * space in an appropriate leaf.
4014                         *
4015                         * XXX: Is this an abnormal condition then?
4016                         * Should we print a message here?
4017                         */
4018                        ret = -ENOSPC;
4019                        goto out;
4020                }
4021
4022                ret = ocfs2_dx_dir_rebalance(osb, dir, dx_root_bh, dx_leaf_bh,
4023                                             &lookup->dl_hinfo, leaf_cpos,
4024                                             blkno);
4025                if (ret) {
4026                        if (ret != -ENOSPC)
4027                                mlog_errno(ret);
4028                        goto out;
4029                }
4030
4031                /*
4032                 * Restart the lookup. The rebalance might have
4033                 * changed which block our item fits into. Mark our
4034                 * progress, so we only execute this once.
4035                 */
4036                brelse(dx_leaf_bh);
4037                dx_leaf_bh = NULL;
4038                rebalanced = 1;
4039                goto restart_search;
4040        }
4041
4042        lookup->dl_dx_leaf_bh = dx_leaf_bh;
4043        dx_leaf_bh = NULL;
4044
4045out:
4046        brelse(dx_leaf_bh);
4047        return ret;
4048}
4049
4050static int ocfs2_search_dx_free_list(struct inode *dir,
4051                                     struct buffer_head *dx_root_bh,
4052                                     int namelen,
4053                                     struct ocfs2_dir_lookup_result *lookup)
4054{
4055        int ret = -ENOSPC;
4056        struct buffer_head *leaf_bh = NULL, *prev_leaf_bh = NULL;
4057        struct ocfs2_dir_block_trailer *db;
4058        u64 next_block;
4059        int rec_len = OCFS2_DIR_REC_LEN(namelen);
4060        struct ocfs2_dx_root_block *dx_root;
4061
4062        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4063        next_block = le64_to_cpu(dx_root->dr_free_blk);
4064
4065        while (next_block) {
4066                brelse(prev_leaf_bh);
4067                prev_leaf_bh = leaf_bh;
4068                leaf_bh = NULL;
4069
4070                ret = ocfs2_read_dir_block_direct(dir, next_block, &leaf_bh);
4071                if (ret) {
4072                        mlog_errno(ret);
4073                        goto out;
4074                }
4075
4076                db = ocfs2_trailer_from_bh(leaf_bh, dir->i_sb);
4077                if (rec_len <= le16_to_cpu(db->db_free_rec_len)) {
4078                        lookup->dl_leaf_bh = leaf_bh;
4079                        lookup->dl_prev_leaf_bh = prev_leaf_bh;
4080                        leaf_bh = NULL;
4081                        prev_leaf_bh = NULL;
4082                        break;
4083                }
4084
4085                next_block = le64_to_cpu(db->db_free_next);
4086        }
4087
4088        if (!next_block)
4089                ret = -ENOSPC;
4090
4091out:
4092
4093        brelse(leaf_bh);
4094        brelse(prev_leaf_bh);
4095        return ret;
4096}
4097
4098static int ocfs2_expand_inline_dx_root(struct inode *dir,
4099                                       struct buffer_head *dx_root_bh)
4100{
4101        int ret, num_dx_leaves, i, j, did_quota = 0;
4102        struct buffer_head **dx_leaves = NULL;
4103        struct ocfs2_extent_tree et;
4104        u64 insert_blkno;
4105        struct ocfs2_alloc_context *data_ac = NULL;
4106        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4107        handle_t *handle = NULL;
4108        struct ocfs2_dx_root_block *dx_root;
4109        struct ocfs2_dx_entry_list *entry_list;
4110        struct ocfs2_dx_entry *dx_entry;
4111        struct ocfs2_dx_leaf *target_leaf;
4112
4113        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
4114        if (ret) {
4115                mlog_errno(ret);
4116                goto out;
4117        }
4118
4119        dx_leaves = ocfs2_dx_dir_kmalloc_leaves(osb->sb, &num_dx_leaves);
4120        if (!dx_leaves) {
4121                ret = -ENOMEM;
4122                mlog_errno(ret);
4123                goto out;
4124        }
4125
4126        handle = ocfs2_start_trans(osb, ocfs2_calc_dxi_expand_credits(osb->sb));
4127        if (IS_ERR(handle)) {
4128                ret = PTR_ERR(handle);
4129                mlog_errno(ret);
4130                goto out;
4131        }
4132
4133        ret = dquot_alloc_space_nodirty(dir,
4134                                       ocfs2_clusters_to_bytes(osb->sb, 1));
4135        if (ret)
4136                goto out_commit;
4137        did_quota = 1;
4138
4139        /*
4140         * We do this up front, before the allocation, so that a
4141         * failure to add the dx_root_bh to the journal won't result
4142         * us losing clusters.
4143         */
4144        ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
4145                                      OCFS2_JOURNAL_ACCESS_WRITE);
4146        if (ret) {
4147                mlog_errno(ret);
4148                goto out_commit;
4149        }
4150
4151        ret = __ocfs2_dx_dir_new_cluster(dir, 0, handle, data_ac, dx_leaves,
4152                                         num_dx_leaves, &insert_blkno);
4153        if (ret) {
4154                mlog_errno(ret);
4155                goto out_commit;
4156        }
4157
4158        /*
4159         * Transfer the entries from our dx_root into the appropriate
4160         * block
4161         */
4162        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4163        entry_list = &dx_root->dr_entries;
4164
4165        for (i = 0; i < le16_to_cpu(entry_list->de_num_used); i++) {
4166                dx_entry = &entry_list->de_entries[i];
4167
4168                j = __ocfs2_dx_dir_hash_idx(osb,
4169                                            le32_to_cpu(dx_entry->dx_minor_hash));
4170                target_leaf = (struct ocfs2_dx_leaf *)dx_leaves[j]->b_data;
4171
4172                ocfs2_dx_dir_leaf_insert_tail(target_leaf, dx_entry);
4173
4174                /* Each leaf has been passed to the journal already
4175                 * via __ocfs2_dx_dir_new_cluster() */
4176        }
4177
4178        dx_root->dr_flags &= ~OCFS2_DX_FLAG_INLINE;
4179        memset(&dx_root->dr_list, 0, osb->sb->s_blocksize -
4180               offsetof(struct ocfs2_dx_root_block, dr_list));
4181        dx_root->dr_list.l_count =
4182                cpu_to_le16(ocfs2_extent_recs_per_dx_root(osb->sb));
4183
4184        /* This should never fail considering we start with an empty
4185         * dx_root. */
4186        ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4187        ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
4188        if (ret)
4189                mlog_errno(ret);
4190        did_quota = 0;
4191
4192        ocfs2_journal_dirty(handle, dx_root_bh);
4193
4194out_commit:
4195        if (ret < 0 && did_quota)
4196                dquot_free_space_nodirty(dir,
4197                                          ocfs2_clusters_to_bytes(dir->i_sb, 1));
4198
4199        ocfs2_commit_trans(osb, handle);
4200
4201out:
4202        if (data_ac)
4203                ocfs2_free_alloc_context(data_ac);
4204
4205        if (dx_leaves) {
4206                for (i = 0; i < num_dx_leaves; i++)
4207                        brelse(dx_leaves[i]);
4208                kfree(dx_leaves);
4209        }
4210        return ret;
4211}
4212
4213static int ocfs2_inline_dx_has_space(struct buffer_head *dx_root_bh)
4214{
4215        struct ocfs2_dx_root_block *dx_root;
4216        struct ocfs2_dx_entry_list *entry_list;
4217
4218        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4219        entry_list = &dx_root->dr_entries;
4220
4221        if (le16_to_cpu(entry_list->de_num_used) >=
4222            le16_to_cpu(entry_list->de_count))
4223                return -ENOSPC;
4224
4225        return 0;
4226}
4227
4228static int ocfs2_prepare_dx_dir_for_insert(struct inode *dir,
4229                                           struct buffer_head *di_bh,
4230                                           const char *name,
4231                                           int namelen,
4232                                           struct ocfs2_dir_lookup_result *lookup)
4233{
4234        int ret, free_dx_root = 1;
4235        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4236        struct buffer_head *dx_root_bh = NULL;
4237        struct buffer_head *leaf_bh = NULL;
4238        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4239        struct ocfs2_dx_root_block *dx_root;
4240
4241        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4242        if (ret) {
4243                mlog_errno(ret);
4244                goto out;
4245        }
4246
4247        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4248        if (le32_to_cpu(dx_root->dr_num_entries) == OCFS2_DX_ENTRIES_MAX) {
4249                ret = -ENOSPC;
4250                mlog_errno(ret);
4251                goto out;
4252        }
4253
4254        if (ocfs2_dx_root_inline(dx_root)) {
4255                ret = ocfs2_inline_dx_has_space(dx_root_bh);
4256
4257                if (ret == 0)
4258                        goto search_el;
4259
4260                /*
4261                 * We ran out of room in the root block. Expand it to
4262                 * an extent, then allow ocfs2_find_dir_space_dx to do
4263                 * the rest.
4264                 */
4265                ret = ocfs2_expand_inline_dx_root(dir, dx_root_bh);
4266                if (ret) {
4267                        mlog_errno(ret);
4268                        goto out;
4269                }
4270        }
4271
4272        /*
4273         * Insert preparation for an indexed directory is split into two
4274         * steps. The call to find_dir_space_dx reserves room in the index for
4275         * an additional item. If we run out of space there, it's a real error
4276         * we can't continue on.
4277         */
4278        ret = ocfs2_find_dir_space_dx(osb, dir, di_bh, dx_root_bh, name,
4279                                      namelen, lookup);
4280        if (ret) {
4281                mlog_errno(ret);
4282                goto out;
4283        }
4284
4285search_el:
4286        /*
4287         * Next, we need to find space in the unindexed tree. This call
4288         * searches using the free space linked list. If the unindexed tree
4289         * lacks sufficient space, we'll expand it below. The expansion code
4290         * is smart enough to add any new blocks to the free space list.
4291         */
4292        ret = ocfs2_search_dx_free_list(dir, dx_root_bh, namelen, lookup);
4293        if (ret && ret != -ENOSPC) {
4294                mlog_errno(ret);
4295                goto out;
4296        }
4297
4298        /* Do this up here - ocfs2_extend_dir might need the dx_root */
4299        lookup->dl_dx_root_bh = dx_root_bh;
4300        free_dx_root = 0;
4301
4302        if (ret == -ENOSPC) {
4303                ret = ocfs2_extend_dir(osb, dir, di_bh, 1, lookup, &leaf_bh);
4304
4305                if (ret) {
4306                        mlog_errno(ret);
4307                        goto out;
4308                }
4309
4310                /*
4311                 * We make the assumption here that new leaf blocks are added
4312                 * to the front of our free list.
4313                 */
4314                lookup->dl_prev_leaf_bh = NULL;
4315                lookup->dl_leaf_bh = leaf_bh;
4316        }
4317
4318out:
4319        if (free_dx_root)
4320                brelse(dx_root_bh);
4321        return ret;
4322}
4323
4324/*
4325 * Get a directory ready for insert. Any directory allocation required
4326 * happens here. Success returns zero, and enough context in the dir
4327 * lookup result that ocfs2_add_entry() will be able complete the task
4328 * with minimal performance impact.
4329 */
4330int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
4331                                 struct inode *dir,
4332                                 struct buffer_head *parent_fe_bh,
4333                                 const char *name,
4334                                 int namelen,
4335                                 struct ocfs2_dir_lookup_result *lookup)
4336{
4337        int ret;
4338        unsigned int blocks_wanted = 1;
4339        struct buffer_head *bh = NULL;
4340
4341        trace_ocfs2_prepare_dir_for_insert(
4342                (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
4343
4344        if (!namelen) {
4345                ret = -EINVAL;
4346                mlog_errno(ret);
4347                goto out;
4348        }
4349
4350        /*
4351         * Do this up front to reduce confusion.
4352         *
4353         * The directory might start inline, then be turned into an
4354         * indexed one, in which case we'd need to hash deep inside
4355         * ocfs2_find_dir_space_id(). Since
4356         * ocfs2_prepare_dx_dir_for_insert() also needs this hash
4357         * done, there seems no point in spreading out the calls. We
4358         * can optimize away the case where the file system doesn't
4359         * support indexing.
4360         */
4361        if (ocfs2_supports_indexed_dirs(osb))
4362                ocfs2_dx_dir_name_hash(dir, name, namelen, &lookup->dl_hinfo);
4363
4364        if (ocfs2_dir_indexed(dir)) {
4365                ret = ocfs2_prepare_dx_dir_for_insert(dir, parent_fe_bh,
4366                                                      name, namelen, lookup);
4367                if (ret)
4368                        mlog_errno(ret);
4369                goto out;
4370        }
4371
4372        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4373                ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
4374                                              namelen, &bh, &blocks_wanted);
4375        } else
4376                ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
4377
4378        if (ret && ret != -ENOSPC) {
4379                mlog_errno(ret);
4380                goto out;
4381        }
4382
4383        if (ret == -ENOSPC) {
4384                /*
4385                 * We have to expand the directory to add this name.
4386                 */
4387                BUG_ON(bh);
4388
4389                ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
4390                                       lookup, &bh);
4391                if (ret) {
4392                        if (ret != -ENOSPC)
4393                                mlog_errno(ret);
4394                        goto out;
4395                }
4396
4397                BUG_ON(!bh);
4398        }
4399
4400        lookup->dl_leaf_bh = bh;
4401        bh = NULL;
4402out:
4403        brelse(bh);
4404        return ret;
4405}
4406
4407static int ocfs2_dx_dir_remove_index(struct inode *dir,
4408                                     struct buffer_head *di_bh,
4409                                     struct buffer_head *dx_root_bh)
4410{
4411        int ret;
4412        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4413        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4414        struct ocfs2_dx_root_block *dx_root;
4415        struct inode *dx_alloc_inode = NULL;
4416        struct buffer_head *dx_alloc_bh = NULL;
4417        handle_t *handle;
4418        u64 blk;
4419        u16 bit;
4420        u64 bg_blkno;
4421
4422        dx_root = (struct ocfs2_dx_root_block *) dx_root_bh->b_data;
4423
4424        dx_alloc_inode = ocfs2_get_system_file_inode(osb,
4425                                        EXTENT_ALLOC_SYSTEM_INODE,
4426                                        le16_to_cpu(dx_root->dr_suballoc_slot));
4427        if (!dx_alloc_inode) {
4428                ret = -ENOMEM;
4429                mlog_errno(ret);
4430                goto out;
4431        }
4432        mutex_lock(&dx_alloc_inode->i_mutex);
4433
4434        ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
4435        if (ret) {
4436                mlog_errno(ret);
4437                goto out_mutex;
4438        }
4439
4440        handle = ocfs2_start_trans(osb, OCFS2_DX_ROOT_REMOVE_CREDITS);
4441        if (IS_ERR(handle)) {
4442                ret = PTR_ERR(handle);
4443                mlog_errno(ret);
4444                goto out_unlock;
4445        }
4446
4447        ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
4448                                      OCFS2_JOURNAL_ACCESS_WRITE);
4449        if (ret) {
4450                mlog_errno(ret);
4451                goto out_commit;
4452        }
4453
4454        spin_lock(&OCFS2_I(dir)->ip_lock);
4455        OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL;
4456        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
4457        spin_unlock(&OCFS2_I(dir)->ip_lock);
4458        di->i_dx_root = cpu_to_le64(0ULL);
4459
4460        ocfs2_journal_dirty(handle, di_bh);
4461
4462        blk = le64_to_cpu(dx_root->dr_blkno);
4463        bit = le16_to_cpu(dx_root->dr_suballoc_bit);
4464        if (dx_root->dr_suballoc_loc)
4465                bg_blkno = le64_to_cpu(dx_root->dr_suballoc_loc);
4466        else
4467                bg_blkno = ocfs2_which_suballoc_group(blk, bit);
4468        ret = ocfs2_free_suballoc_bits(handle, dx_alloc_inode, dx_alloc_bh,
4469                                       bit, bg_blkno, 1);
4470        if (ret)
4471                mlog_errno(ret);
4472
4473out_commit:
4474        ocfs2_commit_trans(osb, handle);
4475
4476out_unlock:
4477        ocfs2_inode_unlock(dx_alloc_inode, 1);
4478
4479out_mutex:
4480        mutex_unlock(&dx_alloc_inode->i_mutex);
4481        brelse(dx_alloc_bh);
4482out:
4483        iput(dx_alloc_inode);
4484        return ret;
4485}
4486
4487int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4488{
4489        int ret;
4490        unsigned int uninitialized_var(clen);
4491        u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
4492        u64 uninitialized_var(blkno);
4493        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
4494        struct buffer_head *dx_root_bh = NULL;
4495        struct ocfs2_dx_root_block *dx_root;
4496        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4497        struct ocfs2_cached_dealloc_ctxt dealloc;
4498        struct ocfs2_extent_tree et;
4499
4500        ocfs2_init_dealloc_ctxt(&dealloc);
4501
4502        if (!ocfs2_dir_indexed(dir))
4503                return 0;
4504
4505        ret = ocfs2_read_dx_root(dir, di, &dx_root_bh);
4506        if (ret) {
4507                mlog_errno(ret);
4508                goto out;
4509        }
4510        dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
4511
4512        if (ocfs2_dx_root_inline(dx_root))
4513                goto remove_index;
4514
4515        ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4516
4517        /* XXX: What if dr_clusters is too large? */
4518        while (le32_to_cpu(dx_root->dr_clusters)) {
4519                ret = ocfs2_dx_dir_lookup_rec(dir, &dx_root->dr_list,
4520                                              major_hash, &cpos, &blkno, &clen);
4521                if (ret) {
4522                        mlog_errno(ret);
4523                        goto out;
4524                }
4525
4526                p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno);
4527
4528                ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0,
4529                                               &dealloc, 0);
4530                if (ret) {
4531                        mlog_errno(ret);
4532                        goto out;
4533                }
4534
4535                if (cpos == 0)
4536                        break;
4537
4538                major_hash = cpos - 1;
4539        }
4540
4541remove_index:
4542        ret = ocfs2_dx_dir_remove_index(dir, di_bh, dx_root_bh);
4543        if (ret) {
4544                mlog_errno(ret);
4545                goto out;
4546        }
4547
4548        ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
4549out:
4550        ocfs2_schedule_truncate_log_flush(osb, 1);
4551        ocfs2_run_deallocs(osb, &dealloc);
4552
4553        brelse(dx_root_bh);
4554        return ret;
4555}
4556