linux/fs/ocfs2/alloc.c
<<
>>
Prefs
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * alloc.c
   5 *
   6 * Extent allocs and frees
   7 *
   8 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   9 *
  10 * This program is free software; you can redistribute it and/or
  11 * modify it under the terms of the GNU General Public
  12 * License as published by the Free Software Foundation; either
  13 * version 2 of the License, or (at your option) any later version.
  14 *
  15 * This program is distributed in the hope that it will be useful,
  16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 * General Public License for more details.
  19 *
  20 * You should have received a copy of the GNU General Public
  21 * License along with this program; if not, write to the
  22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23 * Boston, MA 021110-1307, USA.
  24 */
  25
  26#include <linux/fs.h>
  27#include <linux/types.h>
  28#include <linux/slab.h>
  29#include <linux/highmem.h>
  30#include <linux/swap.h>
  31#include <linux/quotaops.h>
  32#include <linux/blkdev.h>
  33#include <linux/sched/signal.h>
  34
  35#include <cluster/masklog.h>
  36
  37#include "ocfs2.h"
  38
  39#include "alloc.h"
  40#include "aops.h"
  41#include "blockcheck.h"
  42#include "dlmglue.h"
  43#include "extent_map.h"
  44#include "inode.h"
  45#include "journal.h"
  46#include "localalloc.h"
  47#include "suballoc.h"
  48#include "sysfile.h"
  49#include "file.h"
  50#include "super.h"
  51#include "uptodate.h"
  52#include "xattr.h"
  53#include "refcounttree.h"
  54#include "ocfs2_trace.h"
  55
  56#include "buffer_head_io.h"
  57
  58enum ocfs2_contig_type {
  59        CONTIG_NONE = 0,
  60        CONTIG_LEFT,
  61        CONTIG_RIGHT,
  62        CONTIG_LEFTRIGHT,
  63};
  64
  65static enum ocfs2_contig_type
  66        ocfs2_extent_rec_contig(struct super_block *sb,
  67                                struct ocfs2_extent_rec *ext,
  68                                struct ocfs2_extent_rec *insert_rec);
  69/*
  70 * Operations for a specific extent tree type.
  71 *
  72 * To implement an on-disk btree (extent tree) type in ocfs2, add
  73 * an ocfs2_extent_tree_operations structure and the matching
  74 * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
  75 * for the allocation portion of the extent tree.
  76 */
  77struct ocfs2_extent_tree_operations {
  78        /*
  79         * last_eb_blk is the block number of the right most leaf extent
  80         * block.  Most on-disk structures containing an extent tree store
  81         * this value for fast access.  The ->eo_set_last_eb_blk() and
  82         * ->eo_get_last_eb_blk() operations access this value.  They are
  83         *  both required.
  84         */
  85        void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
  86                                   u64 blkno);
  87        u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
  88
  89        /*
  90         * The on-disk structure usually keeps track of how many total
  91         * clusters are stored in this extent tree.  This function updates
  92         * that value.  new_clusters is the delta, and must be
  93         * added to the total.  Required.
  94         */
  95        void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
  96                                   u32 new_clusters);
  97
  98        /*
  99         * If this extent tree is supported by an extent map, insert
 100         * a record into the map.
 101         */
 102        void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
 103                                     struct ocfs2_extent_rec *rec);
 104
 105        /*
 106         * If this extent tree is supported by an extent map, truncate the
 107         * map to clusters,
 108         */
 109        void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
 110                                       u32 clusters);
 111
 112        /*
 113         * If ->eo_insert_check() exists, it is called before rec is
 114         * inserted into the extent tree.  It is optional.
 115         */
 116        int (*eo_insert_check)(struct ocfs2_extent_tree *et,
 117                               struct ocfs2_extent_rec *rec);
 118        int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
 119
 120        /*
 121         * --------------------------------------------------------------
 122         * The remaining are internal to ocfs2_extent_tree and don't have
 123         * accessor functions
 124         */
 125
 126        /*
 127         * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
 128         * It is required.
 129         */
 130        void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
 131
 132        /*
 133         * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
 134         * it exists.  If it does not, et->et_max_leaf_clusters is set
 135         * to 0 (unlimited).  Optional.
 136         */
 137        void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
 138
 139        /*
 140         * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
 141         * are contiguous or not. Optional. Don't need to set it if use
 142         * ocfs2_extent_rec as the tree leaf.
 143         */
 144        enum ocfs2_contig_type
 145                (*eo_extent_contig)(struct ocfs2_extent_tree *et,
 146                                    struct ocfs2_extent_rec *ext,
 147                                    struct ocfs2_extent_rec *insert_rec);
 148};
 149
 150
 151/*
 152 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
 153 * in the methods.
 154 */
 155static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
 156static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 157                                         u64 blkno);
 158static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
 159                                         u32 clusters);
 160static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
 161                                           struct ocfs2_extent_rec *rec);
 162static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
 163                                             u32 clusters);
 164static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
 165                                     struct ocfs2_extent_rec *rec);
 166static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
 167static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
 168
 169static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
 170                                        struct ocfs2_extent_tree *et,
 171                                        struct buffer_head **new_eb_bh,
 172                                        int blk_wanted, int *blk_given);
 173static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
 174
 175static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
 176        .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
 177        .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
 178        .eo_update_clusters     = ocfs2_dinode_update_clusters,
 179        .eo_extent_map_insert   = ocfs2_dinode_extent_map_insert,
 180        .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
 181        .eo_insert_check        = ocfs2_dinode_insert_check,
 182        .eo_sanity_check        = ocfs2_dinode_sanity_check,
 183        .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
 184};
 185
 186static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
 187                                         u64 blkno)
 188{
 189        struct ocfs2_dinode *di = et->et_object;
 190
 191        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 192        di->i_last_eb_blk = cpu_to_le64(blkno);
 193}
 194
 195static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
 196{
 197        struct ocfs2_dinode *di = et->et_object;
 198
 199        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 200        return le64_to_cpu(di->i_last_eb_blk);
 201}
 202
 203static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
 204                                         u32 clusters)
 205{
 206        struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
 207        struct ocfs2_dinode *di = et->et_object;
 208
 209        le32_add_cpu(&di->i_clusters, clusters);
 210        spin_lock(&oi->ip_lock);
 211        oi->ip_clusters = le32_to_cpu(di->i_clusters);
 212        spin_unlock(&oi->ip_lock);
 213}
 214
 215static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
 216                                           struct ocfs2_extent_rec *rec)
 217{
 218        struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
 219
 220        ocfs2_extent_map_insert_rec(inode, rec);
 221}
 222
 223static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
 224                                             u32 clusters)
 225{
 226        struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
 227
 228        ocfs2_extent_map_trunc(inode, clusters);
 229}
 230
 231static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
 232                                     struct ocfs2_extent_rec *rec)
 233{
 234        struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
 235        struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
 236
 237        BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 238        mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
 239                        (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
 240                        "Device %s, asking for sparse allocation: inode %llu, "
 241                        "cpos %u, clusters %u\n",
 242                        osb->dev_str,
 243                        (unsigned long long)oi->ip_blkno,
 244                        rec->e_cpos, oi->ip_clusters);
 245
 246        return 0;
 247}
 248
 249static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
 250{
 251        struct ocfs2_dinode *di = et->et_object;
 252
 253        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
 254        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
 255
 256        return 0;
 257}
 258
 259static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 260{
 261        struct ocfs2_dinode *di = et->et_object;
 262
 263        et->et_root_el = &di->id2.i_list;
 264}
 265
 266
 267static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 268{
 269        struct ocfs2_xattr_value_buf *vb = et->et_object;
 270
 271        et->et_root_el = &vb->vb_xv->xr_list;
 272}
 273
 274static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 275                                              u64 blkno)
 276{
 277        struct ocfs2_xattr_value_buf *vb = et->et_object;
 278
 279        vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 280}
 281
 282static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 283{
 284        struct ocfs2_xattr_value_buf *vb = et->et_object;
 285
 286        return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 287}
 288
 289static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
 290                                              u32 clusters)
 291{
 292        struct ocfs2_xattr_value_buf *vb = et->et_object;
 293
 294        le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 295}
 296
 297static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
 298        .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
 299        .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
 300        .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
 301        .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
 302};
 303
 304static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
 305{
 306        struct ocfs2_xattr_block *xb = et->et_object;
 307
 308        et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
 309}
 310
 311static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
 312{
 313        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
 314        et->et_max_leaf_clusters =
 315                ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
 316}
 317
 318static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 319                                             u64 blkno)
 320{
 321        struct ocfs2_xattr_block *xb = et->et_object;
 322        struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 323
 324        xt->xt_last_eb_blk = cpu_to_le64(blkno);
 325}
 326
 327static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
 328{
 329        struct ocfs2_xattr_block *xb = et->et_object;
 330        struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
 331
 332        return le64_to_cpu(xt->xt_last_eb_blk);
 333}
 334
 335static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
 336                                             u32 clusters)
 337{
 338        struct ocfs2_xattr_block *xb = et->et_object;
 339
 340        le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 341}
 342
 343static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 344        .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
 345        .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
 346        .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
 347        .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
 348        .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
 349};
 350
 351static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
 352                                          u64 blkno)
 353{
 354        struct ocfs2_dx_root_block *dx_root = et->et_object;
 355
 356        dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
 357}
 358
 359static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
 360{
 361        struct ocfs2_dx_root_block *dx_root = et->et_object;
 362
 363        return le64_to_cpu(dx_root->dr_last_eb_blk);
 364}
 365
 366static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
 367                                          u32 clusters)
 368{
 369        struct ocfs2_dx_root_block *dx_root = et->et_object;
 370
 371        le32_add_cpu(&dx_root->dr_clusters, clusters);
 372}
 373
 374static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
 375{
 376        struct ocfs2_dx_root_block *dx_root = et->et_object;
 377
 378        BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
 379
 380        return 0;
 381}
 382
 383static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
 384{
 385        struct ocfs2_dx_root_block *dx_root = et->et_object;
 386
 387        et->et_root_el = &dx_root->dr_list;
 388}
 389
 390static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
 391        .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
 392        .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
 393        .eo_update_clusters     = ocfs2_dx_root_update_clusters,
 394        .eo_sanity_check        = ocfs2_dx_root_sanity_check,
 395        .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
 396};
 397
 398static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
 399{
 400        struct ocfs2_refcount_block *rb = et->et_object;
 401
 402        et->et_root_el = &rb->rf_list;
 403}
 404
 405static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
 406                                                u64 blkno)
 407{
 408        struct ocfs2_refcount_block *rb = et->et_object;
 409
 410        rb->rf_last_eb_blk = cpu_to_le64(blkno);
 411}
 412
 413static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
 414{
 415        struct ocfs2_refcount_block *rb = et->et_object;
 416
 417        return le64_to_cpu(rb->rf_last_eb_blk);
 418}
 419
 420static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
 421                                                u32 clusters)
 422{
 423        struct ocfs2_refcount_block *rb = et->et_object;
 424
 425        le32_add_cpu(&rb->rf_clusters, clusters);
 426}
 427
 428static enum ocfs2_contig_type
 429ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
 430                                  struct ocfs2_extent_rec *ext,
 431                                  struct ocfs2_extent_rec *insert_rec)
 432{
 433        return CONTIG_NONE;
 434}
 435
 436static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
 437        .eo_set_last_eb_blk     = ocfs2_refcount_tree_set_last_eb_blk,
 438        .eo_get_last_eb_blk     = ocfs2_refcount_tree_get_last_eb_blk,
 439        .eo_update_clusters     = ocfs2_refcount_tree_update_clusters,
 440        .eo_fill_root_el        = ocfs2_refcount_tree_fill_root_el,
 441        .eo_extent_contig       = ocfs2_refcount_tree_extent_contig,
 442};
 443
 444static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 445                                     struct ocfs2_caching_info *ci,
 446                                     struct buffer_head *bh,
 447                                     ocfs2_journal_access_func access,
 448                                     void *obj,
 449                                     const struct ocfs2_extent_tree_operations *ops)
 450{
 451        et->et_ops = ops;
 452        et->et_root_bh = bh;
 453        et->et_ci = ci;
 454        et->et_root_journal_access = access;
 455        if (!obj)
 456                obj = (void *)bh->b_data;
 457        et->et_object = obj;
 458        et->et_dealloc = NULL;
 459
 460        et->et_ops->eo_fill_root_el(et);
 461        if (!et->et_ops->eo_fill_max_leaf_clusters)
 462                et->et_max_leaf_clusters = 0;
 463        else
 464                et->et_ops->eo_fill_max_leaf_clusters(et);
 465}
 466
 467void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 468                                   struct ocfs2_caching_info *ci,
 469                                   struct buffer_head *bh)
 470{
 471        __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
 472                                 NULL, &ocfs2_dinode_et_ops);
 473}
 474
 475void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 476                                       struct ocfs2_caching_info *ci,
 477                                       struct buffer_head *bh)
 478{
 479        __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
 480                                 NULL, &ocfs2_xattr_tree_et_ops);
 481}
 482
 483void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 484                                        struct ocfs2_caching_info *ci,
 485                                        struct ocfs2_xattr_value_buf *vb)
 486{
 487        __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
 488                                 &ocfs2_xattr_value_et_ops);
 489}
 490
 491void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
 492                                    struct ocfs2_caching_info *ci,
 493                                    struct buffer_head *bh)
 494{
 495        __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
 496                                 NULL, &ocfs2_dx_root_et_ops);
 497}
 498
 499void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
 500                                     struct ocfs2_caching_info *ci,
 501                                     struct buffer_head *bh)
 502{
 503        __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
 504                                 NULL, &ocfs2_refcount_tree_et_ops);
 505}
 506
 507static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
 508                                            u64 new_last_eb_blk)
 509{
 510        et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
 511}
 512
 513static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
 514{
 515        return et->et_ops->eo_get_last_eb_blk(et);
 516}
 517
 518static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
 519                                            u32 clusters)
 520{
 521        et->et_ops->eo_update_clusters(et, clusters);
 522}
 523
 524static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
 525                                              struct ocfs2_extent_rec *rec)
 526{
 527        if (et->et_ops->eo_extent_map_insert)
 528                et->et_ops->eo_extent_map_insert(et, rec);
 529}
 530
 531static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
 532                                                u32 clusters)
 533{
 534        if (et->et_ops->eo_extent_map_truncate)
 535                et->et_ops->eo_extent_map_truncate(et, clusters);
 536}
 537
 538static inline int ocfs2_et_root_journal_access(handle_t *handle,
 539                                               struct ocfs2_extent_tree *et,
 540                                               int type)
 541{
 542        return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
 543                                          type);
 544}
 545
 546static inline enum ocfs2_contig_type
 547        ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
 548                               struct ocfs2_extent_rec *rec,
 549                               struct ocfs2_extent_rec *insert_rec)
 550{
 551        if (et->et_ops->eo_extent_contig)
 552                return et->et_ops->eo_extent_contig(et, rec, insert_rec);
 553
 554        return ocfs2_extent_rec_contig(
 555                                ocfs2_metadata_cache_get_super(et->et_ci),
 556                                rec, insert_rec);
 557}
 558
 559static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
 560                                        struct ocfs2_extent_rec *rec)
 561{
 562        int ret = 0;
 563
 564        if (et->et_ops->eo_insert_check)
 565                ret = et->et_ops->eo_insert_check(et, rec);
 566        return ret;
 567}
 568
 569static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
 570{
 571        int ret = 0;
 572
 573        if (et->et_ops->eo_sanity_check)
 574                ret = et->et_ops->eo_sanity_check(et);
 575        return ret;
 576}
 577
 578static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 579                                         struct ocfs2_extent_block *eb);
 580static void ocfs2_adjust_rightmost_records(handle_t *handle,
 581                                           struct ocfs2_extent_tree *et,
 582                                           struct ocfs2_path *path,
 583                                           struct ocfs2_extent_rec *insert_rec);
 584/*
 585 * Reset the actual path elements so that we can re-use the structure
 586 * to build another path. Generally, this involves freeing the buffer
 587 * heads.
 588 */
 589void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
 590{
 591        int i, start = 0, depth = 0;
 592        struct ocfs2_path_item *node;
 593
 594        if (keep_root)
 595                start = 1;
 596
 597        for(i = start; i < path_num_items(path); i++) {
 598                node = &path->p_node[i];
 599
 600                brelse(node->bh);
 601                node->bh = NULL;
 602                node->el = NULL;
 603        }
 604
 605        /*
 606         * Tree depth may change during truncate, or insert. If we're
 607         * keeping the root extent list, then make sure that our path
 608         * structure reflects the proper depth.
 609         */
 610        if (keep_root)
 611                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
 612        else
 613                path_root_access(path) = NULL;
 614
 615        path->p_tree_depth = depth;
 616}
 617
 618void ocfs2_free_path(struct ocfs2_path *path)
 619{
 620        if (path) {
 621                ocfs2_reinit_path(path, 0);
 622                kfree(path);
 623        }
 624}
 625
 626/*
 627 * All the elements of src into dest. After this call, src could be freed
 628 * without affecting dest.
 629 *
 630 * Both paths should have the same root. Any non-root elements of dest
 631 * will be freed.
 632 */
 633static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 634{
 635        int i;
 636
 637        BUG_ON(path_root_bh(dest) != path_root_bh(src));
 638        BUG_ON(path_root_el(dest) != path_root_el(src));
 639        BUG_ON(path_root_access(dest) != path_root_access(src));
 640
 641        ocfs2_reinit_path(dest, 1);
 642
 643        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
 644                dest->p_node[i].bh = src->p_node[i].bh;
 645                dest->p_node[i].el = src->p_node[i].el;
 646
 647                if (dest->p_node[i].bh)
 648                        get_bh(dest->p_node[i].bh);
 649        }
 650}
 651
 652/*
 653 * Make the *dest path the same as src and re-initialize src path to
 654 * have a root only.
 655 */
 656static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 657{
 658        int i;
 659
 660        BUG_ON(path_root_bh(dest) != path_root_bh(src));
 661        BUG_ON(path_root_access(dest) != path_root_access(src));
 662
 663        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
 664                brelse(dest->p_node[i].bh);
 665
 666                dest->p_node[i].bh = src->p_node[i].bh;
 667                dest->p_node[i].el = src->p_node[i].el;
 668
 669                src->p_node[i].bh = NULL;
 670                src->p_node[i].el = NULL;
 671        }
 672}
 673
 674/*
 675 * Insert an extent block at given index.
 676 *
 677 * This will not take an additional reference on eb_bh.
 678 */
 679static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 680                                        struct buffer_head *eb_bh)
 681{
 682        struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
 683
 684        /*
 685         * Right now, no root bh is an extent block, so this helps
 686         * catch code errors with dinode trees. The assertion can be
 687         * safely removed if we ever need to insert extent block
 688         * structures at the root.
 689         */
 690        BUG_ON(index == 0);
 691
 692        path->p_node[index].bh = eb_bh;
 693        path->p_node[index].el = &eb->h_list;
 694}
 695
 696static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 697                                         struct ocfs2_extent_list *root_el,
 698                                         ocfs2_journal_access_func access)
 699{
 700        struct ocfs2_path *path;
 701
 702        BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
 703
 704        path = kzalloc(sizeof(*path), GFP_NOFS);
 705        if (path) {
 706                path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
 707                get_bh(root_bh);
 708                path_root_bh(path) = root_bh;
 709                path_root_el(path) = root_el;
 710                path_root_access(path) = access;
 711        }
 712
 713        return path;
 714}
 715
 716struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
 717{
 718        return ocfs2_new_path(path_root_bh(path), path_root_el(path),
 719                              path_root_access(path));
 720}
 721
 722struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
 723{
 724        return ocfs2_new_path(et->et_root_bh, et->et_root_el,
 725                              et->et_root_journal_access);
 726}
 727
 728/*
 729 * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
 730 * otherwise it's the root_access function.
 731 *
 732 * I don't like the way this function's name looks next to
 733 * ocfs2_journal_access_path(), but I don't have a better one.
 734 */
 735int ocfs2_path_bh_journal_access(handle_t *handle,
 736                                 struct ocfs2_caching_info *ci,
 737                                 struct ocfs2_path *path,
 738                                 int idx)
 739{
 740        ocfs2_journal_access_func access = path_root_access(path);
 741
 742        if (!access)
 743                access = ocfs2_journal_access;
 744
 745        if (idx)
 746                access = ocfs2_journal_access_eb;
 747
 748        return access(handle, ci, path->p_node[idx].bh,
 749                      OCFS2_JOURNAL_ACCESS_WRITE);
 750}
 751
 752/*
 753 * Convenience function to journal all components in a path.
 754 */
 755int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
 756                              handle_t *handle,
 757                              struct ocfs2_path *path)
 758{
 759        int i, ret = 0;
 760
 761        if (!path)
 762                goto out;
 763
 764        for(i = 0; i < path_num_items(path); i++) {
 765                ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
 766                if (ret < 0) {
 767                        mlog_errno(ret);
 768                        goto out;
 769                }
 770        }
 771
 772out:
 773        return ret;
 774}
 775
 776/*
 777 * Return the index of the extent record which contains cluster #v_cluster.
 778 * -1 is returned if it was not found.
 779 *
 780 * Should work fine on interior and exterior nodes.
 781 */
 782int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
 783{
 784        int ret = -1;
 785        int i;
 786        struct ocfs2_extent_rec *rec;
 787        u32 rec_end, rec_start, clusters;
 788
 789        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
 790                rec = &el->l_recs[i];
 791
 792                rec_start = le32_to_cpu(rec->e_cpos);
 793                clusters = ocfs2_rec_clusters(el, rec);
 794
 795                rec_end = rec_start + clusters;
 796
 797                if (v_cluster >= rec_start && v_cluster < rec_end) {
 798                        ret = i;
 799                        break;
 800                }
 801        }
 802
 803        return ret;
 804}
 805
 806/*
 807 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
 808 * ocfs2_extent_rec_contig only work properly against leaf nodes!
 809 */
 810static int ocfs2_block_extent_contig(struct super_block *sb,
 811                                     struct ocfs2_extent_rec *ext,
 812                                     u64 blkno)
 813{
 814        u64 blk_end = le64_to_cpu(ext->e_blkno);
 815
 816        blk_end += ocfs2_clusters_to_blocks(sb,
 817                                    le16_to_cpu(ext->e_leaf_clusters));
 818
 819        return blkno == blk_end;
 820}
 821
 822static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
 823                                  struct ocfs2_extent_rec *right)
 824{
 825        u32 left_range;
 826
 827        left_range = le32_to_cpu(left->e_cpos) +
 828                le16_to_cpu(left->e_leaf_clusters);
 829
 830        return (left_range == le32_to_cpu(right->e_cpos));
 831}
 832
 833static enum ocfs2_contig_type
 834        ocfs2_extent_rec_contig(struct super_block *sb,
 835                                struct ocfs2_extent_rec *ext,
 836                                struct ocfs2_extent_rec *insert_rec)
 837{
 838        u64 blkno = le64_to_cpu(insert_rec->e_blkno);
 839
 840        /*
 841         * Refuse to coalesce extent records with different flag
 842         * fields - we don't want to mix unwritten extents with user
 843         * data.
 844         */
 845        if (ext->e_flags != insert_rec->e_flags)
 846                return CONTIG_NONE;
 847
 848        if (ocfs2_extents_adjacent(ext, insert_rec) &&
 849            ocfs2_block_extent_contig(sb, ext, blkno))
 850                        return CONTIG_RIGHT;
 851
 852        blkno = le64_to_cpu(ext->e_blkno);
 853        if (ocfs2_extents_adjacent(insert_rec, ext) &&
 854            ocfs2_block_extent_contig(sb, insert_rec, blkno))
 855                return CONTIG_LEFT;
 856
 857        return CONTIG_NONE;
 858}
 859
 860/*
 861 * NOTE: We can have pretty much any combination of contiguousness and
 862 * appending.
 863 *
 864 * The usefulness of APPEND_TAIL is more in that it lets us know that
 865 * we'll have to update the path to that leaf.
 866 */
 867enum ocfs2_append_type {
 868        APPEND_NONE = 0,
 869        APPEND_TAIL,
 870};
 871
 872enum ocfs2_split_type {
 873        SPLIT_NONE = 0,
 874        SPLIT_LEFT,
 875        SPLIT_RIGHT,
 876};
 877
 878struct ocfs2_insert_type {
 879        enum ocfs2_split_type   ins_split;
 880        enum ocfs2_append_type  ins_appending;
 881        enum ocfs2_contig_type  ins_contig;
 882        int                     ins_contig_index;
 883        int                     ins_tree_depth;
 884};
 885
 886struct ocfs2_merge_ctxt {
 887        enum ocfs2_contig_type  c_contig_type;
 888        int                     c_has_empty_extent;
 889        int                     c_split_covers_rec;
 890};
 891
 892static int ocfs2_validate_extent_block(struct super_block *sb,
 893                                       struct buffer_head *bh)
 894{
 895        int rc;
 896        struct ocfs2_extent_block *eb =
 897                (struct ocfs2_extent_block *)bh->b_data;
 898
 899        trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
 900
 901        BUG_ON(!buffer_uptodate(bh));
 902
 903        /*
 904         * If the ecc fails, we return the error but otherwise
 905         * leave the filesystem running.  We know any error is
 906         * local to this block.
 907         */
 908        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
 909        if (rc) {
 910                mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
 911                     (unsigned long long)bh->b_blocknr);
 912                return rc;
 913        }
 914
 915        /*
 916         * Errors after here are fatal.
 917         */
 918
 919        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 920                rc = ocfs2_error(sb,
 921                                 "Extent block #%llu has bad signature %.*s\n",
 922                                 (unsigned long long)bh->b_blocknr, 7,
 923                                 eb->h_signature);
 924                goto bail;
 925        }
 926
 927        if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
 928                rc = ocfs2_error(sb,
 929                                 "Extent block #%llu has an invalid h_blkno of %llu\n",
 930                                 (unsigned long long)bh->b_blocknr,
 931                                 (unsigned long long)le64_to_cpu(eb->h_blkno));
 932                goto bail;
 933        }
 934
 935        if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
 936                rc = ocfs2_error(sb,
 937                                 "Extent block #%llu has an invalid h_fs_generation of #%u\n",
 938                                 (unsigned long long)bh->b_blocknr,
 939                                 le32_to_cpu(eb->h_fs_generation));
 940                goto bail;
 941        }
 942bail:
 943        return rc;
 944}
 945
 946int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
 947                            struct buffer_head **bh)
 948{
 949        int rc;
 950        struct buffer_head *tmp = *bh;
 951
 952        rc = ocfs2_read_block(ci, eb_blkno, &tmp,
 953                              ocfs2_validate_extent_block);
 954
 955        /* If ocfs2_read_block() got us a new bh, pass it up. */
 956        if (!rc && !*bh)
 957                *bh = tmp;
 958
 959        return rc;
 960}
 961
 962
 963/*
 964 * How many free extents have we got before we need more meta data?
 965 */
 966int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
 967{
 968        int retval;
 969        struct ocfs2_extent_list *el = NULL;
 970        struct ocfs2_extent_block *eb;
 971        struct buffer_head *eb_bh = NULL;
 972        u64 last_eb_blk = 0;
 973
 974        el = et->et_root_el;
 975        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 976
 977        if (last_eb_blk) {
 978                retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
 979                                                 &eb_bh);
 980                if (retval < 0) {
 981                        mlog_errno(retval);
 982                        goto bail;
 983                }
 984                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 985                el = &eb->h_list;
 986        }
 987
 988        BUG_ON(el->l_tree_depth != 0);
 989
 990        retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 991bail:
 992        brelse(eb_bh);
 993
 994        trace_ocfs2_num_free_extents(retval);
 995        return retval;
 996}
 997
 998/* expects array to already be allocated
 999 *
1000 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
1001 * l_count for you
1002 */
1003static int ocfs2_create_new_meta_bhs(handle_t *handle,
1004                                     struct ocfs2_extent_tree *et,
1005                                     int wanted,
1006                                     struct ocfs2_alloc_context *meta_ac,
1007                                     struct buffer_head *bhs[])
1008{
1009        int count, status, i;
1010        u16 suballoc_bit_start;
1011        u32 num_got;
1012        u64 suballoc_loc, first_blkno;
1013        struct ocfs2_super *osb =
1014                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1015        struct ocfs2_extent_block *eb;
1016
1017        count = 0;
1018        while (count < wanted) {
1019                status = ocfs2_claim_metadata(handle,
1020                                              meta_ac,
1021                                              wanted - count,
1022                                              &suballoc_loc,
1023                                              &suballoc_bit_start,
1024                                              &num_got,
1025                                              &first_blkno);
1026                if (status < 0) {
1027                        mlog_errno(status);
1028                        goto bail;
1029                }
1030
1031                for(i = count;  i < (num_got + count); i++) {
1032                        bhs[i] = sb_getblk(osb->sb, first_blkno);
1033                        if (bhs[i] == NULL) {
1034                                status = -ENOMEM;
1035                                mlog_errno(status);
1036                                goto bail;
1037                        }
1038                        ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1039
1040                        status = ocfs2_journal_access_eb(handle, et->et_ci,
1041                                                         bhs[i],
1042                                                         OCFS2_JOURNAL_ACCESS_CREATE);
1043                        if (status < 0) {
1044                                mlog_errno(status);
1045                                goto bail;
1046                        }
1047
1048                        memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1049                        eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1050                        /* Ok, setup the minimal stuff here. */
1051                        strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1052                        eb->h_blkno = cpu_to_le64(first_blkno);
1053                        eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1054                        eb->h_suballoc_slot =
1055                                cpu_to_le16(meta_ac->ac_alloc_slot);
1056                        eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1057                        eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1058                        eb->h_list.l_count =
1059                                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1060
1061                        suballoc_bit_start++;
1062                        first_blkno++;
1063
1064                        /* We'll also be dirtied by the caller, so
1065                         * this isn't absolutely necessary. */
1066                        ocfs2_journal_dirty(handle, bhs[i]);
1067                }
1068
1069                count += num_got;
1070        }
1071
1072        status = 0;
1073bail:
1074        if (status < 0) {
1075                for(i = 0; i < wanted; i++) {
1076                        brelse(bhs[i]);
1077                        bhs[i] = NULL;
1078                }
1079                mlog_errno(status);
1080        }
1081        return status;
1082}
1083
1084/*
1085 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1086 *
1087 * Returns the sum of the rightmost extent rec logical offset and
1088 * cluster count.
1089 *
1090 * ocfs2_add_branch() uses this to determine what logical cluster
1091 * value should be populated into the leftmost new branch records.
1092 *
1093 * ocfs2_shift_tree_depth() uses this to determine the # clusters
1094 * value for the new topmost tree record.
1095 */
1096static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1097{
1098        int i;
1099
1100        i = le16_to_cpu(el->l_next_free_rec) - 1;
1101
1102        return le32_to_cpu(el->l_recs[i].e_cpos) +
1103                ocfs2_rec_clusters(el, &el->l_recs[i]);
1104}
1105
1106/*
1107 * Change range of the branches in the right most path according to the leaf
1108 * extent block's rightmost record.
1109 */
1110static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1111                                         struct ocfs2_extent_tree *et)
1112{
1113        int status;
1114        struct ocfs2_path *path = NULL;
1115        struct ocfs2_extent_list *el;
1116        struct ocfs2_extent_rec *rec;
1117
1118        path = ocfs2_new_path_from_et(et);
1119        if (!path) {
1120                status = -ENOMEM;
1121                return status;
1122        }
1123
1124        status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1125        if (status < 0) {
1126                mlog_errno(status);
1127                goto out;
1128        }
1129
1130        status = ocfs2_extend_trans(handle, path_num_items(path));
1131        if (status < 0) {
1132                mlog_errno(status);
1133                goto out;
1134        }
1135
1136        status = ocfs2_journal_access_path(et->et_ci, handle, path);
1137        if (status < 0) {
1138                mlog_errno(status);
1139                goto out;
1140        }
1141
1142        el = path_leaf_el(path);
1143        rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
1144
1145        ocfs2_adjust_rightmost_records(handle, et, path, rec);
1146
1147out:
1148        ocfs2_free_path(path);
1149        return status;
1150}
1151
1152/*
1153 * Add an entire tree branch to our inode. eb_bh is the extent block
1154 * to start at, if we don't want to start the branch at the root
1155 * structure.
1156 *
1157 * last_eb_bh is required as we have to update it's next_leaf pointer
1158 * for the new last extent block.
1159 *
1160 * the new branch will be 'empty' in the sense that every block will
1161 * contain a single record with cluster count == 0.
1162 */
1163static int ocfs2_add_branch(handle_t *handle,
1164                            struct ocfs2_extent_tree *et,
1165                            struct buffer_head *eb_bh,
1166                            struct buffer_head **last_eb_bh,
1167                            struct ocfs2_alloc_context *meta_ac)
1168{
1169        int status, new_blocks, i, block_given = 0;
1170        u64 next_blkno, new_last_eb_blk;
1171        struct buffer_head *bh;
1172        struct buffer_head **new_eb_bhs = NULL;
1173        struct ocfs2_extent_block *eb;
1174        struct ocfs2_extent_list  *eb_el;
1175        struct ocfs2_extent_list  *el;
1176        u32 new_cpos, root_end;
1177
1178        BUG_ON(!last_eb_bh || !*last_eb_bh);
1179
1180        if (eb_bh) {
1181                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1182                el = &eb->h_list;
1183        } else
1184                el = et->et_root_el;
1185
1186        /* we never add a branch to a leaf. */
1187        BUG_ON(!el->l_tree_depth);
1188
1189        new_blocks = le16_to_cpu(el->l_tree_depth);
1190
1191        eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1192        new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1193        root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1194
1195        /*
1196         * If there is a gap before the root end and the real end
1197         * of the righmost leaf block, we need to remove the gap
1198         * between new_cpos and root_end first so that the tree
1199         * is consistent after we add a new branch(it will start
1200         * from new_cpos).
1201         */
1202        if (root_end > new_cpos) {
1203                trace_ocfs2_adjust_rightmost_branch(
1204                        (unsigned long long)
1205                        ocfs2_metadata_cache_owner(et->et_ci),
1206                        root_end, new_cpos);
1207
1208                status = ocfs2_adjust_rightmost_branch(handle, et);
1209                if (status) {
1210                        mlog_errno(status);
1211                        goto bail;
1212                }
1213        }
1214
1215        /* allocate the number of new eb blocks we need */
1216        new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1217                             GFP_KERNEL);
1218        if (!new_eb_bhs) {
1219                status = -ENOMEM;
1220                mlog_errno(status);
1221                goto bail;
1222        }
1223
1224        /* Firstyly, try to reuse dealloc since we have already estimated how
1225         * many extent blocks we may use.
1226         */
1227        if (!ocfs2_is_dealloc_empty(et)) {
1228                status = ocfs2_reuse_blk_from_dealloc(handle, et,
1229                                                      new_eb_bhs, new_blocks,
1230                                                      &block_given);
1231                if (status < 0) {
1232                        mlog_errno(status);
1233                        goto bail;
1234                }
1235        }
1236
1237        BUG_ON(block_given > new_blocks);
1238
1239        if (block_given < new_blocks) {
1240                BUG_ON(!meta_ac);
1241                status = ocfs2_create_new_meta_bhs(handle, et,
1242                                                   new_blocks - block_given,
1243                                                   meta_ac,
1244                                                   &new_eb_bhs[block_given]);
1245                if (status < 0) {
1246                        mlog_errno(status);
1247                        goto bail;
1248                }
1249        }
1250
1251        /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1252         * linked with the rest of the tree.
1253         * conversly, new_eb_bhs[0] is the new bottommost leaf.
1254         *
1255         * when we leave the loop, new_last_eb_blk will point to the
1256         * newest leaf, and next_blkno will point to the topmost extent
1257         * block. */
1258        next_blkno = new_last_eb_blk = 0;
1259        for(i = 0; i < new_blocks; i++) {
1260                bh = new_eb_bhs[i];
1261                eb = (struct ocfs2_extent_block *) bh->b_data;
1262                /* ocfs2_create_new_meta_bhs() should create it right! */
1263                BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1264                eb_el = &eb->h_list;
1265
1266                status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1267                                                 OCFS2_JOURNAL_ACCESS_CREATE);
1268                if (status < 0) {
1269                        mlog_errno(status);
1270                        goto bail;
1271                }
1272
1273                eb->h_next_leaf_blk = 0;
1274                eb_el->l_tree_depth = cpu_to_le16(i);
1275                eb_el->l_next_free_rec = cpu_to_le16(1);
1276                /*
1277                 * This actually counts as an empty extent as
1278                 * c_clusters == 0
1279                 */
1280                eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1281                eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1282                /*
1283                 * eb_el isn't always an interior node, but even leaf
1284                 * nodes want a zero'd flags and reserved field so
1285                 * this gets the whole 32 bits regardless of use.
1286                 */
1287                eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1288                if (!eb_el->l_tree_depth)
1289                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1290
1291                ocfs2_journal_dirty(handle, bh);
1292                next_blkno = le64_to_cpu(eb->h_blkno);
1293        }
1294
1295        /* This is a bit hairy. We want to update up to three blocks
1296         * here without leaving any of them in an inconsistent state
1297         * in case of error. We don't have to worry about
1298         * journal_dirty erroring as it won't unless we've aborted the
1299         * handle (in which case we would never be here) so reserving
1300         * the write with journal_access is all we need to do. */
1301        status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1302                                         OCFS2_JOURNAL_ACCESS_WRITE);
1303        if (status < 0) {
1304                mlog_errno(status);
1305                goto bail;
1306        }
1307        status = ocfs2_et_root_journal_access(handle, et,
1308                                              OCFS2_JOURNAL_ACCESS_WRITE);
1309        if (status < 0) {
1310                mlog_errno(status);
1311                goto bail;
1312        }
1313        if (eb_bh) {
1314                status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1315                                                 OCFS2_JOURNAL_ACCESS_WRITE);
1316                if (status < 0) {
1317                        mlog_errno(status);
1318                        goto bail;
1319                }
1320        }
1321
1322        /* Link the new branch into the rest of the tree (el will
1323         * either be on the root_bh, or the extent block passed in. */
1324        i = le16_to_cpu(el->l_next_free_rec);
1325        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1326        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1327        el->l_recs[i].e_int_clusters = 0;
1328        le16_add_cpu(&el->l_next_free_rec, 1);
1329
1330        /* fe needs a new last extent block pointer, as does the
1331         * next_leaf on the previously last-extent-block. */
1332        ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1333
1334        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1335        eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1336
1337        ocfs2_journal_dirty(handle, *last_eb_bh);
1338        ocfs2_journal_dirty(handle, et->et_root_bh);
1339        if (eb_bh)
1340                ocfs2_journal_dirty(handle, eb_bh);
1341
1342        /*
1343         * Some callers want to track the rightmost leaf so pass it
1344         * back here.
1345         */
1346        brelse(*last_eb_bh);
1347        get_bh(new_eb_bhs[0]);
1348        *last_eb_bh = new_eb_bhs[0];
1349
1350        status = 0;
1351bail:
1352        if (new_eb_bhs) {
1353                for (i = 0; i < new_blocks; i++)
1354                        brelse(new_eb_bhs[i]);
1355                kfree(new_eb_bhs);
1356        }
1357
1358        return status;
1359}
1360
1361/*
1362 * adds another level to the allocation tree.
1363 * returns back the new extent block so you can add a branch to it
1364 * after this call.
1365 */
1366static int ocfs2_shift_tree_depth(handle_t *handle,
1367                                  struct ocfs2_extent_tree *et,
1368                                  struct ocfs2_alloc_context *meta_ac,
1369                                  struct buffer_head **ret_new_eb_bh)
1370{
1371        int status, i, block_given = 0;
1372        u32 new_clusters;
1373        struct buffer_head *new_eb_bh = NULL;
1374        struct ocfs2_extent_block *eb;
1375        struct ocfs2_extent_list  *root_el;
1376        struct ocfs2_extent_list  *eb_el;
1377
1378        if (!ocfs2_is_dealloc_empty(et)) {
1379                status = ocfs2_reuse_blk_from_dealloc(handle, et,
1380                                                      &new_eb_bh, 1,
1381                                                      &block_given);
1382        } else if (meta_ac) {
1383                status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1384                                                   &new_eb_bh);
1385
1386        } else {
1387                BUG();
1388        }
1389
1390        if (status < 0) {
1391                mlog_errno(status);
1392                goto bail;
1393        }
1394
1395        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1396        /* ocfs2_create_new_meta_bhs() should create it right! */
1397        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1398
1399        eb_el = &eb->h_list;
1400        root_el = et->et_root_el;
1401
1402        status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1403                                         OCFS2_JOURNAL_ACCESS_CREATE);
1404        if (status < 0) {
1405                mlog_errno(status);
1406                goto bail;
1407        }
1408
1409        /* copy the root extent list data into the new extent block */
1410        eb_el->l_tree_depth = root_el->l_tree_depth;
1411        eb_el->l_next_free_rec = root_el->l_next_free_rec;
1412        for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1413                eb_el->l_recs[i] = root_el->l_recs[i];
1414
1415        ocfs2_journal_dirty(handle, new_eb_bh);
1416
1417        status = ocfs2_et_root_journal_access(handle, et,
1418                                              OCFS2_JOURNAL_ACCESS_WRITE);
1419        if (status < 0) {
1420                mlog_errno(status);
1421                goto bail;
1422        }
1423
1424        new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1425
1426        /* update root_bh now */
1427        le16_add_cpu(&root_el->l_tree_depth, 1);
1428        root_el->l_recs[0].e_cpos = 0;
1429        root_el->l_recs[0].e_blkno = eb->h_blkno;
1430        root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1431        for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1432                memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1433        root_el->l_next_free_rec = cpu_to_le16(1);
1434
1435        /* If this is our 1st tree depth shift, then last_eb_blk
1436         * becomes the allocated extent block */
1437        if (root_el->l_tree_depth == cpu_to_le16(1))
1438                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1439
1440        ocfs2_journal_dirty(handle, et->et_root_bh);
1441
1442        *ret_new_eb_bh = new_eb_bh;
1443        new_eb_bh = NULL;
1444        status = 0;
1445bail:
1446        brelse(new_eb_bh);
1447
1448        return status;
1449}
1450
1451/*
1452 * Should only be called when there is no space left in any of the
1453 * leaf nodes. What we want to do is find the lowest tree depth
1454 * non-leaf extent block with room for new records. There are three
1455 * valid results of this search:
1456 *
1457 * 1) a lowest extent block is found, then we pass it back in
1458 *    *lowest_eb_bh and return '0'
1459 *
1460 * 2) the search fails to find anything, but the root_el has room. We
1461 *    pass NULL back in *lowest_eb_bh, but still return '0'
1462 *
1463 * 3) the search fails to find anything AND the root_el is full, in
1464 *    which case we return > 0
1465 *
1466 * return status < 0 indicates an error.
1467 */
1468static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1469                                    struct buffer_head **target_bh)
1470{
1471        int status = 0, i;
1472        u64 blkno;
1473        struct ocfs2_extent_block *eb;
1474        struct ocfs2_extent_list  *el;
1475        struct buffer_head *bh = NULL;
1476        struct buffer_head *lowest_bh = NULL;
1477
1478        *target_bh = NULL;
1479
1480        el = et->et_root_el;
1481
1482        while(le16_to_cpu(el->l_tree_depth) > 1) {
1483                if (le16_to_cpu(el->l_next_free_rec) == 0) {
1484                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1485                                    "Owner %llu has empty extent list (next_free_rec == 0)\n",
1486                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1487                        status = -EIO;
1488                        goto bail;
1489                }
1490                i = le16_to_cpu(el->l_next_free_rec) - 1;
1491                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1492                if (!blkno) {
1493                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1494                                    "Owner %llu has extent list where extent # %d has no physical block start\n",
1495                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1496                        status = -EIO;
1497                        goto bail;
1498                }
1499
1500                brelse(bh);
1501                bh = NULL;
1502
1503                status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1504                if (status < 0) {
1505                        mlog_errno(status);
1506                        goto bail;
1507                }
1508
1509                eb = (struct ocfs2_extent_block *) bh->b_data;
1510                el = &eb->h_list;
1511
1512                if (le16_to_cpu(el->l_next_free_rec) <
1513                    le16_to_cpu(el->l_count)) {
1514                        brelse(lowest_bh);
1515                        lowest_bh = bh;
1516                        get_bh(lowest_bh);
1517                }
1518        }
1519
1520        /* If we didn't find one and the fe doesn't have any room,
1521         * then return '1' */
1522        el = et->et_root_el;
1523        if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1524                status = 1;
1525
1526        *target_bh = lowest_bh;
1527bail:
1528        brelse(bh);
1529
1530        return status;
1531}
1532
1533/*
1534 * Grow a b-tree so that it has more records.
1535 *
1536 * We might shift the tree depth in which case existing paths should
1537 * be considered invalid.
1538 *
1539 * Tree depth after the grow is returned via *final_depth.
1540 *
1541 * *last_eb_bh will be updated by ocfs2_add_branch().
1542 */
1543static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1544                           int *final_depth, struct buffer_head **last_eb_bh,
1545                           struct ocfs2_alloc_context *meta_ac)
1546{
1547        int ret, shift;
1548        struct ocfs2_extent_list *el = et->et_root_el;
1549        int depth = le16_to_cpu(el->l_tree_depth);
1550        struct buffer_head *bh = NULL;
1551
1552        BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
1553
1554        shift = ocfs2_find_branch_target(et, &bh);
1555        if (shift < 0) {
1556                ret = shift;
1557                mlog_errno(ret);
1558                goto out;
1559        }
1560
1561        /* We traveled all the way to the bottom of the allocation tree
1562         * and didn't find room for any more extents - we need to add
1563         * another tree level */
1564        if (shift) {
1565                BUG_ON(bh);
1566                trace_ocfs2_grow_tree(
1567                        (unsigned long long)
1568                        ocfs2_metadata_cache_owner(et->et_ci),
1569                        depth);
1570
1571                /* ocfs2_shift_tree_depth will return us a buffer with
1572                 * the new extent block (so we can pass that to
1573                 * ocfs2_add_branch). */
1574                ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1575                if (ret < 0) {
1576                        mlog_errno(ret);
1577                        goto out;
1578                }
1579                depth++;
1580                if (depth == 1) {
1581                        /*
1582                         * Special case: we have room now if we shifted from
1583                         * tree_depth 0, so no more work needs to be done.
1584                         *
1585                         * We won't be calling add_branch, so pass
1586                         * back *last_eb_bh as the new leaf. At depth
1587                         * zero, it should always be null so there's
1588                         * no reason to brelse.
1589                         */
1590                        BUG_ON(*last_eb_bh);
1591                        get_bh(bh);
1592                        *last_eb_bh = bh;
1593                        goto out;
1594                }
1595        }
1596
1597        /* call ocfs2_add_branch to add the final part of the tree with
1598         * the new data. */
1599        ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1600                               meta_ac);
1601        if (ret < 0) {
1602                mlog_errno(ret);
1603                goto out;
1604        }
1605
1606out:
1607        if (final_depth)
1608                *final_depth = depth;
1609        brelse(bh);
1610        return ret;
1611}
1612
1613/*
1614 * This function will discard the rightmost extent record.
1615 */
1616static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1617{
1618        int next_free = le16_to_cpu(el->l_next_free_rec);
1619        int count = le16_to_cpu(el->l_count);
1620        unsigned int num_bytes;
1621
1622        BUG_ON(!next_free);
1623        /* This will cause us to go off the end of our extent list. */
1624        BUG_ON(next_free >= count);
1625
1626        num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1627
1628        memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1629}
1630
1631static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1632                              struct ocfs2_extent_rec *insert_rec)
1633{
1634        int i, insert_index, next_free, has_empty, num_bytes;
1635        u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1636        struct ocfs2_extent_rec *rec;
1637
1638        next_free = le16_to_cpu(el->l_next_free_rec);
1639        has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1640
1641        BUG_ON(!next_free);
1642
1643        /* The tree code before us didn't allow enough room in the leaf. */
1644        BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1645
1646        /*
1647         * The easiest way to approach this is to just remove the
1648         * empty extent and temporarily decrement next_free.
1649         */
1650        if (has_empty) {
1651                /*
1652                 * If next_free was 1 (only an empty extent), this
1653                 * loop won't execute, which is fine. We still want
1654                 * the decrement above to happen.
1655                 */
1656                for(i = 0; i < (next_free - 1); i++)
1657                        el->l_recs[i] = el->l_recs[i+1];
1658
1659                next_free--;
1660        }
1661
1662        /*
1663         * Figure out what the new record index should be.
1664         */
1665        for(i = 0; i < next_free; i++) {
1666                rec = &el->l_recs[i];
1667
1668                if (insert_cpos < le32_to_cpu(rec->e_cpos))
1669                        break;
1670        }
1671        insert_index = i;
1672
1673        trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
1674                                has_empty, next_free,
1675                                le16_to_cpu(el->l_count));
1676
1677        BUG_ON(insert_index < 0);
1678        BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1679        BUG_ON(insert_index > next_free);
1680
1681        /*
1682         * No need to memmove if we're just adding to the tail.
1683         */
1684        if (insert_index != next_free) {
1685                BUG_ON(next_free >= le16_to_cpu(el->l_count));
1686
1687                num_bytes = next_free - insert_index;
1688                num_bytes *= sizeof(struct ocfs2_extent_rec);
1689                memmove(&el->l_recs[insert_index + 1],
1690                        &el->l_recs[insert_index],
1691                        num_bytes);
1692        }
1693
1694        /*
1695         * Either we had an empty extent, and need to re-increment or
1696         * there was no empty extent on a non full rightmost leaf node,
1697         * in which case we still need to increment.
1698         */
1699        next_free++;
1700        el->l_next_free_rec = cpu_to_le16(next_free);
1701        /*
1702         * Make sure none of the math above just messed up our tree.
1703         */
1704        BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1705
1706        el->l_recs[insert_index] = *insert_rec;
1707
1708}
1709
1710static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1711{
1712        int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1713
1714        BUG_ON(num_recs == 0);
1715
1716        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1717                num_recs--;
1718                size = num_recs * sizeof(struct ocfs2_extent_rec);
1719                memmove(&el->l_recs[0], &el->l_recs[1], size);
1720                memset(&el->l_recs[num_recs], 0,
1721                       sizeof(struct ocfs2_extent_rec));
1722                el->l_next_free_rec = cpu_to_le16(num_recs);
1723        }
1724}
1725
1726/*
1727 * Create an empty extent record .
1728 *
1729 * l_next_free_rec may be updated.
1730 *
1731 * If an empty extent already exists do nothing.
1732 */
1733static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1734{
1735        int next_free = le16_to_cpu(el->l_next_free_rec);
1736
1737        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1738
1739        if (next_free == 0)
1740                goto set_and_inc;
1741
1742        if (ocfs2_is_empty_extent(&el->l_recs[0]))
1743                return;
1744
1745        mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1746                        "Asked to create an empty extent in a full list:\n"
1747                        "count = %u, tree depth = %u",
1748                        le16_to_cpu(el->l_count),
1749                        le16_to_cpu(el->l_tree_depth));
1750
1751        ocfs2_shift_records_right(el);
1752
1753set_and_inc:
1754        le16_add_cpu(&el->l_next_free_rec, 1);
1755        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1756}
1757
1758/*
1759 * For a rotation which involves two leaf nodes, the "root node" is
1760 * the lowest level tree node which contains a path to both leafs. This
1761 * resulting set of information can be used to form a complete "subtree"
1762 *
1763 * This function is passed two full paths from the dinode down to a
1764 * pair of adjacent leaves. It's task is to figure out which path
1765 * index contains the subtree root - this can be the root index itself
1766 * in a worst-case rotation.
1767 *
1768 * The array index of the subtree root is passed back.
1769 */
1770int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1771                            struct ocfs2_path *left,
1772                            struct ocfs2_path *right)
1773{
1774        int i = 0;
1775
1776        /*
1777         * Check that the caller passed in two paths from the same tree.
1778         */
1779        BUG_ON(path_root_bh(left) != path_root_bh(right));
1780
1781        do {
1782                i++;
1783
1784                /*
1785                 * The caller didn't pass two adjacent paths.
1786                 */
1787                mlog_bug_on_msg(i > left->p_tree_depth,
1788                                "Owner %llu, left depth %u, right depth %u\n"
1789                                "left leaf blk %llu, right leaf blk %llu\n",
1790                                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1791                                left->p_tree_depth, right->p_tree_depth,
1792                                (unsigned long long)path_leaf_bh(left)->b_blocknr,
1793                                (unsigned long long)path_leaf_bh(right)->b_blocknr);
1794        } while (left->p_node[i].bh->b_blocknr ==
1795                 right->p_node[i].bh->b_blocknr);
1796
1797        return i - 1;
1798}
1799
1800typedef void (path_insert_t)(void *, struct buffer_head *);
1801
1802/*
1803 * Traverse a btree path in search of cpos, starting at root_el.
1804 *
1805 * This code can be called with a cpos larger than the tree, in which
1806 * case it will return the rightmost path.
1807 */
1808static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1809                             struct ocfs2_extent_list *root_el, u32 cpos,
1810                             path_insert_t *func, void *data)
1811{
1812        int i, ret = 0;
1813        u32 range;
1814        u64 blkno;
1815        struct buffer_head *bh = NULL;
1816        struct ocfs2_extent_block *eb;
1817        struct ocfs2_extent_list *el;
1818        struct ocfs2_extent_rec *rec;
1819
1820        el = root_el;
1821        while (el->l_tree_depth) {
1822                if (le16_to_cpu(el->l_next_free_rec) == 0) {
1823                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1824                                    "Owner %llu has empty extent list at depth %u\n",
1825                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1826                                    le16_to_cpu(el->l_tree_depth));
1827                        ret = -EROFS;
1828                        goto out;
1829
1830                }
1831
1832                for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1833                        rec = &el->l_recs[i];
1834
1835                        /*
1836                         * In the case that cpos is off the allocation
1837                         * tree, this should just wind up returning the
1838                         * rightmost record.
1839                         */
1840                        range = le32_to_cpu(rec->e_cpos) +
1841                                ocfs2_rec_clusters(el, rec);
1842                        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1843                            break;
1844                }
1845
1846                blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1847                if (blkno == 0) {
1848                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1849                                    "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
1850                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1851                                    le16_to_cpu(el->l_tree_depth), i);
1852                        ret = -EROFS;
1853                        goto out;
1854                }
1855
1856                brelse(bh);
1857                bh = NULL;
1858                ret = ocfs2_read_extent_block(ci, blkno, &bh);
1859                if (ret) {
1860                        mlog_errno(ret);
1861                        goto out;
1862                }
1863
1864                eb = (struct ocfs2_extent_block *) bh->b_data;
1865                el = &eb->h_list;
1866
1867                if (le16_to_cpu(el->l_next_free_rec) >
1868                    le16_to_cpu(el->l_count)) {
1869                        ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1870                                    "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
1871                                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1872                                    (unsigned long long)bh->b_blocknr,
1873                                    le16_to_cpu(el->l_next_free_rec),
1874                                    le16_to_cpu(el->l_count));
1875                        ret = -EROFS;
1876                        goto out;
1877                }
1878
1879                if (func)
1880                        func(data, bh);
1881        }
1882
1883out:
1884        /*
1885         * Catch any trailing bh that the loop didn't handle.
1886         */
1887        brelse(bh);
1888
1889        return ret;
1890}
1891
1892/*
1893 * Given an initialized path (that is, it has a valid root extent
1894 * list), this function will traverse the btree in search of the path
1895 * which would contain cpos.
1896 *
1897 * The path traveled is recorded in the path structure.
1898 *
1899 * Note that this will not do any comparisons on leaf node extent
1900 * records, so it will work fine in the case that we just added a tree
1901 * branch.
1902 */
1903struct find_path_data {
1904        int index;
1905        struct ocfs2_path *path;
1906};
1907static void find_path_ins(void *data, struct buffer_head *bh)
1908{
1909        struct find_path_data *fp = data;
1910
1911        get_bh(bh);
1912        ocfs2_path_insert_eb(fp->path, fp->index, bh);
1913        fp->index++;
1914}
1915int ocfs2_find_path(struct ocfs2_caching_info *ci,
1916                    struct ocfs2_path *path, u32 cpos)
1917{
1918        struct find_path_data data;
1919
1920        data.index = 1;
1921        data.path = path;
1922        return __ocfs2_find_path(ci, path_root_el(path), cpos,
1923                                 find_path_ins, &data);
1924}
1925
1926static void find_leaf_ins(void *data, struct buffer_head *bh)
1927{
1928        struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1929        struct ocfs2_extent_list *el = &eb->h_list;
1930        struct buffer_head **ret = data;
1931
1932        /* We want to retain only the leaf block. */
1933        if (le16_to_cpu(el->l_tree_depth) == 0) {
1934                get_bh(bh);
1935                *ret = bh;
1936        }
1937}
1938/*
1939 * Find the leaf block in the tree which would contain cpos. No
1940 * checking of the actual leaf is done.
1941 *
1942 * Some paths want to call this instead of allocating a path structure
1943 * and calling ocfs2_find_path().
1944 *
1945 * This function doesn't handle non btree extent lists.
1946 */
1947int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1948                    struct ocfs2_extent_list *root_el, u32 cpos,
1949                    struct buffer_head **leaf_bh)
1950{
1951        int ret;
1952        struct buffer_head *bh = NULL;
1953
1954        ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1955        if (ret) {
1956                mlog_errno(ret);
1957                goto out;
1958        }
1959
1960        *leaf_bh = bh;
1961out:
1962        return ret;
1963}
1964
1965/*
1966 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1967 *
1968 * Basically, we've moved stuff around at the bottom of the tree and
1969 * we need to fix up the extent records above the changes to reflect
1970 * the new changes.
1971 *
1972 * left_rec: the record on the left.
1973 * right_rec: the record to the right of left_rec
1974 * right_child_el: is the child list pointed to by right_rec
1975 *
1976 * By definition, this only works on interior nodes.
1977 */
1978static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1979                                  struct ocfs2_extent_rec *right_rec,
1980                                  struct ocfs2_extent_list *right_child_el)
1981{
1982        u32 left_clusters, right_end;
1983
1984        /*
1985         * Interior nodes never have holes. Their cpos is the cpos of
1986         * the leftmost record in their child list. Their cluster
1987         * count covers the full theoretical range of their child list
1988         * - the range between their cpos and the cpos of the record
1989         * immediately to their right.
1990         */
1991        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1992        if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1993                BUG_ON(right_child_el->l_tree_depth);
1994                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1995                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1996        }
1997        left_clusters -= le32_to_cpu(left_rec->e_cpos);
1998        left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1999
2000        /*
2001         * Calculate the rightmost cluster count boundary before
2002         * moving cpos - we will need to adjust clusters after
2003         * updating e_cpos to keep the same highest cluster count.
2004         */
2005        right_end = le32_to_cpu(right_rec->e_cpos);
2006        right_end += le32_to_cpu(right_rec->e_int_clusters);
2007
2008        right_rec->e_cpos = left_rec->e_cpos;
2009        le32_add_cpu(&right_rec->e_cpos, left_clusters);
2010
2011        right_end -= le32_to_cpu(right_rec->e_cpos);
2012        right_rec->e_int_clusters = cpu_to_le32(right_end);
2013}
2014
2015/*
2016 * Adjust the adjacent root node records involved in a
2017 * rotation. left_el_blkno is passed in as a key so that we can easily
2018 * find it's index in the root list.
2019 */
2020static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
2021                                      struct ocfs2_extent_list *left_el,
2022                                      struct ocfs2_extent_list *right_el,
2023                                      u64 left_el_blkno)
2024{
2025        int i;
2026
2027        BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
2028               le16_to_cpu(left_el->l_tree_depth));
2029
2030        for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
2031                if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
2032                        break;
2033        }
2034
2035        /*
2036         * The path walking code should have never returned a root and
2037         * two paths which are not adjacent.
2038         */
2039        BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2040
2041        ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
2042                                      &root_el->l_recs[i + 1], right_el);
2043}
2044
2045/*
2046 * We've changed a leaf block (in right_path) and need to reflect that
2047 * change back up the subtree.
2048 *
2049 * This happens in multiple places:
2050 *   - When we've moved an extent record from the left path leaf to the right
2051 *     path leaf to make room for an empty extent in the left path leaf.
2052 *   - When our insert into the right path leaf is at the leftmost edge
2053 *     and requires an update of the path immediately to it's left. This
2054 *     can occur at the end of some types of rotation and appending inserts.
2055 *   - When we've adjusted the last extent record in the left path leaf and the
2056 *     1st extent record in the right path leaf during cross extent block merge.
2057 */
2058static void ocfs2_complete_edge_insert(handle_t *handle,
2059                                       struct ocfs2_path *left_path,
2060                                       struct ocfs2_path *right_path,
2061                                       int subtree_index)
2062{
2063        int i, idx;
2064        struct ocfs2_extent_list *el, *left_el, *right_el;
2065        struct ocfs2_extent_rec *left_rec, *right_rec;
2066        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2067
2068        /*
2069         * Update the counts and position values within all the
2070         * interior nodes to reflect the leaf rotation we just did.
2071         *
2072         * The root node is handled below the loop.
2073         *
2074         * We begin the loop with right_el and left_el pointing to the
2075         * leaf lists and work our way up.
2076         *
2077         * NOTE: within this loop, left_el and right_el always refer
2078         * to the *child* lists.
2079         */
2080        left_el = path_leaf_el(left_path);
2081        right_el = path_leaf_el(right_path);
2082        for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2083                trace_ocfs2_complete_edge_insert(i);
2084
2085                /*
2086                 * One nice property of knowing that all of these
2087                 * nodes are below the root is that we only deal with
2088                 * the leftmost right node record and the rightmost
2089                 * left node record.
2090                 */
2091                el = left_path->p_node[i].el;
2092                idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2093                left_rec = &el->l_recs[idx];
2094
2095                el = right_path->p_node[i].el;
2096                right_rec = &el->l_recs[0];
2097
2098                ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
2099
2100                ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2101                ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2102
2103                /*
2104                 * Setup our list pointers now so that the current
2105                 * parents become children in the next iteration.
2106                 */
2107                left_el = left_path->p_node[i].el;
2108                right_el = right_path->p_node[i].el;
2109        }
2110
2111        /*
2112         * At the root node, adjust the two adjacent records which
2113         * begin our path to the leaves.
2114         */
2115
2116        el = left_path->p_node[subtree_index].el;
2117        left_el = left_path->p_node[subtree_index + 1].el;
2118        right_el = right_path->p_node[subtree_index + 1].el;
2119
2120        ocfs2_adjust_root_records(el, left_el, right_el,
2121                                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
2122
2123        root_bh = left_path->p_node[subtree_index].bh;
2124
2125        ocfs2_journal_dirty(handle, root_bh);
2126}
2127
2128static int ocfs2_rotate_subtree_right(handle_t *handle,
2129                                      struct ocfs2_extent_tree *et,
2130                                      struct ocfs2_path *left_path,
2131                                      struct ocfs2_path *right_path,
2132                                      int subtree_index)
2133{
2134        int ret, i;
2135        struct buffer_head *right_leaf_bh;
2136        struct buffer_head *left_leaf_bh = NULL;
2137        struct buffer_head *root_bh;
2138        struct ocfs2_extent_list *right_el, *left_el;
2139        struct ocfs2_extent_rec move_rec;
2140
2141        left_leaf_bh = path_leaf_bh(left_path);
2142        left_el = path_leaf_el(left_path);
2143
2144        if (left_el->l_next_free_rec != left_el->l_count) {
2145                ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2146                            "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
2147                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2148                            (unsigned long long)left_leaf_bh->b_blocknr,
2149                            le16_to_cpu(left_el->l_next_free_rec));
2150                return -EROFS;
2151        }
2152
2153        /*
2154         * This extent block may already have an empty record, so we
2155         * return early if so.
2156         */
2157        if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2158                return 0;
2159
2160        root_bh = left_path->p_node[subtree_index].bh;
2161        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2162
2163        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2164                                           subtree_index);
2165        if (ret) {
2166                mlog_errno(ret);
2167                goto out;
2168        }
2169
2170        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2171                ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2172                                                   right_path, i);
2173                if (ret) {
2174                        mlog_errno(ret);
2175                        goto out;
2176                }
2177
2178                ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2179                                                   left_path, i);
2180                if (ret) {
2181                        mlog_errno(ret);
2182                        goto out;
2183                }
2184        }
2185
2186        right_leaf_bh = path_leaf_bh(right_path);
2187        right_el = path_leaf_el(right_path);
2188
2189        /* This is a code error, not a disk corruption. */
2190        mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2191                        "because rightmost leaf block %llu is empty\n",
2192                        (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2193                        (unsigned long long)right_leaf_bh->b_blocknr);
2194
2195        ocfs2_create_empty_extent(right_el);
2196
2197        ocfs2_journal_dirty(handle, right_leaf_bh);
2198
2199        /* Do the copy now. */
2200        i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2201        move_rec = left_el->l_recs[i];
2202        right_el->l_recs[0] = move_rec;
2203
2204        /*
2205         * Clear out the record we just copied and shift everything
2206         * over, leaving an empty extent in the left leaf.
2207         *
2208         * We temporarily subtract from next_free_rec so that the
2209         * shift will lose the tail record (which is now defunct).
2210         */
2211        le16_add_cpu(&left_el->l_next_free_rec, -1);
2212        ocfs2_shift_records_right(left_el);
2213        memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2214        le16_add_cpu(&left_el->l_next_free_rec, 1);
2215
2216        ocfs2_journal_dirty(handle, left_leaf_bh);
2217
2218        ocfs2_complete_edge_insert(handle, left_path, right_path,
2219                                   subtree_index);
2220
2221out:
2222        return ret;
2223}
2224
2225/*
2226 * Given a full path, determine what cpos value would return us a path
2227 * containing the leaf immediately to the left of the current one.
2228 *
2229 * Will return zero if the path passed in is already the leftmost path.
2230 */
2231int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2232                                  struct ocfs2_path *path, u32 *cpos)
2233{
2234        int i, j, ret = 0;
2235        u64 blkno;
2236        struct ocfs2_extent_list *el;
2237
2238        BUG_ON(path->p_tree_depth == 0);
2239
2240        *cpos = 0;
2241
2242        blkno = path_leaf_bh(path)->b_blocknr;
2243
2244        /* Start at the tree node just above the leaf and work our way up. */
2245        i = path->p_tree_depth - 1;
2246        while (i >= 0) {
2247                el = path->p_node[i].el;
2248
2249                /*
2250                 * Find the extent record just before the one in our
2251                 * path.
2252                 */
2253                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2254                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2255                                if (j == 0) {
2256                                        if (i == 0) {
2257                                                /*
2258                                                 * We've determined that the
2259                                                 * path specified is already
2260                                                 * the leftmost one - return a
2261                                                 * cpos of zero.
2262                                                 */
2263                                                goto out;
2264                                        }
2265                                        /*
2266                                         * The leftmost record points to our
2267                                         * leaf - we need to travel up the
2268                                         * tree one level.
2269                                         */
2270                                        goto next_node;
2271                                }
2272
2273                                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2274                                *cpos = *cpos + ocfs2_rec_clusters(el,
2275                                                           &el->l_recs[j - 1]);
2276                                *cpos = *cpos - 1;
2277                                goto out;
2278                        }
2279                }
2280
2281                /*
2282                 * If we got here, we never found a valid node where
2283                 * the tree indicated one should be.
2284                 */
2285                ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2286                            (unsigned long long)blkno);
2287                ret = -EROFS;
2288                goto out;
2289
2290next_node:
2291                blkno = path->p_node[i].bh->b_blocknr;
2292                i--;
2293        }
2294
2295out:
2296        return ret;
2297}
2298
2299/*
2300 * Extend the transaction by enough credits to complete the rotation,
2301 * and still leave at least the original number of credits allocated
2302 * to this transaction.
2303 */
2304static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2305                                           int op_credits,
2306                                           struct ocfs2_path *path)
2307{
2308        int ret = 0;
2309        int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2310
2311        if (handle->h_buffer_credits < credits)
2312                ret = ocfs2_extend_trans(handle,
2313                                         credits - handle->h_buffer_credits);
2314
2315        return ret;
2316}
2317
2318/*
2319 * Trap the case where we're inserting into the theoretical range past
2320 * the _actual_ left leaf range. Otherwise, we'll rotate a record
2321 * whose cpos is less than ours into the right leaf.
2322 *
2323 * It's only necessary to look at the rightmost record of the left
2324 * leaf because the logic that calls us should ensure that the
2325 * theoretical ranges in the path components above the leaves are
2326 * correct.
2327 */
2328static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2329                                                 u32 insert_cpos)
2330{
2331        struct ocfs2_extent_list *left_el;
2332        struct ocfs2_extent_rec *rec;
2333        int next_free;
2334
2335        left_el = path_leaf_el(left_path);
2336        next_free = le16_to_cpu(left_el->l_next_free_rec);
2337        rec = &left_el->l_recs[next_free - 1];
2338
2339        if (insert_cpos > le32_to_cpu(rec->e_cpos))
2340                return 1;
2341        return 0;
2342}
2343
2344static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2345{
2346        int next_free = le16_to_cpu(el->l_next_free_rec);
2347        unsigned int range;
2348        struct ocfs2_extent_rec *rec;
2349
2350        if (next_free == 0)
2351                return 0;
2352
2353        rec = &el->l_recs[0];
2354        if (ocfs2_is_empty_extent(rec)) {
2355                /* Empty list. */
2356                if (next_free == 1)
2357                        return 0;
2358                rec = &el->l_recs[1];
2359        }
2360
2361        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2362        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2363                return 1;
2364        return 0;
2365}
2366
2367/*
2368 * Rotate all the records in a btree right one record, starting at insert_cpos.
2369 *
2370 * The path to the rightmost leaf should be passed in.
2371 *
2372 * The array is assumed to be large enough to hold an entire path (tree depth).
2373 *
2374 * Upon successful return from this function:
2375 *
2376 * - The 'right_path' array will contain a path to the leaf block
2377 *   whose range contains e_cpos.
2378 * - That leaf block will have a single empty extent in list index 0.
2379 * - In the case that the rotation requires a post-insert update,
2380 *   *ret_left_path will contain a valid path which can be passed to
2381 *   ocfs2_insert_path().
2382 */
2383static int ocfs2_rotate_tree_right(handle_t *handle,
2384                                   struct ocfs2_extent_tree *et,
2385                                   enum ocfs2_split_type split,
2386                                   u32 insert_cpos,
2387                                   struct ocfs2_path *right_path,
2388                                   struct ocfs2_path **ret_left_path)
2389{
2390        int ret, start, orig_credits = handle->h_buffer_credits;
2391        u32 cpos;
2392        struct ocfs2_path *left_path = NULL;
2393        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2394
2395        *ret_left_path = NULL;
2396
2397        left_path = ocfs2_new_path_from_path(right_path);
2398        if (!left_path) {
2399                ret = -ENOMEM;
2400                mlog_errno(ret);
2401                goto out;
2402        }
2403
2404        ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2405        if (ret) {
2406                mlog_errno(ret);
2407                goto out;
2408        }
2409
2410        trace_ocfs2_rotate_tree_right(
2411                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2412                insert_cpos, cpos);
2413
2414        /*
2415         * What we want to do here is:
2416         *
2417         * 1) Start with the rightmost path.
2418         *
2419         * 2) Determine a path to the leaf block directly to the left
2420         *    of that leaf.
2421         *
2422         * 3) Determine the 'subtree root' - the lowest level tree node
2423         *    which contains a path to both leaves.
2424         *
2425         * 4) Rotate the subtree.
2426         *
2427         * 5) Find the next subtree by considering the left path to be
2428         *    the new right path.
2429         *
2430         * The check at the top of this while loop also accepts
2431         * insert_cpos == cpos because cpos is only a _theoretical_
2432         * value to get us the left path - insert_cpos might very well
2433         * be filling that hole.
2434         *
2435         * Stop at a cpos of '0' because we either started at the
2436         * leftmost branch (i.e., a tree with one branch and a
2437         * rotation inside of it), or we've gone as far as we can in
2438         * rotating subtrees.
2439         */
2440        while (cpos && insert_cpos <= cpos) {
2441                trace_ocfs2_rotate_tree_right(
2442                        (unsigned long long)
2443                        ocfs2_metadata_cache_owner(et->et_ci),
2444                        insert_cpos, cpos);
2445
2446                ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2447                if (ret) {
2448                        mlog_errno(ret);
2449                        goto out;
2450                }
2451
2452                mlog_bug_on_msg(path_leaf_bh(left_path) ==
2453                                path_leaf_bh(right_path),
2454                                "Owner %llu: error during insert of %u "
2455                                "(left path cpos %u) results in two identical "
2456                                "paths ending at %llu\n",
2457                                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2458                                insert_cpos, cpos,
2459                                (unsigned long long)
2460                                path_leaf_bh(left_path)->b_blocknr);
2461
2462                if (split == SPLIT_NONE &&
2463                    ocfs2_rotate_requires_path_adjustment(left_path,
2464                                                          insert_cpos)) {
2465
2466                        /*
2467                         * We've rotated the tree as much as we
2468                         * should. The rest is up to
2469                         * ocfs2_insert_path() to complete, after the
2470                         * record insertion. We indicate this
2471                         * situation by returning the left path.
2472                         *
2473                         * The reason we don't adjust the records here
2474                         * before the record insert is that an error
2475                         * later might break the rule where a parent
2476                         * record e_cpos will reflect the actual
2477                         * e_cpos of the 1st nonempty record of the
2478                         * child list.
2479                         */
2480                        *ret_left_path = left_path;
2481                        goto out_ret_path;
2482                }
2483
2484                start = ocfs2_find_subtree_root(et, left_path, right_path);
2485
2486                trace_ocfs2_rotate_subtree(start,
2487                        (unsigned long long)
2488                        right_path->p_node[start].bh->b_blocknr,
2489                        right_path->p_tree_depth);
2490
2491                ret = ocfs2_extend_rotate_transaction(handle, start,
2492                                                      orig_credits, right_path);
2493                if (ret) {
2494                        mlog_errno(ret);
2495                        goto out;
2496                }
2497
2498                ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2499                                                 right_path, start);
2500                if (ret) {
2501                        mlog_errno(ret);
2502                        goto out;
2503                }
2504
2505                if (split != SPLIT_NONE &&
2506                    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2507                                                insert_cpos)) {
2508                        /*
2509                         * A rotate moves the rightmost left leaf
2510                         * record over to the leftmost right leaf
2511                         * slot. If we're doing an extent split
2512                         * instead of a real insert, then we have to
2513                         * check that the extent to be split wasn't
2514                         * just moved over. If it was, then we can
2515                         * exit here, passing left_path back -
2516                         * ocfs2_split_extent() is smart enough to
2517                         * search both leaves.
2518                         */
2519                        *ret_left_path = left_path;
2520                        goto out_ret_path;
2521                }
2522
2523                /*
2524                 * There is no need to re-read the next right path
2525                 * as we know that it'll be our current left
2526                 * path. Optimize by copying values instead.
2527                 */
2528                ocfs2_mv_path(right_path, left_path);
2529
2530                ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2531                if (ret) {
2532                        mlog_errno(ret);
2533                        goto out;
2534                }
2535        }
2536
2537out:
2538        ocfs2_free_path(left_path);
2539
2540out_ret_path:
2541        return ret;
2542}
2543
2544static int ocfs2_update_edge_lengths(handle_t *handle,
2545                                     struct ocfs2_extent_tree *et,
2546                                     struct ocfs2_path *path)
2547{
2548        int i, idx, ret;
2549        struct ocfs2_extent_rec *rec;
2550        struct ocfs2_extent_list *el;
2551        struct ocfs2_extent_block *eb;
2552        u32 range;
2553
2554        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2555        if (ret) {
2556                mlog_errno(ret);
2557                goto out;
2558        }
2559
2560        /* Path should always be rightmost. */
2561        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2562        BUG_ON(eb->h_next_leaf_blk != 0ULL);
2563
2564        el = &eb->h_list;
2565        BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2566        idx = le16_to_cpu(el->l_next_free_rec) - 1;
2567        rec = &el->l_recs[idx];
2568        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2569
2570        for (i = 0; i < path->p_tree_depth; i++) {
2571                el = path->p_node[i].el;
2572                idx = le16_to_cpu(el->l_next_free_rec) - 1;
2573                rec = &el->l_recs[idx];
2574
2575                rec->e_int_clusters = cpu_to_le32(range);
2576                le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2577
2578                ocfs2_journal_dirty(handle, path->p_node[i].bh);
2579        }
2580out:
2581        return ret;
2582}
2583
2584static void ocfs2_unlink_path(handle_t *handle,
2585                              struct ocfs2_extent_tree *et,
2586                              struct ocfs2_cached_dealloc_ctxt *dealloc,
2587                              struct ocfs2_path *path, int unlink_start)
2588{
2589        int ret, i;
2590        struct ocfs2_extent_block *eb;
2591        struct ocfs2_extent_list *el;
2592        struct buffer_head *bh;
2593
2594        for(i = unlink_start; i < path_num_items(path); i++) {
2595                bh = path->p_node[i].bh;
2596
2597                eb = (struct ocfs2_extent_block *)bh->b_data;
2598                /*
2599                 * Not all nodes might have had their final count
2600                 * decremented by the caller - handle this here.
2601                 */
2602                el = &eb->h_list;
2603                if (le16_to_cpu(el->l_next_free_rec) > 1) {
2604                        mlog(ML_ERROR,
2605                             "Inode %llu, attempted to remove extent block "
2606                             "%llu with %u records\n",
2607                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2608                             (unsigned long long)le64_to_cpu(eb->h_blkno),
2609                             le16_to_cpu(el->l_next_free_rec));
2610
2611                        ocfs2_journal_dirty(handle, bh);
2612                        ocfs2_remove_from_cache(et->et_ci, bh);
2613                        continue;
2614                }
2615
2616                el->l_next_free_rec = 0;
2617                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2618
2619                ocfs2_journal_dirty(handle, bh);
2620
2621                ret = ocfs2_cache_extent_block_free(dealloc, eb);
2622                if (ret)
2623                        mlog_errno(ret);
2624
2625                ocfs2_remove_from_cache(et->et_ci, bh);
2626        }
2627}
2628
2629static void ocfs2_unlink_subtree(handle_t *handle,
2630                                 struct ocfs2_extent_tree *et,
2631                                 struct ocfs2_path *left_path,
2632                                 struct ocfs2_path *right_path,
2633                                 int subtree_index,
2634                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2635{
2636        int i;
2637        struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2638        struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2639        struct ocfs2_extent_block *eb;
2640
2641        eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2642
2643        for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2644                if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2645                        break;
2646
2647        BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2648
2649        memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2650        le16_add_cpu(&root_el->l_next_free_rec, -1);
2651
2652        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2653        eb->h_next_leaf_blk = 0;
2654
2655        ocfs2_journal_dirty(handle, root_bh);
2656        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2657
2658        ocfs2_unlink_path(handle, et, dealloc, right_path,
2659                          subtree_index + 1);
2660}
2661
2662static int ocfs2_rotate_subtree_left(handle_t *handle,
2663                                     struct ocfs2_extent_tree *et,
2664                                     struct ocfs2_path *left_path,
2665                                     struct ocfs2_path *right_path,
2666                                     int subtree_index,
2667                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2668                                     int *deleted)
2669{
2670        int ret, i, del_right_subtree = 0, right_has_empty = 0;
2671        struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2672        struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2673        struct ocfs2_extent_block *eb;
2674
2675        *deleted = 0;
2676
2677        right_leaf_el = path_leaf_el(right_path);
2678        left_leaf_el = path_leaf_el(left_path);
2679        root_bh = left_path->p_node[subtree_index].bh;
2680        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2681
2682        if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2683                return 0;
2684
2685        eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2686        if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2687                /*
2688                 * It's legal for us to proceed if the right leaf is
2689                 * the rightmost one and it has an empty extent. There
2690                 * are two cases to handle - whether the leaf will be
2691                 * empty after removal or not. If the leaf isn't empty
2692                 * then just remove the empty extent up front. The
2693                 * next block will handle empty leaves by flagging
2694                 * them for unlink.
2695                 *
2696                 * Non rightmost leaves will throw -EAGAIN and the
2697                 * caller can manually move the subtree and retry.
2698                 */
2699
2700                if (eb->h_next_leaf_blk != 0ULL)
2701                        return -EAGAIN;
2702
2703                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2704                        ret = ocfs2_journal_access_eb(handle, et->et_ci,
2705                                                      path_leaf_bh(right_path),
2706                                                      OCFS2_JOURNAL_ACCESS_WRITE);
2707                        if (ret) {
2708                                mlog_errno(ret);
2709                                goto out;
2710                        }
2711
2712                        ocfs2_remove_empty_extent(right_leaf_el);
2713                } else
2714                        right_has_empty = 1;
2715        }
2716
2717        if (eb->h_next_leaf_blk == 0ULL &&
2718            le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2719                /*
2720                 * We have to update i_last_eb_blk during the meta
2721                 * data delete.
2722                 */
2723                ret = ocfs2_et_root_journal_access(handle, et,
2724                                                   OCFS2_JOURNAL_ACCESS_WRITE);
2725                if (ret) {
2726                        mlog_errno(ret);
2727                        goto out;
2728                }
2729
2730                del_right_subtree = 1;
2731        }
2732
2733        /*
2734         * Getting here with an empty extent in the right path implies
2735         * that it's the rightmost path and will be deleted.
2736         */
2737        BUG_ON(right_has_empty && !del_right_subtree);
2738
2739        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2740                                           subtree_index);
2741        if (ret) {
2742                mlog_errno(ret);
2743                goto out;
2744        }
2745
2746        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2747                ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2748                                                   right_path, i);
2749                if (ret) {
2750                        mlog_errno(ret);
2751                        goto out;
2752                }
2753
2754                ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2755                                                   left_path, i);
2756                if (ret) {
2757                        mlog_errno(ret);
2758                        goto out;
2759                }
2760        }
2761
2762        if (!right_has_empty) {
2763                /*
2764                 * Only do this if we're moving a real
2765                 * record. Otherwise, the action is delayed until
2766                 * after removal of the right path in which case we
2767                 * can do a simple shift to remove the empty extent.
2768                 */
2769                ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2770                memset(&right_leaf_el->l_recs[0], 0,
2771                       sizeof(struct ocfs2_extent_rec));
2772        }
2773        if (eb->h_next_leaf_blk == 0ULL) {
2774                /*
2775                 * Move recs over to get rid of empty extent, decrease
2776                 * next_free. This is allowed to remove the last
2777                 * extent in our leaf (setting l_next_free_rec to
2778                 * zero) - the delete code below won't care.
2779                 */
2780                ocfs2_remove_empty_extent(right_leaf_el);
2781        }
2782
2783        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2784        ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2785
2786        if (del_right_subtree) {
2787                ocfs2_unlink_subtree(handle, et, left_path, right_path,
2788                                     subtree_index, dealloc);
2789                ret = ocfs2_update_edge_lengths(handle, et, left_path);
2790                if (ret) {
2791                        mlog_errno(ret);
2792                        goto out;
2793                }
2794
2795                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2796                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2797
2798                /*
2799                 * Removal of the extent in the left leaf was skipped
2800                 * above so we could delete the right path
2801                 * 1st.
2802                 */
2803                if (right_has_empty)
2804                        ocfs2_remove_empty_extent(left_leaf_el);
2805
2806                ocfs2_journal_dirty(handle, et_root_bh);
2807
2808                *deleted = 1;
2809        } else
2810                ocfs2_complete_edge_insert(handle, left_path, right_path,
2811                                           subtree_index);
2812
2813out:
2814        return ret;
2815}
2816
2817/*
2818 * Given a full path, determine what cpos value would return us a path
2819 * containing the leaf immediately to the right of the current one.
2820 *
2821 * Will return zero if the path passed in is already the rightmost path.
2822 *
2823 * This looks similar, but is subtly different to
2824 * ocfs2_find_cpos_for_left_leaf().
2825 */
2826int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2827                                   struct ocfs2_path *path, u32 *cpos)
2828{
2829        int i, j, ret = 0;
2830        u64 blkno;
2831        struct ocfs2_extent_list *el;
2832
2833        *cpos = 0;
2834
2835        if (path->p_tree_depth == 0)
2836                return 0;
2837
2838        blkno = path_leaf_bh(path)->b_blocknr;
2839
2840        /* Start at the tree node just above the leaf and work our way up. */
2841        i = path->p_tree_depth - 1;
2842        while (i >= 0) {
2843                int next_free;
2844
2845                el = path->p_node[i].el;
2846
2847                /*
2848                 * Find the extent record just after the one in our
2849                 * path.
2850                 */
2851                next_free = le16_to_cpu(el->l_next_free_rec);
2852                for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2853                        if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2854                                if (j == (next_free - 1)) {
2855                                        if (i == 0) {
2856                                                /*
2857                                                 * We've determined that the
2858                                                 * path specified is already
2859                                                 * the rightmost one - return a
2860                                                 * cpos of zero.
2861                                                 */
2862                                                goto out;
2863                                        }
2864                                        /*
2865                                         * The rightmost record points to our
2866                                         * leaf - we need to travel up the
2867                                         * tree one level.
2868                                         */
2869                                        goto next_node;
2870                                }
2871
2872                                *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2873                                goto out;
2874                        }
2875                }
2876
2877                /*
2878                 * If we got here, we never found a valid node where
2879                 * the tree indicated one should be.
2880                 */
2881                ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
2882                            (unsigned long long)blkno);
2883                ret = -EROFS;
2884                goto out;
2885
2886next_node:
2887                blkno = path->p_node[i].bh->b_blocknr;
2888                i--;
2889        }
2890
2891out:
2892        return ret;
2893}
2894
2895static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2896                                            struct ocfs2_extent_tree *et,
2897                                            struct ocfs2_path *path)
2898{
2899        int ret;
2900        struct buffer_head *bh = path_leaf_bh(path);
2901        struct ocfs2_extent_list *el = path_leaf_el(path);
2902
2903        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2904                return 0;
2905
2906        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2907                                           path_num_items(path) - 1);
2908        if (ret) {
2909                mlog_errno(ret);
2910                goto out;
2911        }
2912
2913        ocfs2_remove_empty_extent(el);
2914        ocfs2_journal_dirty(handle, bh);
2915
2916out:
2917        return ret;
2918}
2919
2920static int __ocfs2_rotate_tree_left(handle_t *handle,
2921                                    struct ocfs2_extent_tree *et,
2922                                    int orig_credits,
2923                                    struct ocfs2_path *path,
2924                                    struct ocfs2_cached_dealloc_ctxt *dealloc,
2925                                    struct ocfs2_path **empty_extent_path)
2926{
2927        int ret, subtree_root, deleted;
2928        u32 right_cpos;
2929        struct ocfs2_path *left_path = NULL;
2930        struct ocfs2_path *right_path = NULL;
2931        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2932
2933        if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])))
2934                return 0;
2935
2936        *empty_extent_path = NULL;
2937
2938        ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2939        if (ret) {
2940                mlog_errno(ret);
2941                goto out;
2942        }
2943
2944        left_path = ocfs2_new_path_from_path(path);
2945        if (!left_path) {
2946                ret = -ENOMEM;
2947                mlog_errno(ret);
2948                goto out;
2949        }
2950
2951        ocfs2_cp_path(left_path, path);
2952
2953        right_path = ocfs2_new_path_from_path(path);
2954        if (!right_path) {
2955                ret = -ENOMEM;
2956                mlog_errno(ret);
2957                goto out;
2958        }
2959
2960        while (right_cpos) {
2961                ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2962                if (ret) {
2963                        mlog_errno(ret);
2964                        goto out;
2965                }
2966
2967                subtree_root = ocfs2_find_subtree_root(et, left_path,
2968                                                       right_path);
2969
2970                trace_ocfs2_rotate_subtree(subtree_root,
2971                     (unsigned long long)
2972                     right_path->p_node[subtree_root].bh->b_blocknr,
2973                     right_path->p_tree_depth);
2974
2975                ret = ocfs2_extend_rotate_transaction(handle, 0,
2976                                                      orig_credits, left_path);
2977                if (ret) {
2978                        mlog_errno(ret);
2979                        goto out;
2980                }
2981
2982                /*
2983                 * Caller might still want to make changes to the
2984                 * tree root, so re-add it to the journal here.
2985                 */
2986                ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2987                                                   left_path, 0);
2988                if (ret) {
2989                        mlog_errno(ret);
2990                        goto out;
2991                }
2992
2993                ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2994                                                right_path, subtree_root,
2995                                                dealloc, &deleted);
2996                if (ret == -EAGAIN) {
2997                        /*
2998                         * The rotation has to temporarily stop due to
2999                         * the right subtree having an empty
3000                         * extent. Pass it back to the caller for a
3001                         * fixup.
3002                         */
3003                        *empty_extent_path = right_path;
3004                        right_path = NULL;
3005                        goto out;
3006                }
3007                if (ret) {
3008                        mlog_errno(ret);
3009                        goto out;
3010                }
3011
3012                /*
3013                 * The subtree rotate might have removed records on
3014                 * the rightmost edge. If so, then rotation is
3015                 * complete.
3016                 */
3017                if (deleted)
3018                        break;
3019
3020                ocfs2_mv_path(left_path, right_path);
3021
3022                ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3023                                                     &right_cpos);
3024                if (ret) {
3025                        mlog_errno(ret);
3026                        goto out;
3027                }
3028        }
3029
3030out:
3031        ocfs2_free_path(right_path);
3032        ocfs2_free_path(left_path);
3033
3034        return ret;
3035}
3036
3037static int ocfs2_remove_rightmost_path(handle_t *handle,
3038                                struct ocfs2_extent_tree *et,
3039                                struct ocfs2_path *path,
3040                                struct ocfs2_cached_dealloc_ctxt *dealloc)
3041{
3042        int ret, subtree_index;
3043        u32 cpos;
3044        struct ocfs2_path *left_path = NULL;
3045        struct ocfs2_extent_block *eb;
3046        struct ocfs2_extent_list *el;
3047
3048        ret = ocfs2_et_sanity_check(et);
3049        if (ret)
3050                goto out;
3051
3052        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3053        if (ret) {
3054                mlog_errno(ret);
3055                goto out;
3056        }
3057
3058        ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3059                                            path, &cpos);
3060        if (ret) {
3061                mlog_errno(ret);
3062                goto out;
3063        }
3064
3065        if (cpos) {
3066                /*
3067                 * We have a path to the left of this one - it needs
3068                 * an update too.
3069                 */
3070                left_path = ocfs2_new_path_from_path(path);
3071                if (!left_path) {
3072                        ret = -ENOMEM;
3073                        mlog_errno(ret);
3074                        goto out;
3075                }
3076
3077                ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3078                if (ret) {
3079                        mlog_errno(ret);
3080                        goto out;
3081                }
3082
3083                ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3084                if (ret) {
3085                        mlog_errno(ret);
3086                        goto out;
3087                }
3088
3089                subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3090
3091                ocfs2_unlink_subtree(handle, et, left_path, path,
3092                                     subtree_index, dealloc);
3093                ret = ocfs2_update_edge_lengths(handle, et, left_path);
3094                if (ret) {
3095                        mlog_errno(ret);
3096                        goto out;
3097                }
3098
3099                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3100                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3101        } else {
3102                /*
3103                 * 'path' is also the leftmost path which
3104                 * means it must be the only one. This gets
3105                 * handled differently because we want to
3106                 * revert the root back to having extents
3107                 * in-line.
3108                 */
3109                ocfs2_unlink_path(handle, et, dealloc, path, 1);
3110
3111                el = et->et_root_el;
3112                el->l_tree_depth = 0;
3113                el->l_next_free_rec = 0;
3114                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3115
3116                ocfs2_et_set_last_eb_blk(et, 0);
3117        }
3118
3119        ocfs2_journal_dirty(handle, path_root_bh(path));
3120
3121out:
3122        ocfs2_free_path(left_path);
3123        return ret;
3124}
3125
3126static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
3127                                struct ocfs2_extent_tree *et,
3128                                struct ocfs2_path *path,
3129                                struct ocfs2_cached_dealloc_ctxt *dealloc)
3130{
3131        handle_t *handle;
3132        int ret;
3133        int credits = path->p_tree_depth * 2 + 1;
3134
3135        handle = ocfs2_start_trans(osb, credits);
3136        if (IS_ERR(handle)) {
3137                ret = PTR_ERR(handle);
3138                mlog_errno(ret);
3139                return ret;
3140        }
3141
3142        ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
3143        if (ret)
3144                mlog_errno(ret);
3145
3146        ocfs2_commit_trans(osb, handle);
3147        return ret;
3148}
3149
3150/*
3151 * Left rotation of btree records.
3152 *
3153 * In many ways, this is (unsurprisingly) the opposite of right
3154 * rotation. We start at some non-rightmost path containing an empty
3155 * extent in the leaf block. The code works its way to the rightmost
3156 * path by rotating records to the left in every subtree.
3157 *
3158 * This is used by any code which reduces the number of extent records
3159 * in a leaf. After removal, an empty record should be placed in the
3160 * leftmost list position.
3161 *
3162 * This won't handle a length update of the rightmost path records if
3163 * the rightmost tree leaf record is removed so the caller is
3164 * responsible for detecting and correcting that.
3165 */
3166static int ocfs2_rotate_tree_left(handle_t *handle,
3167                                  struct ocfs2_extent_tree *et,
3168                                  struct ocfs2_path *path,
3169                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
3170{
3171        int ret, orig_credits = handle->h_buffer_credits;
3172        struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3173        struct ocfs2_extent_block *eb;
3174        struct ocfs2_extent_list *el;
3175
3176        el = path_leaf_el(path);
3177        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3178                return 0;
3179
3180        if (path->p_tree_depth == 0) {
3181rightmost_no_delete:
3182                /*
3183                 * Inline extents. This is trivially handled, so do
3184                 * it up front.
3185                 */
3186                ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3187                if (ret)
3188                        mlog_errno(ret);
3189                goto out;
3190        }
3191
3192        /*
3193         * Handle rightmost branch now. There's several cases:
3194         *  1) simple rotation leaving records in there. That's trivial.
3195         *  2) rotation requiring a branch delete - there's no more
3196         *     records left. Two cases of this:
3197         *     a) There are branches to the left.
3198         *     b) This is also the leftmost (the only) branch.
3199         *
3200         *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3201         *  2a) we need the left branch so that we can update it with the unlink
3202         *  2b) we need to bring the root back to inline extents.
3203         */
3204
3205        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3206        el = &eb->h_list;
3207        if (eb->h_next_leaf_blk == 0) {
3208                /*
3209                 * This gets a bit tricky if we're going to delete the
3210                 * rightmost path. Get the other cases out of the way
3211                 * 1st.
3212                 */
3213                if (le16_to_cpu(el->l_next_free_rec) > 1)
3214                        goto rightmost_no_delete;
3215
3216                if (le16_to_cpu(el->l_next_free_rec) == 0) {
3217                        ret = -EIO;
3218                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3219                                    "Owner %llu has empty extent block at %llu\n",
3220                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3221                                    (unsigned long long)le64_to_cpu(eb->h_blkno));
3222                        goto out;
3223                }
3224
3225                /*
3226                 * XXX: The caller can not trust "path" any more after
3227                 * this as it will have been deleted. What do we do?
3228                 *
3229                 * In theory the rotate-for-merge code will never get
3230                 * here because it'll always ask for a rotate in a
3231                 * nonempty list.
3232                 */
3233
3234                ret = ocfs2_remove_rightmost_path(handle, et, path,
3235                                                  dealloc);
3236                if (ret)
3237                        mlog_errno(ret);
3238                goto out;
3239        }
3240
3241        /*
3242         * Now we can loop, remembering the path we get from -EAGAIN
3243         * and restarting from there.
3244         */
3245try_rotate:
3246        ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3247                                       dealloc, &restart_path);
3248        if (ret && ret != -EAGAIN) {
3249                mlog_errno(ret);
3250                goto out;
3251        }
3252
3253        while (ret == -EAGAIN) {
3254                tmp_path = restart_path;
3255                restart_path = NULL;
3256
3257                ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3258                                               tmp_path, dealloc,
3259                                               &restart_path);
3260                if (ret && ret != -EAGAIN) {
3261                        mlog_errno(ret);
3262                        goto out;
3263                }
3264
3265                ocfs2_free_path(tmp_path);
3266                tmp_path = NULL;
3267
3268                if (ret == 0)
3269                        goto try_rotate;
3270        }
3271
3272out:
3273        ocfs2_free_path(tmp_path);
3274        ocfs2_free_path(restart_path);
3275        return ret;
3276}
3277
3278static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3279                                int index)
3280{
3281        struct ocfs2_extent_rec *rec = &el->l_recs[index];
3282        unsigned int size;
3283
3284        if (rec->e_leaf_clusters == 0) {
3285                /*
3286                 * We consumed all of the merged-from record. An empty
3287                 * extent cannot exist anywhere but the 1st array
3288                 * position, so move things over if the merged-from
3289                 * record doesn't occupy that position.
3290                 *
3291                 * This creates a new empty extent so the caller
3292                 * should be smart enough to have removed any existing
3293                 * ones.
3294                 */
3295                if (index > 0) {
3296                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3297                        size = index * sizeof(struct ocfs2_extent_rec);
3298                        memmove(&el->l_recs[1], &el->l_recs[0], size);
3299                }
3300
3301                /*
3302                 * Always memset - the caller doesn't check whether it
3303                 * created an empty extent, so there could be junk in
3304                 * the other fields.
3305                 */
3306                memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3307        }
3308}
3309
3310static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3311                                struct ocfs2_path *left_path,
3312                                struct ocfs2_path **ret_right_path)
3313{
3314        int ret;
3315        u32 right_cpos;
3316        struct ocfs2_path *right_path = NULL;
3317        struct ocfs2_extent_list *left_el;
3318
3319        *ret_right_path = NULL;
3320
3321        /* This function shouldn't be called for non-trees. */
3322        BUG_ON(left_path->p_tree_depth == 0);
3323
3324        left_el = path_leaf_el(left_path);
3325        BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3326
3327        ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3328                                             left_path, &right_cpos);
3329        if (ret) {
3330                mlog_errno(ret);
3331                goto out;
3332        }
3333
3334        /* This function shouldn't be called for the rightmost leaf. */
3335        BUG_ON(right_cpos == 0);
3336
3337        right_path = ocfs2_new_path_from_path(left_path);
3338        if (!right_path) {
3339                ret = -ENOMEM;
3340                mlog_errno(ret);
3341                goto out;
3342        }
3343
3344        ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3345        if (ret) {
3346                mlog_errno(ret);
3347                goto out;
3348        }
3349
3350        *ret_right_path = right_path;
3351out:
3352        if (ret)
3353                ocfs2_free_path(right_path);
3354        return ret;
3355}
3356
3357/*
3358 * Remove split_rec clusters from the record at index and merge them
3359 * onto the beginning of the record "next" to it.
3360 * For index < l_count - 1, the next means the extent rec at index + 1.
3361 * For index == l_count - 1, the "next" means the 1st extent rec of the
3362 * next extent block.
3363 */
3364static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3365                                 handle_t *handle,
3366                                 struct ocfs2_extent_tree *et,
3367                                 struct ocfs2_extent_rec *split_rec,
3368                                 int index)
3369{
3370        int ret, next_free, i;
3371        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3372        struct ocfs2_extent_rec *left_rec;
3373        struct ocfs2_extent_rec *right_rec;
3374        struct ocfs2_extent_list *right_el;
3375        struct ocfs2_path *right_path = NULL;
3376        int subtree_index = 0;
3377        struct ocfs2_extent_list *el = path_leaf_el(left_path);
3378        struct buffer_head *bh = path_leaf_bh(left_path);
3379        struct buffer_head *root_bh = NULL;
3380
3381        BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3382        left_rec = &el->l_recs[index];
3383
3384        if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3385            le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3386                /* we meet with a cross extent block merge. */
3387                ret = ocfs2_get_right_path(et, left_path, &right_path);
3388                if (ret) {
3389                        mlog_errno(ret);
3390                        return ret;
3391                }
3392
3393                right_el = path_leaf_el(right_path);
3394                next_free = le16_to_cpu(right_el->l_next_free_rec);
3395                BUG_ON(next_free <= 0);
3396                right_rec = &right_el->l_recs[0];
3397                if (ocfs2_is_empty_extent(right_rec)) {
3398                        BUG_ON(next_free <= 1);
3399                        right_rec = &right_el->l_recs[1];
3400                }
3401
3402                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3403                       le16_to_cpu(left_rec->e_leaf_clusters) !=
3404                       le32_to_cpu(right_rec->e_cpos));
3405
3406                subtree_index = ocfs2_find_subtree_root(et, left_path,
3407                                                        right_path);
3408
3409                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3410                                                      handle->h_buffer_credits,
3411                                                      right_path);
3412                if (ret) {
3413                        mlog_errno(ret);
3414                        goto out;
3415                }
3416
3417                root_bh = left_path->p_node[subtree_index].bh;
3418                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3419
3420                ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3421                                                   subtree_index);
3422                if (ret) {
3423                        mlog_errno(ret);
3424                        goto out;
3425                }
3426
3427                for (i = subtree_index + 1;
3428                     i < path_num_items(right_path); i++) {
3429                        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3430                                                           right_path, i);
3431                        if (ret) {
3432                                mlog_errno(ret);
3433                                goto out;
3434                        }
3435
3436                        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3437                                                           left_path, i);
3438                        if (ret) {
3439                                mlog_errno(ret);
3440                                goto out;
3441                        }
3442                }
3443
3444        } else {
3445                BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3446                right_rec = &el->l_recs[index + 1];
3447        }
3448
3449        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3450                                           path_num_items(left_path) - 1);
3451        if (ret) {
3452                mlog_errno(ret);
3453                goto out;
3454        }
3455
3456        le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3457
3458        le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3459        le64_add_cpu(&right_rec->e_blkno,
3460                     -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3461                                               split_clusters));
3462        le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3463
3464        ocfs2_cleanup_merge(el, index);
3465
3466        ocfs2_journal_dirty(handle, bh);
3467        if (right_path) {
3468                ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3469                ocfs2_complete_edge_insert(handle, left_path, right_path,
3470                                           subtree_index);
3471        }
3472out:
3473        ocfs2_free_path(right_path);
3474        return ret;
3475}
3476
3477static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3478                               struct ocfs2_path *right_path,
3479                               struct ocfs2_path **ret_left_path)
3480{
3481        int ret;
3482        u32 left_cpos;
3483        struct ocfs2_path *left_path = NULL;
3484
3485        *ret_left_path = NULL;
3486
3487        /* This function shouldn't be called for non-trees. */
3488        BUG_ON(right_path->p_tree_depth == 0);
3489
3490        ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3491                                            right_path, &left_cpos);
3492        if (ret) {
3493                mlog_errno(ret);
3494                goto out;
3495        }
3496
3497        /* This function shouldn't be called for the leftmost leaf. */
3498        BUG_ON(left_cpos == 0);
3499
3500        left_path = ocfs2_new_path_from_path(right_path);
3501        if (!left_path) {
3502                ret = -ENOMEM;
3503                mlog_errno(ret);
3504                goto out;
3505        }
3506
3507        ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3508        if (ret) {
3509                mlog_errno(ret);
3510                goto out;
3511        }
3512
3513        *ret_left_path = left_path;
3514out:
3515        if (ret)
3516                ocfs2_free_path(left_path);
3517        return ret;
3518}
3519
3520/*
3521 * Remove split_rec clusters from the record at index and merge them
3522 * onto the tail of the record "before" it.
3523 * For index > 0, the "before" means the extent rec at index - 1.
3524 *
3525 * For index == 0, the "before" means the last record of the previous
3526 * extent block. And there is also a situation that we may need to
3527 * remove the rightmost leaf extent block in the right_path and change
3528 * the right path to indicate the new rightmost path.
3529 */
3530static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3531                                handle_t *handle,
3532                                struct ocfs2_extent_tree *et,
3533                                struct ocfs2_extent_rec *split_rec,
3534                                struct ocfs2_cached_dealloc_ctxt *dealloc,
3535                                int index)
3536{
3537        int ret, i, subtree_index = 0, has_empty_extent = 0;
3538        unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3539        struct ocfs2_extent_rec *left_rec;
3540        struct ocfs2_extent_rec *right_rec;
3541        struct ocfs2_extent_list *el = path_leaf_el(right_path);
3542        struct buffer_head *bh = path_leaf_bh(right_path);
3543        struct buffer_head *root_bh = NULL;
3544        struct ocfs2_path *left_path = NULL;
3545        struct ocfs2_extent_list *left_el;
3546
3547        BUG_ON(index < 0);
3548
3549        right_rec = &el->l_recs[index];
3550        if (index == 0) {
3551                /* we meet with a cross extent block merge. */
3552                ret = ocfs2_get_left_path(et, right_path, &left_path);
3553                if (ret) {
3554                        mlog_errno(ret);
3555                        return ret;
3556                }
3557
3558                left_el = path_leaf_el(left_path);
3559                BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3560                       le16_to_cpu(left_el->l_count));
3561
3562                left_rec = &left_el->l_recs[
3563                                le16_to_cpu(left_el->l_next_free_rec) - 1];
3564                BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3565                       le16_to_cpu(left_rec->e_leaf_clusters) !=
3566                       le32_to_cpu(split_rec->e_cpos));
3567
3568                subtree_index = ocfs2_find_subtree_root(et, left_path,
3569                                                        right_path);
3570
3571                ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3572                                                      handle->h_buffer_credits,
3573                                                      left_path);
3574                if (ret) {
3575                        mlog_errno(ret);
3576                        goto out;
3577                }
3578
3579                root_bh = left_path->p_node[subtree_index].bh;
3580                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3581
3582                ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3583                                                   subtree_index);
3584                if (ret) {
3585                        mlog_errno(ret);
3586                        goto out;
3587                }
3588
3589                for (i = subtree_index + 1;
3590                     i < path_num_items(right_path); i++) {
3591                        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3592                                                           right_path, i);
3593                        if (ret) {
3594                                mlog_errno(ret);
3595                                goto out;
3596                        }
3597
3598                        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3599                                                           left_path, i);
3600                        if (ret) {
3601                                mlog_errno(ret);
3602                                goto out;
3603                        }
3604                }
3605        } else {
3606                left_rec = &el->l_recs[index - 1];
3607                if (ocfs2_is_empty_extent(&el->l_recs[0]))
3608                        has_empty_extent = 1;
3609        }
3610
3611        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3612                                           path_num_items(right_path) - 1);
3613        if (ret) {
3614                mlog_errno(ret);
3615                goto out;
3616        }
3617
3618        if (has_empty_extent && index == 1) {
3619                /*
3620                 * The easy case - we can just plop the record right in.
3621                 */
3622                *left_rec = *split_rec;
3623        } else
3624                le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3625
3626        le32_add_cpu(&right_rec->e_cpos, split_clusters);
3627        le64_add_cpu(&right_rec->e_blkno,
3628                     ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3629                                              split_clusters));
3630        le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3631
3632        ocfs2_cleanup_merge(el, index);
3633
3634        ocfs2_journal_dirty(handle, bh);
3635        if (left_path) {
3636                ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3637
3638                /*
3639                 * In the situation that the right_rec is empty and the extent
3640                 * block is empty also,  ocfs2_complete_edge_insert can't handle
3641                 * it and we need to delete the right extent block.
3642                 */
3643                if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3644                    le16_to_cpu(el->l_next_free_rec) == 1) {
3645                        /* extend credit for ocfs2_remove_rightmost_path */
3646                        ret = ocfs2_extend_rotate_transaction(handle, 0,
3647                                        handle->h_buffer_credits,
3648                                        right_path);
3649                        if (ret) {
3650                                mlog_errno(ret);
3651                                goto out;
3652                        }
3653
3654                        ret = ocfs2_remove_rightmost_path(handle, et,
3655                                                          right_path,
3656                                                          dealloc);
3657                        if (ret) {
3658                                mlog_errno(ret);
3659                                goto out;
3660                        }
3661
3662                        /* Now the rightmost extent block has been deleted.
3663                         * So we use the new rightmost path.
3664                         */
3665                        ocfs2_mv_path(right_path, left_path);
3666                        left_path = NULL;
3667                } else
3668                        ocfs2_complete_edge_insert(handle, left_path,
3669                                                   right_path, subtree_index);
3670        }
3671out:
3672        ocfs2_free_path(left_path);
3673        return ret;
3674}
3675
3676static int ocfs2_try_to_merge_extent(handle_t *handle,
3677                                     struct ocfs2_extent_tree *et,
3678                                     struct ocfs2_path *path,
3679                                     int split_index,
3680                                     struct ocfs2_extent_rec *split_rec,
3681                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
3682                                     struct ocfs2_merge_ctxt *ctxt)
3683{
3684        int ret = 0;
3685        struct ocfs2_extent_list *el = path_leaf_el(path);
3686        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3687
3688        BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3689
3690        if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3691                /* extend credit for ocfs2_remove_rightmost_path */
3692                ret = ocfs2_extend_rotate_transaction(handle, 0,
3693                                handle->h_buffer_credits,
3694                                path);
3695                if (ret) {
3696                        mlog_errno(ret);
3697                        goto out;
3698                }
3699                /*
3700                 * The merge code will need to create an empty
3701                 * extent to take the place of the newly
3702                 * emptied slot. Remove any pre-existing empty
3703                 * extents - having more than one in a leaf is
3704                 * illegal.
3705                 */
3706                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3707                if (ret) {
3708                        mlog_errno(ret);
3709                        goto out;
3710                }
3711                split_index--;
3712                rec = &el->l_recs[split_index];
3713        }
3714
3715        if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3716                /*
3717                 * Left-right contig implies this.
3718                 */
3719                BUG_ON(!ctxt->c_split_covers_rec);
3720
3721                /*
3722                 * Since the leftright insert always covers the entire
3723                 * extent, this call will delete the insert record
3724                 * entirely, resulting in an empty extent record added to
3725                 * the extent block.
3726                 *
3727                 * Since the adding of an empty extent shifts
3728                 * everything back to the right, there's no need to
3729                 * update split_index here.
3730                 *
3731                 * When the split_index is zero, we need to merge it to the
3732                 * prevoius extent block. It is more efficient and easier
3733                 * if we do merge_right first and merge_left later.
3734                 */
3735                ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3736                                            split_index);
3737                if (ret) {
3738                        mlog_errno(ret);
3739                        goto out;
3740                }
3741
3742                /*
3743                 * We can only get this from logic error above.
3744                 */
3745                BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3746
3747                /* extend credit for ocfs2_remove_rightmost_path */
3748                ret = ocfs2_extend_rotate_transaction(handle, 0,
3749                                        handle->h_buffer_credits,
3750                                        path);
3751                if (ret) {
3752                        mlog_errno(ret);
3753                        goto out;
3754                }
3755
3756                /* The merge left us with an empty extent, remove it. */
3757                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3758                if (ret) {
3759                        mlog_errno(ret);
3760                        goto out;
3761                }
3762
3763                rec = &el->l_recs[split_index];
3764
3765                /*
3766                 * Note that we don't pass split_rec here on purpose -
3767                 * we've merged it into the rec already.
3768                 */
3769                ret = ocfs2_merge_rec_left(path, handle, et, rec,
3770                                           dealloc, split_index);
3771
3772                if (ret) {
3773                        mlog_errno(ret);
3774                        goto out;
3775                }
3776
3777                /* extend credit for ocfs2_remove_rightmost_path */
3778                ret = ocfs2_extend_rotate_transaction(handle, 0,
3779                                handle->h_buffer_credits,
3780                                path);
3781                if (ret) {
3782                        mlog_errno(ret);
3783                        goto out;
3784                }
3785
3786                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3787                /*
3788                 * Error from this last rotate is not critical, so
3789                 * print but don't bubble it up.
3790                 */
3791                if (ret)
3792                        mlog_errno(ret);
3793                ret = 0;
3794        } else {
3795                /*
3796                 * Merge a record to the left or right.
3797                 *
3798                 * 'contig_type' is relative to the existing record,
3799                 * so for example, if we're "right contig", it's to
3800                 * the record on the left (hence the left merge).
3801                 */
3802                if (ctxt->c_contig_type == CONTIG_RIGHT) {
3803                        ret = ocfs2_merge_rec_left(path, handle, et,
3804                                                   split_rec, dealloc,
3805                                                   split_index);
3806                        if (ret) {
3807                                mlog_errno(ret);
3808                                goto out;
3809                        }
3810                } else {
3811                        ret = ocfs2_merge_rec_right(path, handle,
3812                                                    et, split_rec,
3813                                                    split_index);
3814                        if (ret) {
3815                                mlog_errno(ret);
3816                                goto out;
3817                        }
3818                }
3819
3820                if (ctxt->c_split_covers_rec) {
3821                        /* extend credit for ocfs2_remove_rightmost_path */
3822                        ret = ocfs2_extend_rotate_transaction(handle, 0,
3823                                        handle->h_buffer_credits,
3824                                        path);
3825                        if (ret) {
3826                                mlog_errno(ret);
3827                                ret = 0;
3828                                goto out;
3829                        }
3830
3831                        /*
3832                         * The merge may have left an empty extent in
3833                         * our leaf. Try to rotate it away.
3834                         */
3835                        ret = ocfs2_rotate_tree_left(handle, et, path,
3836                                                     dealloc);
3837                        if (ret)
3838                                mlog_errno(ret);
3839                        ret = 0;
3840                }
3841        }
3842
3843out:
3844        return ret;
3845}
3846
3847static void ocfs2_subtract_from_rec(struct super_block *sb,
3848                                    enum ocfs2_split_type split,
3849                                    struct ocfs2_extent_rec *rec,
3850                                    struct ocfs2_extent_rec *split_rec)
3851{
3852        u64 len_blocks;
3853
3854        len_blocks = ocfs2_clusters_to_blocks(sb,
3855                                le16_to_cpu(split_rec->e_leaf_clusters));
3856
3857        if (split == SPLIT_LEFT) {
3858                /*
3859                 * Region is on the left edge of the existing
3860                 * record.
3861                 */
3862                le32_add_cpu(&rec->e_cpos,
3863                             le16_to_cpu(split_rec->e_leaf_clusters));
3864                le64_add_cpu(&rec->e_blkno, len_blocks);
3865                le16_add_cpu(&rec->e_leaf_clusters,
3866                             -le16_to_cpu(split_rec->e_leaf_clusters));
3867        } else {
3868                /*
3869                 * Region is on the right edge of the existing
3870                 * record.
3871                 */
3872                le16_add_cpu(&rec->e_leaf_clusters,
3873                             -le16_to_cpu(split_rec->e_leaf_clusters));
3874        }
3875}
3876
3877/*
3878 * Do the final bits of extent record insertion at the target leaf
3879 * list. If this leaf is part of an allocation tree, it is assumed
3880 * that the tree above has been prepared.
3881 */
3882static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3883                                 struct ocfs2_extent_rec *insert_rec,
3884                                 struct ocfs2_extent_list *el,
3885                                 struct ocfs2_insert_type *insert)
3886{
3887        int i = insert->ins_contig_index;
3888        unsigned int range;
3889        struct ocfs2_extent_rec *rec;
3890
3891        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3892
3893        if (insert->ins_split != SPLIT_NONE) {
3894                i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3895                BUG_ON(i == -1);
3896                rec = &el->l_recs[i];
3897                ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3898                                        insert->ins_split, rec,
3899                                        insert_rec);
3900                goto rotate;
3901        }
3902
3903        /*
3904         * Contiguous insert - either left or right.
3905         */
3906        if (insert->ins_contig != CONTIG_NONE) {
3907                rec = &el->l_recs[i];
3908                if (insert->ins_contig == CONTIG_LEFT) {
3909                        rec->e_blkno = insert_rec->e_blkno;
3910                        rec->e_cpos = insert_rec->e_cpos;
3911                }
3912                le16_add_cpu(&rec->e_leaf_clusters,
3913                             le16_to_cpu(insert_rec->e_leaf_clusters));
3914                return;
3915        }
3916
3917        /*
3918         * Handle insert into an empty leaf.
3919         */
3920        if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3921            ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3922             ocfs2_is_empty_extent(&el->l_recs[0]))) {
3923                el->l_recs[0] = *insert_rec;
3924                el->l_next_free_rec = cpu_to_le16(1);
3925                return;
3926        }
3927
3928        /*
3929         * Appending insert.
3930         */
3931        if (insert->ins_appending == APPEND_TAIL) {
3932                i = le16_to_cpu(el->l_next_free_rec) - 1;
3933                rec = &el->l_recs[i];
3934                range = le32_to_cpu(rec->e_cpos)
3935                        + le16_to_cpu(rec->e_leaf_clusters);
3936                BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3937
3938                mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3939                                le16_to_cpu(el->l_count),
3940                                "owner %llu, depth %u, count %u, next free %u, "
3941                                "rec.cpos %u, rec.clusters %u, "
3942                                "insert.cpos %u, insert.clusters %u\n",
3943                                ocfs2_metadata_cache_owner(et->et_ci),
3944                                le16_to_cpu(el->l_tree_depth),
3945                                le16_to_cpu(el->l_count),
3946                                le16_to_cpu(el->l_next_free_rec),
3947                                le32_to_cpu(el->l_recs[i].e_cpos),
3948                                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3949                                le32_to_cpu(insert_rec->e_cpos),
3950                                le16_to_cpu(insert_rec->e_leaf_clusters));
3951                i++;
3952                el->l_recs[i] = *insert_rec;
3953                le16_add_cpu(&el->l_next_free_rec, 1);
3954                return;
3955        }
3956
3957rotate:
3958        /*
3959         * Ok, we have to rotate.
3960         *
3961         * At this point, it is safe to assume that inserting into an
3962         * empty leaf and appending to a leaf have both been handled
3963         * above.
3964         *
3965         * This leaf needs to have space, either by the empty 1st
3966         * extent record, or by virtue of an l_next_rec < l_count.
3967         */
3968        ocfs2_rotate_leaf(el, insert_rec);
3969}
3970
3971static void ocfs2_adjust_rightmost_records(handle_t *handle,
3972                                           struct ocfs2_extent_tree *et,
3973                                           struct ocfs2_path *path,
3974                                           struct ocfs2_extent_rec *insert_rec)
3975{
3976        int i, next_free;
3977        struct buffer_head *bh;
3978        struct ocfs2_extent_list *el;
3979        struct ocfs2_extent_rec *rec;
3980
3981        /*
3982         * Update everything except the leaf block.
3983         */
3984        for (i = 0; i < path->p_tree_depth; i++) {
3985                bh = path->p_node[i].bh;
3986                el = path->p_node[i].el;
3987
3988                next_free = le16_to_cpu(el->l_next_free_rec);
3989                if (next_free == 0) {
3990                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3991                                    "Owner %llu has a bad extent list\n",
3992                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3993                        return;
3994                }
3995
3996                rec = &el->l_recs[next_free - 1];
3997
3998                rec->e_int_clusters = insert_rec->e_cpos;
3999                le32_add_cpu(&rec->e_int_clusters,
4000                             le16_to_cpu(insert_rec->e_leaf_clusters));
4001                le32_add_cpu(&rec->e_int_clusters,
4002                             -le32_to_cpu(rec->e_cpos));
4003
4004                ocfs2_journal_dirty(handle, bh);
4005        }
4006}
4007
4008static int ocfs2_append_rec_to_path(handle_t *handle,
4009                                    struct ocfs2_extent_tree *et,
4010                                    struct ocfs2_extent_rec *insert_rec,
4011                                    struct ocfs2_path *right_path,
4012                                    struct ocfs2_path **ret_left_path)
4013{
4014        int ret, next_free;
4015        struct ocfs2_extent_list *el;
4016        struct ocfs2_path *left_path = NULL;
4017
4018        *ret_left_path = NULL;
4019
4020        /*
4021         * This shouldn't happen for non-trees. The extent rec cluster
4022         * count manipulation below only works for interior nodes.
4023         */
4024        BUG_ON(right_path->p_tree_depth == 0);
4025
4026        /*
4027         * If our appending insert is at the leftmost edge of a leaf,
4028         * then we might need to update the rightmost records of the
4029         * neighboring path.
4030         */
4031        el = path_leaf_el(right_path);
4032        next_free = le16_to_cpu(el->l_next_free_rec);
4033        if (next_free == 0 ||
4034            (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
4035                u32 left_cpos;
4036
4037                ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
4038                                                    right_path, &left_cpos);
4039                if (ret) {
4040                        mlog_errno(ret);
4041                        goto out;
4042                }
4043
4044                trace_ocfs2_append_rec_to_path(
4045                        (unsigned long long)
4046                        ocfs2_metadata_cache_owner(et->et_ci),
4047                        le32_to_cpu(insert_rec->e_cpos),
4048                        left_cpos);
4049
4050                /*
4051                 * No need to worry if the append is already in the
4052                 * leftmost leaf.
4053                 */
4054                if (left_cpos) {
4055                        left_path = ocfs2_new_path_from_path(right_path);
4056                        if (!left_path) {
4057                                ret = -ENOMEM;
4058                                mlog_errno(ret);
4059                                goto out;
4060                        }
4061
4062                        ret = ocfs2_find_path(et->et_ci, left_path,
4063                                              left_cpos);
4064                        if (ret) {
4065                                mlog_errno(ret);
4066                                goto out;
4067                        }
4068
4069                        /*
4070                         * ocfs2_insert_path() will pass the left_path to the
4071                         * journal for us.
4072                         */
4073                }
4074        }
4075
4076        ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4077        if (ret) {
4078                mlog_errno(ret);
4079                goto out;
4080        }
4081
4082        ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4083
4084        *ret_left_path = left_path;
4085        ret = 0;
4086out:
4087        if (ret != 0)
4088                ocfs2_free_path(left_path);
4089
4090        return ret;
4091}
4092
4093static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4094                               struct ocfs2_path *left_path,
4095                               struct ocfs2_path *right_path,
4096                               struct ocfs2_extent_rec *split_rec,
4097                               enum ocfs2_split_type split)
4098{
4099        int index;
4100        u32 cpos = le32_to_cpu(split_rec->e_cpos);
4101        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4102        struct ocfs2_extent_rec *rec, *tmprec;
4103
4104        right_el = path_leaf_el(right_path);
4105        if (left_path)
4106                left_el = path_leaf_el(left_path);
4107
4108        el = right_el;
4109        insert_el = right_el;
4110        index = ocfs2_search_extent_list(el, cpos);
4111        if (index != -1) {
4112                if (index == 0 && left_path) {
4113                        BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4114
4115                        /*
4116                         * This typically means that the record
4117                         * started in the left path but moved to the
4118                         * right as a result of rotation. We either
4119                         * move the existing record to the left, or we
4120                         * do the later insert there.
4121                         *
4122                         * In this case, the left path should always
4123                         * exist as the rotate code will have passed
4124                         * it back for a post-insert update.
4125                         */
4126
4127                        if (split == SPLIT_LEFT) {
4128                                /*
4129                                 * It's a left split. Since we know
4130                                 * that the rotate code gave us an
4131                                 * empty extent in the left path, we
4132                                 * can just do the insert there.
4133                                 */
4134                                insert_el = left_el;
4135                        } else {
4136                                /*
4137                                 * Right split - we have to move the
4138                                 * existing record over to the left
4139                                 * leaf. The insert will be into the
4140                                 * newly created empty extent in the
4141                                 * right leaf.
4142                                 */
4143                                tmprec = &right_el->l_recs[index];
4144                                ocfs2_rotate_leaf(left_el, tmprec);
4145                                el = left_el;
4146
4147                                memset(tmprec, 0, sizeof(*tmprec));
4148                                index = ocfs2_search_extent_list(left_el, cpos);
4149                                BUG_ON(index == -1);
4150                        }
4151                }
4152        } else {
4153                BUG_ON(!left_path);
4154                BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4155                /*
4156                 * Left path is easy - we can just allow the insert to
4157                 * happen.
4158                 */
4159                el = left_el;
4160                insert_el = left_el;
4161                index = ocfs2_search_extent_list(el, cpos);
4162                BUG_ON(index == -1);
4163        }
4164
4165        rec = &el->l_recs[index];
4166        ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4167                                split, rec, split_rec);
4168        ocfs2_rotate_leaf(insert_el, split_rec);
4169}
4170
4171/*
4172 * This function only does inserts on an allocation b-tree. For tree
4173 * depth = 0, ocfs2_insert_at_leaf() is called directly.
4174 *
4175 * right_path is the path we want to do the actual insert
4176 * in. left_path should only be passed in if we need to update that
4177 * portion of the tree after an edge insert.
4178 */
4179static int ocfs2_insert_path(handle_t *handle,
4180                             struct ocfs2_extent_tree *et,
4181                             struct ocfs2_path *left_path,
4182                             struct ocfs2_path *right_path,
4183                             struct ocfs2_extent_rec *insert_rec,
4184                             struct ocfs2_insert_type *insert)
4185{
4186        int ret, subtree_index;
4187        struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4188
4189        if (left_path) {
4190                /*
4191                 * There's a chance that left_path got passed back to
4192                 * us without being accounted for in the
4193                 * journal. Extend our transaction here to be sure we
4194                 * can change those blocks.
4195                 */
4196                ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4197                if (ret < 0) {
4198                        mlog_errno(ret);
4199                        goto out;
4200                }
4201
4202                ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4203                if (ret < 0) {
4204                        mlog_errno(ret);
4205                        goto out;
4206                }
4207        }
4208
4209        /*
4210         * Pass both paths to the journal. The majority of inserts
4211         * will be touching all components anyway.
4212         */
4213        ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4214        if (ret < 0) {
4215                mlog_errno(ret);
4216                goto out;
4217        }
4218
4219        if (insert->ins_split != SPLIT_NONE) {
4220                /*
4221                 * We could call ocfs2_insert_at_leaf() for some types
4222                 * of splits, but it's easier to just let one separate
4223                 * function sort it all out.
4224                 */
4225                ocfs2_split_record(et, left_path, right_path,
4226                                   insert_rec, insert->ins_split);
4227
4228                /*
4229                 * Split might have modified either leaf and we don't
4230                 * have a guarantee that the later edge insert will
4231                 * dirty this for us.
4232                 */
4233                if (left_path)
4234                        ocfs2_journal_dirty(handle,
4235                                            path_leaf_bh(left_path));
4236        } else
4237                ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4238                                     insert);
4239
4240        ocfs2_journal_dirty(handle, leaf_bh);
4241
4242        if (left_path) {
4243                /*
4244                 * The rotate code has indicated that we need to fix
4245                 * up portions of the tree after the insert.
4246                 *
4247                 * XXX: Should we extend the transaction here?
4248                 */
4249                subtree_index = ocfs2_find_subtree_root(et, left_path,
4250                                                        right_path);
4251                ocfs2_complete_edge_insert(handle, left_path, right_path,
4252                                           subtree_index);
4253        }
4254
4255        ret = 0;
4256out:
4257        return ret;
4258}
4259
4260static int ocfs2_do_insert_extent(handle_t *handle,
4261                                  struct ocfs2_extent_tree *et,
4262                                  struct ocfs2_extent_rec *insert_rec,
4263                                  struct ocfs2_insert_type *type)
4264{
4265        int ret, rotate = 0;
4266        u32 cpos;
4267        struct ocfs2_path *right_path = NULL;
4268        struct ocfs2_path *left_path = NULL;
4269        struct ocfs2_extent_list *el;
4270
4271        el = et->et_root_el;
4272
4273        ret = ocfs2_et_root_journal_access(handle, et,
4274                                           OCFS2_JOURNAL_ACCESS_WRITE);
4275        if (ret) {
4276                mlog_errno(ret);
4277                goto out;
4278        }
4279
4280        if (le16_to_cpu(el->l_tree_depth) == 0) {
4281                ocfs2_insert_at_leaf(et, insert_rec, el, type);
4282                goto out_update_clusters;
4283        }
4284
4285        right_path = ocfs2_new_path_from_et(et);
4286        if (!right_path) {
4287                ret = -ENOMEM;
4288                mlog_errno(ret);
4289                goto out;
4290        }
4291
4292        /*
4293         * Determine the path to start with. Rotations need the
4294         * rightmost path, everything else can go directly to the
4295         * target leaf.
4296         */
4297        cpos = le32_to_cpu(insert_rec->e_cpos);
4298        if (type->ins_appending == APPEND_NONE &&
4299            type->ins_contig == CONTIG_NONE) {
4300                rotate = 1;
4301                cpos = UINT_MAX;
4302        }
4303
4304        ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4305        if (ret) {
4306                mlog_errno(ret);
4307                goto out;
4308        }
4309
4310        /*
4311         * Rotations and appends need special treatment - they modify
4312         * parts of the tree's above them.
4313         *
4314         * Both might pass back a path immediate to the left of the
4315         * one being inserted to. This will be cause
4316         * ocfs2_insert_path() to modify the rightmost records of
4317         * left_path to account for an edge insert.
4318         *
4319         * XXX: When modifying this code, keep in mind that an insert
4320         * can wind up skipping both of these two special cases...
4321         */
4322        if (rotate) {
4323                ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4324                                              le32_to_cpu(insert_rec->e_cpos),
4325                                              right_path, &left_path);
4326                if (ret) {
4327                        mlog_errno(ret);
4328                        goto out;
4329                }
4330
4331                /*
4332                 * ocfs2_rotate_tree_right() might have extended the
4333                 * transaction without re-journaling our tree root.
4334                 */
4335                ret = ocfs2_et_root_journal_access(handle, et,
4336                                                   OCFS2_JOURNAL_ACCESS_WRITE);
4337                if (ret) {
4338                        mlog_errno(ret);
4339                        goto out;
4340                }
4341        } else if (type->ins_appending == APPEND_TAIL
4342                   && type->ins_contig != CONTIG_LEFT) {
4343                ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4344                                               right_path, &left_path);
4345                if (ret) {
4346                        mlog_errno(ret);
4347                        goto out;
4348                }
4349        }
4350
4351        ret = ocfs2_insert_path(handle, et, left_path, right_path,
4352                                insert_rec, type);
4353        if (ret) {
4354                mlog_errno(ret);
4355                goto out;
4356        }
4357
4358out_update_clusters:
4359        if (type->ins_split == SPLIT_NONE)
4360                ocfs2_et_update_clusters(et,
4361                                         le16_to_cpu(insert_rec->e_leaf_clusters));
4362
4363        ocfs2_journal_dirty(handle, et->et_root_bh);
4364
4365out:
4366        ocfs2_free_path(left_path);
4367        ocfs2_free_path(right_path);
4368
4369        return ret;
4370}
4371
4372static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4373                               struct ocfs2_path *path,
4374                               struct ocfs2_extent_list *el, int index,
4375                               struct ocfs2_extent_rec *split_rec,
4376                               struct ocfs2_merge_ctxt *ctxt)
4377{
4378        int status = 0;
4379        enum ocfs2_contig_type ret = CONTIG_NONE;
4380        u32 left_cpos, right_cpos;
4381        struct ocfs2_extent_rec *rec = NULL;
4382        struct ocfs2_extent_list *new_el;
4383        struct ocfs2_path *left_path = NULL, *right_path = NULL;
4384        struct buffer_head *bh;
4385        struct ocfs2_extent_block *eb;
4386        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4387
4388        if (index > 0) {
4389                rec = &el->l_recs[index - 1];
4390        } else if (path->p_tree_depth > 0) {
4391                status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4392                if (status)
4393                        goto exit;
4394
4395                if (left_cpos != 0) {
4396                        left_path = ocfs2_new_path_from_path(path);
4397                        if (!left_path) {
4398                                status = -ENOMEM;
4399                                mlog_errno(status);
4400                                goto exit;
4401                        }
4402
4403                        status = ocfs2_find_path(et->et_ci, left_path,
4404                                                 left_cpos);
4405                        if (status)
4406                                goto free_left_path;
4407
4408                        new_el = path_leaf_el(left_path);
4409
4410                        if (le16_to_cpu(new_el->l_next_free_rec) !=
4411                            le16_to_cpu(new_el->l_count)) {
4412                                bh = path_leaf_bh(left_path);
4413                                eb = (struct ocfs2_extent_block *)bh->b_data;
4414                                ocfs2_error(sb,
4415                                            "Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
4416                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
4417                                            le16_to_cpu(new_el->l_next_free_rec),
4418                                            le16_to_cpu(new_el->l_count));
4419                                status = -EINVAL;
4420                                goto free_left_path;
4421                        }
4422                        rec = &new_el->l_recs[
4423                                le16_to_cpu(new_el->l_next_free_rec) - 1];
4424                }
4425        }
4426
4427        /*
4428         * We're careful to check for an empty extent record here -
4429         * the merge code will know what to do if it sees one.
4430         */
4431        if (rec) {
4432                if (index == 1 && ocfs2_is_empty_extent(rec)) {
4433                        if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4434                                ret = CONTIG_RIGHT;
4435                } else {
4436                        ret = ocfs2_et_extent_contig(et, rec, split_rec);
4437                }
4438        }
4439
4440        rec = NULL;
4441        if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4442                rec = &el->l_recs[index + 1];
4443        else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4444                 path->p_tree_depth > 0) {
4445                status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4446                if (status)
4447                        goto free_left_path;
4448
4449                if (right_cpos == 0)
4450                        goto free_left_path;
4451
4452                right_path = ocfs2_new_path_from_path(path);
4453                if (!right_path) {
4454                        status = -ENOMEM;
4455                        mlog_errno(status);
4456                        goto free_left_path;
4457                }
4458
4459                status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4460                if (status)
4461                        goto free_right_path;
4462
4463                new_el = path_leaf_el(right_path);
4464                rec = &new_el->l_recs[0];
4465                if (ocfs2_is_empty_extent(rec)) {
4466                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4467                                bh = path_leaf_bh(right_path);
4468                                eb = (struct ocfs2_extent_block *)bh->b_data;
4469                                ocfs2_error(sb,
4470                                            "Extent block #%llu has an invalid l_next_free_rec of %d\n",
4471                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
4472                                            le16_to_cpu(new_el->l_next_free_rec));
4473                                status = -EINVAL;
4474                                goto free_right_path;
4475                        }
4476                        rec = &new_el->l_recs[1];
4477                }
4478        }
4479
4480        if (rec) {
4481                enum ocfs2_contig_type contig_type;
4482
4483                contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4484
4485                if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4486                        ret = CONTIG_LEFTRIGHT;
4487                else if (ret == CONTIG_NONE)
4488                        ret = contig_type;
4489        }
4490
4491free_right_path:
4492        ocfs2_free_path(right_path);
4493free_left_path:
4494        ocfs2_free_path(left_path);
4495exit:
4496        if (status == 0)
4497                ctxt->c_contig_type = ret;
4498
4499        return status;
4500}
4501
4502static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4503                                     struct ocfs2_insert_type *insert,
4504                                     struct ocfs2_extent_list *el,
4505                                     struct ocfs2_extent_rec *insert_rec)
4506{
4507        int i;
4508        enum ocfs2_contig_type contig_type = CONTIG_NONE;
4509
4510        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4511
4512        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4513                contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4514                                                     insert_rec);
4515                if (contig_type != CONTIG_NONE) {
4516                        insert->ins_contig_index = i;
4517                        break;
4518                }
4519        }
4520        insert->ins_contig = contig_type;
4521
4522        if (insert->ins_contig != CONTIG_NONE) {
4523                struct ocfs2_extent_rec *rec =
4524                                &el->l_recs[insert->ins_contig_index];
4525                unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4526                                   le16_to_cpu(insert_rec->e_leaf_clusters);
4527
4528                /*
4529                 * Caller might want us to limit the size of extents, don't
4530                 * calculate contiguousness if we might exceed that limit.
4531                 */
4532                if (et->et_max_leaf_clusters &&
4533                    (len > et->et_max_leaf_clusters))
4534                        insert->ins_contig = CONTIG_NONE;
4535        }
4536}
4537
4538/*
4539 * This should only be called against the righmost leaf extent list.
4540 *
4541 * ocfs2_figure_appending_type() will figure out whether we'll have to
4542 * insert at the tail of the rightmost leaf.
4543 *
4544 * This should also work against the root extent list for tree's with 0
4545 * depth. If we consider the root extent list to be the rightmost leaf node
4546 * then the logic here makes sense.
4547 */
4548static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4549                                        struct ocfs2_extent_list *el,
4550                                        struct ocfs2_extent_rec *insert_rec)
4551{
4552        int i;
4553        u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4554        struct ocfs2_extent_rec *rec;
4555
4556        insert->ins_appending = APPEND_NONE;
4557
4558        BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4559
4560        if (!el->l_next_free_rec)
4561                goto set_tail_append;
4562
4563        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4564                /* Were all records empty? */
4565                if (le16_to_cpu(el->l_next_free_rec) == 1)
4566                        goto set_tail_append;
4567        }
4568
4569        i = le16_to_cpu(el->l_next_free_rec) - 1;
4570        rec = &el->l_recs[i];
4571
4572        if (cpos >=
4573            (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4574                goto set_tail_append;
4575
4576        return;
4577
4578set_tail_append:
4579        insert->ins_appending = APPEND_TAIL;
4580}
4581
4582/*
4583 * Helper function called at the beginning of an insert.
4584 *
4585 * This computes a few things that are commonly used in the process of
4586 * inserting into the btree:
4587 *   - Whether the new extent is contiguous with an existing one.
4588 *   - The current tree depth.
4589 *   - Whether the insert is an appending one.
4590 *   - The total # of free records in the tree.
4591 *
4592 * All of the information is stored on the ocfs2_insert_type
4593 * structure.
4594 */
4595static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4596                                    struct buffer_head **last_eb_bh,
4597                                    struct ocfs2_extent_rec *insert_rec,
4598                                    int *free_records,
4599                                    struct ocfs2_insert_type *insert)
4600{
4601        int ret;
4602        struct ocfs2_extent_block *eb;
4603        struct ocfs2_extent_list *el;
4604        struct ocfs2_path *path = NULL;
4605        struct buffer_head *bh = NULL;
4606
4607        insert->ins_split = SPLIT_NONE;
4608
4609        el = et->et_root_el;
4610        insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4611
4612        if (el->l_tree_depth) {
4613                /*
4614                 * If we have tree depth, we read in the
4615                 * rightmost extent block ahead of time as
4616                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4617                 * may want it later.
4618                 */
4619                ret = ocfs2_read_extent_block(et->et_ci,
4620                                              ocfs2_et_get_last_eb_blk(et),
4621                                              &bh);
4622                if (ret) {
4623                        mlog_errno(ret);
4624                        goto out;
4625                }
4626                eb = (struct ocfs2_extent_block *) bh->b_data;
4627                el = &eb->h_list;
4628        }
4629
4630        /*
4631         * Unless we have a contiguous insert, we'll need to know if
4632         * there is room left in our allocation tree for another
4633         * extent record.
4634         *
4635         * XXX: This test is simplistic, we can search for empty
4636         * extent records too.
4637         */
4638        *free_records = le16_to_cpu(el->l_count) -
4639                le16_to_cpu(el->l_next_free_rec);
4640
4641        if (!insert->ins_tree_depth) {
4642                ocfs2_figure_contig_type(et, insert, el, insert_rec);
4643                ocfs2_figure_appending_type(insert, el, insert_rec);
4644                return 0;
4645        }
4646
4647        path = ocfs2_new_path_from_et(et);
4648        if (!path) {
4649                ret = -ENOMEM;
4650                mlog_errno(ret);
4651                goto out;
4652        }
4653
4654        /*
4655         * In the case that we're inserting past what the tree
4656         * currently accounts for, ocfs2_find_path() will return for
4657         * us the rightmost tree path. This is accounted for below in
4658         * the appending code.
4659         */
4660        ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4661        if (ret) {
4662                mlog_errno(ret);
4663                goto out;
4664        }
4665
4666        el = path_leaf_el(path);
4667
4668        /*
4669         * Now that we have the path, there's two things we want to determine:
4670         * 1) Contiguousness (also set contig_index if this is so)
4671         *
4672         * 2) Are we doing an append? We can trivially break this up
4673         *     into two types of appends: simple record append, or a
4674         *     rotate inside the tail leaf.
4675         */
4676        ocfs2_figure_contig_type(et, insert, el, insert_rec);
4677
4678        /*
4679         * The insert code isn't quite ready to deal with all cases of
4680         * left contiguousness. Specifically, if it's an insert into
4681         * the 1st record in a leaf, it will require the adjustment of
4682         * cluster count on the last record of the path directly to it's
4683         * left. For now, just catch that case and fool the layers
4684         * above us. This works just fine for tree_depth == 0, which
4685         * is why we allow that above.
4686         */
4687        if (insert->ins_contig == CONTIG_LEFT &&
4688            insert->ins_contig_index == 0)
4689                insert->ins_contig = CONTIG_NONE;
4690
4691        /*
4692         * Ok, so we can simply compare against last_eb to figure out
4693         * whether the path doesn't exist. This will only happen in
4694         * the case that we're doing a tail append, so maybe we can
4695         * take advantage of that information somehow.
4696         */
4697        if (ocfs2_et_get_last_eb_blk(et) ==
4698            path_leaf_bh(path)->b_blocknr) {
4699                /*
4700                 * Ok, ocfs2_find_path() returned us the rightmost
4701                 * tree path. This might be an appending insert. There are
4702                 * two cases:
4703                 *    1) We're doing a true append at the tail:
4704                 *      -This might even be off the end of the leaf
4705                 *    2) We're "appending" by rotating in the tail
4706                 */
4707                ocfs2_figure_appending_type(insert, el, insert_rec);
4708        }
4709
4710out:
4711        ocfs2_free_path(path);
4712
4713        if (ret == 0)
4714                *last_eb_bh = bh;
4715        else
4716                brelse(bh);
4717        return ret;
4718}
4719
4720/*
4721 * Insert an extent into a btree.
4722 *
4723 * The caller needs to update the owning btree's cluster count.
4724 */
4725int ocfs2_insert_extent(handle_t *handle,
4726                        struct ocfs2_extent_tree *et,
4727                        u32 cpos,
4728                        u64 start_blk,
4729                        u32 new_clusters,
4730                        u8 flags,
4731                        struct ocfs2_alloc_context *meta_ac)
4732{
4733        int status;
4734        int uninitialized_var(free_records);
4735        struct buffer_head *last_eb_bh = NULL;
4736        struct ocfs2_insert_type insert = {0, };
4737        struct ocfs2_extent_rec rec;
4738
4739        trace_ocfs2_insert_extent_start(
4740                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4741                cpos, new_clusters);
4742
4743        memset(&rec, 0, sizeof(rec));
4744        rec.e_cpos = cpu_to_le32(cpos);
4745        rec.e_blkno = cpu_to_le64(start_blk);
4746        rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4747        rec.e_flags = flags;
4748        status = ocfs2_et_insert_check(et, &rec);
4749        if (status) {
4750                mlog_errno(status);
4751                goto bail;
4752        }
4753
4754        status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4755                                          &free_records, &insert);
4756        if (status < 0) {
4757                mlog_errno(status);
4758                goto bail;
4759        }
4760
4761        trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
4762                                  insert.ins_contig_index, free_records,
4763                                  insert.ins_tree_depth);
4764
4765        if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4766                status = ocfs2_grow_tree(handle, et,
4767                                         &insert.ins_tree_depth, &last_eb_bh,
4768                                         meta_ac);
4769                if (status) {
4770                        mlog_errno(status);
4771                        goto bail;
4772                }
4773        }
4774
4775        /* Finally, we can add clusters. This might rotate the tree for us. */
4776        status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4777        if (status < 0)
4778                mlog_errno(status);
4779        else
4780                ocfs2_et_extent_map_insert(et, &rec);
4781
4782bail:
4783        brelse(last_eb_bh);
4784
4785        return status;
4786}
4787
4788/*
4789 * Allcate and add clusters into the extent b-tree.
4790 * The new clusters(clusters_to_add) will be inserted at logical_offset.
4791 * The extent b-tree's root is specified by et, and
4792 * it is not limited to the file storage. Any extent tree can use this
4793 * function if it implements the proper ocfs2_extent_tree.
4794 */
4795int ocfs2_add_clusters_in_btree(handle_t *handle,
4796                                struct ocfs2_extent_tree *et,
4797                                u32 *logical_offset,
4798                                u32 clusters_to_add,
4799                                int mark_unwritten,
4800                                struct ocfs2_alloc_context *data_ac,
4801                                struct ocfs2_alloc_context *meta_ac,
4802                                enum ocfs2_alloc_restarted *reason_ret)
4803{
4804        int status = 0, err = 0;
4805        int need_free = 0;
4806        int free_extents;
4807        enum ocfs2_alloc_restarted reason = RESTART_NONE;
4808        u32 bit_off, num_bits;
4809        u64 block;
4810        u8 flags = 0;
4811        struct ocfs2_super *osb =
4812                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4813
4814        BUG_ON(!clusters_to_add);
4815
4816        if (mark_unwritten)
4817                flags = OCFS2_EXT_UNWRITTEN;
4818
4819        free_extents = ocfs2_num_free_extents(et);
4820        if (free_extents < 0) {
4821                status = free_extents;
4822                mlog_errno(status);
4823                goto leave;
4824        }
4825
4826        /* there are two cases which could cause us to EAGAIN in the
4827         * we-need-more-metadata case:
4828         * 1) we haven't reserved *any*
4829         * 2) we are so fragmented, we've needed to add metadata too
4830         *    many times. */
4831        if (!free_extents && !meta_ac) {
4832                err = -1;
4833                status = -EAGAIN;
4834                reason = RESTART_META;
4835                goto leave;
4836        } else if ((!free_extents)
4837                   && (ocfs2_alloc_context_bits_left(meta_ac)
4838                       < ocfs2_extend_meta_needed(et->et_root_el))) {
4839                err = -2;
4840                status = -EAGAIN;
4841                reason = RESTART_META;
4842                goto leave;
4843        }
4844
4845        status = __ocfs2_claim_clusters(handle, data_ac, 1,
4846                                        clusters_to_add, &bit_off, &num_bits);
4847        if (status < 0) {
4848                if (status != -ENOSPC)
4849                        mlog_errno(status);
4850                goto leave;
4851        }
4852
4853        BUG_ON(num_bits > clusters_to_add);
4854
4855        /* reserve our write early -- insert_extent may update the tree root */
4856        status = ocfs2_et_root_journal_access(handle, et,
4857                                              OCFS2_JOURNAL_ACCESS_WRITE);
4858        if (status < 0) {
4859                mlog_errno(status);
4860                need_free = 1;
4861                goto bail;
4862        }
4863
4864        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4865        trace_ocfs2_add_clusters_in_btree(
4866             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4867             bit_off, num_bits);
4868        status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4869                                     num_bits, flags, meta_ac);
4870        if (status < 0) {
4871                mlog_errno(status);
4872                need_free = 1;
4873                goto bail;
4874        }
4875
4876        ocfs2_journal_dirty(handle, et->et_root_bh);
4877
4878        clusters_to_add -= num_bits;
4879        *logical_offset += num_bits;
4880
4881        if (clusters_to_add) {
4882                err = clusters_to_add;
4883                status = -EAGAIN;
4884                reason = RESTART_TRANS;
4885        }
4886
4887bail:
4888        if (need_free) {
4889                if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
4890                        ocfs2_free_local_alloc_bits(osb, handle, data_ac,
4891                                        bit_off, num_bits);
4892                else
4893                        ocfs2_free_clusters(handle,
4894                                        data_ac->ac_inode,
4895                                        data_ac->ac_bh,
4896                                        ocfs2_clusters_to_blocks(osb->sb, bit_off),
4897                                        num_bits);
4898        }
4899
4900leave:
4901        if (reason_ret)
4902                *reason_ret = reason;
4903        trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
4904        return status;
4905}
4906
4907static void ocfs2_make_right_split_rec(struct super_block *sb,
4908                                       struct ocfs2_extent_rec *split_rec,
4909                                       u32 cpos,
4910                                       struct ocfs2_extent_rec *rec)
4911{
4912        u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4913        u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4914
4915        memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4916
4917        split_rec->e_cpos = cpu_to_le32(cpos);
4918        split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4919
4920        split_rec->e_blkno = rec->e_blkno;
4921        le64_add_cpu(&split_rec->e_blkno,
4922                     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4923
4924        split_rec->e_flags = rec->e_flags;
4925}
4926
4927static int ocfs2_split_and_insert(handle_t *handle,
4928                                  struct ocfs2_extent_tree *et,
4929                                  struct ocfs2_path *path,
4930                                  struct buffer_head **last_eb_bh,
4931                                  int split_index,
4932                                  struct ocfs2_extent_rec *orig_split_rec,
4933                                  struct ocfs2_alloc_context *meta_ac)
4934{
4935        int ret = 0, depth;
4936        unsigned int insert_range, rec_range, do_leftright = 0;
4937        struct ocfs2_extent_rec tmprec;
4938        struct ocfs2_extent_list *rightmost_el;
4939        struct ocfs2_extent_rec rec;
4940        struct ocfs2_extent_rec split_rec = *orig_split_rec;
4941        struct ocfs2_insert_type insert;
4942        struct ocfs2_extent_block *eb;
4943
4944leftright:
4945        /*
4946         * Store a copy of the record on the stack - it might move
4947         * around as the tree is manipulated below.
4948         */
4949        rec = path_leaf_el(path)->l_recs[split_index];
4950
4951        rightmost_el = et->et_root_el;
4952
4953        depth = le16_to_cpu(rightmost_el->l_tree_depth);
4954        if (depth) {
4955                BUG_ON(!(*last_eb_bh));
4956                eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4957                rightmost_el = &eb->h_list;
4958        }
4959
4960        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4961            le16_to_cpu(rightmost_el->l_count)) {
4962                ret = ocfs2_grow_tree(handle, et,
4963                                      &depth, last_eb_bh, meta_ac);
4964                if (ret) {
4965                        mlog_errno(ret);
4966                        goto out;
4967                }
4968        }
4969
4970        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4971        insert.ins_appending = APPEND_NONE;
4972        insert.ins_contig = CONTIG_NONE;
4973        insert.ins_tree_depth = depth;
4974
4975        insert_range = le32_to_cpu(split_rec.e_cpos) +
4976                le16_to_cpu(split_rec.e_leaf_clusters);
4977        rec_range = le32_to_cpu(rec.e_cpos) +
4978                le16_to_cpu(rec.e_leaf_clusters);
4979
4980        if (split_rec.e_cpos == rec.e_cpos) {
4981                insert.ins_split = SPLIT_LEFT;
4982        } else if (insert_range == rec_range) {
4983                insert.ins_split = SPLIT_RIGHT;
4984        } else {
4985                /*
4986                 * Left/right split. We fake this as a right split
4987                 * first and then make a second pass as a left split.
4988                 */
4989                insert.ins_split = SPLIT_RIGHT;
4990
4991                ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4992                                           &tmprec, insert_range, &rec);
4993
4994                split_rec = tmprec;
4995
4996                BUG_ON(do_leftright);
4997                do_leftright = 1;
4998        }
4999
5000        ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5001        if (ret) {
5002                mlog_errno(ret);
5003                goto out;
5004        }
5005
5006        if (do_leftright == 1) {
5007                u32 cpos;
5008                struct ocfs2_extent_list *el;
5009
5010                do_leftright++;
5011                split_rec = *orig_split_rec;
5012
5013                ocfs2_reinit_path(path, 1);
5014
5015                cpos = le32_to_cpu(split_rec.e_cpos);
5016                ret = ocfs2_find_path(et->et_ci, path, cpos);
5017                if (ret) {
5018                        mlog_errno(ret);
5019                        goto out;
5020                }
5021
5022                el = path_leaf_el(path);
5023                split_index = ocfs2_search_extent_list(el, cpos);
5024                if (split_index == -1) {
5025                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5026                                    "Owner %llu has an extent at cpos %u which can no longer be found\n",
5027                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5028                                    cpos);
5029                        ret = -EROFS;
5030                        goto out;
5031                }
5032                goto leftright;
5033        }
5034out:
5035
5036        return ret;
5037}
5038
5039static int ocfs2_replace_extent_rec(handle_t *handle,
5040                                    struct ocfs2_extent_tree *et,
5041                                    struct ocfs2_path *path,
5042                                    struct ocfs2_extent_list *el,
5043                                    int split_index,
5044                                    struct ocfs2_extent_rec *split_rec)
5045{
5046        int ret;
5047
5048        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
5049                                           path_num_items(path) - 1);
5050        if (ret) {
5051                mlog_errno(ret);
5052                goto out;
5053        }
5054
5055        el->l_recs[split_index] = *split_rec;
5056
5057        ocfs2_journal_dirty(handle, path_leaf_bh(path));
5058out:
5059        return ret;
5060}
5061
5062/*
5063 * Split part or all of the extent record at split_index in the leaf
5064 * pointed to by path. Merge with the contiguous extent record if needed.
5065 *
5066 * Care is taken to handle contiguousness so as to not grow the tree.
5067 *
5068 * meta_ac is not strictly necessary - we only truly need it if growth
5069 * of the tree is required. All other cases will degrade into a less
5070 * optimal tree layout.
5071 *
5072 * last_eb_bh should be the rightmost leaf block for any extent
5073 * btree. Since a split may grow the tree or a merge might shrink it,
5074 * the caller cannot trust the contents of that buffer after this call.
5075 *
5076 * This code is optimized for readability - several passes might be
5077 * made over certain portions of the tree. All of those blocks will
5078 * have been brought into cache (and pinned via the journal), so the
5079 * extra overhead is not expressed in terms of disk reads.
5080 */
5081int ocfs2_split_extent(handle_t *handle,
5082                       struct ocfs2_extent_tree *et,
5083                       struct ocfs2_path *path,
5084                       int split_index,
5085                       struct ocfs2_extent_rec *split_rec,
5086                       struct ocfs2_alloc_context *meta_ac,
5087                       struct ocfs2_cached_dealloc_ctxt *dealloc)
5088{
5089        int ret = 0;
5090        struct ocfs2_extent_list *el = path_leaf_el(path);
5091        struct buffer_head *last_eb_bh = NULL;
5092        struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5093        struct ocfs2_merge_ctxt ctxt;
5094
5095        if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5096            ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5097             (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5098                ret = -EIO;
5099                mlog_errno(ret);
5100                goto out;
5101        }
5102
5103        ret = ocfs2_figure_merge_contig_type(et, path, el,
5104                                             split_index,
5105                                             split_rec,
5106                                             &ctxt);
5107        if (ret) {
5108                mlog_errno(ret);
5109                goto out;
5110        }
5111
5112        /*
5113         * The core merge / split code wants to know how much room is
5114         * left in this allocation tree, so we pass the
5115         * rightmost extent list.
5116         */
5117        if (path->p_tree_depth) {
5118                struct ocfs2_extent_block *eb;
5119
5120                ret = ocfs2_read_extent_block(et->et_ci,
5121                                              ocfs2_et_get_last_eb_blk(et),
5122                                              &last_eb_bh);
5123                if (ret) {
5124                        mlog_errno(ret);
5125                        goto out;
5126                }
5127
5128                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5129        }
5130
5131        if (rec->e_cpos == split_rec->e_cpos &&
5132            rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5133                ctxt.c_split_covers_rec = 1;
5134        else
5135                ctxt.c_split_covers_rec = 0;
5136
5137        ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5138
5139        trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
5140                                 ctxt.c_has_empty_extent,
5141                                 ctxt.c_split_covers_rec);
5142
5143        if (ctxt.c_contig_type == CONTIG_NONE) {
5144                if (ctxt.c_split_covers_rec)
5145                        ret = ocfs2_replace_extent_rec(handle, et, path, el,
5146                                                       split_index, split_rec);
5147                else
5148                        ret = ocfs2_split_and_insert(handle, et, path,
5149                                                     &last_eb_bh, split_index,
5150                                                     split_rec, meta_ac);
5151                if (ret)
5152                        mlog_errno(ret);
5153        } else {
5154                ret = ocfs2_try_to_merge_extent(handle, et, path,
5155                                                split_index, split_rec,
5156                                                dealloc, &ctxt);
5157                if (ret)
5158                        mlog_errno(ret);
5159        }
5160
5161out:
5162        brelse(last_eb_bh);
5163        return ret;
5164}
5165
5166/*
5167 * Change the flags of the already-existing extent at cpos for len clusters.
5168 *
5169 * new_flags: the flags we want to set.
5170 * clear_flags: the flags we want to clear.
5171 * phys: the new physical offset we want this new extent starts from.
5172 *
5173 * If the existing extent is larger than the request, initiate a
5174 * split. An attempt will be made at merging with adjacent extents.
5175 *
5176 * The caller is responsible for passing down meta_ac if we'll need it.
5177 */
5178int ocfs2_change_extent_flag(handle_t *handle,
5179                             struct ocfs2_extent_tree *et,
5180                             u32 cpos, u32 len, u32 phys,
5181                             struct ocfs2_alloc_context *meta_ac,
5182                             struct ocfs2_cached_dealloc_ctxt *dealloc,
5183                             int new_flags, int clear_flags)
5184{
5185        int ret, index;
5186        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5187        u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5188        struct ocfs2_extent_rec split_rec;
5189        struct ocfs2_path *left_path = NULL;
5190        struct ocfs2_extent_list *el;
5191        struct ocfs2_extent_rec *rec;
5192
5193        left_path = ocfs2_new_path_from_et(et);
5194        if (!left_path) {
5195                ret = -ENOMEM;
5196                mlog_errno(ret);
5197                goto out;
5198        }
5199
5200        ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5201        if (ret) {
5202                mlog_errno(ret);
5203                goto out;
5204        }
5205        el = path_leaf_el(left_path);
5206
5207        index = ocfs2_search_extent_list(el, cpos);
5208        if (index == -1) {
5209                ocfs2_error(sb,
5210                            "Owner %llu has an extent at cpos %u which can no longer be found\n",
5211                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5212                            cpos);
5213                ret = -EROFS;
5214                goto out;
5215        }
5216
5217        ret = -EIO;
5218        rec = &el->l_recs[index];
5219        if (new_flags && (rec->e_flags & new_flags)) {
5220                mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5221                     "extent that already had them\n",
5222                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5223                     new_flags);
5224                goto out;
5225        }
5226
5227        if (clear_flags && !(rec->e_flags & clear_flags)) {
5228                mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5229                     "extent that didn't have them\n",
5230                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5231                     clear_flags);
5232                goto out;
5233        }
5234
5235        memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5236        split_rec.e_cpos = cpu_to_le32(cpos);
5237        split_rec.e_leaf_clusters = cpu_to_le16(len);
5238        split_rec.e_blkno = cpu_to_le64(start_blkno);
5239        split_rec.e_flags = rec->e_flags;
5240        if (new_flags)
5241                split_rec.e_flags |= new_flags;
5242        if (clear_flags)
5243                split_rec.e_flags &= ~clear_flags;
5244
5245        ret = ocfs2_split_extent(handle, et, left_path,
5246                                 index, &split_rec, meta_ac,
5247                                 dealloc);
5248        if (ret)
5249                mlog_errno(ret);
5250
5251out:
5252        ocfs2_free_path(left_path);
5253        return ret;
5254
5255}
5256
5257/*
5258 * Mark the already-existing extent at cpos as written for len clusters.
5259 * This removes the unwritten extent flag.
5260 *
5261 * If the existing extent is larger than the request, initiate a
5262 * split. An attempt will be made at merging with adjacent extents.
5263 *
5264 * The caller is responsible for passing down meta_ac if we'll need it.
5265 */
5266int ocfs2_mark_extent_written(struct inode *inode,
5267                              struct ocfs2_extent_tree *et,
5268                              handle_t *handle, u32 cpos, u32 len, u32 phys,
5269                              struct ocfs2_alloc_context *meta_ac,
5270                              struct ocfs2_cached_dealloc_ctxt *dealloc)
5271{
5272        int ret;
5273
5274        trace_ocfs2_mark_extent_written(
5275                (unsigned long long)OCFS2_I(inode)->ip_blkno,
5276                cpos, len, phys);
5277
5278        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5279                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
5280                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
5281                ret = -EROFS;
5282                goto out;
5283        }
5284
5285        /*
5286         * XXX: This should be fixed up so that we just re-insert the
5287         * next extent records.
5288         */
5289        ocfs2_et_extent_map_truncate(et, 0);
5290
5291        ret = ocfs2_change_extent_flag(handle, et, cpos,
5292                                       len, phys, meta_ac, dealloc,
5293                                       0, OCFS2_EXT_UNWRITTEN);
5294        if (ret)
5295                mlog_errno(ret);
5296
5297out:
5298        return ret;
5299}
5300
5301static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5302                            struct ocfs2_path *path,
5303                            int index, u32 new_range,
5304                            struct ocfs2_alloc_context *meta_ac)
5305{
5306        int ret, depth, credits;
5307        struct buffer_head *last_eb_bh = NULL;
5308        struct ocfs2_extent_block *eb;
5309        struct ocfs2_extent_list *rightmost_el, *el;
5310        struct ocfs2_extent_rec split_rec;
5311        struct ocfs2_extent_rec *rec;
5312        struct ocfs2_insert_type insert;
5313
5314        /*
5315         * Setup the record to split before we grow the tree.
5316         */
5317        el = path_leaf_el(path);
5318        rec = &el->l_recs[index];
5319        ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5320                                   &split_rec, new_range, rec);
5321
5322        depth = path->p_tree_depth;
5323        if (depth > 0) {
5324                ret = ocfs2_read_extent_block(et->et_ci,
5325                                              ocfs2_et_get_last_eb_blk(et),
5326                                              &last_eb_bh);
5327                if (ret < 0) {
5328                        mlog_errno(ret);
5329                        goto out;
5330                }
5331
5332                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5333                rightmost_el = &eb->h_list;
5334        } else
5335                rightmost_el = path_leaf_el(path);
5336
5337        credits = path->p_tree_depth +
5338                  ocfs2_extend_meta_needed(et->et_root_el);
5339        ret = ocfs2_extend_trans(handle, credits);
5340        if (ret) {
5341                mlog_errno(ret);
5342                goto out;
5343        }
5344
5345        if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5346            le16_to_cpu(rightmost_el->l_count)) {
5347                ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5348                                      meta_ac);
5349                if (ret) {
5350                        mlog_errno(ret);
5351                        goto out;
5352                }
5353        }
5354
5355        memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5356        insert.ins_appending = APPEND_NONE;
5357        insert.ins_contig = CONTIG_NONE;
5358        insert.ins_split = SPLIT_RIGHT;
5359        insert.ins_tree_depth = depth;
5360
5361        ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5362        if (ret)
5363                mlog_errno(ret);
5364
5365out:
5366        brelse(last_eb_bh);
5367        return ret;
5368}
5369
5370static int ocfs2_truncate_rec(handle_t *handle,
5371                              struct ocfs2_extent_tree *et,
5372                              struct ocfs2_path *path, int index,
5373                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5374                              u32 cpos, u32 len)
5375{
5376        int ret;
5377        u32 left_cpos, rec_range, trunc_range;
5378        int is_rightmost_tree_rec = 0;
5379        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5380        struct ocfs2_path *left_path = NULL;
5381        struct ocfs2_extent_list *el = path_leaf_el(path);
5382        struct ocfs2_extent_rec *rec;
5383        struct ocfs2_extent_block *eb;
5384
5385        if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5386                /* extend credit for ocfs2_remove_rightmost_path */
5387                ret = ocfs2_extend_rotate_transaction(handle, 0,
5388                                handle->h_buffer_credits,
5389                                path);
5390                if (ret) {
5391                        mlog_errno(ret);
5392                        goto out;
5393                }
5394
5395                ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5396                if (ret) {
5397                        mlog_errno(ret);
5398                        goto out;
5399                }
5400
5401                index--;
5402        }
5403
5404        if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5405            path->p_tree_depth) {
5406                /*
5407                 * Check whether this is the rightmost tree record. If
5408                 * we remove all of this record or part of its right
5409                 * edge then an update of the record lengths above it
5410                 * will be required.
5411                 */
5412                eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5413                if (eb->h_next_leaf_blk == 0)
5414                        is_rightmost_tree_rec = 1;
5415        }
5416
5417        rec = &el->l_recs[index];
5418        if (index == 0 && path->p_tree_depth &&
5419            le32_to_cpu(rec->e_cpos) == cpos) {
5420                /*
5421                 * Changing the leftmost offset (via partial or whole
5422                 * record truncate) of an interior (or rightmost) path
5423                 * means we have to update the subtree that is formed
5424                 * by this leaf and the one to it's left.
5425                 *
5426                 * There are two cases we can skip:
5427                 *   1) Path is the leftmost one in our btree.
5428                 *   2) The leaf is rightmost and will be empty after
5429                 *      we remove the extent record - the rotate code
5430                 *      knows how to update the newly formed edge.
5431                 */
5432
5433                ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5434                if (ret) {
5435                        mlog_errno(ret);
5436                        goto out;
5437                }
5438
5439                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5440                        left_path = ocfs2_new_path_from_path(path);
5441                        if (!left_path) {
5442                                ret = -ENOMEM;
5443                                mlog_errno(ret);
5444                                goto out;
5445                        }
5446
5447                        ret = ocfs2_find_path(et->et_ci, left_path,
5448                                              left_cpos);
5449                        if (ret) {
5450                                mlog_errno(ret);
5451                                goto out;
5452                        }
5453                }
5454        }
5455
5456        ret = ocfs2_extend_rotate_transaction(handle, 0,
5457                                              handle->h_buffer_credits,
5458                                              path);
5459        if (ret) {
5460                mlog_errno(ret);
5461                goto out;
5462        }
5463
5464        ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5465        if (ret) {
5466                mlog_errno(ret);
5467                goto out;
5468        }
5469
5470        ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5471        if (ret) {
5472                mlog_errno(ret);
5473                goto out;
5474        }
5475
5476        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5477        trunc_range = cpos + len;
5478
5479        if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5480                int next_free;
5481
5482                memset(rec, 0, sizeof(*rec));
5483                ocfs2_cleanup_merge(el, index);
5484
5485                next_free = le16_to_cpu(el->l_next_free_rec);
5486                if (is_rightmost_tree_rec && next_free > 1) {
5487                        /*
5488                         * We skip the edge update if this path will
5489                         * be deleted by the rotate code.
5490                         */
5491                        rec = &el->l_recs[next_free - 1];
5492                        ocfs2_adjust_rightmost_records(handle, et, path,
5493                                                       rec);
5494                }
5495        } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5496                /* Remove leftmost portion of the record. */
5497                le32_add_cpu(&rec->e_cpos, len);
5498                le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5499                le16_add_cpu(&rec->e_leaf_clusters, -len);
5500        } else if (rec_range == trunc_range) {
5501                /* Remove rightmost portion of the record */
5502                le16_add_cpu(&rec->e_leaf_clusters, -len);
5503                if (is_rightmost_tree_rec)
5504                        ocfs2_adjust_rightmost_records(handle, et, path, rec);
5505        } else {
5506                /* Caller should have trapped this. */
5507                mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5508                     "(%u, %u)\n",
5509                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5510                     le32_to_cpu(rec->e_cpos),
5511                     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5512                BUG();
5513        }
5514
5515        if (left_path) {
5516                int subtree_index;
5517
5518                subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5519                ocfs2_complete_edge_insert(handle, left_path, path,
5520                                           subtree_index);
5521        }
5522
5523        ocfs2_journal_dirty(handle, path_leaf_bh(path));
5524
5525        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5526        if (ret) {
5527                mlog_errno(ret);
5528                goto out;
5529        }
5530
5531out:
5532        ocfs2_free_path(left_path);
5533        return ret;
5534}
5535
5536int ocfs2_remove_extent(handle_t *handle,
5537                        struct ocfs2_extent_tree *et,
5538                        u32 cpos, u32 len,
5539                        struct ocfs2_alloc_context *meta_ac,
5540                        struct ocfs2_cached_dealloc_ctxt *dealloc)
5541{
5542        int ret, index;
5543        u32 rec_range, trunc_range;
5544        struct ocfs2_extent_rec *rec;
5545        struct ocfs2_extent_list *el;
5546        struct ocfs2_path *path = NULL;
5547
5548        /*
5549         * XXX: Why are we truncating to 0 instead of wherever this
5550         * affects us?
5551         */
5552        ocfs2_et_extent_map_truncate(et, 0);
5553
5554        path = ocfs2_new_path_from_et(et);
5555        if (!path) {
5556                ret = -ENOMEM;
5557                mlog_errno(ret);
5558                goto out;
5559        }
5560
5561        ret = ocfs2_find_path(et->et_ci, path, cpos);
5562        if (ret) {
5563                mlog_errno(ret);
5564                goto out;
5565        }
5566
5567        el = path_leaf_el(path);
5568        index = ocfs2_search_extent_list(el, cpos);
5569        if (index == -1) {
5570                ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5571                            "Owner %llu has an extent at cpos %u which can no longer be found\n",
5572                            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5573                            cpos);
5574                ret = -EROFS;
5575                goto out;
5576        }
5577
5578        /*
5579         * We have 3 cases of extent removal:
5580         *   1) Range covers the entire extent rec
5581         *   2) Range begins or ends on one edge of the extent rec
5582         *   3) Range is in the middle of the extent rec (no shared edges)
5583         *
5584         * For case 1 we remove the extent rec and left rotate to
5585         * fill the hole.
5586         *
5587         * For case 2 we just shrink the existing extent rec, with a
5588         * tree update if the shrinking edge is also the edge of an
5589         * extent block.
5590         *
5591         * For case 3 we do a right split to turn the extent rec into
5592         * something case 2 can handle.
5593         */
5594        rec = &el->l_recs[index];
5595        rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5596        trunc_range = cpos + len;
5597
5598        BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5599
5600        trace_ocfs2_remove_extent(
5601                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5602                cpos, len, index, le32_to_cpu(rec->e_cpos),
5603                ocfs2_rec_clusters(el, rec));
5604
5605        if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5606                ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5607                                         cpos, len);
5608                if (ret) {
5609                        mlog_errno(ret);
5610                        goto out;
5611                }
5612        } else {
5613                ret = ocfs2_split_tree(handle, et, path, index,
5614                                       trunc_range, meta_ac);
5615                if (ret) {
5616                        mlog_errno(ret);
5617                        goto out;
5618                }
5619
5620                /*
5621                 * The split could have manipulated the tree enough to
5622                 * move the record location, so we have to look for it again.
5623                 */
5624                ocfs2_reinit_path(path, 1);
5625
5626                ret = ocfs2_find_path(et->et_ci, path, cpos);
5627                if (ret) {
5628                        mlog_errno(ret);
5629                        goto out;
5630                }
5631
5632                el = path_leaf_el(path);
5633                index = ocfs2_search_extent_list(el, cpos);
5634                if (index == -1) {
5635                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5636                                    "Owner %llu: split at cpos %u lost record\n",
5637                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5638                                    cpos);
5639                        ret = -EROFS;
5640                        goto out;
5641                }
5642
5643                /*
5644                 * Double check our values here. If anything is fishy,
5645                 * it's easier to catch it at the top level.
5646                 */
5647                rec = &el->l_recs[index];
5648                rec_range = le32_to_cpu(rec->e_cpos) +
5649                        ocfs2_rec_clusters(el, rec);
5650                if (rec_range != trunc_range) {
5651                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5652                                    "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
5653                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5654                                    cpos, len, le32_to_cpu(rec->e_cpos),
5655                                    ocfs2_rec_clusters(el, rec));
5656                        ret = -EROFS;
5657                        goto out;
5658                }
5659
5660                ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5661                                         cpos, len);
5662                if (ret) {
5663                        mlog_errno(ret);
5664                        goto out;
5665                }
5666        }
5667
5668out:
5669        ocfs2_free_path(path);
5670        return ret;
5671}
5672
5673/*
5674 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5675 * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5676 * number to reserve some extra blocks, and it only handles meta
5677 * data allocations.
5678 *
5679 * Currently, only ocfs2_remove_btree_range() uses it for truncating
5680 * and punching holes.
5681 */
5682static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5683                                              struct ocfs2_extent_tree *et,
5684                                              u32 extents_to_split,
5685                                              struct ocfs2_alloc_context **ac,
5686                                              int extra_blocks)
5687{
5688        int ret = 0, num_free_extents;
5689        unsigned int max_recs_needed = 2 * extents_to_split;
5690        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5691
5692        *ac = NULL;
5693
5694        num_free_extents = ocfs2_num_free_extents(et);
5695        if (num_free_extents < 0) {
5696                ret = num_free_extents;
5697                mlog_errno(ret);
5698                goto out;
5699        }
5700
5701        if (!num_free_extents ||
5702            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
5703                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
5704
5705        if (extra_blocks) {
5706                ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, ac);
5707                if (ret < 0) {
5708                        if (ret != -ENOSPC)
5709                                mlog_errno(ret);
5710                        goto out;
5711                }
5712        }
5713
5714out:
5715        if (ret) {
5716                if (*ac) {
5717                        ocfs2_free_alloc_context(*ac);
5718                        *ac = NULL;
5719                }
5720        }
5721
5722        return ret;
5723}
5724
5725int ocfs2_remove_btree_range(struct inode *inode,
5726                             struct ocfs2_extent_tree *et,
5727                             u32 cpos, u32 phys_cpos, u32 len, int flags,
5728                             struct ocfs2_cached_dealloc_ctxt *dealloc,
5729                             u64 refcount_loc, bool refcount_tree_locked)
5730{
5731        int ret, credits = 0, extra_blocks = 0;
5732        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5733        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5734        struct inode *tl_inode = osb->osb_tl_inode;
5735        handle_t *handle;
5736        struct ocfs2_alloc_context *meta_ac = NULL;
5737        struct ocfs2_refcount_tree *ref_tree = NULL;
5738
5739        if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
5740                BUG_ON(!ocfs2_is_refcount_inode(inode));
5741
5742                if (!refcount_tree_locked) {
5743                        ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
5744                                                       &ref_tree, NULL);
5745                        if (ret) {
5746                                mlog_errno(ret);
5747                                goto bail;
5748                        }
5749                }
5750
5751                ret = ocfs2_prepare_refcount_change_for_del(inode,
5752                                                            refcount_loc,
5753                                                            phys_blkno,
5754                                                            len,
5755                                                            &credits,
5756                                                            &extra_blocks);
5757                if (ret < 0) {
5758                        mlog_errno(ret);
5759                        goto bail;
5760                }
5761        }
5762
5763        ret = ocfs2_reserve_blocks_for_rec_trunc(inode, et, 1, &meta_ac,
5764                                                 extra_blocks);
5765        if (ret) {
5766                mlog_errno(ret);
5767                goto bail;
5768        }
5769
5770        inode_lock(tl_inode);
5771
5772        if (ocfs2_truncate_log_needs_flush(osb)) {
5773                ret = __ocfs2_flush_truncate_log(osb);
5774                if (ret < 0) {
5775                        mlog_errno(ret);
5776                        goto out;
5777                }
5778        }
5779
5780        handle = ocfs2_start_trans(osb,
5781                        ocfs2_remove_extent_credits(osb->sb) + credits);
5782        if (IS_ERR(handle)) {
5783                ret = PTR_ERR(handle);
5784                mlog_errno(ret);
5785                goto out;
5786        }
5787
5788        ret = ocfs2_et_root_journal_access(handle, et,
5789                                           OCFS2_JOURNAL_ACCESS_WRITE);
5790        if (ret) {
5791                mlog_errno(ret);
5792                goto out_commit;
5793        }
5794
5795        dquot_free_space_nodirty(inode,
5796                                  ocfs2_clusters_to_bytes(inode->i_sb, len));
5797
5798        ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5799        if (ret) {
5800                mlog_errno(ret);
5801                goto out_commit;
5802        }
5803
5804        ocfs2_et_update_clusters(et, -len);
5805        ocfs2_update_inode_fsync_trans(handle, inode, 1);
5806
5807        ocfs2_journal_dirty(handle, et->et_root_bh);
5808
5809        if (phys_blkno) {
5810                if (flags & OCFS2_EXT_REFCOUNTED)
5811                        ret = ocfs2_decrease_refcount(inode, handle,
5812                                        ocfs2_blocks_to_clusters(osb->sb,
5813                                                                 phys_blkno),
5814                                        len, meta_ac,
5815                                        dealloc, 1);
5816                else
5817                        ret = ocfs2_truncate_log_append(osb, handle,
5818                                                        phys_blkno, len);
5819                if (ret)
5820                        mlog_errno(ret);
5821
5822        }
5823
5824out_commit:
5825        ocfs2_commit_trans(osb, handle);
5826out:
5827        inode_unlock(tl_inode);
5828bail:
5829        if (meta_ac)
5830                ocfs2_free_alloc_context(meta_ac);
5831
5832        if (ref_tree)
5833                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
5834
5835        return ret;
5836}
5837
5838int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5839{
5840        struct buffer_head *tl_bh = osb->osb_tl_bh;
5841        struct ocfs2_dinode *di;
5842        struct ocfs2_truncate_log *tl;
5843
5844        di = (struct ocfs2_dinode *) tl_bh->b_data;
5845        tl = &di->id2.i_dealloc;
5846
5847        mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5848                        "slot %d, invalid truncate log parameters: used = "
5849                        "%u, count = %u\n", osb->slot_num,
5850                        le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5851        return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5852}
5853
5854static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5855                                           unsigned int new_start)
5856{
5857        unsigned int tail_index;
5858        unsigned int current_tail;
5859
5860        /* No records, nothing to coalesce */
5861        if (!le16_to_cpu(tl->tl_used))
5862                return 0;
5863
5864        tail_index = le16_to_cpu(tl->tl_used) - 1;
5865        current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5866        current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5867
5868        return current_tail == new_start;
5869}
5870
5871int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5872                              handle_t *handle,
5873                              u64 start_blk,
5874                              unsigned int num_clusters)
5875{
5876        int status, index;
5877        unsigned int start_cluster, tl_count;
5878        struct inode *tl_inode = osb->osb_tl_inode;
5879        struct buffer_head *tl_bh = osb->osb_tl_bh;
5880        struct ocfs2_dinode *di;
5881        struct ocfs2_truncate_log *tl;
5882
5883        BUG_ON(inode_trylock(tl_inode));
5884
5885        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5886
5887        di = (struct ocfs2_dinode *) tl_bh->b_data;
5888
5889        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5890         * by the underlying call to ocfs2_read_inode_block(), so any
5891         * corruption is a code bug */
5892        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5893
5894        tl = &di->id2.i_dealloc;
5895        tl_count = le16_to_cpu(tl->tl_count);
5896        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5897                        tl_count == 0,
5898                        "Truncate record count on #%llu invalid "
5899                        "wanted %u, actual %u\n",
5900                        (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5901                        ocfs2_truncate_recs_per_inode(osb->sb),
5902                        le16_to_cpu(tl->tl_count));
5903
5904        /* Caller should have known to flush before calling us. */
5905        index = le16_to_cpu(tl->tl_used);
5906        if (index >= tl_count) {
5907                status = -ENOSPC;
5908                mlog_errno(status);
5909                goto bail;
5910        }
5911
5912        status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5913                                         OCFS2_JOURNAL_ACCESS_WRITE);
5914        if (status < 0) {
5915                mlog_errno(status);
5916                goto bail;
5917        }
5918
5919        trace_ocfs2_truncate_log_append(
5920                (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
5921                start_cluster, num_clusters);
5922        if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5923                /*
5924                 * Move index back to the record we are coalescing with.
5925                 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5926                 */
5927                index--;
5928
5929                num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5930                trace_ocfs2_truncate_log_append(
5931                        (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5932                        index, le32_to_cpu(tl->tl_recs[index].t_start),
5933                        num_clusters);
5934        } else {
5935                tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5936                tl->tl_used = cpu_to_le16(index + 1);
5937        }
5938        tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5939
5940        ocfs2_journal_dirty(handle, tl_bh);
5941
5942        osb->truncated_clusters += num_clusters;
5943bail:
5944        return status;
5945}
5946
5947static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5948                                         struct inode *data_alloc_inode,
5949                                         struct buffer_head *data_alloc_bh)
5950{
5951        int status = 0;
5952        int i;
5953        unsigned int num_clusters;
5954        u64 start_blk;
5955        struct ocfs2_truncate_rec rec;
5956        struct ocfs2_dinode *di;
5957        struct ocfs2_truncate_log *tl;
5958        struct inode *tl_inode = osb->osb_tl_inode;
5959        struct buffer_head *tl_bh = osb->osb_tl_bh;
5960        handle_t *handle;
5961
5962        di = (struct ocfs2_dinode *) tl_bh->b_data;
5963        tl = &di->id2.i_dealloc;
5964        i = le16_to_cpu(tl->tl_used) - 1;
5965        while (i >= 0) {
5966                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5967                if (IS_ERR(handle)) {
5968                        status = PTR_ERR(handle);
5969                        mlog_errno(status);
5970                        goto bail;
5971                }
5972
5973                /* Caller has given us at least enough credits to
5974                 * update the truncate log dinode */
5975                status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5976                                                 OCFS2_JOURNAL_ACCESS_WRITE);
5977                if (status < 0) {
5978                        mlog_errno(status);
5979                        goto bail;
5980                }
5981
5982                tl->tl_used = cpu_to_le16(i);
5983
5984                ocfs2_journal_dirty(handle, tl_bh);
5985
5986                rec = tl->tl_recs[i];
5987                start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5988                                                    le32_to_cpu(rec.t_start));
5989                num_clusters = le32_to_cpu(rec.t_clusters);
5990
5991                /* if start_blk is not set, we ignore the record as
5992                 * invalid. */
5993                if (start_blk) {
5994                        trace_ocfs2_replay_truncate_records(
5995                                (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5996                                i, le32_to_cpu(rec.t_start), num_clusters);
5997
5998                        status = ocfs2_free_clusters(handle, data_alloc_inode,
5999                                                     data_alloc_bh, start_blk,
6000                                                     num_clusters);
6001                        if (status < 0) {
6002                                mlog_errno(status);
6003                                goto bail;
6004                        }
6005                }
6006
6007                ocfs2_commit_trans(osb, handle);
6008                i--;
6009        }
6010
6011        osb->truncated_clusters = 0;
6012
6013bail:
6014        return status;
6015}
6016
6017/* Expects you to already be holding tl_inode->i_mutex */
6018int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6019{
6020        int status;
6021        unsigned int num_to_flush;
6022        struct inode *tl_inode = osb->osb_tl_inode;
6023        struct inode *data_alloc_inode = NULL;
6024        struct buffer_head *tl_bh = osb->osb_tl_bh;
6025        struct buffer_head *data_alloc_bh = NULL;
6026        struct ocfs2_dinode *di;
6027        struct ocfs2_truncate_log *tl;
6028
6029        BUG_ON(inode_trylock(tl_inode));
6030
6031        di = (struct ocfs2_dinode *) tl_bh->b_data;
6032
6033        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
6034         * by the underlying call to ocfs2_read_inode_block(), so any
6035         * corruption is a code bug */
6036        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6037
6038        tl = &di->id2.i_dealloc;
6039        num_to_flush = le16_to_cpu(tl->tl_used);
6040        trace_ocfs2_flush_truncate_log(
6041                (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
6042                num_to_flush);
6043        if (!num_to_flush) {
6044                status = 0;
6045                goto out;
6046        }
6047
6048        data_alloc_inode = ocfs2_get_system_file_inode(osb,
6049                                                       GLOBAL_BITMAP_SYSTEM_INODE,
6050                                                       OCFS2_INVALID_SLOT);
6051        if (!data_alloc_inode) {
6052                status = -EINVAL;
6053                mlog(ML_ERROR, "Could not get bitmap inode!\n");
6054                goto out;
6055        }
6056
6057        inode_lock(data_alloc_inode);
6058
6059        status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
6060        if (status < 0) {
6061                mlog_errno(status);
6062                goto out_mutex;
6063        }
6064
6065        status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
6066                                               data_alloc_bh);
6067        if (status < 0)
6068                mlog_errno(status);
6069
6070        brelse(data_alloc_bh);
6071        ocfs2_inode_unlock(data_alloc_inode, 1);
6072
6073out_mutex:
6074        inode_unlock(data_alloc_inode);
6075        iput(data_alloc_inode);
6076
6077out:
6078        return status;
6079}
6080
6081int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6082{
6083        int status;
6084        struct inode *tl_inode = osb->osb_tl_inode;
6085
6086        inode_lock(tl_inode);
6087        status = __ocfs2_flush_truncate_log(osb);
6088        inode_unlock(tl_inode);
6089
6090        return status;
6091}
6092
6093static void ocfs2_truncate_log_worker(struct work_struct *work)
6094{
6095        int status;
6096        struct ocfs2_super *osb =
6097                container_of(work, struct ocfs2_super,
6098                             osb_truncate_log_wq.work);
6099
6100        status = ocfs2_flush_truncate_log(osb);
6101        if (status < 0)
6102                mlog_errno(status);
6103        else
6104                ocfs2_init_steal_slots(osb);
6105}
6106
6107#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
6108void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
6109                                       int cancel)
6110{
6111        if (osb->osb_tl_inode &&
6112                        atomic_read(&osb->osb_tl_disable) == 0) {
6113                /* We want to push off log flushes while truncates are
6114                 * still running. */
6115                if (cancel)
6116                        cancel_delayed_work(&osb->osb_truncate_log_wq);
6117
6118                queue_delayed_work(osb->ocfs2_wq, &osb->osb_truncate_log_wq,
6119                                   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
6120        }
6121}
6122
6123/*
6124 * Try to flush truncate logs if we can free enough clusters from it.
6125 * As for return value, "< 0" means error, "0" no space and "1" means
6126 * we have freed enough spaces and let the caller try to allocate again.
6127 */
6128int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
6129                                        unsigned int needed)
6130{
6131        tid_t target;
6132        int ret = 0;
6133        unsigned int truncated_clusters;
6134
6135        inode_lock(osb->osb_tl_inode);
6136        truncated_clusters = osb->truncated_clusters;
6137        inode_unlock(osb->osb_tl_inode);
6138
6139        /*
6140         * Check whether we can succeed in allocating if we free
6141         * the truncate log.
6142         */
6143        if (truncated_clusters < needed)
6144                goto out;
6145
6146        ret = ocfs2_flush_truncate_log(osb);
6147        if (ret) {
6148                mlog_errno(ret);
6149                goto out;
6150        }
6151
6152        if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
6153                jbd2_log_wait_commit(osb->journal->j_journal, target);
6154                ret = 1;
6155        }
6156out:
6157        return ret;
6158}
6159
6160static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
6161                                       int slot_num,
6162                                       struct inode **tl_inode,
6163                                       struct buffer_head **tl_bh)
6164{
6165        int status;
6166        struct inode *inode = NULL;
6167        struct buffer_head *bh = NULL;
6168
6169        inode = ocfs2_get_system_file_inode(osb,
6170                                           TRUNCATE_LOG_SYSTEM_INODE,
6171                                           slot_num);
6172        if (!inode) {
6173                status = -EINVAL;
6174                mlog(ML_ERROR, "Could not get load truncate log inode!\n");
6175                goto bail;
6176        }
6177
6178        status = ocfs2_read_inode_block(inode, &bh);
6179        if (status < 0) {
6180                iput(inode);
6181                mlog_errno(status);
6182                goto bail;
6183        }
6184
6185        *tl_inode = inode;
6186        *tl_bh    = bh;
6187bail:
6188        return status;
6189}
6190
6191/* called during the 1st stage of node recovery. we stamp a clean
6192 * truncate log and pass back a copy for processing later. if the
6193 * truncate log does not require processing, a *tl_copy is set to
6194 * NULL. */
6195int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6196                                      int slot_num,
6197                                      struct ocfs2_dinode **tl_copy)
6198{
6199        int status;
6200        struct inode *tl_inode = NULL;
6201        struct buffer_head *tl_bh = NULL;
6202        struct ocfs2_dinode *di;
6203        struct ocfs2_truncate_log *tl;
6204
6205        *tl_copy = NULL;
6206
6207        trace_ocfs2_begin_truncate_log_recovery(slot_num);
6208
6209        status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6210        if (status < 0) {
6211                mlog_errno(status);
6212                goto bail;
6213        }
6214
6215        di = (struct ocfs2_dinode *) tl_bh->b_data;
6216
6217        /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
6218         * validated by the underlying call to ocfs2_read_inode_block(),
6219         * so any corruption is a code bug */
6220        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6221
6222        tl = &di->id2.i_dealloc;
6223        if (le16_to_cpu(tl->tl_used)) {
6224                trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
6225
6226                *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6227                if (!(*tl_copy)) {
6228                        status = -ENOMEM;
6229                        mlog_errno(status);
6230                        goto bail;
6231                }
6232
6233                /* Assuming the write-out below goes well, this copy
6234                 * will be passed back to recovery for processing. */
6235                memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
6236
6237                /* All we need to do to clear the truncate log is set
6238                 * tl_used. */
6239                tl->tl_used = 0;
6240
6241                ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6242                status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6243                if (status < 0) {
6244                        mlog_errno(status);
6245                        goto bail;
6246                }
6247        }
6248
6249bail:
6250        iput(tl_inode);
6251        brelse(tl_bh);
6252
6253        if (status < 0) {
6254                kfree(*tl_copy);
6255                *tl_copy = NULL;
6256                mlog_errno(status);
6257        }
6258
6259        return status;
6260}
6261
6262int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6263                                         struct ocfs2_dinode *tl_copy)
6264{
6265        int status = 0;
6266        int i;
6267        unsigned int clusters, num_recs, start_cluster;
6268        u64 start_blk;
6269        handle_t *handle;
6270        struct inode *tl_inode = osb->osb_tl_inode;
6271        struct ocfs2_truncate_log *tl;
6272
6273        if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6274                mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6275                return -EINVAL;
6276        }
6277
6278        tl = &tl_copy->id2.i_dealloc;
6279        num_recs = le16_to_cpu(tl->tl_used);
6280        trace_ocfs2_complete_truncate_log_recovery(
6281                (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
6282                num_recs);
6283
6284        inode_lock(tl_inode);
6285        for(i = 0; i < num_recs; i++) {
6286                if (ocfs2_truncate_log_needs_flush(osb)) {
6287                        status = __ocfs2_flush_truncate_log(osb);
6288                        if (status < 0) {
6289                                mlog_errno(status);
6290                                goto bail_up;
6291                        }
6292                }
6293
6294                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6295                if (IS_ERR(handle)) {
6296                        status = PTR_ERR(handle);
6297                        mlog_errno(status);
6298                        goto bail_up;
6299                }
6300
6301                clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6302                start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6303                start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6304
6305                status = ocfs2_truncate_log_append(osb, handle,
6306                                                   start_blk, clusters);
6307                ocfs2_commit_trans(osb, handle);
6308                if (status < 0) {
6309                        mlog_errno(status);
6310                        goto bail_up;
6311                }
6312        }
6313
6314bail_up:
6315        inode_unlock(tl_inode);
6316
6317        return status;
6318}
6319
6320void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6321{
6322        int status;
6323        struct inode *tl_inode = osb->osb_tl_inode;
6324
6325        atomic_set(&osb->osb_tl_disable, 1);
6326
6327        if (tl_inode) {
6328                cancel_delayed_work(&osb->osb_truncate_log_wq);
6329                flush_workqueue(osb->ocfs2_wq);
6330
6331                status = ocfs2_flush_truncate_log(osb);
6332                if (status < 0)
6333                        mlog_errno(status);
6334
6335                brelse(osb->osb_tl_bh);
6336                iput(osb->osb_tl_inode);
6337        }
6338}
6339
6340int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6341{
6342        int status;
6343        struct inode *tl_inode = NULL;
6344        struct buffer_head *tl_bh = NULL;
6345
6346        status = ocfs2_get_truncate_log_info(osb,
6347                                             osb->slot_num,
6348                                             &tl_inode,
6349                                             &tl_bh);
6350        if (status < 0)
6351                mlog_errno(status);
6352
6353        /* ocfs2_truncate_log_shutdown keys on the existence of
6354         * osb->osb_tl_inode so we don't set any of the osb variables
6355         * until we're sure all is well. */
6356        INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6357                          ocfs2_truncate_log_worker);
6358        atomic_set(&osb->osb_tl_disable, 0);
6359        osb->osb_tl_bh    = tl_bh;
6360        osb->osb_tl_inode = tl_inode;
6361
6362        return status;
6363}
6364
6365/*
6366 * Delayed de-allocation of suballocator blocks.
6367 *
6368 * Some sets of block de-allocations might involve multiple suballocator inodes.
6369 *
6370 * The locking for this can get extremely complicated, especially when
6371 * the suballocator inodes to delete from aren't known until deep
6372 * within an unrelated codepath.
6373 *
6374 * ocfs2_extent_block structures are a good example of this - an inode
6375 * btree could have been grown by any number of nodes each allocating
6376 * out of their own suballoc inode.
6377 *
6378 * These structures allow the delay of block de-allocation until a
6379 * later time, when locking of multiple cluster inodes won't cause
6380 * deadlock.
6381 */
6382
6383/*
6384 * Describe a single bit freed from a suballocator.  For the block
6385 * suballocators, it represents one block.  For the global cluster
6386 * allocator, it represents some clusters and free_bit indicates
6387 * clusters number.
6388 */
6389struct ocfs2_cached_block_free {
6390        struct ocfs2_cached_block_free          *free_next;
6391        u64                                     free_bg;
6392        u64                                     free_blk;
6393        unsigned int                            free_bit;
6394};
6395
6396struct ocfs2_per_slot_free_list {
6397        struct ocfs2_per_slot_free_list         *f_next_suballocator;
6398        int                                     f_inode_type;
6399        int                                     f_slot;
6400        struct ocfs2_cached_block_free          *f_first;
6401};
6402
6403static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6404                                    int sysfile_type,
6405                                    int slot,
6406                                    struct ocfs2_cached_block_free *head)
6407{
6408        int ret;
6409        u64 bg_blkno;
6410        handle_t *handle;
6411        struct inode *inode;
6412        struct buffer_head *di_bh = NULL;
6413        struct ocfs2_cached_block_free *tmp;
6414
6415        inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6416        if (!inode) {
6417                ret = -EINVAL;
6418                mlog_errno(ret);
6419                goto out;
6420        }
6421
6422        inode_lock(inode);
6423
6424        ret = ocfs2_inode_lock(inode, &di_bh, 1);
6425        if (ret) {
6426                mlog_errno(ret);
6427                goto out_mutex;
6428        }
6429
6430        while (head) {
6431                if (head->free_bg)
6432                        bg_blkno = head->free_bg;
6433                else
6434                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6435                                                              head->free_bit);
6436                handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6437                if (IS_ERR(handle)) {
6438                        ret = PTR_ERR(handle);
6439                        mlog_errno(ret);
6440                        goto out_unlock;
6441                }
6442
6443                trace_ocfs2_free_cached_blocks(
6444                     (unsigned long long)head->free_blk, head->free_bit);
6445
6446                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6447                                               head->free_bit, bg_blkno, 1);
6448                if (ret)
6449                        mlog_errno(ret);
6450
6451                ocfs2_commit_trans(osb, handle);
6452
6453                tmp = head;
6454                head = head->free_next;
6455                kfree(tmp);
6456        }
6457
6458out_unlock:
6459        ocfs2_inode_unlock(inode, 1);
6460        brelse(di_bh);
6461out_mutex:
6462        inode_unlock(inode);
6463        iput(inode);
6464out:
6465        while(head) {
6466                /* Premature exit may have left some dangling items. */
6467                tmp = head;
6468                head = head->free_next;
6469                kfree(tmp);
6470        }
6471
6472        return ret;
6473}
6474
6475int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6476                                u64 blkno, unsigned int bit)
6477{
6478        int ret = 0;
6479        struct ocfs2_cached_block_free *item;
6480
6481        item = kzalloc(sizeof(*item), GFP_NOFS);
6482        if (item == NULL) {
6483                ret = -ENOMEM;
6484                mlog_errno(ret);
6485                return ret;
6486        }
6487
6488        trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
6489
6490        item->free_blk = blkno;
6491        item->free_bit = bit;
6492        item->free_next = ctxt->c_global_allocator;
6493
6494        ctxt->c_global_allocator = item;
6495        return ret;
6496}
6497
6498static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6499                                      struct ocfs2_cached_block_free *head)
6500{
6501        struct ocfs2_cached_block_free *tmp;
6502        struct inode *tl_inode = osb->osb_tl_inode;
6503        handle_t *handle;
6504        int ret = 0;
6505
6506        inode_lock(tl_inode);
6507
6508        while (head) {
6509                if (ocfs2_truncate_log_needs_flush(osb)) {
6510                        ret = __ocfs2_flush_truncate_log(osb);
6511                        if (ret < 0) {
6512                                mlog_errno(ret);
6513                                break;
6514                        }
6515                }
6516
6517                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6518                if (IS_ERR(handle)) {
6519                        ret = PTR_ERR(handle);
6520                        mlog_errno(ret);
6521                        break;
6522                }
6523
6524                ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6525                                                head->free_bit);
6526
6527                ocfs2_commit_trans(osb, handle);
6528                tmp = head;
6529                head = head->free_next;
6530                kfree(tmp);
6531
6532                if (ret < 0) {
6533                        mlog_errno(ret);
6534                        break;
6535                }
6536        }
6537
6538        inode_unlock(tl_inode);
6539
6540        while (head) {
6541                /* Premature exit may have left some dangling items. */
6542                tmp = head;
6543                head = head->free_next;
6544                kfree(tmp);
6545        }
6546
6547        return ret;
6548}
6549
6550int ocfs2_run_deallocs(struct ocfs2_super *osb,
6551                       struct ocfs2_cached_dealloc_ctxt *ctxt)
6552{
6553        int ret = 0, ret2;
6554        struct ocfs2_per_slot_free_list *fl;
6555
6556        if (!ctxt)
6557                return 0;
6558
6559        while (ctxt->c_first_suballocator) {
6560                fl = ctxt->c_first_suballocator;
6561
6562                if (fl->f_first) {
6563                        trace_ocfs2_run_deallocs(fl->f_inode_type,
6564                                                 fl->f_slot);
6565                        ret2 = ocfs2_free_cached_blocks(osb,
6566                                                        fl->f_inode_type,
6567                                                        fl->f_slot,
6568                                                        fl->f_first);
6569                        if (ret2)
6570                                mlog_errno(ret2);
6571                        if (!ret)
6572                                ret = ret2;
6573                }
6574
6575                ctxt->c_first_suballocator = fl->f_next_suballocator;
6576                kfree(fl);
6577        }
6578
6579        if (ctxt->c_global_allocator) {
6580                ret2 = ocfs2_free_cached_clusters(osb,
6581                                                  ctxt->c_global_allocator);
6582                if (ret2)
6583                        mlog_errno(ret2);
6584                if (!ret)
6585                        ret = ret2;
6586
6587                ctxt->c_global_allocator = NULL;
6588        }
6589
6590        return ret;
6591}
6592
6593static struct ocfs2_per_slot_free_list *
6594ocfs2_find_per_slot_free_list(int type,
6595                              int slot,
6596                              struct ocfs2_cached_dealloc_ctxt *ctxt)
6597{
6598        struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6599
6600        while (fl) {
6601                if (fl->f_inode_type == type && fl->f_slot == slot)
6602                        return fl;
6603
6604                fl = fl->f_next_suballocator;
6605        }
6606
6607        fl = kmalloc(sizeof(*fl), GFP_NOFS);
6608        if (fl) {
6609                fl->f_inode_type = type;
6610                fl->f_slot = slot;
6611                fl->f_first = NULL;
6612                fl->f_next_suballocator = ctxt->c_first_suballocator;
6613
6614                ctxt->c_first_suballocator = fl;
6615        }
6616        return fl;
6617}
6618
6619static struct ocfs2_per_slot_free_list *
6620ocfs2_find_preferred_free_list(int type,
6621                               int preferred_slot,
6622                               int *real_slot,
6623                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6624{
6625        struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6626
6627        while (fl) {
6628                if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
6629                        *real_slot = fl->f_slot;
6630                        return fl;
6631                }
6632
6633                fl = fl->f_next_suballocator;
6634        }
6635
6636        /* If we can't find any free list matching preferred slot, just use
6637         * the first one.
6638         */
6639        fl = ctxt->c_first_suballocator;
6640        *real_slot = fl->f_slot;
6641
6642        return fl;
6643}
6644
6645/* Return Value 1 indicates empty */
6646static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
6647{
6648        struct ocfs2_per_slot_free_list *fl = NULL;
6649
6650        if (!et->et_dealloc)
6651                return 1;
6652
6653        fl = et->et_dealloc->c_first_suballocator;
6654        if (!fl)
6655                return 1;
6656
6657        if (!fl->f_first)
6658                return 1;
6659
6660        return 0;
6661}
6662
6663/* If extent was deleted from tree due to extent rotation and merging, and
6664 * no metadata is reserved ahead of time. Try to reuse some extents
6665 * just deleted. This is only used to reuse extent blocks.
6666 * It is supposed to find enough extent blocks in dealloc if our estimation
6667 * on metadata is accurate.
6668 */
6669static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
6670                                        struct ocfs2_extent_tree *et,
6671                                        struct buffer_head **new_eb_bh,
6672                                        int blk_wanted, int *blk_given)
6673{
6674        int i, status = 0, real_slot;
6675        struct ocfs2_cached_dealloc_ctxt *dealloc;
6676        struct ocfs2_per_slot_free_list *fl;
6677        struct ocfs2_cached_block_free *bf;
6678        struct ocfs2_extent_block *eb;
6679        struct ocfs2_super *osb =
6680                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
6681
6682        *blk_given = 0;
6683
6684        /* If extent tree doesn't have a dealloc, this is not faulty. Just
6685         * tell upper caller dealloc can't provide any block and it should
6686         * ask for alloc to claim more space.
6687         */
6688        dealloc = et->et_dealloc;
6689        if (!dealloc)
6690                goto bail;
6691
6692        for (i = 0; i < blk_wanted; i++) {
6693                /* Prefer to use local slot */
6694                fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
6695                                                    osb->slot_num, &real_slot,
6696                                                    dealloc);
6697                /* If no more block can be reused, we should claim more
6698                 * from alloc. Just return here normally.
6699                 */
6700                if (!fl) {
6701                        status = 0;
6702                        break;
6703                }
6704
6705                bf = fl->f_first;
6706                fl->f_first = bf->free_next;
6707
6708                new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
6709                if (new_eb_bh[i] == NULL) {
6710                        status = -ENOMEM;
6711                        mlog_errno(status);
6712                        goto bail;
6713                }
6714
6715                mlog(0, "Reusing block(%llu) from "
6716                     "dealloc(local slot:%d, real slot:%d)\n",
6717                     bf->free_blk, osb->slot_num, real_slot);
6718
6719                ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
6720
6721                status = ocfs2_journal_access_eb(handle, et->et_ci,
6722                                                 new_eb_bh[i],
6723                                                 OCFS2_JOURNAL_ACCESS_CREATE);
6724                if (status < 0) {
6725                        mlog_errno(status);
6726                        goto bail;
6727                }
6728
6729                memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
6730                eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
6731
6732                /* We can't guarantee that buffer head is still cached, so
6733                 * polutlate the extent block again.
6734                 */
6735                strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
6736                eb->h_blkno = cpu_to_le64(bf->free_blk);
6737                eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
6738                eb->h_suballoc_slot = cpu_to_le16(real_slot);
6739                eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
6740                eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
6741                eb->h_list.l_count =
6742                        cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
6743
6744                /* We'll also be dirtied by the caller, so
6745                 * this isn't absolutely necessary.
6746                 */
6747                ocfs2_journal_dirty(handle, new_eb_bh[i]);
6748
6749                if (!fl->f_first) {
6750                        dealloc->c_first_suballocator = fl->f_next_suballocator;
6751                        kfree(fl);
6752                }
6753                kfree(bf);
6754        }
6755
6756        *blk_given = i;
6757
6758bail:
6759        if (unlikely(status < 0)) {
6760                for (i = 0; i < blk_wanted; i++)
6761                        brelse(new_eb_bh[i]);
6762        }
6763
6764        return status;
6765}
6766
6767int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6768                              int type, int slot, u64 suballoc,
6769                              u64 blkno, unsigned int bit)
6770{
6771        int ret;
6772        struct ocfs2_per_slot_free_list *fl;
6773        struct ocfs2_cached_block_free *item;
6774
6775        fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6776        if (fl == NULL) {
6777                ret = -ENOMEM;
6778                mlog_errno(ret);
6779                goto out;
6780        }
6781
6782        item = kzalloc(sizeof(*item), GFP_NOFS);
6783        if (item == NULL) {
6784                ret = -ENOMEM;
6785                mlog_errno(ret);
6786                goto out;
6787        }
6788
6789        trace_ocfs2_cache_block_dealloc(type, slot,
6790                                        (unsigned long long)suballoc,
6791                                        (unsigned long long)blkno, bit);
6792
6793        item->free_bg = suballoc;
6794        item->free_blk = blkno;
6795        item->free_bit = bit;
6796        item->free_next = fl->f_first;
6797
6798        fl->f_first = item;
6799
6800        ret = 0;
6801out:
6802        return ret;
6803}
6804
6805static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6806                                         struct ocfs2_extent_block *eb)
6807{
6808        return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6809                                         le16_to_cpu(eb->h_suballoc_slot),
6810                                         le64_to_cpu(eb->h_suballoc_loc),
6811                                         le64_to_cpu(eb->h_blkno),
6812                                         le16_to_cpu(eb->h_suballoc_bit));
6813}
6814
6815static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6816{
6817        set_buffer_uptodate(bh);
6818        mark_buffer_dirty(bh);
6819        return 0;
6820}
6821
6822void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6823                              unsigned int from, unsigned int to,
6824                              struct page *page, int zero, u64 *phys)
6825{
6826        int ret, partial = 0;
6827
6828        ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6829        if (ret)
6830                mlog_errno(ret);
6831
6832        if (zero)
6833                zero_user_segment(page, from, to);
6834
6835        /*
6836         * Need to set the buffers we zero'd into uptodate
6837         * here if they aren't - ocfs2_map_page_blocks()
6838         * might've skipped some
6839         */
6840        ret = walk_page_buffers(handle, page_buffers(page),
6841                                from, to, &partial,
6842                                ocfs2_zero_func);
6843        if (ret < 0)
6844                mlog_errno(ret);
6845        else if (ocfs2_should_order_data(inode)) {
6846                ret = ocfs2_jbd2_file_inode(handle, inode);
6847                if (ret < 0)
6848                        mlog_errno(ret);
6849        }
6850
6851        if (!partial)
6852                SetPageUptodate(page);
6853
6854        flush_dcache_page(page);
6855}
6856
6857static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6858                                     loff_t end, struct page **pages,
6859                                     int numpages, u64 phys, handle_t *handle)
6860{
6861        int i;
6862        struct page *page;
6863        unsigned int from, to = PAGE_SIZE;
6864        struct super_block *sb = inode->i_sb;
6865
6866        BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6867
6868        if (numpages == 0)
6869                goto out;
6870
6871        to = PAGE_SIZE;
6872        for(i = 0; i < numpages; i++) {
6873                page = pages[i];
6874
6875                from = start & (PAGE_SIZE - 1);
6876                if ((end >> PAGE_SHIFT) == page->index)
6877                        to = end & (PAGE_SIZE - 1);
6878
6879                BUG_ON(from > PAGE_SIZE);
6880                BUG_ON(to > PAGE_SIZE);
6881
6882                ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6883                                         &phys);
6884
6885                start = (page->index + 1) << PAGE_SHIFT;
6886        }
6887out:
6888        if (pages)
6889                ocfs2_unlock_and_free_pages(pages, numpages);
6890}
6891
6892int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6893                     struct page **pages, int *num)
6894{
6895        int numpages, ret = 0;
6896        struct address_space *mapping = inode->i_mapping;
6897        unsigned long index;
6898        loff_t last_page_bytes;
6899
6900        BUG_ON(start > end);
6901
6902        numpages = 0;
6903        last_page_bytes = PAGE_ALIGN(end);
6904        index = start >> PAGE_SHIFT;
6905        do {
6906                pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
6907                if (!pages[numpages]) {
6908                        ret = -ENOMEM;
6909                        mlog_errno(ret);
6910                        goto out;
6911                }
6912
6913                numpages++;
6914                index++;
6915        } while (index < (last_page_bytes >> PAGE_SHIFT));
6916
6917out:
6918        if (ret != 0) {
6919                if (pages)
6920                        ocfs2_unlock_and_free_pages(pages, numpages);
6921                numpages = 0;
6922        }
6923
6924        *num = numpages;
6925
6926        return ret;
6927}
6928
6929static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6930                                struct page **pages, int *num)
6931{
6932        struct super_block *sb = inode->i_sb;
6933
6934        BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6935               (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6936
6937        return ocfs2_grab_pages(inode, start, end, pages, num);
6938}
6939
6940/*
6941 * Zero the area past i_size but still within an allocated
6942 * cluster. This avoids exposing nonzero data on subsequent file
6943 * extends.
6944 *
6945 * We need to call this before i_size is updated on the inode because
6946 * otherwise block_write_full_page() will skip writeout of pages past
6947 * i_size. The new_i_size parameter is passed for this reason.
6948 */
6949int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6950                                  u64 range_start, u64 range_end)
6951{
6952        int ret = 0, numpages;
6953        struct page **pages = NULL;
6954        u64 phys;
6955        unsigned int ext_flags;
6956        struct super_block *sb = inode->i_sb;
6957
6958        /*
6959         * File systems which don't support sparse files zero on every
6960         * extend.
6961         */
6962        if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6963                return 0;
6964
6965        pages = kcalloc(ocfs2_pages_per_cluster(sb),
6966                        sizeof(struct page *), GFP_NOFS);
6967        if (pages == NULL) {
6968                ret = -ENOMEM;
6969                mlog_errno(ret);
6970                goto out;
6971        }
6972
6973        if (range_start == range_end)
6974                goto out;
6975
6976        ret = ocfs2_extent_map_get_blocks(inode,
6977                                          range_start >> sb->s_blocksize_bits,
6978                                          &phys, NULL, &ext_flags);
6979        if (ret) {
6980                mlog_errno(ret);
6981                goto out;
6982        }
6983
6984        /*
6985         * Tail is a hole, or is marked unwritten. In either case, we
6986         * can count on read and write to return/push zero's.
6987         */
6988        if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6989                goto out;
6990
6991        ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6992                                   &numpages);
6993        if (ret) {
6994                mlog_errno(ret);
6995                goto out;
6996        }
6997
6998        ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6999                                 numpages, phys, handle);
7000
7001        /*
7002         * Initiate writeout of the pages we zero'd here. We don't
7003         * wait on them - the truncate_inode_pages() call later will
7004         * do that for us.
7005         */
7006        ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
7007                                       range_end - 1);
7008        if (ret)
7009                mlog_errno(ret);
7010
7011out:
7012        kfree(pages);
7013
7014        return ret;
7015}
7016
7017static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
7018                                             struct ocfs2_dinode *di)
7019{
7020        unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
7021        unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
7022
7023        if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
7024                memset(&di->id2, 0, blocksize -
7025                                    offsetof(struct ocfs2_dinode, id2) -
7026                                    xattrsize);
7027        else
7028                memset(&di->id2, 0, blocksize -
7029                                    offsetof(struct ocfs2_dinode, id2));
7030}
7031
7032void ocfs2_dinode_new_extent_list(struct inode *inode,
7033                                  struct ocfs2_dinode *di)
7034{
7035        ocfs2_zero_dinode_id2_with_xattr(inode, di);
7036        di->id2.i_list.l_tree_depth = 0;
7037        di->id2.i_list.l_next_free_rec = 0;
7038        di->id2.i_list.l_count = cpu_to_le16(
7039                ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
7040}
7041
7042void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
7043{
7044        struct ocfs2_inode_info *oi = OCFS2_I(inode);
7045        struct ocfs2_inline_data *idata = &di->id2.i_data;
7046
7047        spin_lock(&oi->ip_lock);
7048        oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
7049        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7050        spin_unlock(&oi->ip_lock);
7051
7052        /*
7053         * We clear the entire i_data structure here so that all
7054         * fields can be properly initialized.
7055         */
7056        ocfs2_zero_dinode_id2_with_xattr(inode, di);
7057
7058        idata->id_count = cpu_to_le16(
7059                        ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
7060}
7061
7062int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7063                                         struct buffer_head *di_bh)
7064{
7065        int ret, i, has_data, num_pages = 0;
7066        int need_free = 0;
7067        u32 bit_off, num;
7068        handle_t *handle;
7069        u64 uninitialized_var(block);
7070        struct ocfs2_inode_info *oi = OCFS2_I(inode);
7071        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7072        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7073        struct ocfs2_alloc_context *data_ac = NULL;
7074        struct page **pages = NULL;
7075        loff_t end = osb->s_clustersize;
7076        struct ocfs2_extent_tree et;
7077        int did_quota = 0;
7078
7079        has_data = i_size_read(inode) ? 1 : 0;
7080
7081        if (has_data) {
7082                pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
7083                                sizeof(struct page *), GFP_NOFS);
7084                if (pages == NULL) {
7085                        ret = -ENOMEM;
7086                        mlog_errno(ret);
7087                        return ret;
7088                }
7089
7090                ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
7091                if (ret) {
7092                        mlog_errno(ret);
7093                        goto free_pages;
7094                }
7095        }
7096
7097        handle = ocfs2_start_trans(osb,
7098                                   ocfs2_inline_to_extents_credits(osb->sb));
7099        if (IS_ERR(handle)) {
7100                ret = PTR_ERR(handle);
7101                mlog_errno(ret);
7102                goto out;
7103        }
7104
7105        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7106                                      OCFS2_JOURNAL_ACCESS_WRITE);
7107        if (ret) {
7108                mlog_errno(ret);
7109                goto out_commit;
7110        }
7111
7112        if (has_data) {
7113                unsigned int page_end;
7114                u64 phys;
7115
7116                ret = dquot_alloc_space_nodirty(inode,
7117                                       ocfs2_clusters_to_bytes(osb->sb, 1));
7118                if (ret)
7119                        goto out_commit;
7120                did_quota = 1;
7121
7122                data_ac->ac_resv = &oi->ip_la_data_resv;
7123
7124                ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7125                                           &num);
7126                if (ret) {
7127                        mlog_errno(ret);
7128                        goto out_commit;
7129                }
7130
7131                /*
7132                 * Save two copies, one for insert, and one that can
7133                 * be changed by ocfs2_map_and_dirty_page() below.
7134                 */
7135                block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
7136
7137                /*
7138                 * Non sparse file systems zero on extend, so no need
7139                 * to do that now.
7140                 */
7141                if (!ocfs2_sparse_alloc(osb) &&
7142                    PAGE_SIZE < osb->s_clustersize)
7143                        end = PAGE_SIZE;
7144
7145                ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
7146                if (ret) {
7147                        mlog_errno(ret);
7148                        need_free = 1;
7149                        goto out_commit;
7150                }
7151
7152                /*
7153                 * This should populate the 1st page for us and mark
7154                 * it up to date.
7155                 */
7156                ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
7157                if (ret) {
7158                        mlog_errno(ret);
7159                        need_free = 1;
7160                        goto out_unlock;
7161                }
7162
7163                page_end = PAGE_SIZE;
7164                if (PAGE_SIZE > osb->s_clustersize)
7165                        page_end = osb->s_clustersize;
7166
7167                for (i = 0; i < num_pages; i++)
7168                        ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
7169                                                 pages[i], i > 0, &phys);
7170        }
7171
7172        spin_lock(&oi->ip_lock);
7173        oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7174        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7175        spin_unlock(&oi->ip_lock);
7176
7177        ocfs2_update_inode_fsync_trans(handle, inode, 1);
7178        ocfs2_dinode_new_extent_list(inode, di);
7179
7180        ocfs2_journal_dirty(handle, di_bh);
7181
7182        if (has_data) {
7183                /*
7184                 * An error at this point should be extremely rare. If
7185                 * this proves to be false, we could always re-build
7186                 * the in-inode data from our pages.
7187                 */
7188                ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7189                ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7190                if (ret) {
7191                        mlog_errno(ret);
7192                        need_free = 1;
7193                        goto out_unlock;
7194                }
7195
7196                inode->i_blocks = ocfs2_inode_sector_count(inode);
7197        }
7198
7199out_unlock:
7200        if (pages)
7201                ocfs2_unlock_and_free_pages(pages, num_pages);
7202
7203out_commit:
7204        if (ret < 0 && did_quota)
7205                dquot_free_space_nodirty(inode,
7206                                          ocfs2_clusters_to_bytes(osb->sb, 1));
7207
7208        if (need_free) {
7209                if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
7210                        ocfs2_free_local_alloc_bits(osb, handle, data_ac,
7211                                        bit_off, num);
7212                else
7213                        ocfs2_free_clusters(handle,
7214                                        data_ac->ac_inode,
7215                                        data_ac->ac_bh,
7216                                        ocfs2_clusters_to_blocks(osb->sb, bit_off),
7217                                        num);
7218        }
7219
7220        ocfs2_commit_trans(osb, handle);
7221
7222out:
7223        if (data_ac)
7224                ocfs2_free_alloc_context(data_ac);
7225free_pages:
7226        kfree(pages);
7227        return ret;
7228}
7229
7230/*
7231 * It is expected, that by the time you call this function,
7232 * inode->i_size and fe->i_size have been adjusted.
7233 *
7234 * WARNING: This will kfree the truncate context
7235 */
7236int ocfs2_commit_truncate(struct ocfs2_super *osb,
7237                          struct inode *inode,
7238                          struct buffer_head *di_bh)
7239{
7240        int status = 0, i, flags = 0;
7241        u32 new_highest_cpos, range, trunc_cpos, trunc_len, phys_cpos, coff;
7242        u64 blkno = 0;
7243        struct ocfs2_extent_list *el;
7244        struct ocfs2_extent_rec *rec;
7245        struct ocfs2_path *path = NULL;
7246        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7247        struct ocfs2_extent_list *root_el = &(di->id2.i_list);
7248        u64 refcount_loc = le64_to_cpu(di->i_refcount_loc);
7249        struct ocfs2_extent_tree et;
7250        struct ocfs2_cached_dealloc_ctxt dealloc;
7251        struct ocfs2_refcount_tree *ref_tree = NULL;
7252
7253        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7254        ocfs2_init_dealloc_ctxt(&dealloc);
7255
7256        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7257                                                     i_size_read(inode));
7258
7259        path = ocfs2_new_path(di_bh, &di->id2.i_list,
7260                              ocfs2_journal_access_di);
7261        if (!path) {
7262                status = -ENOMEM;
7263                mlog_errno(status);
7264                goto bail;
7265        }
7266
7267        ocfs2_extent_map_trunc(inode, new_highest_cpos);
7268
7269start:
7270        /*
7271         * Check that we still have allocation to delete.
7272         */
7273        if (OCFS2_I(inode)->ip_clusters == 0) {
7274                status = 0;
7275                goto bail;
7276        }
7277
7278        /*
7279         * Truncate always works against the rightmost tree branch.
7280         */
7281        status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7282        if (status) {
7283                mlog_errno(status);
7284                goto bail;
7285        }
7286
7287        trace_ocfs2_commit_truncate(
7288                (unsigned long long)OCFS2_I(inode)->ip_blkno,
7289                new_highest_cpos,
7290                OCFS2_I(inode)->ip_clusters,
7291                path->p_tree_depth);
7292
7293        /*
7294         * By now, el will point to the extent list on the bottom most
7295         * portion of this tree. Only the tail record is considered in
7296         * each pass.
7297         *
7298         * We handle the following cases, in order:
7299         * - empty extent: delete the remaining branch
7300         * - remove the entire record
7301         * - remove a partial record
7302         * - no record needs to be removed (truncate has completed)
7303         */
7304        el = path_leaf_el(path);
7305        if (le16_to_cpu(el->l_next_free_rec) == 0) {
7306                ocfs2_error(inode->i_sb,
7307                            "Inode %llu has empty extent block at %llu\n",
7308                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
7309                            (unsigned long long)path_leaf_bh(path)->b_blocknr);
7310                status = -EROFS;
7311                goto bail;
7312        }
7313
7314        i = le16_to_cpu(el->l_next_free_rec) - 1;
7315        rec = &el->l_recs[i];
7316        flags = rec->e_flags;
7317        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
7318
7319        if (i == 0 && ocfs2_is_empty_extent(rec)) {
7320                /*
7321                 * Lower levels depend on this never happening, but it's best
7322                 * to check it up here before changing the tree.
7323                */
7324                if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
7325                        mlog(ML_ERROR, "Inode %lu has an empty "
7326                                    "extent record, depth %u\n", inode->i_ino,
7327                                    le16_to_cpu(root_el->l_tree_depth));
7328                        status = ocfs2_remove_rightmost_empty_extent(osb,
7329                                        &et, path, &dealloc);
7330                        if (status) {
7331                                mlog_errno(status);
7332                                goto bail;
7333                        }
7334
7335                        ocfs2_reinit_path(path, 1);
7336                        goto start;
7337                } else {
7338                        trunc_cpos = le32_to_cpu(rec->e_cpos);
7339                        trunc_len = 0;
7340                        blkno = 0;
7341                }
7342        } else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
7343                /*
7344                 * Truncate entire record.
7345                 */
7346                trunc_cpos = le32_to_cpu(rec->e_cpos);
7347                trunc_len = ocfs2_rec_clusters(el, rec);
7348                blkno = le64_to_cpu(rec->e_blkno);
7349        } else if (range > new_highest_cpos) {
7350                /*
7351                 * Partial truncate. it also should be
7352                 * the last truncate we're doing.
7353                 */
7354                trunc_cpos = new_highest_cpos;
7355                trunc_len = range - new_highest_cpos;
7356                coff = new_highest_cpos - le32_to_cpu(rec->e_cpos);
7357                blkno = le64_to_cpu(rec->e_blkno) +
7358                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
7359        } else {
7360                /*
7361                 * Truncate completed, leave happily.
7362                 */
7363                status = 0;
7364                goto bail;
7365        }
7366
7367        phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
7368
7369        if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) {
7370                status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
7371                                &ref_tree, NULL);
7372                if (status) {
7373                        mlog_errno(status);
7374                        goto bail;
7375                }
7376        }
7377
7378        status = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
7379                                          phys_cpos, trunc_len, flags, &dealloc,
7380                                          refcount_loc, true);
7381        if (status < 0) {
7382                mlog_errno(status);
7383                goto bail;
7384        }
7385
7386        ocfs2_reinit_path(path, 1);
7387
7388        /*
7389         * The check above will catch the case where we've truncated
7390         * away all allocation.
7391         */
7392        goto start;
7393
7394bail:
7395        if (ref_tree)
7396                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7397
7398        ocfs2_schedule_truncate_log_flush(osb, 1);
7399
7400        ocfs2_run_deallocs(osb, &dealloc);
7401
7402        ocfs2_free_path(path);
7403
7404        return status;
7405}
7406
7407/*
7408 * 'start' is inclusive, 'end' is not.
7409 */
7410int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7411                          unsigned int start, unsigned int end, int trunc)
7412{
7413        int ret;
7414        unsigned int numbytes;
7415        handle_t *handle;
7416        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7417        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7418        struct ocfs2_inline_data *idata = &di->id2.i_data;
7419
7420        if (end > i_size_read(inode))
7421                end = i_size_read(inode);
7422
7423        BUG_ON(start > end);
7424
7425        if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7426            !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7427            !ocfs2_supports_inline_data(osb)) {
7428                ocfs2_error(inode->i_sb,
7429                            "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7430                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
7431                            le16_to_cpu(di->i_dyn_features),
7432                            OCFS2_I(inode)->ip_dyn_features,
7433                            osb->s_feature_incompat);
7434                ret = -EROFS;
7435                goto out;
7436        }
7437
7438        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7439        if (IS_ERR(handle)) {
7440                ret = PTR_ERR(handle);
7441                mlog_errno(ret);
7442                goto out;
7443        }
7444
7445        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7446                                      OCFS2_JOURNAL_ACCESS_WRITE);
7447        if (ret) {
7448                mlog_errno(ret);
7449                goto out_commit;
7450        }
7451
7452        numbytes = end - start;
7453        memset(idata->id_data + start, 0, numbytes);
7454
7455        /*
7456         * No need to worry about the data page here - it's been
7457         * truncated already and inline data doesn't need it for
7458         * pushing zero's to disk, so we'll let readpage pick it up
7459         * later.
7460         */
7461        if (trunc) {
7462                i_size_write(inode, start);
7463                di->i_size = cpu_to_le64(start);
7464        }
7465
7466        inode->i_blocks = ocfs2_inode_sector_count(inode);
7467        inode->i_ctime = inode->i_mtime = current_time(inode);
7468
7469        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7470        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7471
7472        ocfs2_update_inode_fsync_trans(handle, inode, 1);
7473        ocfs2_journal_dirty(handle, di_bh);
7474
7475out_commit:
7476        ocfs2_commit_trans(osb, handle);
7477
7478out:
7479        return ret;
7480}
7481
7482static int ocfs2_trim_extent(struct super_block *sb,
7483                             struct ocfs2_group_desc *gd,
7484                             u64 group, u32 start, u32 count)
7485{
7486        u64 discard, bcount;
7487        struct ocfs2_super *osb = OCFS2_SB(sb);
7488
7489        bcount = ocfs2_clusters_to_blocks(sb, count);
7490        discard = ocfs2_clusters_to_blocks(sb, start);
7491
7492        /*
7493         * For the first cluster group, the gd->bg_blkno is not at the start
7494         * of the group, but at an offset from the start. If we add it while
7495         * calculating discard for first group, we will wrongly start fstrim a
7496         * few blocks after the desried start block and the range can cross
7497         * over into the next cluster group. So, add it only if this is not
7498         * the first cluster group.
7499         */
7500        if (group != osb->first_cluster_group_blkno)
7501                discard += le64_to_cpu(gd->bg_blkno);
7502
7503        trace_ocfs2_trim_extent(sb, (unsigned long long)discard, bcount);
7504
7505        return sb_issue_discard(sb, discard, bcount, GFP_NOFS, 0);
7506}
7507
7508static int ocfs2_trim_group(struct super_block *sb,
7509                            struct ocfs2_group_desc *gd, u64 group,
7510                            u32 start, u32 max, u32 minbits)
7511{
7512        int ret = 0, count = 0, next;
7513        void *bitmap = gd->bg_bitmap;
7514
7515        if (le16_to_cpu(gd->bg_free_bits_count) < minbits)
7516                return 0;
7517
7518        trace_ocfs2_trim_group((unsigned long long)le64_to_cpu(gd->bg_blkno),
7519                               start, max, minbits);
7520
7521        while (start < max) {
7522                start = ocfs2_find_next_zero_bit(bitmap, max, start);
7523                if (start >= max)
7524                        break;
7525                next = ocfs2_find_next_bit(bitmap, max, start);
7526
7527                if ((next - start) >= minbits) {
7528                        ret = ocfs2_trim_extent(sb, gd, group,
7529                                                start, next - start);
7530                        if (ret < 0) {
7531                                mlog_errno(ret);
7532                                break;
7533                        }
7534                        count += next - start;
7535                }
7536                start = next + 1;
7537
7538                if (fatal_signal_pending(current)) {
7539                        count = -ERESTARTSYS;
7540                        break;
7541                }
7542
7543                if ((le16_to_cpu(gd->bg_free_bits_count) - count) < minbits)
7544                        break;
7545        }
7546
7547        if (ret < 0)
7548                count = ret;
7549
7550        return count;
7551}
7552
7553int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
7554{
7555        struct ocfs2_super *osb = OCFS2_SB(sb);
7556        u64 start, len, trimmed, first_group, last_group, group;
7557        int ret, cnt;
7558        u32 first_bit, last_bit, minlen;
7559        struct buffer_head *main_bm_bh = NULL;
7560        struct inode *main_bm_inode = NULL;
7561        struct buffer_head *gd_bh = NULL;
7562        struct ocfs2_dinode *main_bm;
7563        struct ocfs2_group_desc *gd = NULL;
7564        struct ocfs2_trim_fs_info info, *pinfo = NULL;
7565
7566        start = range->start >> osb->s_clustersize_bits;
7567        len = range->len >> osb->s_clustersize_bits;
7568        minlen = range->minlen >> osb->s_clustersize_bits;
7569
7570        if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
7571                return -EINVAL;
7572
7573        main_bm_inode = ocfs2_get_system_file_inode(osb,
7574                                                    GLOBAL_BITMAP_SYSTEM_INODE,
7575                                                    OCFS2_INVALID_SLOT);
7576        if (!main_bm_inode) {
7577                ret = -EIO;
7578                mlog_errno(ret);
7579                goto out;
7580        }
7581
7582        inode_lock(main_bm_inode);
7583
7584        ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
7585        if (ret < 0) {
7586                mlog_errno(ret);
7587                goto out_mutex;
7588        }
7589        main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
7590
7591        if (start >= le32_to_cpu(main_bm->i_clusters)) {
7592                ret = -EINVAL;
7593                goto out_unlock;
7594        }
7595
7596        len = range->len >> osb->s_clustersize_bits;
7597        if (start + len > le32_to_cpu(main_bm->i_clusters))
7598                len = le32_to_cpu(main_bm->i_clusters) - start;
7599
7600        trace_ocfs2_trim_fs(start, len, minlen);
7601
7602        ocfs2_trim_fs_lock_res_init(osb);
7603        ret = ocfs2_trim_fs_lock(osb, NULL, 1);
7604        if (ret < 0) {
7605                if (ret != -EAGAIN) {
7606                        mlog_errno(ret);
7607                        ocfs2_trim_fs_lock_res_uninit(osb);
7608                        goto out_unlock;
7609                }
7610
7611                mlog(ML_NOTICE, "Wait for trim on device (%s) to "
7612                     "finish, which is running from another node.\n",
7613                     osb->dev_str);
7614                ret = ocfs2_trim_fs_lock(osb, &info, 0);
7615                if (ret < 0) {
7616                        mlog_errno(ret);
7617                        ocfs2_trim_fs_lock_res_uninit(osb);
7618                        goto out_unlock;
7619                }
7620
7621                if (info.tf_valid && info.tf_success &&
7622                    info.tf_start == start && info.tf_len == len &&
7623                    info.tf_minlen == minlen) {
7624                        /* Avoid sending duplicated trim to a shared device */
7625                        mlog(ML_NOTICE, "The same trim on device (%s) was "
7626                             "just done from node (%u), return.\n",
7627                             osb->dev_str, info.tf_nodenum);
7628                        range->len = info.tf_trimlen;
7629                        goto out_trimunlock;
7630                }
7631        }
7632
7633        info.tf_nodenum = osb->node_num;
7634        info.tf_start = start;
7635        info.tf_len = len;
7636        info.tf_minlen = minlen;
7637
7638        /* Determine first and last group to examine based on start and len */
7639        first_group = ocfs2_which_cluster_group(main_bm_inode, start);
7640        if (first_group == osb->first_cluster_group_blkno)
7641                first_bit = start;
7642        else
7643                first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
7644        last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
7645        last_bit = osb->bitmap_cpg;
7646
7647        trimmed = 0;
7648        for (group = first_group; group <= last_group;) {
7649                if (first_bit + len >= osb->bitmap_cpg)
7650                        last_bit = osb->bitmap_cpg;
7651                else
7652                        last_bit = first_bit + len;
7653
7654                ret = ocfs2_read_group_descriptor(main_bm_inode,
7655                                                  main_bm, group,
7656                                                  &gd_bh);
7657                if (ret < 0) {
7658                        mlog_errno(ret);
7659                        break;
7660                }
7661
7662                gd = (struct ocfs2_group_desc *)gd_bh->b_data;
7663                cnt = ocfs2_trim_group(sb, gd, group,
7664                                       first_bit, last_bit, minlen);
7665                brelse(gd_bh);
7666                gd_bh = NULL;
7667                if (cnt < 0) {
7668                        ret = cnt;
7669                        mlog_errno(ret);
7670                        break;
7671                }
7672
7673                trimmed += cnt;
7674                len -= osb->bitmap_cpg - first_bit;
7675                first_bit = 0;
7676                if (group == osb->first_cluster_group_blkno)
7677                        group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7678                else
7679                        group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
7680        }
7681        range->len = trimmed * sb->s_blocksize;
7682
7683        info.tf_trimlen = range->len;
7684        info.tf_success = (ret ? 0 : 1);
7685        pinfo = &info;
7686out_trimunlock:
7687        ocfs2_trim_fs_unlock(osb, pinfo);
7688        ocfs2_trim_fs_lock_res_uninit(osb);
7689out_unlock:
7690        ocfs2_inode_unlock(main_bm_inode, 0);
7691        brelse(main_bm_bh);
7692out_mutex:
7693        inode_unlock(main_bm_inode);
7694        iput(main_bm_inode);
7695out:
7696        return ret;
7697}
7698