linux/fs/ocfs2/refcounttree.c
<<
>>
Prefs
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * refcounttree.c
   5 *
   6 * Copyright (C) 2009 Oracle.  All rights reserved.
   7 *
   8 * This program is free software; you can redistribute it and/or
   9 * modify it under the terms of the GNU General Public
  10 * License version 2 as published by the Free Software Foundation.
  11 *
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 * General Public License for more details.
  16 */
  17
  18#include <linux/sort.h>
  19#include <cluster/masklog.h>
  20#include "ocfs2.h"
  21#include "inode.h"
  22#include "alloc.h"
  23#include "suballoc.h"
  24#include "journal.h"
  25#include "uptodate.h"
  26#include "super.h"
  27#include "buffer_head_io.h"
  28#include "blockcheck.h"
  29#include "refcounttree.h"
  30#include "sysfile.h"
  31#include "dlmglue.h"
  32#include "extent_map.h"
  33#include "aops.h"
  34#include "xattr.h"
  35#include "namei.h"
  36#include "ocfs2_trace.h"
  37
  38#include <linux/bio.h>
  39#include <linux/blkdev.h>
  40#include <linux/slab.h>
  41#include <linux/writeback.h>
  42#include <linux/pagevec.h>
  43#include <linux/swap.h>
  44#include <linux/security.h>
  45#include <linux/fsnotify.h>
  46#include <linux/quotaops.h>
  47#include <linux/namei.h>
  48#include <linux/mount.h>
  49
  50struct ocfs2_cow_context {
  51        struct inode *inode;
  52        u32 cow_start;
  53        u32 cow_len;
  54        struct ocfs2_extent_tree data_et;
  55        struct ocfs2_refcount_tree *ref_tree;
  56        struct buffer_head *ref_root_bh;
  57        struct ocfs2_alloc_context *meta_ac;
  58        struct ocfs2_alloc_context *data_ac;
  59        struct ocfs2_cached_dealloc_ctxt dealloc;
  60        void *cow_object;
  61        struct ocfs2_post_refcount *post_refcount;
  62        int extra_credits;
  63        int (*get_clusters)(struct ocfs2_cow_context *context,
  64                            u32 v_cluster, u32 *p_cluster,
  65                            u32 *num_clusters,
  66                            unsigned int *extent_flags);
  67        int (*cow_duplicate_clusters)(handle_t *handle,
  68                                      struct inode *inode,
  69                                      u32 cpos, u32 old_cluster,
  70                                      u32 new_cluster, u32 new_len);
  71};
  72
  73static inline struct ocfs2_refcount_tree *
  74cache_info_to_refcount(struct ocfs2_caching_info *ci)
  75{
  76        return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
  77}
  78
  79static int ocfs2_validate_refcount_block(struct super_block *sb,
  80                                         struct buffer_head *bh)
  81{
  82        int rc;
  83        struct ocfs2_refcount_block *rb =
  84                (struct ocfs2_refcount_block *)bh->b_data;
  85
  86        trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
  87
  88        BUG_ON(!buffer_uptodate(bh));
  89
  90        /*
  91         * If the ecc fails, we return the error but otherwise
  92         * leave the filesystem running.  We know any error is
  93         * local to this block.
  94         */
  95        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
  96        if (rc) {
  97                mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
  98                     (unsigned long long)bh->b_blocknr);
  99                return rc;
 100        }
 101
 102
 103        if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
 104                ocfs2_error(sb,
 105                            "Refcount block #%llu has bad signature %.*s",
 106                            (unsigned long long)bh->b_blocknr, 7,
 107                            rb->rf_signature);
 108                return -EINVAL;
 109        }
 110
 111        if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
 112                ocfs2_error(sb,
 113                            "Refcount block #%llu has an invalid rf_blkno "
 114                            "of %llu",
 115                            (unsigned long long)bh->b_blocknr,
 116                            (unsigned long long)le64_to_cpu(rb->rf_blkno));
 117                return -EINVAL;
 118        }
 119
 120        if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
 121                ocfs2_error(sb,
 122                            "Refcount block #%llu has an invalid "
 123                            "rf_fs_generation of #%u",
 124                            (unsigned long long)bh->b_blocknr,
 125                            le32_to_cpu(rb->rf_fs_generation));
 126                return -EINVAL;
 127        }
 128
 129        return 0;
 130}
 131
 132static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
 133                                     u64 rb_blkno,
 134                                     struct buffer_head **bh)
 135{
 136        int rc;
 137        struct buffer_head *tmp = *bh;
 138
 139        rc = ocfs2_read_block(ci, rb_blkno, &tmp,
 140                              ocfs2_validate_refcount_block);
 141
 142        /* If ocfs2_read_block() got us a new bh, pass it up. */
 143        if (!rc && !*bh)
 144                *bh = tmp;
 145
 146        return rc;
 147}
 148
 149static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
 150{
 151        struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
 152
 153        return rf->rf_blkno;
 154}
 155
 156static struct super_block *
 157ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
 158{
 159        struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
 160
 161        return rf->rf_sb;
 162}
 163
 164static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
 165{
 166        struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
 167
 168        spin_lock(&rf->rf_lock);
 169}
 170
 171static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
 172{
 173        struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
 174
 175        spin_unlock(&rf->rf_lock);
 176}
 177
 178static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
 179{
 180        struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
 181
 182        mutex_lock(&rf->rf_io_mutex);
 183}
 184
 185static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
 186{
 187        struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
 188
 189        mutex_unlock(&rf->rf_io_mutex);
 190}
 191
 192static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
 193        .co_owner               = ocfs2_refcount_cache_owner,
 194        .co_get_super           = ocfs2_refcount_cache_get_super,
 195        .co_cache_lock          = ocfs2_refcount_cache_lock,
 196        .co_cache_unlock        = ocfs2_refcount_cache_unlock,
 197        .co_io_lock             = ocfs2_refcount_cache_io_lock,
 198        .co_io_unlock           = ocfs2_refcount_cache_io_unlock,
 199};
 200
 201static struct ocfs2_refcount_tree *
 202ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
 203{
 204        struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
 205        struct ocfs2_refcount_tree *tree = NULL;
 206
 207        while (n) {
 208                tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
 209
 210                if (blkno < tree->rf_blkno)
 211                        n = n->rb_left;
 212                else if (blkno > tree->rf_blkno)
 213                        n = n->rb_right;
 214                else
 215                        return tree;
 216        }
 217
 218        return NULL;
 219}
 220
 221/* osb_lock is already locked. */
 222static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
 223                                       struct ocfs2_refcount_tree *new)
 224{
 225        u64 rf_blkno = new->rf_blkno;
 226        struct rb_node *parent = NULL;
 227        struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
 228        struct ocfs2_refcount_tree *tmp;
 229
 230        while (*p) {
 231                parent = *p;
 232
 233                tmp = rb_entry(parent, struct ocfs2_refcount_tree,
 234                               rf_node);
 235
 236                if (rf_blkno < tmp->rf_blkno)
 237                        p = &(*p)->rb_left;
 238                else if (rf_blkno > tmp->rf_blkno)
 239                        p = &(*p)->rb_right;
 240                else {
 241                        /* This should never happen! */
 242                        mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
 243                             (unsigned long long)rf_blkno);
 244                        BUG();
 245                }
 246        }
 247
 248        rb_link_node(&new->rf_node, parent, p);
 249        rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
 250}
 251
 252static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
 253{
 254        ocfs2_metadata_cache_exit(&tree->rf_ci);
 255        ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
 256        ocfs2_lock_res_free(&tree->rf_lockres);
 257        kfree(tree);
 258}
 259
 260static inline void
 261ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
 262                                        struct ocfs2_refcount_tree *tree)
 263{
 264        rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
 265        if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
 266                osb->osb_ref_tree_lru = NULL;
 267}
 268
 269static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
 270                                        struct ocfs2_refcount_tree *tree)
 271{
 272        spin_lock(&osb->osb_lock);
 273        ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
 274        spin_unlock(&osb->osb_lock);
 275}
 276
 277static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
 278{
 279        struct ocfs2_refcount_tree *tree =
 280                container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
 281
 282        ocfs2_free_refcount_tree(tree);
 283}
 284
 285static inline void
 286ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
 287{
 288        kref_get(&tree->rf_getcnt);
 289}
 290
 291static inline void
 292ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
 293{
 294        kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
 295}
 296
 297static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
 298                                               struct super_block *sb)
 299{
 300        ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
 301        mutex_init(&new->rf_io_mutex);
 302        new->rf_sb = sb;
 303        spin_lock_init(&new->rf_lock);
 304}
 305
 306static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
 307                                        struct ocfs2_refcount_tree *new,
 308                                        u64 rf_blkno, u32 generation)
 309{
 310        init_rwsem(&new->rf_sem);
 311        ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
 312                                     rf_blkno, generation);
 313}
 314
 315static struct ocfs2_refcount_tree*
 316ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
 317{
 318        struct ocfs2_refcount_tree *new;
 319
 320        new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
 321        if (!new)
 322                return NULL;
 323
 324        new->rf_blkno = rf_blkno;
 325        kref_init(&new->rf_getcnt);
 326        ocfs2_init_refcount_tree_ci(new, osb->sb);
 327
 328        return new;
 329}
 330
 331static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
 332                                   struct ocfs2_refcount_tree **ret_tree)
 333{
 334        int ret = 0;
 335        struct ocfs2_refcount_tree *tree, *new = NULL;
 336        struct buffer_head *ref_root_bh = NULL;
 337        struct ocfs2_refcount_block *ref_rb;
 338
 339        spin_lock(&osb->osb_lock);
 340        if (osb->osb_ref_tree_lru &&
 341            osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
 342                tree = osb->osb_ref_tree_lru;
 343        else
 344                tree = ocfs2_find_refcount_tree(osb, rf_blkno);
 345        if (tree)
 346                goto out;
 347
 348        spin_unlock(&osb->osb_lock);
 349
 350        new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
 351        if (!new) {
 352                ret = -ENOMEM;
 353                mlog_errno(ret);
 354                return ret;
 355        }
 356        /*
 357         * We need the generation to create the refcount tree lock and since
 358         * it isn't changed during the tree modification, we are safe here to
 359         * read without protection.
 360         * We also have to purge the cache after we create the lock since the
 361         * refcount block may have the stale data. It can only be trusted when
 362         * we hold the refcount lock.
 363         */
 364        ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
 365        if (ret) {
 366                mlog_errno(ret);
 367                ocfs2_metadata_cache_exit(&new->rf_ci);
 368                kfree(new);
 369                return ret;
 370        }
 371
 372        ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
 373        new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
 374        ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
 375                                      new->rf_generation);
 376        ocfs2_metadata_cache_purge(&new->rf_ci);
 377
 378        spin_lock(&osb->osb_lock);
 379        tree = ocfs2_find_refcount_tree(osb, rf_blkno);
 380        if (tree)
 381                goto out;
 382
 383        ocfs2_insert_refcount_tree(osb, new);
 384
 385        tree = new;
 386        new = NULL;
 387
 388out:
 389        *ret_tree = tree;
 390
 391        osb->osb_ref_tree_lru = tree;
 392
 393        spin_unlock(&osb->osb_lock);
 394
 395        if (new)
 396                ocfs2_free_refcount_tree(new);
 397
 398        brelse(ref_root_bh);
 399        return ret;
 400}
 401
 402static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
 403{
 404        int ret;
 405        struct buffer_head *di_bh = NULL;
 406        struct ocfs2_dinode *di;
 407
 408        ret = ocfs2_read_inode_block(inode, &di_bh);
 409        if (ret) {
 410                mlog_errno(ret);
 411                goto out;
 412        }
 413
 414        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
 415
 416        di = (struct ocfs2_dinode *)di_bh->b_data;
 417        *ref_blkno = le64_to_cpu(di->i_refcount_loc);
 418        brelse(di_bh);
 419out:
 420        return ret;
 421}
 422
 423static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
 424                                      struct ocfs2_refcount_tree *tree, int rw)
 425{
 426        int ret;
 427
 428        ret = ocfs2_refcount_lock(tree, rw);
 429        if (ret) {
 430                mlog_errno(ret);
 431                goto out;
 432        }
 433
 434        if (rw)
 435                down_write(&tree->rf_sem);
 436        else
 437                down_read(&tree->rf_sem);
 438
 439out:
 440        return ret;
 441}
 442
 443/*
 444 * Lock the refcount tree pointed by ref_blkno and return the tree.
 445 * In most case, we lock the tree and read the refcount block.
 446 * So read it here if the caller really needs it.
 447 *
 448 * If the tree has been re-created by other node, it will free the
 449 * old one and re-create it.
 450 */
 451int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
 452                             u64 ref_blkno, int rw,
 453                             struct ocfs2_refcount_tree **ret_tree,
 454                             struct buffer_head **ref_bh)
 455{
 456        int ret, delete_tree = 0;
 457        struct ocfs2_refcount_tree *tree = NULL;
 458        struct buffer_head *ref_root_bh = NULL;
 459        struct ocfs2_refcount_block *rb;
 460
 461again:
 462        ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
 463        if (ret) {
 464                mlog_errno(ret);
 465                return ret;
 466        }
 467
 468        ocfs2_refcount_tree_get(tree);
 469
 470        ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
 471        if (ret) {
 472                mlog_errno(ret);
 473                ocfs2_refcount_tree_put(tree);
 474                goto out;
 475        }
 476
 477        ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
 478                                        &ref_root_bh);
 479        if (ret) {
 480                mlog_errno(ret);
 481                ocfs2_unlock_refcount_tree(osb, tree, rw);
 482                ocfs2_refcount_tree_put(tree);
 483                goto out;
 484        }
 485
 486        rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
 487        /*
 488         * If the refcount block has been freed and re-created, we may need
 489         * to recreate the refcount tree also.
 490         *
 491         * Here we just remove the tree from the rb-tree, and the last
 492         * kref holder will unlock and delete this refcount_tree.
 493         * Then we goto "again" and ocfs2_get_refcount_tree will create
 494         * the new refcount tree for us.
 495         */
 496        if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
 497                if (!tree->rf_removed) {
 498                        ocfs2_erase_refcount_tree_from_list(osb, tree);
 499                        tree->rf_removed = 1;
 500                        delete_tree = 1;
 501                }
 502
 503                ocfs2_unlock_refcount_tree(osb, tree, rw);
 504                /*
 505                 * We get an extra reference when we create the refcount
 506                 * tree, so another put will destroy it.
 507                 */
 508                if (delete_tree)
 509                        ocfs2_refcount_tree_put(tree);
 510                brelse(ref_root_bh);
 511                ref_root_bh = NULL;
 512                goto again;
 513        }
 514
 515        *ret_tree = tree;
 516        if (ref_bh) {
 517                *ref_bh = ref_root_bh;
 518                ref_root_bh = NULL;
 519        }
 520out:
 521        brelse(ref_root_bh);
 522        return ret;
 523}
 524
 525void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
 526                                struct ocfs2_refcount_tree *tree, int rw)
 527{
 528        if (rw)
 529                up_write(&tree->rf_sem);
 530        else
 531                up_read(&tree->rf_sem);
 532
 533        ocfs2_refcount_unlock(tree, rw);
 534        ocfs2_refcount_tree_put(tree);
 535}
 536
 537void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
 538{
 539        struct rb_node *node;
 540        struct ocfs2_refcount_tree *tree;
 541        struct rb_root *root = &osb->osb_rf_lock_tree;
 542
 543        while ((node = rb_last(root)) != NULL) {
 544                tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
 545
 546                trace_ocfs2_purge_refcount_trees(
 547                                (unsigned long long) tree->rf_blkno);
 548
 549                rb_erase(&tree->rf_node, root);
 550                ocfs2_free_refcount_tree(tree);
 551        }
 552}
 553
 554/*
 555 * Create a refcount tree for an inode.
 556 * We take for granted that the inode is already locked.
 557 */
 558static int ocfs2_create_refcount_tree(struct inode *inode,
 559                                      struct buffer_head *di_bh)
 560{
 561        int ret;
 562        handle_t *handle = NULL;
 563        struct ocfs2_alloc_context *meta_ac = NULL;
 564        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 565        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 566        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 567        struct buffer_head *new_bh = NULL;
 568        struct ocfs2_refcount_block *rb;
 569        struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
 570        u16 suballoc_bit_start;
 571        u32 num_got;
 572        u64 suballoc_loc, first_blkno;
 573
 574        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
 575
 576        trace_ocfs2_create_refcount_tree(
 577                (unsigned long long)OCFS2_I(inode)->ip_blkno);
 578
 579        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
 580        if (ret) {
 581                mlog_errno(ret);
 582                goto out;
 583        }
 584
 585        handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
 586        if (IS_ERR(handle)) {
 587                ret = PTR_ERR(handle);
 588                mlog_errno(ret);
 589                goto out;
 590        }
 591
 592        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 593                                      OCFS2_JOURNAL_ACCESS_WRITE);
 594        if (ret) {
 595                mlog_errno(ret);
 596                goto out_commit;
 597        }
 598
 599        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
 600                                   &suballoc_bit_start, &num_got,
 601                                   &first_blkno);
 602        if (ret) {
 603                mlog_errno(ret);
 604                goto out_commit;
 605        }
 606
 607        new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
 608        if (!new_tree) {
 609                ret = -ENOMEM;
 610                mlog_errno(ret);
 611                goto out_commit;
 612        }
 613
 614        new_bh = sb_getblk(inode->i_sb, first_blkno);
 615        ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
 616
 617        ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
 618                                      OCFS2_JOURNAL_ACCESS_CREATE);
 619        if (ret) {
 620                mlog_errno(ret);
 621                goto out_commit;
 622        }
 623
 624        /* Initialize ocfs2_refcount_block. */
 625        rb = (struct ocfs2_refcount_block *)new_bh->b_data;
 626        memset(rb, 0, inode->i_sb->s_blocksize);
 627        strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
 628        rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
 629        rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
 630        rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
 631        rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
 632        rb->rf_blkno = cpu_to_le64(first_blkno);
 633        rb->rf_count = cpu_to_le32(1);
 634        rb->rf_records.rl_count =
 635                        cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
 636        spin_lock(&osb->osb_lock);
 637        rb->rf_generation = osb->s_next_generation++;
 638        spin_unlock(&osb->osb_lock);
 639
 640        ocfs2_journal_dirty(handle, new_bh);
 641
 642        spin_lock(&oi->ip_lock);
 643        oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
 644        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 645        di->i_refcount_loc = cpu_to_le64(first_blkno);
 646        spin_unlock(&oi->ip_lock);
 647
 648        trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
 649
 650        ocfs2_journal_dirty(handle, di_bh);
 651
 652        /*
 653         * We have to init the tree lock here since it will use
 654         * the generation number to create it.
 655         */
 656        new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
 657        ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
 658                                      new_tree->rf_generation);
 659
 660        spin_lock(&osb->osb_lock);
 661        tree = ocfs2_find_refcount_tree(osb, first_blkno);
 662
 663        /*
 664         * We've just created a new refcount tree in this block.  If
 665         * we found a refcount tree on the ocfs2_super, it must be
 666         * one we just deleted.  We free the old tree before
 667         * inserting the new tree.
 668         */
 669        BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
 670        if (tree)
 671                ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
 672        ocfs2_insert_refcount_tree(osb, new_tree);
 673        spin_unlock(&osb->osb_lock);
 674        new_tree = NULL;
 675        if (tree)
 676                ocfs2_refcount_tree_put(tree);
 677
 678out_commit:
 679        ocfs2_commit_trans(osb, handle);
 680
 681out:
 682        if (new_tree) {
 683                ocfs2_metadata_cache_exit(&new_tree->rf_ci);
 684                kfree(new_tree);
 685        }
 686
 687        brelse(new_bh);
 688        if (meta_ac)
 689                ocfs2_free_alloc_context(meta_ac);
 690
 691        return ret;
 692}
 693
 694static int ocfs2_set_refcount_tree(struct inode *inode,
 695                                   struct buffer_head *di_bh,
 696                                   u64 refcount_loc)
 697{
 698        int ret;
 699        handle_t *handle = NULL;
 700        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 701        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 702        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 703        struct buffer_head *ref_root_bh = NULL;
 704        struct ocfs2_refcount_block *rb;
 705        struct ocfs2_refcount_tree *ref_tree;
 706
 707        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
 708
 709        ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
 710                                       &ref_tree, &ref_root_bh);
 711        if (ret) {
 712                mlog_errno(ret);
 713                return ret;
 714        }
 715
 716        handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
 717        if (IS_ERR(handle)) {
 718                ret = PTR_ERR(handle);
 719                mlog_errno(ret);
 720                goto out;
 721        }
 722
 723        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 724                                      OCFS2_JOURNAL_ACCESS_WRITE);
 725        if (ret) {
 726                mlog_errno(ret);
 727                goto out_commit;
 728        }
 729
 730        ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
 731                                      OCFS2_JOURNAL_ACCESS_WRITE);
 732        if (ret) {
 733                mlog_errno(ret);
 734                goto out_commit;
 735        }
 736
 737        rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
 738        le32_add_cpu(&rb->rf_count, 1);
 739
 740        ocfs2_journal_dirty(handle, ref_root_bh);
 741
 742        spin_lock(&oi->ip_lock);
 743        oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
 744        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 745        di->i_refcount_loc = cpu_to_le64(refcount_loc);
 746        spin_unlock(&oi->ip_lock);
 747        ocfs2_journal_dirty(handle, di_bh);
 748
 749out_commit:
 750        ocfs2_commit_trans(osb, handle);
 751out:
 752        ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 753        brelse(ref_root_bh);
 754
 755        return ret;
 756}
 757
 758int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
 759{
 760        int ret, delete_tree = 0;
 761        handle_t *handle = NULL;
 762        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 763        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 764        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 765        struct ocfs2_refcount_block *rb;
 766        struct inode *alloc_inode = NULL;
 767        struct buffer_head *alloc_bh = NULL;
 768        struct buffer_head *blk_bh = NULL;
 769        struct ocfs2_refcount_tree *ref_tree;
 770        int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
 771        u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
 772        u16 bit = 0;
 773
 774        if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
 775                return 0;
 776
 777        BUG_ON(!ref_blkno);
 778        ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
 779        if (ret) {
 780                mlog_errno(ret);
 781                return ret;
 782        }
 783
 784        rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
 785
 786        /*
 787         * If we are the last user, we need to free the block.
 788         * So lock the allocator ahead.
 789         */
 790        if (le32_to_cpu(rb->rf_count) == 1) {
 791                blk = le64_to_cpu(rb->rf_blkno);
 792                bit = le16_to_cpu(rb->rf_suballoc_bit);
 793                if (rb->rf_suballoc_loc)
 794                        bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
 795                else
 796                        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
 797
 798                alloc_inode = ocfs2_get_system_file_inode(osb,
 799                                        EXTENT_ALLOC_SYSTEM_INODE,
 800                                        le16_to_cpu(rb->rf_suballoc_slot));
 801                if (!alloc_inode) {
 802                        ret = -ENOMEM;
 803                        mlog_errno(ret);
 804                        goto out;
 805                }
 806                mutex_lock(&alloc_inode->i_mutex);
 807
 808                ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
 809                if (ret) {
 810                        mlog_errno(ret);
 811                        goto out_mutex;
 812                }
 813
 814                credits += OCFS2_SUBALLOC_FREE;
 815        }
 816
 817        handle = ocfs2_start_trans(osb, credits);
 818        if (IS_ERR(handle)) {
 819                ret = PTR_ERR(handle);
 820                mlog_errno(ret);
 821                goto out_unlock;
 822        }
 823
 824        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 825                                      OCFS2_JOURNAL_ACCESS_WRITE);
 826        if (ret) {
 827                mlog_errno(ret);
 828                goto out_commit;
 829        }
 830
 831        ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
 832                                      OCFS2_JOURNAL_ACCESS_WRITE);
 833        if (ret) {
 834                mlog_errno(ret);
 835                goto out_commit;
 836        }
 837
 838        spin_lock(&oi->ip_lock);
 839        oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
 840        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 841        di->i_refcount_loc = 0;
 842        spin_unlock(&oi->ip_lock);
 843        ocfs2_journal_dirty(handle, di_bh);
 844
 845        le32_add_cpu(&rb->rf_count , -1);
 846        ocfs2_journal_dirty(handle, blk_bh);
 847
 848        if (!rb->rf_count) {
 849                delete_tree = 1;
 850                ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
 851                ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
 852                                               alloc_bh, bit, bg_blkno, 1);
 853                if (ret)
 854                        mlog_errno(ret);
 855        }
 856
 857out_commit:
 858        ocfs2_commit_trans(osb, handle);
 859out_unlock:
 860        if (alloc_inode) {
 861                ocfs2_inode_unlock(alloc_inode, 1);
 862                brelse(alloc_bh);
 863        }
 864out_mutex:
 865        if (alloc_inode) {
 866                mutex_unlock(&alloc_inode->i_mutex);
 867                iput(alloc_inode);
 868        }
 869out:
 870        ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 871        if (delete_tree)
 872                ocfs2_refcount_tree_put(ref_tree);
 873        brelse(blk_bh);
 874
 875        return ret;
 876}
 877
 878static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
 879                                          struct buffer_head *ref_leaf_bh,
 880                                          u64 cpos, unsigned int len,
 881                                          struct ocfs2_refcount_rec *ret_rec,
 882                                          int *index)
 883{
 884        int i = 0;
 885        struct ocfs2_refcount_block *rb =
 886                (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
 887        struct ocfs2_refcount_rec *rec = NULL;
 888
 889        for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
 890                rec = &rb->rf_records.rl_recs[i];
 891
 892                if (le64_to_cpu(rec->r_cpos) +
 893                    le32_to_cpu(rec->r_clusters) <= cpos)
 894                        continue;
 895                else if (le64_to_cpu(rec->r_cpos) > cpos)
 896                        break;
 897
 898                /* ok, cpos fail in this rec. Just return. */
 899                if (ret_rec)
 900                        *ret_rec = *rec;
 901                goto out;
 902        }
 903
 904        if (ret_rec) {
 905                /* We meet with a hole here, so fake the rec. */
 906                ret_rec->r_cpos = cpu_to_le64(cpos);
 907                ret_rec->r_refcount = 0;
 908                if (i < le16_to_cpu(rb->rf_records.rl_used) &&
 909                    le64_to_cpu(rec->r_cpos) < cpos + len)
 910                        ret_rec->r_clusters =
 911                                cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
 912                else
 913                        ret_rec->r_clusters = cpu_to_le32(len);
 914        }
 915
 916out:
 917        *index = i;
 918}
 919
 920/*
 921 * Try to remove refcount tree. The mechanism is:
 922 * 1) Check whether i_clusters == 0, if no, exit.
 923 * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
 924 * 3) Check whether we have inline xattr stored outside, if yes, exit.
 925 * 4) Remove the tree.
 926 */
 927int ocfs2_try_remove_refcount_tree(struct inode *inode,
 928                                   struct buffer_head *di_bh)
 929{
 930        int ret;
 931        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 932        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 933
 934        down_write(&oi->ip_xattr_sem);
 935        down_write(&oi->ip_alloc_sem);
 936
 937        if (oi->ip_clusters)
 938                goto out;
 939
 940        if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
 941                goto out;
 942
 943        if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
 944            ocfs2_has_inline_xattr_value_outside(inode, di))
 945                goto out;
 946
 947        ret = ocfs2_remove_refcount_tree(inode, di_bh);
 948        if (ret)
 949                mlog_errno(ret);
 950out:
 951        up_write(&oi->ip_alloc_sem);
 952        up_write(&oi->ip_xattr_sem);
 953        return 0;
 954}
 955
 956/*
 957 * Find the end range for a leaf refcount block indicated by
 958 * el->l_recs[index].e_blkno.
 959 */
 960static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
 961                                       struct buffer_head *ref_root_bh,
 962                                       struct ocfs2_extent_block *eb,
 963                                       struct ocfs2_extent_list *el,
 964                                       int index,  u32 *cpos_end)
 965{
 966        int ret, i, subtree_root;
 967        u32 cpos;
 968        u64 blkno;
 969        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 970        struct ocfs2_path *left_path = NULL, *right_path = NULL;
 971        struct ocfs2_extent_tree et;
 972        struct ocfs2_extent_list *tmp_el;
 973
 974        if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
 975                /*
 976                 * We have a extent rec after index, so just use the e_cpos
 977                 * of the next extent rec.
 978                 */
 979                *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
 980                return 0;
 981        }
 982
 983        if (!eb || (eb && !eb->h_next_leaf_blk)) {
 984                /*
 985                 * We are the last extent rec, so any high cpos should
 986                 * be stored in this leaf refcount block.
 987                 */
 988                *cpos_end = UINT_MAX;
 989                return 0;
 990        }
 991
 992        /*
 993         * If the extent block isn't the last one, we have to find
 994         * the subtree root between this extent block and the next
 995         * leaf extent block and get the corresponding e_cpos from
 996         * the subroot. Otherwise we may corrupt the b-tree.
 997         */
 998        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
 999
1000        left_path = ocfs2_new_path_from_et(&et);
1001        if (!left_path) {
1002                ret = -ENOMEM;
1003                mlog_errno(ret);
1004                goto out;
1005        }
1006
1007        cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1008        ret = ocfs2_find_path(ci, left_path, cpos);
1009        if (ret) {
1010                mlog_errno(ret);
1011                goto out;
1012        }
1013
1014        right_path = ocfs2_new_path_from_path(left_path);
1015        if (!right_path) {
1016                ret = -ENOMEM;
1017                mlog_errno(ret);
1018                goto out;
1019        }
1020
1021        ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1022        if (ret) {
1023                mlog_errno(ret);
1024                goto out;
1025        }
1026
1027        ret = ocfs2_find_path(ci, right_path, cpos);
1028        if (ret) {
1029                mlog_errno(ret);
1030                goto out;
1031        }
1032
1033        subtree_root = ocfs2_find_subtree_root(&et, left_path,
1034                                               right_path);
1035
1036        tmp_el = left_path->p_node[subtree_root].el;
1037        blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1038        for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
1039                if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1040                        *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1041                        break;
1042                }
1043        }
1044
1045        BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
1046
1047out:
1048        ocfs2_free_path(left_path);
1049        ocfs2_free_path(right_path);
1050        return ret;
1051}
1052
1053/*
1054 * Given a cpos and len, try to find the refcount record which contains cpos.
1055 * 1. If cpos can be found in one refcount record, return the record.
1056 * 2. If cpos can't be found, return a fake record which start from cpos
1057 *    and end at a small value between cpos+len and start of the next record.
1058 *    This fake record has r_refcount = 0.
1059 */
1060static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1061                                  struct buffer_head *ref_root_bh,
1062                                  u64 cpos, unsigned int len,
1063                                  struct ocfs2_refcount_rec *ret_rec,
1064                                  int *index,
1065                                  struct buffer_head **ret_bh)
1066{
1067        int ret = 0, i, found;
1068        u32 low_cpos, uninitialized_var(cpos_end);
1069        struct ocfs2_extent_list *el;
1070        struct ocfs2_extent_rec *rec = NULL;
1071        struct ocfs2_extent_block *eb = NULL;
1072        struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
1073        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1074        struct ocfs2_refcount_block *rb =
1075                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1076
1077        if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
1078                ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
1079                                              ret_rec, index);
1080                *ret_bh = ref_root_bh;
1081                get_bh(ref_root_bh);
1082                return 0;
1083        }
1084
1085        el = &rb->rf_list;
1086        low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1087
1088        if (el->l_tree_depth) {
1089                ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1090                if (ret) {
1091                        mlog_errno(ret);
1092                        goto out;
1093                }
1094
1095                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1096                el = &eb->h_list;
1097
1098                if (el->l_tree_depth) {
1099                        ocfs2_error(sb,
1100                        "refcount tree %llu has non zero tree "
1101                        "depth in leaf btree tree block %llu\n",
1102                        (unsigned long long)ocfs2_metadata_cache_owner(ci),
1103                        (unsigned long long)eb_bh->b_blocknr);
1104                        ret = -EROFS;
1105                        goto out;
1106                }
1107        }
1108
1109        found = 0;
1110        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1111                rec = &el->l_recs[i];
1112
1113                if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1114                        found = 1;
1115                        break;
1116                }
1117        }
1118
1119        if (found) {
1120                ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1121                                                  eb, el, i, &cpos_end);
1122                if (ret) {
1123                        mlog_errno(ret);
1124                        goto out;
1125                }
1126
1127                if (cpos_end < low_cpos + len)
1128                        len = cpos_end - low_cpos;
1129        }
1130
1131        ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1132                                        &ref_leaf_bh);
1133        if (ret) {
1134                mlog_errno(ret);
1135                goto out;
1136        }
1137
1138        ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1139                                      ret_rec, index);
1140        *ret_bh = ref_leaf_bh;
1141out:
1142        brelse(eb_bh);
1143        return ret;
1144}
1145
1146enum ocfs2_ref_rec_contig {
1147        REF_CONTIG_NONE = 0,
1148        REF_CONTIG_LEFT,
1149        REF_CONTIG_RIGHT,
1150        REF_CONTIG_LEFTRIGHT,
1151};
1152
1153static enum ocfs2_ref_rec_contig
1154        ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1155                                    int index)
1156{
1157        if ((rb->rf_records.rl_recs[index].r_refcount ==
1158            rb->rf_records.rl_recs[index + 1].r_refcount) &&
1159            (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1160            le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1161            le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1162                return REF_CONTIG_RIGHT;
1163
1164        return REF_CONTIG_NONE;
1165}
1166
1167static enum ocfs2_ref_rec_contig
1168        ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1169                                  int index)
1170{
1171        enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1172
1173        if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1174                ret = ocfs2_refcount_rec_adjacent(rb, index);
1175
1176        if (index > 0) {
1177                enum ocfs2_ref_rec_contig tmp;
1178
1179                tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1180
1181                if (tmp == REF_CONTIG_RIGHT) {
1182                        if (ret == REF_CONTIG_RIGHT)
1183                                ret = REF_CONTIG_LEFTRIGHT;
1184                        else
1185                                ret = REF_CONTIG_LEFT;
1186                }
1187        }
1188
1189        return ret;
1190}
1191
1192static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1193                                           int index)
1194{
1195        BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1196               rb->rf_records.rl_recs[index+1].r_refcount);
1197
1198        le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1199                     le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1200
1201        if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1202                memmove(&rb->rf_records.rl_recs[index + 1],
1203                        &rb->rf_records.rl_recs[index + 2],
1204                        sizeof(struct ocfs2_refcount_rec) *
1205                        (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1206
1207        memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1208               0, sizeof(struct ocfs2_refcount_rec));
1209        le16_add_cpu(&rb->rf_records.rl_used, -1);
1210}
1211
1212/*
1213 * Merge the refcount rec if we are contiguous with the adjacent recs.
1214 */
1215static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1216                                     int index)
1217{
1218        enum ocfs2_ref_rec_contig contig =
1219                                ocfs2_refcount_rec_contig(rb, index);
1220
1221        if (contig == REF_CONTIG_NONE)
1222                return;
1223
1224        if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1225                BUG_ON(index == 0);
1226                index--;
1227        }
1228
1229        ocfs2_rotate_refcount_rec_left(rb, index);
1230
1231        if (contig == REF_CONTIG_LEFTRIGHT)
1232                ocfs2_rotate_refcount_rec_left(rb, index);
1233}
1234
1235/*
1236 * Change the refcount indexed by "index" in ref_bh.
1237 * If refcount reaches 0, remove it.
1238 */
1239static int ocfs2_change_refcount_rec(handle_t *handle,
1240                                     struct ocfs2_caching_info *ci,
1241                                     struct buffer_head *ref_leaf_bh,
1242                                     int index, int merge, int change)
1243{
1244        int ret;
1245        struct ocfs2_refcount_block *rb =
1246                        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1247        struct ocfs2_refcount_list *rl = &rb->rf_records;
1248        struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1249
1250        ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1251                                      OCFS2_JOURNAL_ACCESS_WRITE);
1252        if (ret) {
1253                mlog_errno(ret);
1254                goto out;
1255        }
1256
1257        trace_ocfs2_change_refcount_rec(
1258                (unsigned long long)ocfs2_metadata_cache_owner(ci),
1259                index, le32_to_cpu(rec->r_refcount), change);
1260        le32_add_cpu(&rec->r_refcount, change);
1261
1262        if (!rec->r_refcount) {
1263                if (index != le16_to_cpu(rl->rl_used) - 1) {
1264                        memmove(rec, rec + 1,
1265                                (le16_to_cpu(rl->rl_used) - index - 1) *
1266                                sizeof(struct ocfs2_refcount_rec));
1267                        memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1268                               0, sizeof(struct ocfs2_refcount_rec));
1269                }
1270
1271                le16_add_cpu(&rl->rl_used, -1);
1272        } else if (merge)
1273                ocfs2_refcount_rec_merge(rb, index);
1274
1275        ocfs2_journal_dirty(handle, ref_leaf_bh);
1276out:
1277        return ret;
1278}
1279
1280static int ocfs2_expand_inline_ref_root(handle_t *handle,
1281                                        struct ocfs2_caching_info *ci,
1282                                        struct buffer_head *ref_root_bh,
1283                                        struct buffer_head **ref_leaf_bh,
1284                                        struct ocfs2_alloc_context *meta_ac)
1285{
1286        int ret;
1287        u16 suballoc_bit_start;
1288        u32 num_got;
1289        u64 suballoc_loc, blkno;
1290        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1291        struct buffer_head *new_bh = NULL;
1292        struct ocfs2_refcount_block *new_rb;
1293        struct ocfs2_refcount_block *root_rb =
1294                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1295
1296        ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1297                                      OCFS2_JOURNAL_ACCESS_WRITE);
1298        if (ret) {
1299                mlog_errno(ret);
1300                goto out;
1301        }
1302
1303        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1304                                   &suballoc_bit_start, &num_got,
1305                                   &blkno);
1306        if (ret) {
1307                mlog_errno(ret);
1308                goto out;
1309        }
1310
1311        new_bh = sb_getblk(sb, blkno);
1312        if (new_bh == NULL) {
1313                ret = -EIO;
1314                mlog_errno(ret);
1315                goto out;
1316        }
1317        ocfs2_set_new_buffer_uptodate(ci, new_bh);
1318
1319        ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1320                                      OCFS2_JOURNAL_ACCESS_CREATE);
1321        if (ret) {
1322                mlog_errno(ret);
1323                goto out;
1324        }
1325
1326        /*
1327         * Initialize ocfs2_refcount_block.
1328         * It should contain the same information as the old root.
1329         * so just memcpy it and change the corresponding field.
1330         */
1331        memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1332
1333        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1334        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1335        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1336        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1337        new_rb->rf_blkno = cpu_to_le64(blkno);
1338        new_rb->rf_cpos = cpu_to_le32(0);
1339        new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1340        new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1341        ocfs2_journal_dirty(handle, new_bh);
1342
1343        /* Now change the root. */
1344        memset(&root_rb->rf_list, 0, sb->s_blocksize -
1345               offsetof(struct ocfs2_refcount_block, rf_list));
1346        root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1347        root_rb->rf_clusters = cpu_to_le32(1);
1348        root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1349        root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1350        root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1351        root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1352
1353        ocfs2_journal_dirty(handle, ref_root_bh);
1354
1355        trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
1356                le16_to_cpu(new_rb->rf_records.rl_used));
1357
1358        *ref_leaf_bh = new_bh;
1359        new_bh = NULL;
1360out:
1361        brelse(new_bh);
1362        return ret;
1363}
1364
1365static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1366                                           struct ocfs2_refcount_rec *next)
1367{
1368        if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1369                ocfs2_get_ref_rec_low_cpos(next))
1370                return 1;
1371
1372        return 0;
1373}
1374
1375static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1376{
1377        const struct ocfs2_refcount_rec *l = a, *r = b;
1378        u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1379        u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1380
1381        if (l_cpos > r_cpos)
1382                return 1;
1383        if (l_cpos < r_cpos)
1384                return -1;
1385        return 0;
1386}
1387
1388static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1389{
1390        const struct ocfs2_refcount_rec *l = a, *r = b;
1391        u64 l_cpos = le64_to_cpu(l->r_cpos);
1392        u64 r_cpos = le64_to_cpu(r->r_cpos);
1393
1394        if (l_cpos > r_cpos)
1395                return 1;
1396        if (l_cpos < r_cpos)
1397                return -1;
1398        return 0;
1399}
1400
1401static void swap_refcount_rec(void *a, void *b, int size)
1402{
1403        struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1404
1405        tmp = *(struct ocfs2_refcount_rec *)l;
1406        *(struct ocfs2_refcount_rec *)l =
1407                        *(struct ocfs2_refcount_rec *)r;
1408        *(struct ocfs2_refcount_rec *)r = tmp;
1409}
1410
1411/*
1412 * The refcount cpos are ordered by their 64bit cpos,
1413 * But we will use the low 32 bit to be the e_cpos in the b-tree.
1414 * So we need to make sure that this pos isn't intersected with others.
1415 *
1416 * Note: The refcount block is already sorted by their low 32 bit cpos,
1417 *       So just try the middle pos first, and we will exit when we find
1418 *       the good position.
1419 */
1420static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1421                                         u32 *split_pos, int *split_index)
1422{
1423        int num_used = le16_to_cpu(rl->rl_used);
1424        int delta, middle = num_used / 2;
1425
1426        for (delta = 0; delta < middle; delta++) {
1427                /* Let's check delta earlier than middle */
1428                if (ocfs2_refcount_rec_no_intersect(
1429                                        &rl->rl_recs[middle - delta - 1],
1430                                        &rl->rl_recs[middle - delta])) {
1431                        *split_index = middle - delta;
1432                        break;
1433                }
1434
1435                /* For even counts, don't walk off the end */
1436                if ((middle + delta + 1) == num_used)
1437                        continue;
1438
1439                /* Now try delta past middle */
1440                if (ocfs2_refcount_rec_no_intersect(
1441                                        &rl->rl_recs[middle + delta],
1442                                        &rl->rl_recs[middle + delta + 1])) {
1443                        *split_index = middle + delta + 1;
1444                        break;
1445                }
1446        }
1447
1448        if (delta >= middle)
1449                return -ENOSPC;
1450
1451        *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1452        return 0;
1453}
1454
1455static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1456                                            struct buffer_head *new_bh,
1457                                            u32 *split_cpos)
1458{
1459        int split_index = 0, num_moved, ret;
1460        u32 cpos = 0;
1461        struct ocfs2_refcount_block *rb =
1462                        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1463        struct ocfs2_refcount_list *rl = &rb->rf_records;
1464        struct ocfs2_refcount_block *new_rb =
1465                        (struct ocfs2_refcount_block *)new_bh->b_data;
1466        struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1467
1468        trace_ocfs2_divide_leaf_refcount_block(
1469                (unsigned long long)ref_leaf_bh->b_blocknr,
1470                le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
1471
1472        /*
1473         * XXX: Improvement later.
1474         * If we know all the high 32 bit cpos is the same, no need to sort.
1475         *
1476         * In order to make the whole process safe, we do:
1477         * 1. sort the entries by their low 32 bit cpos first so that we can
1478         *    find the split cpos easily.
1479         * 2. call ocfs2_insert_extent to insert the new refcount block.
1480         * 3. move the refcount rec to the new block.
1481         * 4. sort the entries by their 64 bit cpos.
1482         * 5. dirty the new_rb and rb.
1483         */
1484        sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1485             sizeof(struct ocfs2_refcount_rec),
1486             cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1487
1488        ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1489        if (ret) {
1490                mlog_errno(ret);
1491                return ret;
1492        }
1493
1494        new_rb->rf_cpos = cpu_to_le32(cpos);
1495
1496        /* move refcount records starting from split_index to the new block. */
1497        num_moved = le16_to_cpu(rl->rl_used) - split_index;
1498        memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1499               num_moved * sizeof(struct ocfs2_refcount_rec));
1500
1501        /*ok, remove the entries we just moved over to the other block. */
1502        memset(&rl->rl_recs[split_index], 0,
1503               num_moved * sizeof(struct ocfs2_refcount_rec));
1504
1505        /* change old and new rl_used accordingly. */
1506        le16_add_cpu(&rl->rl_used, -num_moved);
1507        new_rl->rl_used = cpu_to_le16(num_moved);
1508
1509        sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1510             sizeof(struct ocfs2_refcount_rec),
1511             cmp_refcount_rec_by_cpos, swap_refcount_rec);
1512
1513        sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1514             sizeof(struct ocfs2_refcount_rec),
1515             cmp_refcount_rec_by_cpos, swap_refcount_rec);
1516
1517        *split_cpos = cpos;
1518        return 0;
1519}
1520
1521static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1522                                         struct ocfs2_caching_info *ci,
1523                                         struct buffer_head *ref_root_bh,
1524                                         struct buffer_head *ref_leaf_bh,
1525                                         struct ocfs2_alloc_context *meta_ac)
1526{
1527        int ret;
1528        u16 suballoc_bit_start;
1529        u32 num_got, new_cpos;
1530        u64 suballoc_loc, blkno;
1531        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1532        struct ocfs2_refcount_block *root_rb =
1533                        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1534        struct buffer_head *new_bh = NULL;
1535        struct ocfs2_refcount_block *new_rb;
1536        struct ocfs2_extent_tree ref_et;
1537
1538        BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1539
1540        ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1541                                      OCFS2_JOURNAL_ACCESS_WRITE);
1542        if (ret) {
1543                mlog_errno(ret);
1544                goto out;
1545        }
1546
1547        ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1548                                      OCFS2_JOURNAL_ACCESS_WRITE);
1549        if (ret) {
1550                mlog_errno(ret);
1551                goto out;
1552        }
1553
1554        ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1555                                   &suballoc_bit_start, &num_got,
1556                                   &blkno);
1557        if (ret) {
1558                mlog_errno(ret);
1559                goto out;
1560        }
1561
1562        new_bh = sb_getblk(sb, blkno);
1563        if (new_bh == NULL) {
1564                ret = -EIO;
1565                mlog_errno(ret);
1566                goto out;
1567        }
1568        ocfs2_set_new_buffer_uptodate(ci, new_bh);
1569
1570        ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1571                                      OCFS2_JOURNAL_ACCESS_CREATE);
1572        if (ret) {
1573                mlog_errno(ret);
1574                goto out;
1575        }
1576
1577        /* Initialize ocfs2_refcount_block. */
1578        new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1579        memset(new_rb, 0, sb->s_blocksize);
1580        strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1581        new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1582        new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1583        new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1584        new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1585        new_rb->rf_blkno = cpu_to_le64(blkno);
1586        new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1587        new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1588        new_rb->rf_records.rl_count =
1589                                cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1590        new_rb->rf_generation = root_rb->rf_generation;
1591
1592        ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1593        if (ret) {
1594                mlog_errno(ret);
1595                goto out;
1596        }
1597
1598        ocfs2_journal_dirty(handle, ref_leaf_bh);
1599        ocfs2_journal_dirty(handle, new_bh);
1600
1601        ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1602
1603        trace_ocfs2_new_leaf_refcount_block(
1604                        (unsigned long long)new_bh->b_blocknr, new_cpos);
1605
1606        /* Insert the new leaf block with the specific offset cpos. */
1607        ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1608                                  1, 0, meta_ac);
1609        if (ret)
1610                mlog_errno(ret);
1611
1612out:
1613        brelse(new_bh);
1614        return ret;
1615}
1616
1617static int ocfs2_expand_refcount_tree(handle_t *handle,
1618                                      struct ocfs2_caching_info *ci,
1619                                      struct buffer_head *ref_root_bh,
1620                                      struct buffer_head *ref_leaf_bh,
1621                                      struct ocfs2_alloc_context *meta_ac)
1622{
1623        int ret;
1624        struct buffer_head *expand_bh = NULL;
1625
1626        if (ref_root_bh == ref_leaf_bh) {
1627                /*
1628                 * the old root bh hasn't been expanded to a b-tree,
1629                 * so expand it first.
1630                 */
1631                ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1632                                                   &expand_bh, meta_ac);
1633                if (ret) {
1634                        mlog_errno(ret);
1635                        goto out;
1636                }
1637        } else {
1638                expand_bh = ref_leaf_bh;
1639                get_bh(expand_bh);
1640        }
1641
1642
1643        /* Now add a new refcount block into the tree.*/
1644        ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1645                                            expand_bh, meta_ac);
1646        if (ret)
1647                mlog_errno(ret);
1648out:
1649        brelse(expand_bh);
1650        return ret;
1651}
1652
1653/*
1654 * Adjust the extent rec in b-tree representing ref_leaf_bh.
1655 *
1656 * Only called when we have inserted a new refcount rec at index 0
1657 * which means ocfs2_extent_rec.e_cpos may need some change.
1658 */
1659static int ocfs2_adjust_refcount_rec(handle_t *handle,
1660                                     struct ocfs2_caching_info *ci,
1661                                     struct buffer_head *ref_root_bh,
1662                                     struct buffer_head *ref_leaf_bh,
1663                                     struct ocfs2_refcount_rec *rec)
1664{
1665        int ret = 0, i;
1666        u32 new_cpos, old_cpos;
1667        struct ocfs2_path *path = NULL;
1668        struct ocfs2_extent_tree et;
1669        struct ocfs2_refcount_block *rb =
1670                (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1671        struct ocfs2_extent_list *el;
1672
1673        if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1674                goto out;
1675
1676        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1677        old_cpos = le32_to_cpu(rb->rf_cpos);
1678        new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1679        if (old_cpos <= new_cpos)
1680                goto out;
1681
1682        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1683
1684        path = ocfs2_new_path_from_et(&et);
1685        if (!path) {
1686                ret = -ENOMEM;
1687                mlog_errno(ret);
1688                goto out;
1689        }
1690
1691        ret = ocfs2_find_path(ci, path, old_cpos);
1692        if (ret) {
1693                mlog_errno(ret);
1694                goto out;
1695        }
1696
1697        /*
1698         * 2 more credits, one for the leaf refcount block, one for
1699         * the extent block contains the extent rec.
1700         */
1701        ret = ocfs2_extend_trans(handle, 2);
1702        if (ret < 0) {
1703                mlog_errno(ret);
1704                goto out;
1705        }
1706
1707        ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1708                                      OCFS2_JOURNAL_ACCESS_WRITE);
1709        if (ret < 0) {
1710                mlog_errno(ret);
1711                goto out;
1712        }
1713
1714        ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1715                                      OCFS2_JOURNAL_ACCESS_WRITE);
1716        if (ret < 0) {
1717                mlog_errno(ret);
1718                goto out;
1719        }
1720
1721        /* change the leaf extent block first. */
1722        el = path_leaf_el(path);
1723
1724        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1725                if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1726                        break;
1727
1728        BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1729
1730        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1731
1732        /* change the r_cpos in the leaf block. */
1733        rb->rf_cpos = cpu_to_le32(new_cpos);
1734
1735        ocfs2_journal_dirty(handle, path_leaf_bh(path));
1736        ocfs2_journal_dirty(handle, ref_leaf_bh);
1737
1738out:
1739        ocfs2_free_path(path);
1740        return ret;
1741}
1742
1743static int ocfs2_insert_refcount_rec(handle_t *handle,
1744                                     struct ocfs2_caching_info *ci,
1745                                     struct buffer_head *ref_root_bh,
1746                                     struct buffer_head *ref_leaf_bh,
1747                                     struct ocfs2_refcount_rec *rec,
1748                                     int index, int merge,
1749                                     struct ocfs2_alloc_context *meta_ac)
1750{
1751        int ret;
1752        struct ocfs2_refcount_block *rb =
1753                        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1754        struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1755        struct buffer_head *new_bh = NULL;
1756
1757        BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1758
1759        if (rf_list->rl_used == rf_list->rl_count) {
1760                u64 cpos = le64_to_cpu(rec->r_cpos);
1761                u32 len = le32_to_cpu(rec->r_clusters);
1762
1763                ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1764                                                 ref_leaf_bh, meta_ac);
1765                if (ret) {
1766                        mlog_errno(ret);
1767                        goto out;
1768                }
1769
1770                ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1771                                             cpos, len, NULL, &index,
1772                                             &new_bh);
1773                if (ret) {
1774                        mlog_errno(ret);
1775                        goto out;
1776                }
1777
1778                ref_leaf_bh = new_bh;
1779                rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1780                rf_list = &rb->rf_records;
1781        }
1782
1783        ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1784                                      OCFS2_JOURNAL_ACCESS_WRITE);
1785        if (ret) {
1786                mlog_errno(ret);
1787                goto out;
1788        }
1789
1790        if (index < le16_to_cpu(rf_list->rl_used))
1791                memmove(&rf_list->rl_recs[index + 1],
1792                        &rf_list->rl_recs[index],
1793                        (le16_to_cpu(rf_list->rl_used) - index) *
1794                         sizeof(struct ocfs2_refcount_rec));
1795
1796        trace_ocfs2_insert_refcount_rec(
1797                (unsigned long long)ref_leaf_bh->b_blocknr, index,
1798                (unsigned long long)le64_to_cpu(rec->r_cpos),
1799                le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
1800
1801        rf_list->rl_recs[index] = *rec;
1802
1803        le16_add_cpu(&rf_list->rl_used, 1);
1804
1805        if (merge)
1806                ocfs2_refcount_rec_merge(rb, index);
1807
1808        ocfs2_journal_dirty(handle, ref_leaf_bh);
1809
1810        if (index == 0) {
1811                ret = ocfs2_adjust_refcount_rec(handle, ci,
1812                                                ref_root_bh,
1813                                                ref_leaf_bh, rec);
1814                if (ret)
1815                        mlog_errno(ret);
1816        }
1817out:
1818        brelse(new_bh);
1819        return ret;
1820}
1821
1822/*
1823 * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1824 * This is much simple than our b-tree code.
1825 * split_rec is the new refcount rec we want to insert.
1826 * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1827 * increase refcount or decrease a refcount to non-zero).
1828 * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1829 * rec( in case we decrease a refcount to zero).
1830 */
1831static int ocfs2_split_refcount_rec(handle_t *handle,
1832                                    struct ocfs2_caching_info *ci,
1833                                    struct buffer_head *ref_root_bh,
1834                                    struct buffer_head *ref_leaf_bh,
1835                                    struct ocfs2_refcount_rec *split_rec,
1836                                    int index, int merge,
1837                                    struct ocfs2_alloc_context *meta_ac,
1838                                    struct ocfs2_cached_dealloc_ctxt *dealloc)
1839{
1840        int ret, recs_need;
1841        u32 len;
1842        struct ocfs2_refcount_block *rb =
1843                        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1844        struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1845        struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1846        struct ocfs2_refcount_rec *tail_rec = NULL;
1847        struct buffer_head *new_bh = NULL;
1848
1849        BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1850
1851        trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
1852                le32_to_cpu(orig_rec->r_clusters),
1853                le32_to_cpu(orig_rec->r_refcount),
1854                le64_to_cpu(split_rec->r_cpos),
1855                le32_to_cpu(split_rec->r_clusters),
1856                le32_to_cpu(split_rec->r_refcount));
1857
1858        /*
1859         * If we just need to split the header or tail clusters,
1860         * no more recs are needed, just split is OK.
1861         * Otherwise we at least need one new recs.
1862         */
1863        if (!split_rec->r_refcount &&
1864            (split_rec->r_cpos == orig_rec->r_cpos ||
1865             le64_to_cpu(split_rec->r_cpos) +
1866             le32_to_cpu(split_rec->r_clusters) ==
1867             le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1868                recs_need = 0;
1869        else
1870                recs_need = 1;
1871
1872        /*
1873         * We need one more rec if we split in the middle and the new rec have
1874         * some refcount in it.
1875         */
1876        if (split_rec->r_refcount &&
1877            (split_rec->r_cpos != orig_rec->r_cpos &&
1878             le64_to_cpu(split_rec->r_cpos) +
1879             le32_to_cpu(split_rec->r_clusters) !=
1880             le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1881                recs_need++;
1882
1883        /* If the leaf block don't have enough record, expand it. */
1884        if (le16_to_cpu(rf_list->rl_used) + recs_need >
1885                                         le16_to_cpu(rf_list->rl_count)) {
1886                struct ocfs2_refcount_rec tmp_rec;
1887                u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1888                len = le32_to_cpu(orig_rec->r_clusters);
1889                ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1890                                                 ref_leaf_bh, meta_ac);
1891                if (ret) {
1892                        mlog_errno(ret);
1893                        goto out;
1894                }
1895
1896                /*
1897                 * We have to re-get it since now cpos may be moved to
1898                 * another leaf block.
1899                 */
1900                ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1901                                             cpos, len, &tmp_rec, &index,
1902                                             &new_bh);
1903                if (ret) {
1904                        mlog_errno(ret);
1905                        goto out;
1906                }
1907
1908                ref_leaf_bh = new_bh;
1909                rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1910                rf_list = &rb->rf_records;
1911                orig_rec = &rf_list->rl_recs[index];
1912        }
1913
1914        ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1915                                      OCFS2_JOURNAL_ACCESS_WRITE);
1916        if (ret) {
1917                mlog_errno(ret);
1918                goto out;
1919        }
1920
1921        /*
1922         * We have calculated out how many new records we need and store
1923         * in recs_need, so spare enough space first by moving the records
1924         * after "index" to the end.
1925         */
1926        if (index != le16_to_cpu(rf_list->rl_used) - 1)
1927                memmove(&rf_list->rl_recs[index + 1 + recs_need],
1928                        &rf_list->rl_recs[index + 1],
1929                        (le16_to_cpu(rf_list->rl_used) - index - 1) *
1930                         sizeof(struct ocfs2_refcount_rec));
1931
1932        len = (le64_to_cpu(orig_rec->r_cpos) +
1933              le32_to_cpu(orig_rec->r_clusters)) -
1934              (le64_to_cpu(split_rec->r_cpos) +
1935              le32_to_cpu(split_rec->r_clusters));
1936
1937        /*
1938         * If we have "len", the we will split in the tail and move it
1939         * to the end of the space we have just spared.
1940         */
1941        if (len) {
1942                tail_rec = &rf_list->rl_recs[index + recs_need];
1943
1944                memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1945                le64_add_cpu(&tail_rec->r_cpos,
1946                             le32_to_cpu(tail_rec->r_clusters) - len);
1947                tail_rec->r_clusters = cpu_to_le32(len);
1948        }
1949
1950        /*
1951         * If the split pos isn't the same as the original one, we need to
1952         * split in the head.
1953         *
1954         * Note: We have the chance that split_rec.r_refcount = 0,
1955         * recs_need = 0 and len > 0, which means we just cut the head from
1956         * the orig_rec and in that case we have done some modification in
1957         * orig_rec above, so the check for r_cpos is faked.
1958         */
1959        if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1960                len = le64_to_cpu(split_rec->r_cpos) -
1961                      le64_to_cpu(orig_rec->r_cpos);
1962                orig_rec->r_clusters = cpu_to_le32(len);
1963                index++;
1964        }
1965
1966        le16_add_cpu(&rf_list->rl_used, recs_need);
1967
1968        if (split_rec->r_refcount) {
1969                rf_list->rl_recs[index] = *split_rec;
1970                trace_ocfs2_split_refcount_rec_insert(
1971                        (unsigned long long)ref_leaf_bh->b_blocknr, index,
1972                        (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1973                        le32_to_cpu(split_rec->r_clusters),
1974                        le32_to_cpu(split_rec->r_refcount));
1975
1976                if (merge)
1977                        ocfs2_refcount_rec_merge(rb, index);
1978        }
1979
1980        ocfs2_journal_dirty(handle, ref_leaf_bh);
1981
1982out:
1983        brelse(new_bh);
1984        return ret;
1985}
1986
1987static int __ocfs2_increase_refcount(handle_t *handle,
1988                                     struct ocfs2_caching_info *ci,
1989                                     struct buffer_head *ref_root_bh,
1990                                     u64 cpos, u32 len, int merge,
1991                                     struct ocfs2_alloc_context *meta_ac,
1992                                     struct ocfs2_cached_dealloc_ctxt *dealloc)
1993{
1994        int ret = 0, index;
1995        struct buffer_head *ref_leaf_bh = NULL;
1996        struct ocfs2_refcount_rec rec;
1997        unsigned int set_len = 0;
1998
1999        trace_ocfs2_increase_refcount_begin(
2000             (unsigned long long)ocfs2_metadata_cache_owner(ci),
2001             (unsigned long long)cpos, len);
2002
2003        while (len) {
2004                ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2005                                             cpos, len, &rec, &index,
2006                                             &ref_leaf_bh);
2007                if (ret) {
2008                        mlog_errno(ret);
2009                        goto out;
2010                }
2011
2012                set_len = le32_to_cpu(rec.r_clusters);
2013
2014                /*
2015                 * Here we may meet with 3 situations:
2016                 *
2017                 * 1. If we find an already existing record, and the length
2018                 *    is the same, cool, we just need to increase the r_refcount
2019                 *    and it is OK.
2020                 * 2. If we find a hole, just insert it with r_refcount = 1.
2021                 * 3. If we are in the middle of one extent record, split
2022                 *    it.
2023                 */
2024                if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
2025                    set_len <= len) {
2026                        trace_ocfs2_increase_refcount_change(
2027                                (unsigned long long)cpos, set_len,
2028                                le32_to_cpu(rec.r_refcount));
2029                        ret = ocfs2_change_refcount_rec(handle, ci,
2030                                                        ref_leaf_bh, index,
2031                                                        merge, 1);
2032                        if (ret) {
2033                                mlog_errno(ret);
2034                                goto out;
2035                        }
2036                } else if (!rec.r_refcount) {
2037                        rec.r_refcount = cpu_to_le32(1);
2038
2039                        trace_ocfs2_increase_refcount_insert(
2040                             (unsigned long long)le64_to_cpu(rec.r_cpos),
2041                             set_len);
2042                        ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
2043                                                        ref_leaf_bh,
2044                                                        &rec, index,
2045                                                        merge, meta_ac);
2046                        if (ret) {
2047                                mlog_errno(ret);
2048                                goto out;
2049                        }
2050                } else  {
2051                        set_len = min((u64)(cpos + len),
2052                                      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
2053                        rec.r_cpos = cpu_to_le64(cpos);
2054                        rec.r_clusters = cpu_to_le32(set_len);
2055                        le32_add_cpu(&rec.r_refcount, 1);
2056
2057                        trace_ocfs2_increase_refcount_split(
2058                             (unsigned long long)le64_to_cpu(rec.r_cpos),
2059                             set_len, le32_to_cpu(rec.r_refcount));
2060                        ret = ocfs2_split_refcount_rec(handle, ci,
2061                                                       ref_root_bh, ref_leaf_bh,
2062                                                       &rec, index, merge,
2063                                                       meta_ac, dealloc);
2064                        if (ret) {
2065                                mlog_errno(ret);
2066                                goto out;
2067                        }
2068                }
2069
2070                cpos += set_len;
2071                len -= set_len;
2072                brelse(ref_leaf_bh);
2073                ref_leaf_bh = NULL;
2074        }
2075
2076out:
2077        brelse(ref_leaf_bh);
2078        return ret;
2079}
2080
2081static int ocfs2_remove_refcount_extent(handle_t *handle,
2082                                struct ocfs2_caching_info *ci,
2083                                struct buffer_head *ref_root_bh,
2084                                struct buffer_head *ref_leaf_bh,
2085                                struct ocfs2_alloc_context *meta_ac,
2086                                struct ocfs2_cached_dealloc_ctxt *dealloc)
2087{
2088        int ret;
2089        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2090        struct ocfs2_refcount_block *rb =
2091                        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2092        struct ocfs2_extent_tree et;
2093
2094        BUG_ON(rb->rf_records.rl_used);
2095
2096        trace_ocfs2_remove_refcount_extent(
2097                (unsigned long long)ocfs2_metadata_cache_owner(ci),
2098                (unsigned long long)ref_leaf_bh->b_blocknr,
2099                le32_to_cpu(rb->rf_cpos));
2100
2101        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2102        ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2103                                  1, meta_ac, dealloc);
2104        if (ret) {
2105                mlog_errno(ret);
2106                goto out;
2107        }
2108
2109        ocfs2_remove_from_cache(ci, ref_leaf_bh);
2110
2111        /*
2112         * add the freed block to the dealloc so that it will be freed
2113         * when we run dealloc.
2114         */
2115        ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2116                                        le16_to_cpu(rb->rf_suballoc_slot),
2117                                        le64_to_cpu(rb->rf_suballoc_loc),
2118                                        le64_to_cpu(rb->rf_blkno),
2119                                        le16_to_cpu(rb->rf_suballoc_bit));
2120        if (ret) {
2121                mlog_errno(ret);
2122                goto out;
2123        }
2124
2125        ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2126                                      OCFS2_JOURNAL_ACCESS_WRITE);
2127        if (ret) {
2128                mlog_errno(ret);
2129                goto out;
2130        }
2131
2132        rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2133
2134        le32_add_cpu(&rb->rf_clusters, -1);
2135
2136        /*
2137         * check whether we need to restore the root refcount block if
2138         * there is no leaf extent block at atll.
2139         */
2140        if (!rb->rf_list.l_next_free_rec) {
2141                BUG_ON(rb->rf_clusters);
2142
2143                trace_ocfs2_restore_refcount_block(
2144                     (unsigned long long)ref_root_bh->b_blocknr);
2145
2146                rb->rf_flags = 0;
2147                rb->rf_parent = 0;
2148                rb->rf_cpos = 0;
2149                memset(&rb->rf_records, 0, sb->s_blocksize -
2150                       offsetof(struct ocfs2_refcount_block, rf_records));
2151                rb->rf_records.rl_count =
2152                                cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2153        }
2154
2155        ocfs2_journal_dirty(handle, ref_root_bh);
2156
2157out:
2158        return ret;
2159}
2160
2161int ocfs2_increase_refcount(handle_t *handle,
2162                            struct ocfs2_caching_info *ci,
2163                            struct buffer_head *ref_root_bh,
2164                            u64 cpos, u32 len,
2165                            struct ocfs2_alloc_context *meta_ac,
2166                            struct ocfs2_cached_dealloc_ctxt *dealloc)
2167{
2168        return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2169                                         cpos, len, 1,
2170                                         meta_ac, dealloc);
2171}
2172
2173static int ocfs2_decrease_refcount_rec(handle_t *handle,
2174                                struct ocfs2_caching_info *ci,
2175                                struct buffer_head *ref_root_bh,
2176                                struct buffer_head *ref_leaf_bh,
2177                                int index, u64 cpos, unsigned int len,
2178                                struct ocfs2_alloc_context *meta_ac,
2179                                struct ocfs2_cached_dealloc_ctxt *dealloc)
2180{
2181        int ret;
2182        struct ocfs2_refcount_block *rb =
2183                        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2184        struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2185
2186        BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2187        BUG_ON(cpos + len >
2188               le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2189
2190        trace_ocfs2_decrease_refcount_rec(
2191                (unsigned long long)ocfs2_metadata_cache_owner(ci),
2192                (unsigned long long)cpos, len);
2193
2194        if (cpos == le64_to_cpu(rec->r_cpos) &&
2195            len == le32_to_cpu(rec->r_clusters))
2196                ret = ocfs2_change_refcount_rec(handle, ci,
2197                                                ref_leaf_bh, index, 1, -1);
2198        else {
2199                struct ocfs2_refcount_rec split = *rec;
2200                split.r_cpos = cpu_to_le64(cpos);
2201                split.r_clusters = cpu_to_le32(len);
2202
2203                le32_add_cpu(&split.r_refcount, -1);
2204
2205                ret = ocfs2_split_refcount_rec(handle, ci,
2206                                               ref_root_bh, ref_leaf_bh,
2207                                               &split, index, 1,
2208                                               meta_ac, dealloc);
2209        }
2210
2211        if (ret) {
2212                mlog_errno(ret);
2213                goto out;
2214        }
2215
2216        /* Remove the leaf refcount block if it contains no refcount record. */
2217        if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2218                ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2219                                                   ref_leaf_bh, meta_ac,
2220                                                   dealloc);
2221                if (ret)
2222                        mlog_errno(ret);
2223        }
2224
2225out:
2226        return ret;
2227}
2228
2229static int __ocfs2_decrease_refcount(handle_t *handle,
2230                                     struct ocfs2_caching_info *ci,
2231                                     struct buffer_head *ref_root_bh,
2232                                     u64 cpos, u32 len,
2233                                     struct ocfs2_alloc_context *meta_ac,
2234                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2235                                     int delete)
2236{
2237        int ret = 0, index = 0;
2238        struct ocfs2_refcount_rec rec;
2239        unsigned int r_count = 0, r_len;
2240        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2241        struct buffer_head *ref_leaf_bh = NULL;
2242
2243        trace_ocfs2_decrease_refcount(
2244                (unsigned long long)ocfs2_metadata_cache_owner(ci),
2245                (unsigned long long)cpos, len, delete);
2246
2247        while (len) {
2248                ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2249                                             cpos, len, &rec, &index,
2250                                             &ref_leaf_bh);
2251                if (ret) {
2252                        mlog_errno(ret);
2253                        goto out;
2254                }
2255
2256                r_count = le32_to_cpu(rec.r_refcount);
2257                BUG_ON(r_count == 0);
2258                if (!delete)
2259                        BUG_ON(r_count > 1);
2260
2261                r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2262                              le32_to_cpu(rec.r_clusters)) - cpos;
2263
2264                ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2265                                                  ref_leaf_bh, index,
2266                                                  cpos, r_len,
2267                                                  meta_ac, dealloc);
2268                if (ret) {
2269                        mlog_errno(ret);
2270                        goto out;
2271                }
2272
2273                if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2274                        ret = ocfs2_cache_cluster_dealloc(dealloc,
2275                                          ocfs2_clusters_to_blocks(sb, cpos),
2276                                                          r_len);
2277                        if (ret) {
2278                                mlog_errno(ret);
2279                                goto out;
2280                        }
2281                }
2282
2283                cpos += r_len;
2284                len -= r_len;
2285                brelse(ref_leaf_bh);
2286                ref_leaf_bh = NULL;
2287        }
2288
2289out:
2290        brelse(ref_leaf_bh);
2291        return ret;
2292}
2293
2294/* Caller must hold refcount tree lock. */
2295int ocfs2_decrease_refcount(struct inode *inode,
2296                            handle_t *handle, u32 cpos, u32 len,
2297                            struct ocfs2_alloc_context *meta_ac,
2298                            struct ocfs2_cached_dealloc_ctxt *dealloc,
2299                            int delete)
2300{
2301        int ret;
2302        u64 ref_blkno;
2303        struct ocfs2_inode_info *oi = OCFS2_I(inode);
2304        struct buffer_head *ref_root_bh = NULL;
2305        struct ocfs2_refcount_tree *tree;
2306
2307        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2308
2309        ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2310        if (ret) {
2311                mlog_errno(ret);
2312                goto out;
2313        }
2314
2315        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2316        if (ret) {
2317                mlog_errno(ret);
2318                goto out;
2319        }
2320
2321        ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2322                                        &ref_root_bh);
2323        if (ret) {
2324                mlog_errno(ret);
2325                goto out;
2326        }
2327
2328        ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2329                                        cpos, len, meta_ac, dealloc, delete);
2330        if (ret)
2331                mlog_errno(ret);
2332out:
2333        brelse(ref_root_bh);
2334        return ret;
2335}
2336
2337/*
2338 * Mark the already-existing extent at cpos as refcounted for len clusters.
2339 * This adds the refcount extent flag.
2340 *
2341 * If the existing extent is larger than the request, initiate a
2342 * split. An attempt will be made at merging with adjacent extents.
2343 *
2344 * The caller is responsible for passing down meta_ac if we'll need it.
2345 */
2346static int ocfs2_mark_extent_refcounted(struct inode *inode,
2347                                struct ocfs2_extent_tree *et,
2348                                handle_t *handle, u32 cpos,
2349                                u32 len, u32 phys,
2350                                struct ocfs2_alloc_context *meta_ac,
2351                                struct ocfs2_cached_dealloc_ctxt *dealloc)
2352{
2353        int ret;
2354
2355        trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
2356                                           cpos, len, phys);
2357
2358        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2359                ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2360                            "tree, but the feature bit is not set in the "
2361                            "super block.", inode->i_ino);
2362                ret = -EROFS;
2363                goto out;
2364        }
2365
2366        ret = ocfs2_change_extent_flag(handle, et, cpos,
2367                                       len, phys, meta_ac, dealloc,
2368                                       OCFS2_EXT_REFCOUNTED, 0);
2369        if (ret)
2370                mlog_errno(ret);
2371
2372out:
2373        return ret;
2374}
2375
2376/*
2377 * Given some contiguous physical clusters, calculate what we need
2378 * for modifying their refcount.
2379 */
2380static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2381                                            struct ocfs2_caching_info *ci,
2382                                            struct buffer_head *ref_root_bh,
2383                                            u64 start_cpos,
2384                                            u32 clusters,
2385                                            int *meta_add,
2386                                            int *credits)
2387{
2388        int ret = 0, index, ref_blocks = 0, recs_add = 0;
2389        u64 cpos = start_cpos;
2390        struct ocfs2_refcount_block *rb;
2391        struct ocfs2_refcount_rec rec;
2392        struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2393        u32 len;
2394
2395        while (clusters) {
2396                ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2397                                             cpos, clusters, &rec,
2398                                             &index, &ref_leaf_bh);
2399                if (ret) {
2400                        mlog_errno(ret);
2401                        goto out;
2402                }
2403
2404                if (ref_leaf_bh != prev_bh) {
2405                        /*
2406                         * Now we encounter a new leaf block, so calculate
2407                         * whether we need to extend the old leaf.
2408                         */
2409                        if (prev_bh) {
2410                                rb = (struct ocfs2_refcount_block *)
2411                                                        prev_bh->b_data;
2412
2413                                if (le16_to_cpu(rb->rf_records.rl_used) +
2414                                    recs_add >
2415                                    le16_to_cpu(rb->rf_records.rl_count))
2416                                        ref_blocks++;
2417                        }
2418
2419                        recs_add = 0;
2420                        *credits += 1;
2421                        brelse(prev_bh);
2422                        prev_bh = ref_leaf_bh;
2423                        get_bh(prev_bh);
2424                }
2425
2426                rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2427
2428                trace_ocfs2_calc_refcount_meta_credits_iterate(
2429                                recs_add, (unsigned long long)cpos, clusters,
2430                                (unsigned long long)le64_to_cpu(rec.r_cpos),
2431                                le32_to_cpu(rec.r_clusters),
2432                                le32_to_cpu(rec.r_refcount), index);
2433
2434                len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2435                          le32_to_cpu(rec.r_clusters)) - cpos;
2436                /*
2437                 * We record all the records which will be inserted to the
2438                 * same refcount block, so that we can tell exactly whether
2439                 * we need a new refcount block or not.
2440                 *
2441                 * If we will insert a new one, this is easy and only happens
2442                 * during adding refcounted flag to the extent, so we don't
2443                 * have a chance of spliting. We just need one record.
2444                 *
2445                 * If the refcount rec already exists, that would be a little
2446                 * complicated. we may have to:
2447                 * 1) split at the beginning if the start pos isn't aligned.
2448                 *    we need 1 more record in this case.
2449                 * 2) split int the end if the end pos isn't aligned.
2450                 *    we need 1 more record in this case.
2451                 * 3) split in the middle because of file system fragmentation.
2452                 *    we need 2 more records in this case(we can't detect this
2453                 *    beforehand, so always think of the worst case).
2454                 */
2455                if (rec.r_refcount) {
2456                        recs_add += 2;
2457                        /* Check whether we need a split at the beginning. */
2458                        if (cpos == start_cpos &&
2459                            cpos != le64_to_cpu(rec.r_cpos))
2460                                recs_add++;
2461
2462                        /* Check whether we need a split in the end. */
2463                        if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2464                            le32_to_cpu(rec.r_clusters))
2465                                recs_add++;
2466                } else
2467                        recs_add++;
2468
2469                brelse(ref_leaf_bh);
2470                ref_leaf_bh = NULL;
2471                clusters -= len;
2472                cpos += len;
2473        }
2474
2475        if (prev_bh) {
2476                rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2477
2478                if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
2479                    le16_to_cpu(rb->rf_records.rl_count))
2480                        ref_blocks++;
2481
2482                *credits += 1;
2483        }
2484
2485        if (!ref_blocks)
2486                goto out;
2487
2488        *meta_add += ref_blocks;
2489        *credits += ref_blocks;
2490
2491        /*
2492         * So we may need ref_blocks to insert into the tree.
2493         * That also means we need to change the b-tree and add that number
2494         * of records since we never merge them.
2495         * We need one more block for expansion since the new created leaf
2496         * block is also full and needs split.
2497         */
2498        rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2499        if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2500                struct ocfs2_extent_tree et;
2501
2502                ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2503                *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2504                *credits += ocfs2_calc_extend_credits(sb,
2505                                                      et.et_root_el,
2506                                                      ref_blocks);
2507        } else {
2508                *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2509                *meta_add += 1;
2510        }
2511
2512out:
2513
2514        trace_ocfs2_calc_refcount_meta_credits(
2515                (unsigned long long)start_cpos, clusters,
2516                *meta_add, *credits);
2517        brelse(ref_leaf_bh);
2518        brelse(prev_bh);
2519        return ret;
2520}
2521
2522/*
2523 * For refcount tree, we will decrease some contiguous clusters
2524 * refcount count, so just go through it to see how many blocks
2525 * we gonna touch and whether we need to create new blocks.
2526 *
2527 * Normally the refcount blocks store these refcount should be
2528 * contiguous also, so that we can get the number easily.
2529 * We will at most add split 2 refcount records and 2 more
2530 * refcount blocks, so just check it in a rough way.
2531 *
2532 * Caller must hold refcount tree lock.
2533 */
2534int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2535                                          u64 refcount_loc,
2536                                          u64 phys_blkno,
2537                                          u32 clusters,
2538                                          int *credits,
2539                                          int *ref_blocks)
2540{
2541        int ret;
2542        struct ocfs2_inode_info *oi = OCFS2_I(inode);
2543        struct buffer_head *ref_root_bh = NULL;
2544        struct ocfs2_refcount_tree *tree;
2545        u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2546
2547        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2548                ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2549                            "tree, but the feature bit is not set in the "
2550                            "super block.", inode->i_ino);
2551                ret = -EROFS;
2552                goto out;
2553        }
2554
2555        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2556
2557        ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2558                                      refcount_loc, &tree);
2559        if (ret) {
2560                mlog_errno(ret);
2561                goto out;
2562        }
2563
2564        ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2565                                        &ref_root_bh);
2566        if (ret) {
2567                mlog_errno(ret);
2568                goto out;
2569        }
2570
2571        ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2572                                               &tree->rf_ci,
2573                                               ref_root_bh,
2574                                               start_cpos, clusters,
2575                                               ref_blocks, credits);
2576        if (ret) {
2577                mlog_errno(ret);
2578                goto out;
2579        }
2580
2581        trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
2582
2583out:
2584        brelse(ref_root_bh);
2585        return ret;
2586}
2587
2588#define MAX_CONTIG_BYTES        1048576
2589
2590static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2591{
2592        return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2593}
2594
2595static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2596{
2597        return ~(ocfs2_cow_contig_clusters(sb) - 1);
2598}
2599
2600/*
2601 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2602 * find an offset (start + (n * contig_clusters)) that is closest to cpos
2603 * while still being less than or equal to it.
2604 *
2605 * The goal is to break the extent at a multiple of contig_clusters.
2606 */
2607static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2608                                                 unsigned int start,
2609                                                 unsigned int cpos)
2610{
2611        BUG_ON(start > cpos);
2612
2613        return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2614}
2615
2616/*
2617 * Given a cluster count of len, pad it out so that it is a multiple
2618 * of contig_clusters.
2619 */
2620static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2621                                                  unsigned int len)
2622{
2623        unsigned int padded =
2624                (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2625                ocfs2_cow_contig_mask(sb);
2626
2627        /* Did we wrap? */
2628        if (padded < len)
2629                padded = UINT_MAX;
2630
2631        return padded;
2632}
2633
2634/*
2635 * Calculate out the start and number of virtual clusters we need to to CoW.
2636 *
2637 * cpos is vitual start cluster position we want to do CoW in a
2638 * file and write_len is the cluster length.
2639 * max_cpos is the place where we want to stop CoW intentionally.
2640 *
2641 * Normal we will start CoW from the beginning of extent record cotaining cpos.
2642 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2643 * get good I/O from the resulting extent tree.
2644 */
2645static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2646                                           struct ocfs2_extent_list *el,
2647                                           u32 cpos,
2648                                           u32 write_len,
2649                                           u32 max_cpos,
2650                                           u32 *cow_start,
2651                                           u32 *cow_len)
2652{
2653        int ret = 0;
2654        int tree_height = le16_to_cpu(el->l_tree_depth), i;
2655        struct buffer_head *eb_bh = NULL;
2656        struct ocfs2_extent_block *eb = NULL;
2657        struct ocfs2_extent_rec *rec;
2658        unsigned int want_clusters, rec_end = 0;
2659        int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2660        int leaf_clusters;
2661
2662        BUG_ON(cpos + write_len > max_cpos);
2663
2664        if (tree_height > 0) {
2665                ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2666                if (ret) {
2667                        mlog_errno(ret);
2668                        goto out;
2669                }
2670
2671                eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2672                el = &eb->h_list;
2673
2674                if (el->l_tree_depth) {
2675                        ocfs2_error(inode->i_sb,
2676                                    "Inode %lu has non zero tree depth in "
2677                                    "leaf block %llu\n", inode->i_ino,
2678                                    (unsigned long long)eb_bh->b_blocknr);
2679                        ret = -EROFS;
2680                        goto out;
2681                }
2682        }
2683
2684        *cow_len = 0;
2685        for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2686                rec = &el->l_recs[i];
2687
2688                if (ocfs2_is_empty_extent(rec)) {
2689                        mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2690                                        "index %d\n", inode->i_ino, i);
2691                        continue;
2692                }
2693
2694                if (le32_to_cpu(rec->e_cpos) +
2695                    le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2696                        continue;
2697
2698                if (*cow_len == 0) {
2699                        /*
2700                         * We should find a refcounted record in the
2701                         * first pass.
2702                         */
2703                        BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2704                        *cow_start = le32_to_cpu(rec->e_cpos);
2705                }
2706
2707                /*
2708                 * If we encounter a hole, a non-refcounted record or
2709                 * pass the max_cpos, stop the search.
2710                 */
2711                if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2712                    (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2713                    (max_cpos <= le32_to_cpu(rec->e_cpos)))
2714                        break;
2715
2716                leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2717                rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2718                if (rec_end > max_cpos) {
2719                        rec_end = max_cpos;
2720                        leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2721                }
2722
2723                /*
2724                 * How many clusters do we actually need from
2725                 * this extent?  First we see how many we actually
2726                 * need to complete the write.  If that's smaller
2727                 * than contig_clusters, we try for contig_clusters.
2728                 */
2729                if (!*cow_len)
2730                        want_clusters = write_len;
2731                else
2732                        want_clusters = (cpos + write_len) -
2733                                (*cow_start + *cow_len);
2734                if (want_clusters < contig_clusters)
2735                        want_clusters = contig_clusters;
2736
2737                /*
2738                 * If the write does not cover the whole extent, we
2739                 * need to calculate how we're going to split the extent.
2740                 * We try to do it on contig_clusters boundaries.
2741                 *
2742                 * Any extent smaller than contig_clusters will be
2743                 * CoWed in its entirety.
2744                 */
2745                if (leaf_clusters <= contig_clusters)
2746                        *cow_len += leaf_clusters;
2747                else if (*cow_len || (*cow_start == cpos)) {
2748                        /*
2749                         * This extent needs to be CoW'd from its
2750                         * beginning, so all we have to do is compute
2751                         * how many clusters to grab.  We align
2752                         * want_clusters to the edge of contig_clusters
2753                         * to get better I/O.
2754                         */
2755                        want_clusters = ocfs2_cow_align_length(inode->i_sb,
2756                                                               want_clusters);
2757
2758                        if (leaf_clusters < want_clusters)
2759                                *cow_len += leaf_clusters;
2760                        else
2761                                *cow_len += want_clusters;
2762                } else if ((*cow_start + contig_clusters) >=
2763                           (cpos + write_len)) {
2764                        /*
2765                         * Breaking off contig_clusters at the front
2766                         * of the extent will cover our write.  That's
2767                         * easy.
2768                         */
2769                        *cow_len = contig_clusters;
2770                } else if ((rec_end - cpos) <= contig_clusters) {
2771                        /*
2772                         * Breaking off contig_clusters at the tail of
2773                         * this extent will cover cpos.
2774                         */
2775                        *cow_start = rec_end - contig_clusters;
2776                        *cow_len = contig_clusters;
2777                } else if ((rec_end - cpos) <= want_clusters) {
2778                        /*
2779                         * While we can't fit the entire write in this
2780                         * extent, we know that the write goes from cpos
2781                         * to the end of the extent.  Break that off.
2782                         * We try to break it at some multiple of
2783                         * contig_clusters from the front of the extent.
2784                         * Failing that (ie, cpos is within
2785                         * contig_clusters of the front), we'll CoW the
2786                         * entire extent.
2787                         */
2788                        *cow_start = ocfs2_cow_align_start(inode->i_sb,
2789                                                           *cow_start, cpos);
2790                        *cow_len = rec_end - *cow_start;
2791                } else {
2792                        /*
2793                         * Ok, the entire write lives in the middle of
2794                         * this extent.  Let's try to slice the extent up
2795                         * nicely.  Optimally, our CoW region starts at
2796                         * m*contig_clusters from the beginning of the
2797                         * extent and goes for n*contig_clusters,
2798                         * covering the entire write.
2799                         */
2800                        *cow_start = ocfs2_cow_align_start(inode->i_sb,
2801                                                           *cow_start, cpos);
2802
2803                        want_clusters = (cpos + write_len) - *cow_start;
2804                        want_clusters = ocfs2_cow_align_length(inode->i_sb,
2805                                                               want_clusters);
2806                        if (*cow_start + want_clusters <= rec_end)
2807                                *cow_len = want_clusters;
2808                        else
2809                                *cow_len = rec_end - *cow_start;
2810                }
2811
2812                /* Have we covered our entire write yet? */
2813                if ((*cow_start + *cow_len) >= (cpos + write_len))
2814                        break;
2815
2816                /*
2817                 * If we reach the end of the extent block and don't get enough
2818                 * clusters, continue with the next extent block if possible.
2819                 */
2820                if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2821                    eb && eb->h_next_leaf_blk) {
2822                        brelse(eb_bh);
2823                        eb_bh = NULL;
2824
2825                        ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2826                                               le64_to_cpu(eb->h_next_leaf_blk),
2827                                               &eb_bh);
2828                        if (ret) {
2829                                mlog_errno(ret);
2830                                goto out;
2831                        }
2832
2833                        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2834                        el = &eb->h_list;
2835                        i = -1;
2836                }
2837        }
2838
2839out:
2840        brelse(eb_bh);
2841        return ret;
2842}
2843
2844/*
2845 * Prepare meta_ac, data_ac and calculate credits when we want to add some
2846 * num_clusters in data_tree "et" and change the refcount for the old
2847 * clusters(starting form p_cluster) in the refcount tree.
2848 *
2849 * Note:
2850 * 1. since we may split the old tree, so we at most will need num_clusters + 2
2851 *    more new leaf records.
2852 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2853 *    just give data_ac = NULL.
2854 */
2855static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2856                                        u32 p_cluster, u32 num_clusters,
2857                                        struct ocfs2_extent_tree *et,
2858                                        struct ocfs2_caching_info *ref_ci,
2859                                        struct buffer_head *ref_root_bh,
2860                                        struct ocfs2_alloc_context **meta_ac,
2861                                        struct ocfs2_alloc_context **data_ac,
2862                                        int *credits)
2863{
2864        int ret = 0, meta_add = 0;
2865        int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2866
2867        if (num_free_extents < 0) {
2868                ret = num_free_extents;
2869                mlog_errno(ret);
2870                goto out;
2871        }
2872
2873        if (num_free_extents < num_clusters + 2)
2874                meta_add =
2875                        ocfs2_extend_meta_needed(et->et_root_el);
2876
2877        *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2878                                              num_clusters + 2);
2879
2880        ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2881                                               p_cluster, num_clusters,
2882                                               &meta_add, credits);
2883        if (ret) {
2884                mlog_errno(ret);
2885                goto out;
2886        }
2887
2888        trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
2889        ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2890                                                meta_ac);
2891        if (ret) {
2892                mlog_errno(ret);
2893                goto out;
2894        }
2895
2896        if (data_ac) {
2897                ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2898                                             data_ac);
2899                if (ret)
2900                        mlog_errno(ret);
2901        }
2902
2903out:
2904        if (ret) {
2905                if (*meta_ac) {
2906                        ocfs2_free_alloc_context(*meta_ac);
2907                        *meta_ac = NULL;
2908                }
2909        }
2910
2911        return ret;
2912}
2913
2914static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2915{
2916        BUG_ON(buffer_dirty(bh));
2917
2918        clear_buffer_mapped(bh);
2919
2920        return 0;
2921}
2922
2923int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2924                                     struct inode *inode,
2925                                     u32 cpos, u32 old_cluster,
2926                                     u32 new_cluster, u32 new_len)
2927{
2928        int ret = 0, partial;
2929        struct super_block *sb = inode->i_sb;
2930        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2931        struct page *page;
2932        pgoff_t page_index;
2933        unsigned int from, to, readahead_pages;
2934        loff_t offset, end, map_end;
2935        struct address_space *mapping = inode->i_mapping;
2936
2937        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2938                                               new_cluster, new_len);
2939
2940        readahead_pages =
2941                (ocfs2_cow_contig_clusters(sb) <<
2942                 OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2943        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2944        end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2945        /*
2946         * We only duplicate pages until we reach the page contains i_size - 1.
2947         * So trim 'end' to i_size.
2948         */
2949        if (end > i_size_read(inode))
2950                end = i_size_read(inode);
2951
2952        while (offset < end) {
2953                page_index = offset >> PAGE_CACHE_SHIFT;
2954                map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2955                if (map_end > end)
2956                        map_end = end;
2957
2958                /* from, to is the offset within the page. */
2959                from = offset & (PAGE_CACHE_SIZE - 1);
2960                to = PAGE_CACHE_SIZE;
2961                if (map_end & (PAGE_CACHE_SIZE - 1))
2962                        to = map_end & (PAGE_CACHE_SIZE - 1);
2963
2964                page = find_or_create_page(mapping, page_index, GFP_NOFS);
2965                if (!page) {
2966                        ret = -ENOMEM;
2967                        mlog_errno(ret);
2968                        break;
2969                }
2970
2971                /*
2972                 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2973                 * can't be dirtied before we CoW it out.
2974                 */
2975                if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2976                        BUG_ON(PageDirty(page));
2977
2978                if (!PageUptodate(page)) {
2979                        ret = block_read_full_page(page, ocfs2_get_block);
2980                        if (ret) {
2981                                mlog_errno(ret);
2982                                goto unlock;
2983                        }
2984                        lock_page(page);
2985                }
2986
2987                if (page_has_buffers(page)) {
2988                        ret = walk_page_buffers(handle, page_buffers(page),
2989                                                from, to, &partial,
2990                                                ocfs2_clear_cow_buffer);
2991                        if (ret) {
2992                                mlog_errno(ret);
2993                                goto unlock;
2994                        }
2995                }
2996
2997                ocfs2_map_and_dirty_page(inode,
2998                                         handle, from, to,
2999                                         page, 0, &new_block);
3000                mark_page_accessed(page);
3001unlock:
3002                unlock_page(page);
3003                page_cache_release(page);
3004                page = NULL;
3005                offset = map_end;
3006                if (ret)
3007                        break;
3008        }
3009
3010        return ret;
3011}
3012
3013int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3014                                    struct inode *inode,
3015                                    u32 cpos, u32 old_cluster,
3016                                    u32 new_cluster, u32 new_len)
3017{
3018        int ret = 0;
3019        struct super_block *sb = inode->i_sb;
3020        struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3021        int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3022        u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3023        u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
3024        struct ocfs2_super *osb = OCFS2_SB(sb);
3025        struct buffer_head *old_bh = NULL;
3026        struct buffer_head *new_bh = NULL;
3027
3028        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
3029                                               new_cluster, new_len);
3030
3031        for (i = 0; i < blocks; i++, old_block++, new_block++) {
3032                new_bh = sb_getblk(osb->sb, new_block);
3033                if (new_bh == NULL) {
3034                        ret = -EIO;
3035                        mlog_errno(ret);
3036                        break;
3037                }
3038
3039                ocfs2_set_new_buffer_uptodate(ci, new_bh);
3040
3041                ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
3042                if (ret) {
3043                        mlog_errno(ret);
3044                        break;
3045                }
3046
3047                ret = ocfs2_journal_access(handle, ci, new_bh,
3048                                           OCFS2_JOURNAL_ACCESS_CREATE);
3049                if (ret) {
3050                        mlog_errno(ret);
3051                        break;
3052                }
3053
3054                memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3055                ocfs2_journal_dirty(handle, new_bh);
3056
3057                brelse(new_bh);
3058                brelse(old_bh);
3059                new_bh = NULL;
3060                old_bh = NULL;
3061        }
3062
3063        brelse(new_bh);
3064        brelse(old_bh);
3065        return ret;
3066}
3067
3068static int ocfs2_clear_ext_refcount(handle_t *handle,
3069                                    struct ocfs2_extent_tree *et,
3070                                    u32 cpos, u32 p_cluster, u32 len,
3071                                    unsigned int ext_flags,
3072                                    struct ocfs2_alloc_context *meta_ac,
3073                                    struct ocfs2_cached_dealloc_ctxt *dealloc)
3074{
3075        int ret, index;
3076        struct ocfs2_extent_rec replace_rec;
3077        struct ocfs2_path *path = NULL;
3078        struct ocfs2_extent_list *el;
3079        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
3080        u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
3081
3082        trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
3083                                       cpos, len, p_cluster, ext_flags);
3084
3085        memset(&replace_rec, 0, sizeof(replace_rec));
3086        replace_rec.e_cpos = cpu_to_le32(cpos);
3087        replace_rec.e_leaf_clusters = cpu_to_le16(len);
3088        replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
3089                                                                   p_cluster));
3090        replace_rec.e_flags = ext_flags;
3091        replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
3092
3093        path = ocfs2_new_path_from_et(et);
3094        if (!path) {
3095                ret = -ENOMEM;
3096                mlog_errno(ret);
3097                goto out;
3098        }
3099
3100        ret = ocfs2_find_path(et->et_ci, path, cpos);
3101        if (ret) {
3102                mlog_errno(ret);
3103                goto out;
3104        }
3105
3106        el = path_leaf_el(path);
3107
3108        index = ocfs2_search_extent_list(el, cpos);
3109        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3110                ocfs2_error(sb,
3111                            "Inode %llu has an extent at cpos %u which can no "
3112                            "longer be found.\n",
3113                            (unsigned long long)ino, cpos);
3114                ret = -EROFS;
3115                goto out;
3116        }
3117
3118        ret = ocfs2_split_extent(handle, et, path, index,
3119                                 &replace_rec, meta_ac, dealloc);
3120        if (ret)
3121                mlog_errno(ret);
3122
3123out:
3124        ocfs2_free_path(path);
3125        return ret;
3126}
3127
3128static int ocfs2_replace_clusters(handle_t *handle,
3129                                  struct ocfs2_cow_context *context,
3130                                  u32 cpos, u32 old,
3131                                  u32 new, u32 len,
3132                                  unsigned int ext_flags)
3133{
3134        int ret;
3135        struct ocfs2_caching_info *ci = context->data_et.et_ci;
3136        u64 ino = ocfs2_metadata_cache_owner(ci);
3137
3138        trace_ocfs2_replace_clusters((unsigned long long)ino,
3139                                     cpos, old, new, len, ext_flags);
3140
3141        /*If the old clusters is unwritten, no need to duplicate. */
3142        if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3143                ret = context->cow_duplicate_clusters(handle, context->inode,
3144                                                      cpos, old, new, len);
3145                if (ret) {
3146                        mlog_errno(ret);
3147                        goto out;
3148                }
3149        }
3150
3151        ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3152                                       cpos, new, len, ext_flags,
3153                                       context->meta_ac, &context->dealloc);
3154        if (ret)
3155                mlog_errno(ret);
3156out:
3157        return ret;
3158}
3159
3160int ocfs2_cow_sync_writeback(struct super_block *sb,
3161                             struct inode *inode,
3162                             u32 cpos, u32 num_clusters)
3163{
3164        int ret = 0;
3165        loff_t offset, end, map_end;
3166        pgoff_t page_index;
3167        struct page *page;
3168
3169        if (ocfs2_should_order_data(inode))
3170                return 0;
3171
3172        offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3173        end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3174
3175        ret = filemap_fdatawrite_range(inode->i_mapping,
3176                                       offset, end - 1);
3177        if (ret < 0) {
3178                mlog_errno(ret);
3179                return ret;
3180        }
3181
3182        while (offset < end) {
3183                page_index = offset >> PAGE_CACHE_SHIFT;
3184                map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3185                if (map_end > end)
3186                        map_end = end;
3187
3188                page = find_or_create_page(inode->i_mapping,
3189                                           page_index, GFP_NOFS);
3190                BUG_ON(!page);
3191
3192                wait_on_page_writeback(page);
3193                if (PageError(page)) {
3194                        ret = -EIO;
3195                        mlog_errno(ret);
3196                } else
3197                        mark_page_accessed(page);
3198
3199                unlock_page(page);
3200                page_cache_release(page);
3201                page = NULL;
3202                offset = map_end;
3203                if (ret)
3204                        break;
3205        }
3206
3207        return ret;
3208}
3209
3210static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3211                                 u32 v_cluster, u32 *p_cluster,
3212                                 u32 *num_clusters,
3213                                 unsigned int *extent_flags)
3214{
3215        return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3216                                  num_clusters, extent_flags);
3217}
3218
3219static int ocfs2_make_clusters_writable(struct super_block *sb,
3220                                        struct ocfs2_cow_context *context,
3221                                        u32 cpos, u32 p_cluster,
3222                                        u32 num_clusters, unsigned int e_flags)
3223{
3224        int ret, delete, index, credits =  0;
3225        u32 new_bit, new_len, orig_num_clusters;
3226        unsigned int set_len;
3227        struct ocfs2_super *osb = OCFS2_SB(sb);
3228        handle_t *handle;
3229        struct buffer_head *ref_leaf_bh = NULL;
3230        struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3231        struct ocfs2_refcount_rec rec;
3232
3233        trace_ocfs2_make_clusters_writable(cpos, p_cluster,
3234                                           num_clusters, e_flags);
3235
3236        ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3237                                             &context->data_et,
3238                                             ref_ci,
3239                                             context->ref_root_bh,
3240                                             &context->meta_ac,
3241                                             &context->data_ac, &credits);
3242        if (ret) {
3243                mlog_errno(ret);
3244                return ret;
3245        }
3246
3247        if (context->post_refcount)
3248                credits += context->post_refcount->credits;
3249
3250        credits += context->extra_credits;
3251        handle = ocfs2_start_trans(osb, credits);
3252        if (IS_ERR(handle)) {
3253                ret = PTR_ERR(handle);
3254                mlog_errno(ret);
3255                goto out;
3256        }
3257
3258        orig_num_clusters = num_clusters;
3259
3260        while (num_clusters) {
3261                ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3262                                             p_cluster, num_clusters,
3263                                             &rec, &index, &ref_leaf_bh);
3264                if (ret) {
3265                        mlog_errno(ret);
3266                        goto out_commit;
3267                }
3268
3269                BUG_ON(!rec.r_refcount);
3270                set_len = min((u64)p_cluster + num_clusters,
3271                              le64_to_cpu(rec.r_cpos) +
3272                              le32_to_cpu(rec.r_clusters)) - p_cluster;
3273
3274                /*
3275                 * There are many different situation here.
3276                 * 1. If refcount == 1, remove the flag and don't COW.
3277                 * 2. If refcount > 1, allocate clusters.
3278                 *    Here we may not allocate r_len once at a time, so continue
3279                 *    until we reach num_clusters.
3280                 */
3281                if (le32_to_cpu(rec.r_refcount) == 1) {
3282                        delete = 0;
3283                        ret = ocfs2_clear_ext_refcount(handle,
3284                                                       &context->data_et,
3285                                                       cpos, p_cluster,
3286                                                       set_len, e_flags,
3287                                                       context->meta_ac,
3288                                                       &context->dealloc);
3289                        if (ret) {
3290                                mlog_errno(ret);
3291                                goto out_commit;
3292                        }
3293                } else {
3294                        delete = 1;
3295
3296                        ret = __ocfs2_claim_clusters(handle,
3297                                                     context->data_ac,
3298                                                     1, set_len,
3299                                                     &new_bit, &new_len);
3300                        if (ret) {
3301                                mlog_errno(ret);
3302                                goto out_commit;
3303                        }
3304
3305                        ret = ocfs2_replace_clusters(handle, context,
3306                                                     cpos, p_cluster, new_bit,
3307                                                     new_len, e_flags);
3308                        if (ret) {
3309                                mlog_errno(ret);
3310                                goto out_commit;
3311                        }
3312                        set_len = new_len;
3313                }
3314
3315                ret = __ocfs2_decrease_refcount(handle, ref_ci,
3316                                                context->ref_root_bh,
3317                                                p_cluster, set_len,
3318                                                context->meta_ac,
3319                                                &context->dealloc, delete);
3320                if (ret) {
3321                        mlog_errno(ret);
3322                        goto out_commit;
3323                }
3324
3325                cpos += set_len;
3326                p_cluster += set_len;
3327                num_clusters -= set_len;
3328                brelse(ref_leaf_bh);
3329                ref_leaf_bh = NULL;
3330        }
3331
3332        /* handle any post_cow action. */
3333        if (context->post_refcount && context->post_refcount->func) {
3334                ret = context->post_refcount->func(context->inode, handle,
3335                                                context->post_refcount->para);
3336                if (ret) {
3337                        mlog_errno(ret);
3338                        goto out_commit;
3339                }
3340        }
3341
3342        /*
3343         * Here we should write the new page out first if we are
3344         * in write-back mode.
3345         */
3346        if (context->get_clusters == ocfs2_di_get_clusters) {
3347                ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3348                                               orig_num_clusters);
3349                if (ret)
3350                        mlog_errno(ret);
3351        }
3352
3353out_commit:
3354        ocfs2_commit_trans(osb, handle);
3355
3356out:
3357        if (context->data_ac) {
3358                ocfs2_free_alloc_context(context->data_ac);
3359                context->data_ac = NULL;
3360        }
3361        if (context->meta_ac) {
3362                ocfs2_free_alloc_context(context->meta_ac);
3363                context->meta_ac = NULL;
3364        }
3365        brelse(ref_leaf_bh);
3366
3367        return ret;
3368}
3369
3370static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3371{
3372        int ret = 0;
3373        struct inode *inode = context->inode;
3374        u32 cow_start = context->cow_start, cow_len = context->cow_len;
3375        u32 p_cluster, num_clusters;
3376        unsigned int ext_flags;
3377        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3378
3379        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3380                ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3381                            "tree, but the feature bit is not set in the "
3382                            "super block.", inode->i_ino);
3383                return -EROFS;
3384        }
3385
3386        ocfs2_init_dealloc_ctxt(&context->dealloc);
3387
3388        while (cow_len) {
3389                ret = context->get_clusters(context, cow_start, &p_cluster,
3390                                            &num_clusters, &ext_flags);
3391                if (ret) {
3392                        mlog_errno(ret);
3393                        break;
3394                }
3395
3396                BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3397
3398                if (cow_len < num_clusters)
3399                        num_clusters = cow_len;
3400
3401                ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3402                                                   cow_start, p_cluster,
3403                                                   num_clusters, ext_flags);
3404                if (ret) {
3405                        mlog_errno(ret);
3406                        break;
3407                }
3408
3409                cow_len -= num_clusters;
3410                cow_start += num_clusters;
3411        }
3412
3413        if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3414                ocfs2_schedule_truncate_log_flush(osb, 1);
3415                ocfs2_run_deallocs(osb, &context->dealloc);
3416        }
3417
3418        return ret;
3419}
3420
3421/*
3422 * Starting at cpos, try to CoW write_len clusters.  Don't CoW
3423 * past max_cpos.  This will stop when it runs into a hole or an
3424 * unrefcounted extent.
3425 */
3426static int ocfs2_refcount_cow_hunk(struct inode *inode,
3427                                   struct buffer_head *di_bh,
3428                                   u32 cpos, u32 write_len, u32 max_cpos)
3429{
3430        int ret;
3431        u32 cow_start = 0, cow_len = 0;
3432        struct ocfs2_inode_info *oi = OCFS2_I(inode);
3433        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3434        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3435        struct buffer_head *ref_root_bh = NULL;
3436        struct ocfs2_refcount_tree *ref_tree;
3437        struct ocfs2_cow_context *context = NULL;
3438
3439        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3440
3441        ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3442                                              cpos, write_len, max_cpos,
3443                                              &cow_start, &cow_len);
3444        if (ret) {
3445                mlog_errno(ret);
3446                goto out;
3447        }
3448
3449        trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
3450                                      cpos, write_len, max_cpos,
3451                                      cow_start, cow_len);
3452
3453        BUG_ON(cow_len == 0);
3454
3455        context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3456        if (!context) {
3457                ret = -ENOMEM;
3458                mlog_errno(ret);
3459                goto out;
3460        }
3461
3462        ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3463                                       1, &ref_tree, &ref_root_bh);
3464        if (ret) {
3465                mlog_errno(ret);
3466                goto out;
3467        }
3468
3469        context->inode = inode;
3470        context->cow_start = cow_start;
3471        context->cow_len = cow_len;
3472        context->ref_tree = ref_tree;
3473        context->ref_root_bh = ref_root_bh;
3474        context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3475        context->get_clusters = ocfs2_di_get_clusters;
3476
3477        ocfs2_init_dinode_extent_tree(&context->data_et,
3478                                      INODE_CACHE(inode), di_bh);
3479
3480        ret = ocfs2_replace_cow(context);
3481        if (ret)
3482                mlog_errno(ret);
3483
3484        /*
3485         * truncate the extent map here since no matter whether we meet with
3486         * any error during the action, we shouldn't trust cached extent map
3487         * any more.
3488         */
3489        ocfs2_extent_map_trunc(inode, cow_start);
3490
3491        ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3492        brelse(ref_root_bh);
3493out:
3494        kfree(context);
3495        return ret;
3496}
3497
3498/*
3499 * CoW any and all clusters between cpos and cpos+write_len.
3500 * Don't CoW past max_cpos.  If this returns successfully, all
3501 * clusters between cpos and cpos+write_len are safe to modify.
3502 */
3503int ocfs2_refcount_cow(struct inode *inode,
3504                       struct buffer_head *di_bh,
3505                       u32 cpos, u32 write_len, u32 max_cpos)
3506{
3507        int ret = 0;
3508        u32 p_cluster, num_clusters;
3509        unsigned int ext_flags;
3510
3511        while (write_len) {
3512                ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3513                                         &num_clusters, &ext_flags);
3514                if (ret) {
3515                        mlog_errno(ret);
3516                        break;
3517                }
3518
3519                if (write_len < num_clusters)
3520                        num_clusters = write_len;
3521
3522                if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3523                        ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3524                                                      num_clusters, max_cpos);
3525                        if (ret) {
3526                                mlog_errno(ret);
3527                                break;
3528                        }
3529                }
3530
3531                write_len -= num_clusters;
3532                cpos += num_clusters;
3533        }
3534
3535        return ret;
3536}
3537
3538static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3539                                          u32 v_cluster, u32 *p_cluster,
3540                                          u32 *num_clusters,
3541                                          unsigned int *extent_flags)
3542{
3543        struct inode *inode = context->inode;
3544        struct ocfs2_xattr_value_root *xv = context->cow_object;
3545
3546        return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3547                                        num_clusters, &xv->xr_list,
3548                                        extent_flags);
3549}
3550
3551/*
3552 * Given a xattr value root, calculate the most meta/credits we need for
3553 * refcount tree change if we truncate it to 0.
3554 */
3555int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3556                                       struct ocfs2_caching_info *ref_ci,
3557                                       struct buffer_head *ref_root_bh,
3558                                       struct ocfs2_xattr_value_root *xv,
3559                                       int *meta_add, int *credits)
3560{
3561        int ret = 0, index, ref_blocks = 0;
3562        u32 p_cluster, num_clusters;
3563        u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3564        struct ocfs2_refcount_block *rb;
3565        struct ocfs2_refcount_rec rec;
3566        struct buffer_head *ref_leaf_bh = NULL;
3567
3568        while (cpos < clusters) {
3569                ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3570                                               &num_clusters, &xv->xr_list,
3571                                               NULL);
3572                if (ret) {
3573                        mlog_errno(ret);
3574                        goto out;
3575                }
3576
3577                cpos += num_clusters;
3578
3579                while (num_clusters) {
3580                        ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3581                                                     p_cluster, num_clusters,
3582                                                     &rec, &index,
3583                                                     &ref_leaf_bh);
3584                        if (ret) {
3585                                mlog_errno(ret);
3586                                goto out;
3587                        }
3588
3589                        BUG_ON(!rec.r_refcount);
3590
3591                        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3592
3593                        /*
3594                         * We really don't know whether the other clusters is in
3595                         * this refcount block or not, so just take the worst
3596                         * case that all the clusters are in this block and each
3597                         * one will split a refcount rec, so totally we need
3598                         * clusters * 2 new refcount rec.
3599                         */
3600                        if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3601                            le16_to_cpu(rb->rf_records.rl_count))
3602                                ref_blocks++;
3603
3604                        *credits += 1;
3605                        brelse(ref_leaf_bh);
3606                        ref_leaf_bh = NULL;
3607
3608                        if (num_clusters <= le32_to_cpu(rec.r_clusters))
3609                                break;
3610                        else
3611                                num_clusters -= le32_to_cpu(rec.r_clusters);
3612                        p_cluster += num_clusters;
3613                }
3614        }
3615
3616        *meta_add += ref_blocks;
3617        if (!ref_blocks)
3618                goto out;
3619
3620        rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3621        if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3622                *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3623        else {
3624                struct ocfs2_extent_tree et;
3625
3626                ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3627                *credits += ocfs2_calc_extend_credits(inode->i_sb,
3628                                                      et.et_root_el,
3629                                                      ref_blocks);
3630        }
3631
3632out:
3633        brelse(ref_leaf_bh);
3634        return ret;
3635}
3636
3637/*
3638 * Do CoW for xattr.
3639 */
3640int ocfs2_refcount_cow_xattr(struct inode *inode,
3641                             struct ocfs2_dinode *di,
3642                             struct ocfs2_xattr_value_buf *vb,
3643                             struct ocfs2_refcount_tree *ref_tree,
3644                             struct buffer_head *ref_root_bh,
3645                             u32 cpos, u32 write_len,
3646                             struct ocfs2_post_refcount *post)
3647{
3648        int ret;
3649        struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3650        struct ocfs2_inode_info *oi = OCFS2_I(inode);
3651        struct ocfs2_cow_context *context = NULL;
3652        u32 cow_start, cow_len;
3653
3654        BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3655
3656        ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3657                                              cpos, write_len, UINT_MAX,
3658                                              &cow_start, &cow_len);
3659        if (ret) {
3660                mlog_errno(ret);
3661                goto out;
3662        }
3663
3664        BUG_ON(cow_len == 0);
3665
3666        context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3667        if (!context) {
3668                ret = -ENOMEM;
3669                mlog_errno(ret);
3670                goto out;
3671        }
3672
3673        context->inode = inode;
3674        context->cow_start = cow_start;
3675        context->cow_len = cow_len;
3676        context->ref_tree = ref_tree;
3677        context->ref_root_bh = ref_root_bh;
3678        context->cow_object = xv;
3679
3680        context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3681        /* We need the extra credits for duplicate_clusters by jbd. */
3682        context->extra_credits =
3683                ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3684        context->get_clusters = ocfs2_xattr_value_get_clusters;
3685        context->post_refcount = post;
3686
3687        ocfs2_init_xattr_value_extent_tree(&context->data_et,
3688                                           INODE_CACHE(inode), vb);
3689
3690        ret = ocfs2_replace_cow(context);
3691        if (ret)
3692                mlog_errno(ret);
3693
3694out:
3695        kfree(context);
3696        return ret;
3697}
3698
3699/*
3700 * Insert a new extent into refcount tree and mark a extent rec
3701 * as refcounted in the dinode tree.
3702 */
3703int ocfs2_add_refcount_flag(struct inode *inode,
3704                            struct ocfs2_extent_tree *data_et,
3705                            struct ocfs2_caching_info *ref_ci,
3706                            struct buffer_head *ref_root_bh,
3707                            u32 cpos, u32 p_cluster, u32 num_clusters,
3708                            struct ocfs2_cached_dealloc_ctxt *dealloc,
3709                            struct ocfs2_post_refcount *post)
3710{
3711        int ret;
3712        handle_t *handle;
3713        int credits = 1, ref_blocks = 0;
3714        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3715        struct ocfs2_alloc_context *meta_ac = NULL;
3716
3717        ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3718                                               ref_ci, ref_root_bh,
3719                                               p_cluster, num_clusters,
3720                                               &ref_blocks, &credits);
3721        if (ret) {
3722                mlog_errno(ret);
3723                goto out;
3724        }
3725
3726        trace_ocfs2_add_refcount_flag(ref_blocks, credits);
3727
3728        if (ref_blocks) {
3729                ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
3730                                                        ref_blocks, &meta_ac);
3731                if (ret) {
3732                        mlog_errno(ret);
3733                        goto out;
3734                }
3735        }
3736
3737        if (post)
3738                credits += post->credits;
3739
3740        handle = ocfs2_start_trans(osb, credits);
3741        if (IS_ERR(handle)) {
3742                ret = PTR_ERR(handle);
3743                mlog_errno(ret);
3744                goto out;
3745        }
3746
3747        ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3748                                           cpos, num_clusters, p_cluster,
3749                                           meta_ac, dealloc);
3750        if (ret) {
3751                mlog_errno(ret);
3752                goto out_commit;
3753        }
3754
3755        ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3756                                        p_cluster, num_clusters, 0,
3757                                        meta_ac, dealloc);
3758        if (ret) {
3759                mlog_errno(ret);
3760                goto out_commit;
3761        }
3762
3763        if (post && post->func) {
3764                ret = post->func(inode, handle, post->para);
3765                if (ret)
3766                        mlog_errno(ret);
3767        }
3768
3769out_commit:
3770        ocfs2_commit_trans(osb, handle);
3771out:
3772        if (meta_ac)
3773                ocfs2_free_alloc_context(meta_ac);
3774        return ret;
3775}
3776
3777static int ocfs2_change_ctime(struct inode *inode,
3778                              struct buffer_head *di_bh)
3779{
3780        int ret;
3781        handle_t *handle;
3782        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3783
3784        handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3785                                   OCFS2_INODE_UPDATE_CREDITS);
3786        if (IS_ERR(handle)) {
3787                ret = PTR_ERR(handle);
3788                mlog_errno(ret);
3789                goto out;
3790        }
3791
3792        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3793                                      OCFS2_JOURNAL_ACCESS_WRITE);
3794        if (ret) {
3795                mlog_errno(ret);
3796                goto out_commit;
3797        }
3798
3799        inode->i_ctime = CURRENT_TIME;
3800        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3801        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3802
3803        ocfs2_journal_dirty(handle, di_bh);
3804
3805out_commit:
3806        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3807out:
3808        return ret;
3809}
3810
3811static int ocfs2_attach_refcount_tree(struct inode *inode,
3812                                      struct buffer_head *di_bh)
3813{
3814        int ret, data_changed = 0;
3815        struct buffer_head *ref_root_bh = NULL;
3816        struct ocfs2_inode_info *oi = OCFS2_I(inode);
3817        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3818        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3819        struct ocfs2_refcount_tree *ref_tree;
3820        unsigned int ext_flags;
3821        loff_t size;
3822        u32 cpos, num_clusters, clusters, p_cluster;
3823        struct ocfs2_cached_dealloc_ctxt dealloc;
3824        struct ocfs2_extent_tree di_et;
3825
3826        ocfs2_init_dealloc_ctxt(&dealloc);
3827
3828        if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3829                ret = ocfs2_create_refcount_tree(inode, di_bh);
3830                if (ret) {
3831                        mlog_errno(ret);
3832                        goto out;
3833                }
3834        }
3835
3836        BUG_ON(!di->i_refcount_loc);
3837        ret = ocfs2_lock_refcount_tree(osb,
3838                                       le64_to_cpu(di->i_refcount_loc), 1,
3839                                       &ref_tree, &ref_root_bh);
3840        if (ret) {
3841                mlog_errno(ret);
3842                goto out;
3843        }
3844
3845        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3846                goto attach_xattr;
3847
3848        ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3849
3850        size = i_size_read(inode);
3851        clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3852
3853        cpos = 0;
3854        while (cpos < clusters) {
3855                ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3856                                         &num_clusters, &ext_flags);
3857
3858                if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3859                        ret = ocfs2_add_refcount_flag(inode, &di_et,
3860                                                      &ref_tree->rf_ci,
3861                                                      ref_root_bh, cpos,
3862                                                      p_cluster, num_clusters,
3863                                                      &dealloc, NULL);
3864                        if (ret) {
3865                                mlog_errno(ret);
3866                                goto unlock;
3867                        }
3868
3869                        data_changed = 1;
3870                }
3871                cpos += num_clusters;
3872        }
3873
3874attach_xattr:
3875        if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3876                ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3877                                                       &ref_tree->rf_ci,
3878                                                       ref_root_bh,
3879                                                       &dealloc);
3880                if (ret) {
3881                        mlog_errno(ret);
3882                        goto unlock;
3883                }
3884        }
3885
3886        if (data_changed) {
3887                ret = ocfs2_change_ctime(inode, di_bh);
3888                if (ret)
3889                        mlog_errno(ret);
3890        }
3891
3892unlock:
3893        ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3894        brelse(ref_root_bh);
3895
3896        if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3897                ocfs2_schedule_truncate_log_flush(osb, 1);
3898                ocfs2_run_deallocs(osb, &dealloc);
3899        }
3900out:
3901        /*
3902         * Empty the extent map so that we may get the right extent
3903         * record from the disk.
3904         */
3905        ocfs2_extent_map_trunc(inode, 0);
3906
3907        return ret;
3908}
3909
3910static int ocfs2_add_refcounted_extent(struct inode *inode,
3911                                   struct ocfs2_extent_tree *et,
3912                                   struct ocfs2_caching_info *ref_ci,
3913                                   struct buffer_head *ref_root_bh,
3914                                   u32 cpos, u32 p_cluster, u32 num_clusters,
3915                                   unsigned int ext_flags,
3916                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
3917{
3918        int ret;
3919        handle_t *handle;
3920        int credits = 0;
3921        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3922        struct ocfs2_alloc_context *meta_ac = NULL;
3923
3924        ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3925                                             p_cluster, num_clusters,
3926                                             et, ref_ci,
3927                                             ref_root_bh, &meta_ac,
3928                                             NULL, &credits);
3929        if (ret) {
3930                mlog_errno(ret);
3931                goto out;
3932        }
3933
3934        handle = ocfs2_start_trans(osb, credits);
3935        if (IS_ERR(handle)) {
3936                ret = PTR_ERR(handle);
3937                mlog_errno(ret);
3938                goto out;
3939        }
3940
3941        ret = ocfs2_insert_extent(handle, et, cpos,
3942                        ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3943                        num_clusters, ext_flags, meta_ac);
3944        if (ret) {
3945                mlog_errno(ret);
3946                goto out_commit;
3947        }
3948
3949        ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3950                                      p_cluster, num_clusters,
3951                                      meta_ac, dealloc);
3952        if (ret)
3953                mlog_errno(ret);
3954
3955out_commit:
3956        ocfs2_commit_trans(osb, handle);
3957out:
3958        if (meta_ac)
3959                ocfs2_free_alloc_context(meta_ac);
3960        return ret;
3961}
3962
3963static int ocfs2_duplicate_inline_data(struct inode *s_inode,
3964                                       struct buffer_head *s_bh,
3965                                       struct inode *t_inode,
3966                                       struct buffer_head *t_bh)
3967{
3968        int ret;
3969        handle_t *handle;
3970        struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3971        struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3972        struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
3973
3974        BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
3975
3976        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
3977        if (IS_ERR(handle)) {
3978                ret = PTR_ERR(handle);
3979                mlog_errno(ret);
3980                goto out;
3981        }
3982
3983        ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3984                                      OCFS2_JOURNAL_ACCESS_WRITE);
3985        if (ret) {
3986                mlog_errno(ret);
3987                goto out_commit;
3988        }
3989
3990        t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
3991        memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
3992               le16_to_cpu(s_di->id2.i_data.id_count));
3993        spin_lock(&OCFS2_I(t_inode)->ip_lock);
3994        OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
3995        t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
3996        spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3997
3998        ocfs2_journal_dirty(handle, t_bh);
3999
4000out_commit:
4001        ocfs2_commit_trans(osb, handle);
4002out:
4003        return ret;
4004}
4005
4006static int ocfs2_duplicate_extent_list(struct inode *s_inode,
4007                                struct inode *t_inode,
4008                                struct buffer_head *t_bh,
4009                                struct ocfs2_caching_info *ref_ci,
4010                                struct buffer_head *ref_root_bh,
4011                                struct ocfs2_cached_dealloc_ctxt *dealloc)
4012{
4013        int ret = 0;
4014        u32 p_cluster, num_clusters, clusters, cpos;
4015        loff_t size;
4016        unsigned int ext_flags;
4017        struct ocfs2_extent_tree et;
4018
4019        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
4020
4021        size = i_size_read(s_inode);
4022        clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
4023
4024        cpos = 0;
4025        while (cpos < clusters) {
4026                ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4027                                         &num_clusters, &ext_flags);
4028
4029                if (p_cluster) {
4030                        ret = ocfs2_add_refcounted_extent(t_inode, &et,
4031                                                          ref_ci, ref_root_bh,
4032                                                          cpos, p_cluster,
4033                                                          num_clusters,
4034                                                          ext_flags,
4035                                                          dealloc);
4036                        if (ret) {
4037                                mlog_errno(ret);
4038                                goto out;
4039                        }
4040                }
4041
4042                cpos += num_clusters;
4043        }
4044
4045out:
4046        return ret;
4047}
4048
4049/*
4050 * change the new file's attributes to the src.
4051 *
4052 * reflink creates a snapshot of a file, that means the attributes
4053 * must be identical except for three exceptions - nlink, ino, and ctime.
4054 */
4055static int ocfs2_complete_reflink(struct inode *s_inode,
4056                                  struct buffer_head *s_bh,
4057                                  struct inode *t_inode,
4058                                  struct buffer_head *t_bh,
4059                                  bool preserve)
4060{
4061        int ret;
4062        handle_t *handle;
4063        struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4064        struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
4065        loff_t size = i_size_read(s_inode);
4066
4067        handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
4068                                   OCFS2_INODE_UPDATE_CREDITS);
4069        if (IS_ERR(handle)) {
4070                ret = PTR_ERR(handle);
4071                mlog_errno(ret);
4072                return ret;
4073        }
4074
4075        ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4076                                      OCFS2_JOURNAL_ACCESS_WRITE);
4077        if (ret) {
4078                mlog_errno(ret);
4079                goto out_commit;
4080        }
4081
4082        spin_lock(&OCFS2_I(t_inode)->ip_lock);
4083        OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
4084        OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
4085        OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4086        spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4087        i_size_write(t_inode, size);
4088        t_inode->i_blocks = s_inode->i_blocks;
4089
4090        di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4091        di->i_clusters = s_di->i_clusters;
4092        di->i_size = s_di->i_size;
4093        di->i_dyn_features = s_di->i_dyn_features;
4094        di->i_attr = s_di->i_attr;
4095
4096        if (preserve) {
4097                t_inode->i_uid = s_inode->i_uid;
4098                t_inode->i_gid = s_inode->i_gid;
4099                t_inode->i_mode = s_inode->i_mode;
4100                di->i_uid = s_di->i_uid;
4101                di->i_gid = s_di->i_gid;
4102                di->i_mode = s_di->i_mode;
4103
4104                /*
4105                 * update time.
4106                 * we want mtime to appear identical to the source and
4107                 * update ctime.
4108                 */
4109                t_inode->i_ctime = CURRENT_TIME;
4110
4111                di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
4112                di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
4113
4114                t_inode->i_mtime = s_inode->i_mtime;
4115                di->i_mtime = s_di->i_mtime;
4116                di->i_mtime_nsec = s_di->i_mtime_nsec;
4117        }
4118
4119        ocfs2_journal_dirty(handle, t_bh);
4120
4121out_commit:
4122        ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
4123        return ret;
4124}
4125
4126static int ocfs2_create_reflink_node(struct inode *s_inode,
4127                                     struct buffer_head *s_bh,
4128                                     struct inode *t_inode,
4129                                     struct buffer_head *t_bh,
4130                                     bool preserve)
4131{
4132        int ret;
4133        struct buffer_head *ref_root_bh = NULL;
4134        struct ocfs2_cached_dealloc_ctxt dealloc;
4135        struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4136        struct ocfs2_refcount_block *rb;
4137        struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
4138        struct ocfs2_refcount_tree *ref_tree;
4139
4140        ocfs2_init_dealloc_ctxt(&dealloc);
4141
4142        ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4143                                      le64_to_cpu(di->i_refcount_loc));
4144        if (ret) {
4145                mlog_errno(ret);
4146                goto out;
4147        }
4148
4149        if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4150                ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
4151                                                  t_inode, t_bh);
4152                if (ret)
4153                        mlog_errno(ret);
4154                goto out;
4155        }
4156
4157        ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4158                                       1, &ref_tree, &ref_root_bh);
4159        if (ret) {
4160                mlog_errno(ret);
4161                goto out;
4162        }
4163        rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4164
4165        ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4166                                          &ref_tree->rf_ci, ref_root_bh,
4167                                          &dealloc);
4168        if (ret) {
4169                mlog_errno(ret);
4170                goto out_unlock_refcount;
4171        }
4172
4173out_unlock_refcount:
4174        ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4175        brelse(ref_root_bh);
4176out:
4177        if (ocfs2_dealloc_has_cluster(&dealloc)) {
4178                ocfs2_schedule_truncate_log_flush(osb, 1);
4179                ocfs2_run_deallocs(osb, &dealloc);
4180        }
4181
4182        return ret;
4183}
4184
4185static int __ocfs2_reflink(struct dentry *old_dentry,
4186                           struct buffer_head *old_bh,
4187                           struct inode *new_inode,
4188                           bool preserve)
4189{
4190        int ret;
4191        struct inode *inode = old_dentry->d_inode;
4192        struct buffer_head *new_bh = NULL;
4193
4194        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
4195                ret = -EINVAL;
4196                mlog_errno(ret);
4197                goto out;
4198        }
4199
4200        ret = filemap_fdatawrite(inode->i_mapping);
4201        if (ret) {
4202                mlog_errno(ret);
4203                goto out;
4204        }
4205
4206        ret = ocfs2_attach_refcount_tree(inode, old_bh);
4207        if (ret) {
4208                mlog_errno(ret);
4209                goto out;
4210        }
4211
4212        mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4213        ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4214                                      OI_LS_REFLINK_TARGET);
4215        if (ret) {
4216                mlog_errno(ret);
4217                goto out_unlock;
4218        }
4219
4220        ret = ocfs2_create_reflink_node(inode, old_bh,
4221                                        new_inode, new_bh, preserve);
4222        if (ret) {
4223                mlog_errno(ret);
4224                goto inode_unlock;
4225        }
4226
4227        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4228                ret = ocfs2_reflink_xattrs(inode, old_bh,
4229                                           new_inode, new_bh,
4230                                           preserve);
4231                if (ret) {
4232                        mlog_errno(ret);
4233                        goto inode_unlock;
4234                }
4235        }
4236
4237        ret = ocfs2_complete_reflink(inode, old_bh,
4238                                     new_inode, new_bh, preserve);
4239        if (ret)
4240                mlog_errno(ret);
4241
4242inode_unlock:
4243        ocfs2_inode_unlock(new_inode, 1);
4244        brelse(new_bh);
4245out_unlock:
4246        mutex_unlock(&new_inode->i_mutex);
4247out:
4248        if (!ret) {
4249                ret = filemap_fdatawait(inode->i_mapping);
4250                if (ret)
4251                        mlog_errno(ret);
4252        }
4253        return ret;
4254}
4255
4256static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4257                         struct dentry *new_dentry, bool preserve)
4258{
4259        int error;
4260        struct inode *inode = old_dentry->d_inode;
4261        struct buffer_head *old_bh = NULL;
4262        struct inode *new_orphan_inode = NULL;
4263
4264        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4265                return -EOPNOTSUPP;
4266
4267        error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4268                                             &new_orphan_inode);
4269        if (error) {
4270                mlog_errno(error);
4271                goto out;
4272        }
4273
4274        error = ocfs2_inode_lock(inode, &old_bh, 1);
4275        if (error) {
4276                mlog_errno(error);
4277                goto out;
4278        }
4279
4280        down_write(&OCFS2_I(inode)->ip_xattr_sem);
4281        down_write(&OCFS2_I(inode)->ip_alloc_sem);
4282        error = __ocfs2_reflink(old_dentry, old_bh,
4283                                new_orphan_inode, preserve);
4284        up_write(&OCFS2_I(inode)->ip_alloc_sem);
4285        up_write(&OCFS2_I(inode)->ip_xattr_sem);
4286
4287        ocfs2_inode_unlock(inode, 1);
4288        brelse(old_bh);
4289
4290        if (error) {
4291                mlog_errno(error);
4292                goto out;
4293        }
4294
4295        /* If the security isn't preserved, we need to re-initialize them. */
4296        if (!preserve) {
4297                error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
4298                                                    &new_dentry->d_name);
4299                if (error)
4300                        mlog_errno(error);
4301        }
4302out:
4303        if (!error) {
4304                error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4305                                                       new_dentry);
4306                if (error)
4307                        mlog_errno(error);
4308        }
4309
4310        if (new_orphan_inode) {
4311                /*
4312                 * We need to open_unlock the inode no matter whether we
4313                 * succeed or not, so that other nodes can delete it later.
4314                 */
4315                ocfs2_open_unlock(new_orphan_inode);
4316                if (error)
4317                        iput(new_orphan_inode);
4318        }
4319
4320        return error;
4321}
4322
4323/*
4324 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4325 * sys_reflink().  This will go away when vfs_reflink() exists in
4326 * fs/namei.c.
4327 */
4328
4329/* copied from may_create in VFS. */
4330static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4331{
4332        if (child->d_inode)
4333                return -EEXIST;
4334        if (IS_DEADDIR(dir))
4335                return -ENOENT;
4336        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4337}
4338
4339/**
4340 * ocfs2_vfs_reflink - Create a reference-counted link
4341 *
4342 * @old_dentry:        source dentry + inode
4343 * @dir:       directory to create the target
4344 * @new_dentry:        target dentry
4345 * @preserve:  if true, preserve all file attributes
4346 */
4347static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4348                             struct dentry *new_dentry, bool preserve)
4349{
4350        struct inode *inode = old_dentry->d_inode;
4351        int error;
4352
4353        if (!inode)
4354                return -ENOENT;
4355
4356        error = ocfs2_may_create(dir, new_dentry);
4357        if (error)
4358                return error;
4359
4360        if (dir->i_sb != inode->i_sb)
4361                return -EXDEV;
4362
4363        /*
4364         * A reflink to an append-only or immutable file cannot be created.
4365         */
4366        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4367                return -EPERM;
4368
4369        /* Only regular files can be reflinked. */
4370        if (!S_ISREG(inode->i_mode))
4371                return -EPERM;
4372
4373        /*
4374         * If the caller wants to preserve ownership, they require the
4375         * rights to do so.
4376         */
4377        if (preserve) {
4378                if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
4379                        return -EPERM;
4380                if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4381                        return -EPERM;
4382        }
4383
4384        /*
4385         * If the caller is modifying any aspect of the attributes, they
4386         * are not creating a snapshot.  They need read permission on the
4387         * file.
4388         */
4389        if (!preserve) {
4390                error = inode_permission(inode, MAY_READ);
4391                if (error)
4392                        return error;
4393        }
4394
4395        mutex_lock(&inode->i_mutex);
4396        dquot_initialize(dir);
4397        error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4398        mutex_unlock(&inode->i_mutex);
4399        if (!error)
4400                fsnotify_create(dir, new_dentry);
4401        return error;
4402}
4403/*
4404 * Most codes are copied from sys_linkat.
4405 */
4406int ocfs2_reflink_ioctl(struct inode *inode,
4407                        const char __user *oldname,
4408                        const char __user *newname,
4409                        bool preserve)
4410{
4411        struct dentry *new_dentry;
4412        struct path old_path, new_path;
4413        int error;
4414
4415        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4416                return -EOPNOTSUPP;
4417
4418        error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4419        if (error) {
4420                mlog_errno(error);
4421                return error;
4422        }
4423
4424        new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
4425        error = PTR_ERR(new_dentry);
4426        if (IS_ERR(new_dentry)) {
4427                mlog_errno(error);
4428                goto out;
4429        }
4430
4431        error = -EXDEV;
4432        if (old_path.mnt != new_path.mnt) {
4433                mlog_errno(error);
4434                goto out_dput;
4435        }
4436
4437        error = ocfs2_vfs_reflink(old_path.dentry,
4438                                  new_path.dentry->d_inode,
4439                                  new_dentry, preserve);
4440out_dput:
4441        done_path_create(&new_path, new_dentry);
4442out:
4443        path_put(&old_path);
4444
4445        return error;
4446}
4447