linux/fs/ocfs2/move_extents.c
<<
>>
Prefs
   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * move_extents.c
   5 *
   6 * Copyright (C) 2011 Oracle.  All rights reserved.
   7 *
   8 * This program is free software; you can redistribute it and/or
   9 * modify it under the terms of the GNU General Public
  10 * License version 2 as published by the Free Software Foundation.
  11 *
  12 * This program is distributed in the hope that it will be useful,
  13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 * General Public License for more details.
  16 */
  17#include <linux/fs.h>
  18#include <linux/types.h>
  19#include <linux/mount.h>
  20#include <linux/swap.h>
  21
  22#include <cluster/masklog.h>
  23
  24#include "ocfs2.h"
  25#include "ocfs2_ioctl.h"
  26
  27#include "alloc.h"
  28#include "aops.h"
  29#include "dlmglue.h"
  30#include "extent_map.h"
  31#include "inode.h"
  32#include "journal.h"
  33#include "suballoc.h"
  34#include "uptodate.h"
  35#include "super.h"
  36#include "dir.h"
  37#include "buffer_head_io.h"
  38#include "sysfile.h"
  39#include "refcounttree.h"
  40#include "move_extents.h"
  41
  42struct ocfs2_move_extents_context {
  43        struct inode *inode;
  44        struct file *file;
  45        int auto_defrag;
  46        int partial;
  47        int credits;
  48        u32 new_phys_cpos;
  49        u32 clusters_moved;
  50        u64 refcount_loc;
  51        struct ocfs2_move_extents *range;
  52        struct ocfs2_extent_tree et;
  53        struct ocfs2_alloc_context *meta_ac;
  54        struct ocfs2_alloc_context *data_ac;
  55        struct ocfs2_cached_dealloc_ctxt dealloc;
  56};
  57
  58static int __ocfs2_move_extent(handle_t *handle,
  59                               struct ocfs2_move_extents_context *context,
  60                               u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
  61                               int ext_flags)
  62{
  63        int ret = 0, index;
  64        struct inode *inode = context->inode;
  65        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  66        struct ocfs2_extent_rec *rec, replace_rec;
  67        struct ocfs2_path *path = NULL;
  68        struct ocfs2_extent_list *el;
  69        u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
  70        u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
  71
  72        ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
  73                                               p_cpos, new_p_cpos, len);
  74        if (ret) {
  75                mlog_errno(ret);
  76                goto out;
  77        }
  78
  79        memset(&replace_rec, 0, sizeof(replace_rec));
  80        replace_rec.e_cpos = cpu_to_le32(cpos);
  81        replace_rec.e_leaf_clusters = cpu_to_le16(len);
  82        replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
  83                                                                   new_p_cpos));
  84
  85        path = ocfs2_new_path_from_et(&context->et);
  86        if (!path) {
  87                ret = -ENOMEM;
  88                mlog_errno(ret);
  89                goto out;
  90        }
  91
  92        ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
  93        if (ret) {
  94                mlog_errno(ret);
  95                goto out;
  96        }
  97
  98        el = path_leaf_el(path);
  99
 100        index = ocfs2_search_extent_list(el, cpos);
 101        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
 102                ocfs2_error(inode->i_sb,
 103                            "Inode %llu has an extent at cpos %u which can no "
 104                            "longer be found.\n",
 105                            (unsigned long long)ino, cpos);
 106                ret = -EROFS;
 107                goto out;
 108        }
 109
 110        rec = &el->l_recs[index];
 111
 112        BUG_ON(ext_flags != rec->e_flags);
 113        /*
 114         * after moving/defraging to new location, the extent is not going
 115         * to be refcounted anymore.
 116         */
 117        replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
 118
 119        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
 120                                      context->et.et_root_bh,
 121                                      OCFS2_JOURNAL_ACCESS_WRITE);
 122        if (ret) {
 123                mlog_errno(ret);
 124                goto out;
 125        }
 126
 127        ret = ocfs2_split_extent(handle, &context->et, path, index,
 128                                 &replace_rec, context->meta_ac,
 129                                 &context->dealloc);
 130        if (ret) {
 131                mlog_errno(ret);
 132                goto out;
 133        }
 134
 135        ocfs2_journal_dirty(handle, context->et.et_root_bh);
 136
 137        context->new_phys_cpos = new_p_cpos;
 138
 139        /*
 140         * need I to append truncate log for old clusters?
 141         */
 142        if (old_blkno) {
 143                if (ext_flags & OCFS2_EXT_REFCOUNTED)
 144                        ret = ocfs2_decrease_refcount(inode, handle,
 145                                        ocfs2_blocks_to_clusters(osb->sb,
 146                                                                 old_blkno),
 147                                        len, context->meta_ac,
 148                                        &context->dealloc, 1);
 149                else
 150                        ret = ocfs2_truncate_log_append(osb, handle,
 151                                                        old_blkno, len);
 152        }
 153
 154out:
 155        return ret;
 156}
 157
 158/*
 159 * lock allocators, and reserving appropriate number of bits for
 160 * meta blocks and data clusters.
 161 *
 162 * in some cases, we don't need to reserve clusters, just let data_ac
 163 * be NULL.
 164 */
 165static int ocfs2_lock_allocators_move_extents(struct inode *inode,
 166                                        struct ocfs2_extent_tree *et,
 167                                        u32 clusters_to_move,
 168                                        u32 extents_to_split,
 169                                        struct ocfs2_alloc_context **meta_ac,
 170                                        struct ocfs2_alloc_context **data_ac,
 171                                        int extra_blocks,
 172                                        int *credits)
 173{
 174        int ret, num_free_extents;
 175        unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
 176        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 177
 178        num_free_extents = ocfs2_num_free_extents(osb, et);
 179        if (num_free_extents < 0) {
 180                ret = num_free_extents;
 181                mlog_errno(ret);
 182                goto out;
 183        }
 184
 185        if (!num_free_extents ||
 186            (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
 187                extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
 188
 189        ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
 190        if (ret) {
 191                mlog_errno(ret);
 192                goto out;
 193        }
 194
 195        if (data_ac) {
 196                ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
 197                if (ret) {
 198                        mlog_errno(ret);
 199                        goto out;
 200                }
 201        }
 202
 203        *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
 204                                              clusters_to_move + 2);
 205
 206        mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
 207             extra_blocks, clusters_to_move, *credits);
 208out:
 209        if (ret) {
 210                if (*meta_ac) {
 211                        ocfs2_free_alloc_context(*meta_ac);
 212                        *meta_ac = NULL;
 213                }
 214        }
 215
 216        return ret;
 217}
 218
 219/*
 220 * Using one journal handle to guarantee the data consistency in case
 221 * crash happens anywhere.
 222 *
 223 *  XXX: defrag can end up with finishing partial extent as requested,
 224 * due to not enough contiguous clusters can be found in allocator.
 225 */
 226static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 227                               u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
 228{
 229        int ret, credits = 0, extra_blocks = 0, partial = context->partial;
 230        handle_t *handle;
 231        struct inode *inode = context->inode;
 232        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 233        struct inode *tl_inode = osb->osb_tl_inode;
 234        struct ocfs2_refcount_tree *ref_tree = NULL;
 235        u32 new_phys_cpos, new_len;
 236        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 237
 238        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
 239
 240                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
 241                         OCFS2_HAS_REFCOUNT_FL));
 242
 243                BUG_ON(!context->refcount_loc);
 244
 245                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
 246                                               &ref_tree, NULL);
 247                if (ret) {
 248                        mlog_errno(ret);
 249                        return ret;
 250                }
 251
 252                ret = ocfs2_prepare_refcount_change_for_del(inode,
 253                                                        context->refcount_loc,
 254                                                        phys_blkno,
 255                                                        *len,
 256                                                        &credits,
 257                                                        &extra_blocks);
 258                if (ret) {
 259                        mlog_errno(ret);
 260                        goto out;
 261                }
 262        }
 263
 264        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
 265                                                 &context->meta_ac,
 266                                                 &context->data_ac,
 267                                                 extra_blocks, &credits);
 268        if (ret) {
 269                mlog_errno(ret);
 270                goto out;
 271        }
 272
 273        /*
 274         * should be using allocation reservation strategy there?
 275         *
 276         * if (context->data_ac)
 277         *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
 278         */
 279
 280        mutex_lock(&tl_inode->i_mutex);
 281
 282        if (ocfs2_truncate_log_needs_flush(osb)) {
 283                ret = __ocfs2_flush_truncate_log(osb);
 284                if (ret < 0) {
 285                        mlog_errno(ret);
 286                        goto out_unlock_mutex;
 287                }
 288        }
 289
 290        handle = ocfs2_start_trans(osb, credits);
 291        if (IS_ERR(handle)) {
 292                ret = PTR_ERR(handle);
 293                mlog_errno(ret);
 294                goto out_unlock_mutex;
 295        }
 296
 297        ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
 298                                     &new_phys_cpos, &new_len);
 299        if (ret) {
 300                mlog_errno(ret);
 301                goto out_commit;
 302        }
 303
 304        /*
 305         * allowing partial extent moving is kind of 'pros and cons', it makes
 306         * whole defragmentation less likely to fail, on the contrary, the bad
 307         * thing is it may make the fs even more fragmented after moving, let
 308         * userspace make a good decision here.
 309         */
 310        if (new_len != *len) {
 311                mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
 312                if (!partial) {
 313                        context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
 314                        ret = -ENOSPC;
 315                        goto out_commit;
 316                }
 317        }
 318
 319        mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
 320             phys_cpos, new_phys_cpos);
 321
 322        ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
 323                                  new_phys_cpos, ext_flags);
 324        if (ret)
 325                mlog_errno(ret);
 326
 327        if (partial && (new_len != *len))
 328                *len = new_len;
 329
 330        /*
 331         * Here we should write the new page out first if we are
 332         * in write-back mode.
 333         */
 334        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
 335        if (ret)
 336                mlog_errno(ret);
 337
 338out_commit:
 339        ocfs2_commit_trans(osb, handle);
 340
 341out_unlock_mutex:
 342        mutex_unlock(&tl_inode->i_mutex);
 343
 344        if (context->data_ac) {
 345                ocfs2_free_alloc_context(context->data_ac);
 346                context->data_ac = NULL;
 347        }
 348
 349        if (context->meta_ac) {
 350                ocfs2_free_alloc_context(context->meta_ac);
 351                context->meta_ac = NULL;
 352        }
 353
 354out:
 355        if (ref_tree)
 356                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 357
 358        return ret;
 359}
 360
 361/*
 362 * find the victim alloc group, where #blkno fits.
 363 */
 364static int ocfs2_find_victim_alloc_group(struct inode *inode,
 365                                         u64 vict_blkno,
 366                                         int type, int slot,
 367                                         int *vict_bit,
 368                                         struct buffer_head **ret_bh)
 369{
 370        int ret, i, bits_per_unit = 0;
 371        u64 blkno;
 372        char namebuf[40];
 373
 374        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 375        struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
 376        struct ocfs2_chain_list *cl;
 377        struct ocfs2_chain_rec *rec;
 378        struct ocfs2_dinode *ac_dinode;
 379        struct ocfs2_group_desc *bg;
 380
 381        ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
 382        ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
 383                                         strlen(namebuf), &blkno);
 384        if (ret) {
 385                ret = -ENOENT;
 386                goto out;
 387        }
 388
 389        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
 390        if (ret) {
 391                mlog_errno(ret);
 392                goto out;
 393        }
 394
 395        ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
 396        cl = &(ac_dinode->id2.i_chain);
 397        rec = &(cl->cl_recs[0]);
 398
 399        if (type == GLOBAL_BITMAP_SYSTEM_INODE)
 400                bits_per_unit = osb->s_clustersize_bits -
 401                                        inode->i_sb->s_blocksize_bits;
 402        /*
 403         * 'vict_blkno' was out of the valid range.
 404         */
 405        if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
 406            (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
 407                                bits_per_unit))) {
 408                ret = -EINVAL;
 409                goto out;
 410        }
 411
 412        for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
 413
 414                rec = &(cl->cl_recs[i]);
 415                if (!rec)
 416                        continue;
 417
 418                bg = NULL;
 419
 420                do {
 421                        if (!bg)
 422                                blkno = le64_to_cpu(rec->c_blkno);
 423                        else
 424                                blkno = le64_to_cpu(bg->bg_next_group);
 425
 426                        if (gd_bh) {
 427                                brelse(gd_bh);
 428                                gd_bh = NULL;
 429                        }
 430
 431                        ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
 432                        if (ret) {
 433                                mlog_errno(ret);
 434                                goto out;
 435                        }
 436
 437                        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 438
 439                        if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
 440                                                le16_to_cpu(bg->bg_bits))) {
 441
 442                                *ret_bh = gd_bh;
 443                                *vict_bit = (vict_blkno - blkno) >>
 444                                                        bits_per_unit;
 445                                mlog(0, "find the victim group: #%llu, "
 446                                     "total_bits: %u, vict_bit: %u\n",
 447                                     blkno, le16_to_cpu(bg->bg_bits),
 448                                     *vict_bit);
 449                                goto out;
 450                        }
 451
 452                } while (le64_to_cpu(bg->bg_next_group));
 453        }
 454
 455        ret = -EINVAL;
 456out:
 457        brelse(ac_bh);
 458
 459        /*
 460         * caller has to release the gd_bh properly.
 461         */
 462        return ret;
 463}
 464
 465/*
 466 * XXX: helper to validate and adjust moving goal.
 467 */
 468static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
 469                                               struct ocfs2_move_extents *range)
 470{
 471        int ret, goal_bit = 0;
 472
 473        struct buffer_head *gd_bh = NULL;
 474        struct ocfs2_group_desc *bg;
 475        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 476        int c_to_b = 1 << (osb->s_clustersize_bits -
 477                                        inode->i_sb->s_blocksize_bits);
 478
 479        /*
 480         * make goal become cluster aligned.
 481         */
 482        range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
 483                                                      range->me_goal);
 484        /*
 485         * validate goal sits within global_bitmap, and return the victim
 486         * group desc
 487         */
 488        ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
 489                                            GLOBAL_BITMAP_SYSTEM_INODE,
 490                                            OCFS2_INVALID_SLOT,
 491                                            &goal_bit, &gd_bh);
 492        if (ret)
 493                goto out;
 494
 495        bg = (struct ocfs2_group_desc *)gd_bh->b_data;
 496
 497        /*
 498         * moving goal is not allowd to start with a group desc blok(#0 blk)
 499         * let's compromise to the latter cluster.
 500         */
 501        if (range->me_goal == le64_to_cpu(bg->bg_blkno))
 502                range->me_goal += c_to_b;
 503
 504        /*
 505         * movement is not gonna cross two groups.
 506         */
 507        if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
 508                                                                range->me_len) {
 509                ret = -EINVAL;
 510                goto out;
 511        }
 512        /*
 513         * more exact validations/adjustments will be performed later during
 514         * moving operation for each extent range.
 515         */
 516        mlog(0, "extents get ready to be moved to #%llu block\n",
 517             range->me_goal);
 518
 519out:
 520        brelse(gd_bh);
 521
 522        return ret;
 523}
 524
 525static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
 526                                    int *goal_bit, u32 move_len, u32 max_hop,
 527                                    u32 *phys_cpos)
 528{
 529        int i, used, last_free_bits = 0, base_bit = *goal_bit;
 530        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 531        u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
 532                                                 le64_to_cpu(gd->bg_blkno));
 533
 534        for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
 535
 536                used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
 537                if (used) {
 538                        /*
 539                         * we even tried searching the free chunk by jumping
 540                         * a 'max_hop' distance, but still failed.
 541                         */
 542                        if ((i - base_bit) > max_hop) {
 543                                *phys_cpos = 0;
 544                                break;
 545                        }
 546
 547                        if (last_free_bits)
 548                                last_free_bits = 0;
 549
 550                        continue;
 551                } else
 552                        last_free_bits++;
 553
 554                if (last_free_bits == move_len) {
 555                        *goal_bit = i;
 556                        *phys_cpos = base_cpos + i;
 557                        break;
 558                }
 559        }
 560
 561        mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
 562}
 563
 564static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 565                                       handle_t *handle,
 566                                       struct buffer_head *di_bh,
 567                                       u32 num_bits,
 568                                       u16 chain)
 569{
 570        int ret;
 571        u32 tmp_used;
 572        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 573        struct ocfs2_chain_list *cl =
 574                                (struct ocfs2_chain_list *) &di->id2.i_chain;
 575
 576        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 577                                      OCFS2_JOURNAL_ACCESS_WRITE);
 578        if (ret < 0) {
 579                mlog_errno(ret);
 580                goto out;
 581        }
 582
 583        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
 584        di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
 585        le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
 586        ocfs2_journal_dirty(handle, di_bh);
 587
 588out:
 589        return ret;
 590}
 591
 592static inline int ocfs2_block_group_set_bits(handle_t *handle,
 593                                             struct inode *alloc_inode,
 594                                             struct ocfs2_group_desc *bg,
 595                                             struct buffer_head *group_bh,
 596                                             unsigned int bit_off,
 597                                             unsigned int num_bits)
 598{
 599        int status;
 600        void *bitmap = bg->bg_bitmap;
 601        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 602
 603        /* All callers get the descriptor via
 604         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 605        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 606        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 607
 608        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
 609             num_bits);
 610
 611        if (ocfs2_is_cluster_bitmap(alloc_inode))
 612                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 613
 614        status = ocfs2_journal_access_gd(handle,
 615                                         INODE_CACHE(alloc_inode),
 616                                         group_bh,
 617                                         journal_type);
 618        if (status < 0) {
 619                mlog_errno(status);
 620                goto bail;
 621        }
 622
 623        le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
 624        if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
 625                ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
 626                            " count %u but claims %u are freed. num_bits %d",
 627                            (unsigned long long)le64_to_cpu(bg->bg_blkno),
 628                            le16_to_cpu(bg->bg_bits),
 629                            le16_to_cpu(bg->bg_free_bits_count), num_bits);
 630                return -EROFS;
 631        }
 632        while (num_bits--)
 633                ocfs2_set_bit(bit_off++, bitmap);
 634
 635        ocfs2_journal_dirty(handle, group_bh);
 636
 637bail:
 638        return status;
 639}
 640
 641static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 642                             u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
 643                             u32 len, int ext_flags)
 644{
 645        int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
 646        handle_t *handle;
 647        struct inode *inode = context->inode;
 648        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 649        struct inode *tl_inode = osb->osb_tl_inode;
 650        struct inode *gb_inode = NULL;
 651        struct buffer_head *gb_bh = NULL;
 652        struct buffer_head *gd_bh = NULL;
 653        struct ocfs2_group_desc *gd;
 654        struct ocfs2_refcount_tree *ref_tree = NULL;
 655        u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
 656                                                    context->range->me_threshold);
 657        u64 phys_blkno, new_phys_blkno;
 658
 659        phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 660
 661        if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
 662
 663                BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
 664                         OCFS2_HAS_REFCOUNT_FL));
 665
 666                BUG_ON(!context->refcount_loc);
 667
 668                ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
 669                                               &ref_tree, NULL);
 670                if (ret) {
 671                        mlog_errno(ret);
 672                        return ret;
 673                }
 674
 675                ret = ocfs2_prepare_refcount_change_for_del(inode,
 676                                                        context->refcount_loc,
 677                                                        phys_blkno,
 678                                                        len,
 679                                                        &credits,
 680                                                        &extra_blocks);
 681                if (ret) {
 682                        mlog_errno(ret);
 683                        goto out;
 684                }
 685        }
 686
 687        ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
 688                                                 &context->meta_ac,
 689                                                 NULL, extra_blocks, &credits);
 690        if (ret) {
 691                mlog_errno(ret);
 692                goto out;
 693        }
 694
 695        /*
 696         * need to count 2 extra credits for global_bitmap inode and
 697         * group descriptor.
 698         */
 699        credits += OCFS2_INODE_UPDATE_CREDITS + 1;
 700
 701        /*
 702         * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
 703         * logic, while we still need to lock the global_bitmap.
 704         */
 705        gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
 706                                               OCFS2_INVALID_SLOT);
 707        if (!gb_inode) {
 708                mlog(ML_ERROR, "unable to get global_bitmap inode\n");
 709                ret = -EIO;
 710                goto out;
 711        }
 712
 713        mutex_lock(&gb_inode->i_mutex);
 714
 715        ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
 716        if (ret) {
 717                mlog_errno(ret);
 718                goto out_unlock_gb_mutex;
 719        }
 720
 721        mutex_lock(&tl_inode->i_mutex);
 722
 723        handle = ocfs2_start_trans(osb, credits);
 724        if (IS_ERR(handle)) {
 725                ret = PTR_ERR(handle);
 726                mlog_errno(ret);
 727                goto out_unlock_tl_inode;
 728        }
 729
 730        new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
 731        ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
 732                                            GLOBAL_BITMAP_SYSTEM_INODE,
 733                                            OCFS2_INVALID_SLOT,
 734                                            &goal_bit, &gd_bh);
 735        if (ret) {
 736                mlog_errno(ret);
 737                goto out_commit;
 738        }
 739
 740        /*
 741         * probe the victim cluster group to find a proper
 742         * region to fit wanted movement, it even will perfrom
 743         * a best-effort attempt by compromising to a threshold
 744         * around the goal.
 745         */
 746        ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
 747                                new_phys_cpos);
 748        if (!*new_phys_cpos) {
 749                ret = -ENOSPC;
 750                goto out_commit;
 751        }
 752
 753        ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
 754                                  *new_phys_cpos, ext_flags);
 755        if (ret) {
 756                mlog_errno(ret);
 757                goto out_commit;
 758        }
 759
 760        gd = (struct ocfs2_group_desc *)gd_bh->b_data;
 761        ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
 762                                               le16_to_cpu(gd->bg_chain));
 763        if (ret) {
 764                mlog_errno(ret);
 765                goto out_commit;
 766        }
 767
 768        ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
 769                                         goal_bit, len);
 770        if (ret)
 771                mlog_errno(ret);
 772
 773        /*
 774         * Here we should write the new page out first if we are
 775         * in write-back mode.
 776         */
 777        ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
 778        if (ret)
 779                mlog_errno(ret);
 780
 781out_commit:
 782        ocfs2_commit_trans(osb, handle);
 783        brelse(gd_bh);
 784
 785out_unlock_tl_inode:
 786        mutex_unlock(&tl_inode->i_mutex);
 787
 788        ocfs2_inode_unlock(gb_inode, 1);
 789out_unlock_gb_mutex:
 790        mutex_unlock(&gb_inode->i_mutex);
 791        brelse(gb_bh);
 792        iput(gb_inode);
 793
 794out:
 795        if (context->meta_ac) {
 796                ocfs2_free_alloc_context(context->meta_ac);
 797                context->meta_ac = NULL;
 798        }
 799
 800        if (ref_tree)
 801                ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
 802
 803        return ret;
 804}
 805
 806/*
 807 * Helper to calculate the defraging length in one run according to threshold.
 808 */
 809static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
 810                                         u32 threshold, int *skip)
 811{
 812        if ((*alloc_size + *len_defraged) < threshold) {
 813                /*
 814                 * proceed defragmentation until we meet the thresh
 815                 */
 816                *len_defraged += *alloc_size;
 817        } else if (*len_defraged == 0) {
 818                /*
 819                 * XXX: skip a large extent.
 820                 */
 821                *skip = 1;
 822        } else {
 823                /*
 824                 * split this extent to coalesce with former pieces as
 825                 * to reach the threshold.
 826                 *
 827                 * we're done here with one cycle of defragmentation
 828                 * in a size of 'thresh', resetting 'len_defraged'
 829                 * forces a new defragmentation.
 830                 */
 831                *alloc_size = threshold - *len_defraged;
 832                *len_defraged = 0;
 833        }
 834}
 835
 836static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
 837                                struct ocfs2_move_extents_context *context)
 838{
 839        int ret = 0, flags, do_defrag, skip = 0;
 840        u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
 841        u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
 842
 843        struct inode *inode = context->inode;
 844        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 845        struct ocfs2_move_extents *range = context->range;
 846        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 847
 848        if ((inode->i_size == 0) || (range->me_len == 0))
 849                return 0;
 850
 851        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 852                return 0;
 853
 854        context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
 855
 856        ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
 857        ocfs2_init_dealloc_ctxt(&context->dealloc);
 858
 859        /*
 860         * TO-DO XXX:
 861         *
 862         * - xattr extents.
 863         */
 864
 865        do_defrag = context->auto_defrag;
 866
 867        /*
 868         * extents moving happens in unit of clusters, for the sake
 869         * of simplicity, we may ignore two clusters where 'byte_start'
 870         * and 'byte_start + len' were within.
 871         */
 872        move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
 873        len_to_move = (range->me_start + range->me_len) >>
 874                                                osb->s_clustersize_bits;
 875        if (len_to_move >= move_start)
 876                len_to_move -= move_start;
 877        else
 878                len_to_move = 0;
 879
 880        if (do_defrag) {
 881                defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
 882                if (defrag_thresh <= 1)
 883                        goto done;
 884        } else
 885                new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
 886                                                         range->me_goal);
 887
 888        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
 889             "thresh: %u\n",
 890             (unsigned long long)OCFS2_I(inode)->ip_blkno,
 891             (unsigned long long)range->me_start,
 892             (unsigned long long)range->me_len,
 893             move_start, len_to_move, defrag_thresh);
 894
 895        cpos = move_start;
 896        while (len_to_move) {
 897                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
 898                                         &flags);
 899                if (ret) {
 900                        mlog_errno(ret);
 901                        goto out;
 902                }
 903
 904                if (alloc_size > len_to_move)
 905                        alloc_size = len_to_move;
 906
 907                /*
 908                 * XXX: how to deal with a hole:
 909                 *
 910                 * - skip the hole of course
 911                 * - force a new defragmentation
 912                 */
 913                if (!phys_cpos) {
 914                        if (do_defrag)
 915                                len_defraged = 0;
 916
 917                        goto next;
 918                }
 919
 920                if (do_defrag) {
 921                        ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
 922                                                     defrag_thresh, &skip);
 923                        /*
 924                         * skip large extents
 925                         */
 926                        if (skip) {
 927                                skip = 0;
 928                                goto next;
 929                        }
 930
 931                        mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
 932                             "alloc_size: %u, len_defraged: %u\n",
 933                             cpos, phys_cpos, alloc_size, len_defraged);
 934
 935                        ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
 936                                                  &alloc_size, flags);
 937                } else {
 938                        ret = ocfs2_move_extent(context, cpos, phys_cpos,
 939                                                &new_phys_cpos, alloc_size,
 940                                                flags);
 941
 942                        new_phys_cpos += alloc_size;
 943                }
 944
 945                if (ret < 0) {
 946                        mlog_errno(ret);
 947                        goto out;
 948                }
 949
 950                context->clusters_moved += alloc_size;
 951next:
 952                cpos += alloc_size;
 953                len_to_move -= alloc_size;
 954        }
 955
 956done:
 957        range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
 958
 959out:
 960        range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
 961                                                      context->clusters_moved);
 962        range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
 963                                                       context->new_phys_cpos);
 964
 965        ocfs2_schedule_truncate_log_flush(osb, 1);
 966        ocfs2_run_deallocs(osb, &context->dealloc);
 967
 968        return ret;
 969}
 970
 971static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 972{
 973        int status;
 974        handle_t *handle;
 975        struct inode *inode = context->inode;
 976        struct ocfs2_dinode *di;
 977        struct buffer_head *di_bh = NULL;
 978        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 979
 980        if (!inode)
 981                return -ENOENT;
 982
 983        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 984                return -EROFS;
 985
 986        mutex_lock(&inode->i_mutex);
 987
 988        /*
 989         * This prevents concurrent writes from other nodes
 990         */
 991        status = ocfs2_rw_lock(inode, 1);
 992        if (status) {
 993                mlog_errno(status);
 994                goto out;
 995        }
 996
 997        status = ocfs2_inode_lock(inode, &di_bh, 1);
 998        if (status) {
 999                mlog_errno(status);
1000                goto out_rw_unlock;
1001        }
1002
1003        /*
1004         * rememer ip_xattr_sem also needs to be held if necessary
1005         */
1006        down_write(&OCFS2_I(inode)->ip_alloc_sem);
1007
1008        status = __ocfs2_move_extents_range(di_bh, context);
1009
1010        up_write(&OCFS2_I(inode)->ip_alloc_sem);
1011        if (status) {
1012                mlog_errno(status);
1013                goto out_inode_unlock;
1014        }
1015
1016        /*
1017         * We update ctime for these changes
1018         */
1019        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1020        if (IS_ERR(handle)) {
1021                status = PTR_ERR(handle);
1022                mlog_errno(status);
1023                goto out_inode_unlock;
1024        }
1025
1026        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1027                                         OCFS2_JOURNAL_ACCESS_WRITE);
1028        if (status) {
1029                mlog_errno(status);
1030                goto out_commit;
1031        }
1032
1033        di = (struct ocfs2_dinode *)di_bh->b_data;
1034        inode->i_ctime = CURRENT_TIME;
1035        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
1036        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
1037
1038        ocfs2_journal_dirty(handle, di_bh);
1039
1040out_commit:
1041        ocfs2_commit_trans(osb, handle);
1042
1043out_inode_unlock:
1044        brelse(di_bh);
1045        ocfs2_inode_unlock(inode, 1);
1046out_rw_unlock:
1047        ocfs2_rw_unlock(inode, 1);
1048out:
1049        mutex_unlock(&inode->i_mutex);
1050
1051        return status;
1052}
1053
1054int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
1055{
1056        int status;
1057
1058        struct inode *inode = file_inode(filp);
1059        struct ocfs2_move_extents range;
1060        struct ocfs2_move_extents_context *context;
1061
1062        if (!argp)
1063                return -EINVAL;
1064
1065        status = mnt_want_write_file(filp);
1066        if (status)
1067                return status;
1068
1069        if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
1070                goto out_drop;
1071
1072        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1073                status = -EPERM;
1074                goto out_drop;
1075        }
1076
1077        context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1078        if (!context) {
1079                status = -ENOMEM;
1080                mlog_errno(status);
1081                goto out_drop;
1082        }
1083
1084        context->inode = inode;
1085        context->file = filp;
1086
1087        if (copy_from_user(&range, argp, sizeof(range))) {
1088                status = -EFAULT;
1089                goto out_free;
1090        }
1091
1092        if (range.me_start > i_size_read(inode))
1093                goto out_free;
1094
1095        if (range.me_start + range.me_len > i_size_read(inode))
1096                        range.me_len = i_size_read(inode) - range.me_start;
1097
1098        context->range = &range;
1099
1100        if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1101                context->auto_defrag = 1;
1102                /*
1103                 * ok, the default theshold for the defragmentation
1104                 * is 1M, since our maximum clustersize was 1M also.
1105                 * any thought?
1106                 */
1107                if (!range.me_threshold)
1108                        range.me_threshold = 1024 * 1024;
1109
1110                if (range.me_threshold > i_size_read(inode))
1111                        range.me_threshold = i_size_read(inode);
1112
1113                if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1114                        context->partial = 1;
1115        } else {
1116                /*
1117                 * first best-effort attempt to validate and adjust the goal
1118                 * (physical address in block), while it can't guarantee later
1119                 * operation can succeed all the time since global_bitmap may
1120                 * change a bit over time.
1121                 */
1122
1123                status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1124                if (status)
1125                        goto out_copy;
1126        }
1127
1128        status = ocfs2_move_extents(context);
1129        if (status)
1130                mlog_errno(status);
1131out_copy:
1132        /*
1133         * movement/defragmentation may end up being partially completed,
1134         * that's the reason why we need to return userspace the finished
1135         * length and new_offset even if failure happens somewhere.
1136         */
1137        if (copy_to_user(argp, &range, sizeof(range)))
1138                status = -EFAULT;
1139
1140out_free:
1141        kfree(context);
1142out_drop:
1143        mnt_drop_write_file(filp);
1144
1145        return status;
1146}
1147