linux/fs/ocfs2/file.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * file.c
   4 *
   5 * File open, close, extend, truncate
   6 *
   7 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   8 */
   9
  10#include <linux/capability.h>
  11#include <linux/fs.h>
  12#include <linux/types.h>
  13#include <linux/slab.h>
  14#include <linux/highmem.h>
  15#include <linux/pagemap.h>
  16#include <linux/uio.h>
  17#include <linux/sched.h>
  18#include <linux/splice.h>
  19#include <linux/mount.h>
  20#include <linux/writeback.h>
  21#include <linux/falloc.h>
  22#include <linux/quotaops.h>
  23#include <linux/blkdev.h>
  24#include <linux/backing-dev.h>
  25
  26#include <cluster/masklog.h>
  27
  28#include "ocfs2.h"
  29
  30#include "alloc.h"
  31#include "aops.h"
  32#include "dir.h"
  33#include "dlmglue.h"
  34#include "extent_map.h"
  35#include "file.h"
  36#include "sysfile.h"
  37#include "inode.h"
  38#include "ioctl.h"
  39#include "journal.h"
  40#include "locks.h"
  41#include "mmap.h"
  42#include "suballoc.h"
  43#include "super.h"
  44#include "xattr.h"
  45#include "acl.h"
  46#include "quota.h"
  47#include "refcounttree.h"
  48#include "ocfs2_trace.h"
  49
  50#include "buffer_head_io.h"
  51
  52static int ocfs2_init_file_private(struct inode *inode, struct file *file)
  53{
  54        struct ocfs2_file_private *fp;
  55
  56        fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
  57        if (!fp)
  58                return -ENOMEM;
  59
  60        fp->fp_file = file;
  61        mutex_init(&fp->fp_mutex);
  62        ocfs2_file_lock_res_init(&fp->fp_flock, fp);
  63        file->private_data = fp;
  64
  65        return 0;
  66}
  67
  68static void ocfs2_free_file_private(struct inode *inode, struct file *file)
  69{
  70        struct ocfs2_file_private *fp = file->private_data;
  71        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  72
  73        if (fp) {
  74                ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
  75                ocfs2_lock_res_free(&fp->fp_flock);
  76                kfree(fp);
  77                file->private_data = NULL;
  78        }
  79}
  80
  81static int ocfs2_file_open(struct inode *inode, struct file *file)
  82{
  83        int status;
  84        int mode = file->f_flags;
  85        struct ocfs2_inode_info *oi = OCFS2_I(inode);
  86
  87        trace_ocfs2_file_open(inode, file, file->f_path.dentry,
  88                              (unsigned long long)oi->ip_blkno,
  89                              file->f_path.dentry->d_name.len,
  90                              file->f_path.dentry->d_name.name, mode);
  91
  92        if (file->f_mode & FMODE_WRITE) {
  93                status = dquot_initialize(inode);
  94                if (status)
  95                        goto leave;
  96        }
  97
  98        spin_lock(&oi->ip_lock);
  99
 100        /* Check that the inode hasn't been wiped from disk by another
 101         * node. If it hasn't then we're safe as long as we hold the
 102         * spin lock until our increment of open count. */
 103        if (oi->ip_flags & OCFS2_INODE_DELETED) {
 104                spin_unlock(&oi->ip_lock);
 105
 106                status = -ENOENT;
 107                goto leave;
 108        }
 109
 110        if (mode & O_DIRECT)
 111                oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
 112
 113        oi->ip_open_count++;
 114        spin_unlock(&oi->ip_lock);
 115
 116        status = ocfs2_init_file_private(inode, file);
 117        if (status) {
 118                /*
 119                 * We want to set open count back if we're failing the
 120                 * open.
 121                 */
 122                spin_lock(&oi->ip_lock);
 123                oi->ip_open_count--;
 124                spin_unlock(&oi->ip_lock);
 125        }
 126
 127        file->f_mode |= FMODE_NOWAIT;
 128
 129leave:
 130        return status;
 131}
 132
 133static int ocfs2_file_release(struct inode *inode, struct file *file)
 134{
 135        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 136
 137        spin_lock(&oi->ip_lock);
 138        if (!--oi->ip_open_count)
 139                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 140
 141        trace_ocfs2_file_release(inode, file, file->f_path.dentry,
 142                                 oi->ip_blkno,
 143                                 file->f_path.dentry->d_name.len,
 144                                 file->f_path.dentry->d_name.name,
 145                                 oi->ip_open_count);
 146        spin_unlock(&oi->ip_lock);
 147
 148        ocfs2_free_file_private(inode, file);
 149
 150        return 0;
 151}
 152
 153static int ocfs2_dir_open(struct inode *inode, struct file *file)
 154{
 155        return ocfs2_init_file_private(inode, file);
 156}
 157
 158static int ocfs2_dir_release(struct inode *inode, struct file *file)
 159{
 160        ocfs2_free_file_private(inode, file);
 161        return 0;
 162}
 163
 164static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
 165                           int datasync)
 166{
 167        int err = 0;
 168        struct inode *inode = file->f_mapping->host;
 169        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 170        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 171        journal_t *journal = osb->journal->j_journal;
 172        int ret;
 173        tid_t commit_tid;
 174        bool needs_barrier = false;
 175
 176        trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
 177                              oi->ip_blkno,
 178                              file->f_path.dentry->d_name.len,
 179                              file->f_path.dentry->d_name.name,
 180                              (unsigned long long)datasync);
 181
 182        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 183                return -EROFS;
 184
 185        err = file_write_and_wait_range(file, start, end);
 186        if (err)
 187                return err;
 188
 189        commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
 190        if (journal->j_flags & JBD2_BARRIER &&
 191            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
 192                needs_barrier = true;
 193        err = jbd2_complete_transaction(journal, commit_tid);
 194        if (needs_barrier) {
 195                ret = blkdev_issue_flush(inode->i_sb->s_bdev);
 196                if (!err)
 197                        err = ret;
 198        }
 199
 200        if (err)
 201                mlog_errno(err);
 202
 203        return (err < 0) ? -EIO : 0;
 204}
 205
 206int ocfs2_should_update_atime(struct inode *inode,
 207                              struct vfsmount *vfsmnt)
 208{
 209        struct timespec64 now;
 210        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 211
 212        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 213                return 0;
 214
 215        if ((inode->i_flags & S_NOATIME) ||
 216            ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
 217                return 0;
 218
 219        /*
 220         * We can be called with no vfsmnt structure - NFSD will
 221         * sometimes do this.
 222         *
 223         * Note that our action here is different than touch_atime() -
 224         * if we can't tell whether this is a noatime mount, then we
 225         * don't know whether to trust the value of s_atime_quantum.
 226         */
 227        if (vfsmnt == NULL)
 228                return 0;
 229
 230        if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
 231            ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 232                return 0;
 233
 234        if (vfsmnt->mnt_flags & MNT_RELATIME) {
 235                if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
 236                    (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 237                        return 1;
 238
 239                return 0;
 240        }
 241
 242        now = current_time(inode);
 243        if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
 244                return 0;
 245        else
 246                return 1;
 247}
 248
 249int ocfs2_update_inode_atime(struct inode *inode,
 250                             struct buffer_head *bh)
 251{
 252        int ret;
 253        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 254        handle_t *handle;
 255        struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 256
 257        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 258        if (IS_ERR(handle)) {
 259                ret = PTR_ERR(handle);
 260                mlog_errno(ret);
 261                goto out;
 262        }
 263
 264        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 265                                      OCFS2_JOURNAL_ACCESS_WRITE);
 266        if (ret) {
 267                mlog_errno(ret);
 268                goto out_commit;
 269        }
 270
 271        /*
 272         * Don't use ocfs2_mark_inode_dirty() here as we don't always
 273         * have i_mutex to guard against concurrent changes to other
 274         * inode fields.
 275         */
 276        inode->i_atime = current_time(inode);
 277        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 278        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
 279        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 280        ocfs2_journal_dirty(handle, bh);
 281
 282out_commit:
 283        ocfs2_commit_trans(osb, handle);
 284out:
 285        return ret;
 286}
 287
 288int ocfs2_set_inode_size(handle_t *handle,
 289                                struct inode *inode,
 290                                struct buffer_head *fe_bh,
 291                                u64 new_i_size)
 292{
 293        int status;
 294
 295        i_size_write(inode, new_i_size);
 296        inode->i_blocks = ocfs2_inode_sector_count(inode);
 297        inode->i_ctime = inode->i_mtime = current_time(inode);
 298
 299        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 300        if (status < 0) {
 301                mlog_errno(status);
 302                goto bail;
 303        }
 304
 305bail:
 306        return status;
 307}
 308
 309int ocfs2_simple_size_update(struct inode *inode,
 310                             struct buffer_head *di_bh,
 311                             u64 new_i_size)
 312{
 313        int ret;
 314        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 315        handle_t *handle = NULL;
 316
 317        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 318        if (IS_ERR(handle)) {
 319                ret = PTR_ERR(handle);
 320                mlog_errno(ret);
 321                goto out;
 322        }
 323
 324        ret = ocfs2_set_inode_size(handle, inode, di_bh,
 325                                   new_i_size);
 326        if (ret < 0)
 327                mlog_errno(ret);
 328
 329        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 330        ocfs2_commit_trans(osb, handle);
 331out:
 332        return ret;
 333}
 334
 335static int ocfs2_cow_file_pos(struct inode *inode,
 336                              struct buffer_head *fe_bh,
 337                              u64 offset)
 338{
 339        int status;
 340        u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 341        unsigned int num_clusters = 0;
 342        unsigned int ext_flags = 0;
 343
 344        /*
 345         * If the new offset is aligned to the range of the cluster, there is
 346         * no space for ocfs2_zero_range_for_truncate to fill, so no need to
 347         * CoW either.
 348         */
 349        if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
 350                return 0;
 351
 352        status = ocfs2_get_clusters(inode, cpos, &phys,
 353                                    &num_clusters, &ext_flags);
 354        if (status) {
 355                mlog_errno(status);
 356                goto out;
 357        }
 358
 359        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
 360                goto out;
 361
 362        return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
 363
 364out:
 365        return status;
 366}
 367
 368static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 369                                     struct inode *inode,
 370                                     struct buffer_head *fe_bh,
 371                                     u64 new_i_size)
 372{
 373        int status;
 374        handle_t *handle;
 375        struct ocfs2_dinode *di;
 376        u64 cluster_bytes;
 377
 378        /*
 379         * We need to CoW the cluster contains the offset if it is reflinked
 380         * since we will call ocfs2_zero_range_for_truncate later which will
 381         * write "0" from offset to the end of the cluster.
 382         */
 383        status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
 384        if (status) {
 385                mlog_errno(status);
 386                return status;
 387        }
 388
 389        /* TODO: This needs to actually orphan the inode in this
 390         * transaction. */
 391
 392        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 393        if (IS_ERR(handle)) {
 394                status = PTR_ERR(handle);
 395                mlog_errno(status);
 396                goto out;
 397        }
 398
 399        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
 400                                         OCFS2_JOURNAL_ACCESS_WRITE);
 401        if (status < 0) {
 402                mlog_errno(status);
 403                goto out_commit;
 404        }
 405
 406        /*
 407         * Do this before setting i_size.
 408         */
 409        cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
 410        status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
 411                                               cluster_bytes);
 412        if (status) {
 413                mlog_errno(status);
 414                goto out_commit;
 415        }
 416
 417        i_size_write(inode, new_i_size);
 418        inode->i_ctime = inode->i_mtime = current_time(inode);
 419
 420        di = (struct ocfs2_dinode *) fe_bh->b_data;
 421        di->i_size = cpu_to_le64(new_i_size);
 422        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 423        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 424        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 425
 426        ocfs2_journal_dirty(handle, fe_bh);
 427
 428out_commit:
 429        ocfs2_commit_trans(osb, handle);
 430out:
 431        return status;
 432}
 433
 434int ocfs2_truncate_file(struct inode *inode,
 435                               struct buffer_head *di_bh,
 436                               u64 new_i_size)
 437{
 438        int status = 0;
 439        struct ocfs2_dinode *fe = NULL;
 440        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 441
 442        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
 443         * already validated it */
 444        fe = (struct ocfs2_dinode *) di_bh->b_data;
 445
 446        trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
 447                                  (unsigned long long)le64_to_cpu(fe->i_size),
 448                                  (unsigned long long)new_i_size);
 449
 450        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 451                        "Inode %llu, inode i_size = %lld != di "
 452                        "i_size = %llu, i_flags = 0x%x\n",
 453                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 454                        i_size_read(inode),
 455                        (unsigned long long)le64_to_cpu(fe->i_size),
 456                        le32_to_cpu(fe->i_flags));
 457
 458        if (new_i_size > le64_to_cpu(fe->i_size)) {
 459                trace_ocfs2_truncate_file_error(
 460                        (unsigned long long)le64_to_cpu(fe->i_size),
 461                        (unsigned long long)new_i_size);
 462                status = -EINVAL;
 463                mlog_errno(status);
 464                goto bail;
 465        }
 466
 467        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 468
 469        ocfs2_resv_discard(&osb->osb_la_resmap,
 470                           &OCFS2_I(inode)->ip_la_data_resv);
 471
 472        /*
 473         * The inode lock forced other nodes to sync and drop their
 474         * pages, which (correctly) happens even if we have a truncate
 475         * without allocation change - ocfs2 cluster sizes can be much
 476         * greater than page size, so we have to truncate them
 477         * anyway.
 478         */
 479        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 480        truncate_inode_pages(inode->i_mapping, new_i_size);
 481
 482        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 483                status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
 484                                               i_size_read(inode), 1);
 485                if (status)
 486                        mlog_errno(status);
 487
 488                goto bail_unlock_sem;
 489        }
 490
 491        /* alright, we're going to need to do a full blown alloc size
 492         * change. Orphan the inode so that recovery can complete the
 493         * truncate if necessary. This does the task of marking
 494         * i_size. */
 495        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 496        if (status < 0) {
 497                mlog_errno(status);
 498                goto bail_unlock_sem;
 499        }
 500
 501        status = ocfs2_commit_truncate(osb, inode, di_bh);
 502        if (status < 0) {
 503                mlog_errno(status);
 504                goto bail_unlock_sem;
 505        }
 506
 507        /* TODO: orphan dir cleanup here. */
 508bail_unlock_sem:
 509        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 510
 511bail:
 512        if (!status && OCFS2_I(inode)->ip_clusters == 0)
 513                status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 514
 515        return status;
 516}
 517
 518/*
 519 * extend file allocation only here.
 520 * we'll update all the disk stuff, and oip->alloc_size
 521 *
 522 * expect stuff to be locked, a transaction started and enough data /
 523 * metadata reservations in the contexts.
 524 *
 525 * Will return -EAGAIN, and a reason if a restart is needed.
 526 * If passed in, *reason will always be set, even in error.
 527 */
 528int ocfs2_add_inode_data(struct ocfs2_super *osb,
 529                         struct inode *inode,
 530                         u32 *logical_offset,
 531                         u32 clusters_to_add,
 532                         int mark_unwritten,
 533                         struct buffer_head *fe_bh,
 534                         handle_t *handle,
 535                         struct ocfs2_alloc_context *data_ac,
 536                         struct ocfs2_alloc_context *meta_ac,
 537                         enum ocfs2_alloc_restarted *reason_ret)
 538{
 539        int ret;
 540        struct ocfs2_extent_tree et;
 541
 542        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
 543        ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
 544                                          clusters_to_add, mark_unwritten,
 545                                          data_ac, meta_ac, reason_ret);
 546
 547        return ret;
 548}
 549
 550static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 551                                   u32 clusters_to_add, int mark_unwritten)
 552{
 553        int status = 0;
 554        int restart_func = 0;
 555        int credits;
 556        u32 prev_clusters;
 557        struct buffer_head *bh = NULL;
 558        struct ocfs2_dinode *fe = NULL;
 559        handle_t *handle = NULL;
 560        struct ocfs2_alloc_context *data_ac = NULL;
 561        struct ocfs2_alloc_context *meta_ac = NULL;
 562        enum ocfs2_alloc_restarted why = RESTART_NONE;
 563        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 564        struct ocfs2_extent_tree et;
 565        int did_quota = 0;
 566
 567        /*
 568         * Unwritten extent only exists for file systems which
 569         * support holes.
 570         */
 571        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 572
 573        status = ocfs2_read_inode_block(inode, &bh);
 574        if (status < 0) {
 575                mlog_errno(status);
 576                goto leave;
 577        }
 578        fe = (struct ocfs2_dinode *) bh->b_data;
 579
 580restart_all:
 581        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 582
 583        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
 584        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 585                                       &data_ac, &meta_ac);
 586        if (status) {
 587                mlog_errno(status);
 588                goto leave;
 589        }
 590
 591        credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
 592        handle = ocfs2_start_trans(osb, credits);
 593        if (IS_ERR(handle)) {
 594                status = PTR_ERR(handle);
 595                handle = NULL;
 596                mlog_errno(status);
 597                goto leave;
 598        }
 599
 600restarted_transaction:
 601        trace_ocfs2_extend_allocation(
 602                (unsigned long long)OCFS2_I(inode)->ip_blkno,
 603                (unsigned long long)i_size_read(inode),
 604                le32_to_cpu(fe->i_clusters), clusters_to_add,
 605                why, restart_func);
 606
 607        status = dquot_alloc_space_nodirty(inode,
 608                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 609        if (status)
 610                goto leave;
 611        did_quota = 1;
 612
 613        /* reserve a write to the file entry early on - that we if we
 614         * run out of credits in the allocation path, we can still
 615         * update i_size. */
 616        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 617                                         OCFS2_JOURNAL_ACCESS_WRITE);
 618        if (status < 0) {
 619                mlog_errno(status);
 620                goto leave;
 621        }
 622
 623        prev_clusters = OCFS2_I(inode)->ip_clusters;
 624
 625        status = ocfs2_add_inode_data(osb,
 626                                      inode,
 627                                      &logical_start,
 628                                      clusters_to_add,
 629                                      mark_unwritten,
 630                                      bh,
 631                                      handle,
 632                                      data_ac,
 633                                      meta_ac,
 634                                      &why);
 635        if ((status < 0) && (status != -EAGAIN)) {
 636                if (status != -ENOSPC)
 637                        mlog_errno(status);
 638                goto leave;
 639        }
 640        ocfs2_update_inode_fsync_trans(handle, inode, 1);
 641        ocfs2_journal_dirty(handle, bh);
 642
 643        spin_lock(&OCFS2_I(inode)->ip_lock);
 644        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 645        spin_unlock(&OCFS2_I(inode)->ip_lock);
 646        /* Release unused quota reservation */
 647        dquot_free_space(inode,
 648                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 649        did_quota = 0;
 650
 651        if (why != RESTART_NONE && clusters_to_add) {
 652                if (why == RESTART_META) {
 653                        restart_func = 1;
 654                        status = 0;
 655                } else {
 656                        BUG_ON(why != RESTART_TRANS);
 657
 658                        status = ocfs2_allocate_extend_trans(handle, 1);
 659                        if (status < 0) {
 660                                /* handle still has to be committed at
 661                                 * this point. */
 662                                status = -ENOMEM;
 663                                mlog_errno(status);
 664                                goto leave;
 665                        }
 666                        goto restarted_transaction;
 667                }
 668        }
 669
 670        trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
 671             le32_to_cpu(fe->i_clusters),
 672             (unsigned long long)le64_to_cpu(fe->i_size),
 673             OCFS2_I(inode)->ip_clusters,
 674             (unsigned long long)i_size_read(inode));
 675
 676leave:
 677        if (status < 0 && did_quota)
 678                dquot_free_space(inode,
 679                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 680        if (handle) {
 681                ocfs2_commit_trans(osb, handle);
 682                handle = NULL;
 683        }
 684        if (data_ac) {
 685                ocfs2_free_alloc_context(data_ac);
 686                data_ac = NULL;
 687        }
 688        if (meta_ac) {
 689                ocfs2_free_alloc_context(meta_ac);
 690                meta_ac = NULL;
 691        }
 692        if ((!status) && restart_func) {
 693                restart_func = 0;
 694                goto restart_all;
 695        }
 696        brelse(bh);
 697        bh = NULL;
 698
 699        return status;
 700}
 701
 702/*
 703 * While a write will already be ordering the data, a truncate will not.
 704 * Thus, we need to explicitly order the zeroed pages.
 705 */
 706static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
 707                                                      struct buffer_head *di_bh,
 708                                                      loff_t start_byte,
 709                                                      loff_t length)
 710{
 711        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 712        handle_t *handle = NULL;
 713        int ret = 0;
 714
 715        if (!ocfs2_should_order_data(inode))
 716                goto out;
 717
 718        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 719        if (IS_ERR(handle)) {
 720                ret = -ENOMEM;
 721                mlog_errno(ret);
 722                goto out;
 723        }
 724
 725        ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
 726        if (ret < 0) {
 727                mlog_errno(ret);
 728                goto out;
 729        }
 730
 731        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 732                                      OCFS2_JOURNAL_ACCESS_WRITE);
 733        if (ret)
 734                mlog_errno(ret);
 735        ocfs2_update_inode_fsync_trans(handle, inode, 1);
 736
 737out:
 738        if (ret) {
 739                if (!IS_ERR(handle))
 740                        ocfs2_commit_trans(osb, handle);
 741                handle = ERR_PTR(ret);
 742        }
 743        return handle;
 744}
 745
 746/* Some parts of this taken from generic_cont_expand, which turned out
 747 * to be too fragile to do exactly what we need without us having to
 748 * worry about recursive locking in ->write_begin() and ->write_end(). */
 749static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 750                                 u64 abs_to, struct buffer_head *di_bh)
 751{
 752        struct address_space *mapping = inode->i_mapping;
 753        struct page *page;
 754        unsigned long index = abs_from >> PAGE_SHIFT;
 755        handle_t *handle;
 756        int ret = 0;
 757        unsigned zero_from, zero_to, block_start, block_end;
 758        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 759
 760        BUG_ON(abs_from >= abs_to);
 761        BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
 762        BUG_ON(abs_from & (inode->i_blkbits - 1));
 763
 764        handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
 765                                                      abs_from,
 766                                                      abs_to - abs_from);
 767        if (IS_ERR(handle)) {
 768                ret = PTR_ERR(handle);
 769                goto out;
 770        }
 771
 772        page = find_or_create_page(mapping, index, GFP_NOFS);
 773        if (!page) {
 774                ret = -ENOMEM;
 775                mlog_errno(ret);
 776                goto out_commit_trans;
 777        }
 778
 779        /* Get the offsets within the page that we want to zero */
 780        zero_from = abs_from & (PAGE_SIZE - 1);
 781        zero_to = abs_to & (PAGE_SIZE - 1);
 782        if (!zero_to)
 783                zero_to = PAGE_SIZE;
 784
 785        trace_ocfs2_write_zero_page(
 786                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 787                        (unsigned long long)abs_from,
 788                        (unsigned long long)abs_to,
 789                        index, zero_from, zero_to);
 790
 791        /* We know that zero_from is block aligned */
 792        for (block_start = zero_from; block_start < zero_to;
 793             block_start = block_end) {
 794                block_end = block_start + i_blocksize(inode);
 795
 796                /*
 797                 * block_start is block-aligned.  Bump it by one to force
 798                 * __block_write_begin and block_commit_write to zero the
 799                 * whole block.
 800                 */
 801                ret = __block_write_begin(page, block_start + 1, 0,
 802                                          ocfs2_get_block);
 803                if (ret < 0) {
 804                        mlog_errno(ret);
 805                        goto out_unlock;
 806                }
 807
 808
 809                /* must not update i_size! */
 810                ret = block_commit_write(page, block_start + 1,
 811                                         block_start + 1);
 812                if (ret < 0)
 813                        mlog_errno(ret);
 814                else
 815                        ret = 0;
 816        }
 817
 818        /*
 819         * fs-writeback will release the dirty pages without page lock
 820         * whose offset are over inode size, the release happens at
 821         * block_write_full_page().
 822         */
 823        i_size_write(inode, abs_to);
 824        inode->i_blocks = ocfs2_inode_sector_count(inode);
 825        di->i_size = cpu_to_le64((u64)i_size_read(inode));
 826        inode->i_mtime = inode->i_ctime = current_time(inode);
 827        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 828        di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 829        di->i_mtime_nsec = di->i_ctime_nsec;
 830        if (handle) {
 831                ocfs2_journal_dirty(handle, di_bh);
 832                ocfs2_update_inode_fsync_trans(handle, inode, 1);
 833        }
 834
 835out_unlock:
 836        unlock_page(page);
 837        put_page(page);
 838out_commit_trans:
 839        if (handle)
 840                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 841out:
 842        return ret;
 843}
 844
 845/*
 846 * Find the next range to zero.  We do this in terms of bytes because
 847 * that's what ocfs2_zero_extend() wants, and it is dealing with the
 848 * pagecache.  We may return multiple extents.
 849 *
 850 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
 851 * needs to be zeroed.  range_start and range_end return the next zeroing
 852 * range.  A subsequent call should pass the previous range_end as its
 853 * zero_start.  If range_end is 0, there's nothing to do.
 854 *
 855 * Unwritten extents are skipped over.  Refcounted extents are CoWd.
 856 */
 857static int ocfs2_zero_extend_get_range(struct inode *inode,
 858                                       struct buffer_head *di_bh,
 859                                       u64 zero_start, u64 zero_end,
 860                                       u64 *range_start, u64 *range_end)
 861{
 862        int rc = 0, needs_cow = 0;
 863        u32 p_cpos, zero_clusters = 0;
 864        u32 zero_cpos =
 865                zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 866        u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
 867        unsigned int num_clusters = 0;
 868        unsigned int ext_flags = 0;
 869
 870        while (zero_cpos < last_cpos) {
 871                rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
 872                                        &num_clusters, &ext_flags);
 873                if (rc) {
 874                        mlog_errno(rc);
 875                        goto out;
 876                }
 877
 878                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
 879                        zero_clusters = num_clusters;
 880                        if (ext_flags & OCFS2_EXT_REFCOUNTED)
 881                                needs_cow = 1;
 882                        break;
 883                }
 884
 885                zero_cpos += num_clusters;
 886        }
 887        if (!zero_clusters) {
 888                *range_end = 0;
 889                goto out;
 890        }
 891
 892        while ((zero_cpos + zero_clusters) < last_cpos) {
 893                rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
 894                                        &p_cpos, &num_clusters,
 895                                        &ext_flags);
 896                if (rc) {
 897                        mlog_errno(rc);
 898                        goto out;
 899                }
 900
 901                if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
 902                        break;
 903                if (ext_flags & OCFS2_EXT_REFCOUNTED)
 904                        needs_cow = 1;
 905                zero_clusters += num_clusters;
 906        }
 907        if ((zero_cpos + zero_clusters) > last_cpos)
 908                zero_clusters = last_cpos - zero_cpos;
 909
 910        if (needs_cow) {
 911                rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
 912                                        zero_clusters, UINT_MAX);
 913                if (rc) {
 914                        mlog_errno(rc);
 915                        goto out;
 916                }
 917        }
 918
 919        *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
 920        *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
 921                                             zero_cpos + zero_clusters);
 922
 923out:
 924        return rc;
 925}
 926
 927/*
 928 * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
 929 * has made sure that the entire range needs zeroing.
 930 */
 931static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
 932                                   u64 range_end, struct buffer_head *di_bh)
 933{
 934        int rc = 0;
 935        u64 next_pos;
 936        u64 zero_pos = range_start;
 937
 938        trace_ocfs2_zero_extend_range(
 939                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 940                        (unsigned long long)range_start,
 941                        (unsigned long long)range_end);
 942        BUG_ON(range_start >= range_end);
 943
 944        while (zero_pos < range_end) {
 945                next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
 946                if (next_pos > range_end)
 947                        next_pos = range_end;
 948                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
 949                if (rc < 0) {
 950                        mlog_errno(rc);
 951                        break;
 952                }
 953                zero_pos = next_pos;
 954
 955                /*
 956                 * Very large extends have the potential to lock up
 957                 * the cpu for extended periods of time.
 958                 */
 959                cond_resched();
 960        }
 961
 962        return rc;
 963}
 964
 965int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 966                      loff_t zero_to_size)
 967{
 968        int ret = 0;
 969        u64 zero_start, range_start = 0, range_end = 0;
 970        struct super_block *sb = inode->i_sb;
 971
 972        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 973        trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
 974                                (unsigned long long)zero_start,
 975                                (unsigned long long)i_size_read(inode));
 976        while (zero_start < zero_to_size) {
 977                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
 978                                                  zero_to_size,
 979                                                  &range_start,
 980                                                  &range_end);
 981                if (ret) {
 982                        mlog_errno(ret);
 983                        break;
 984                }
 985                if (!range_end)
 986                        break;
 987                /* Trim the ends */
 988                if (range_start < zero_start)
 989                        range_start = zero_start;
 990                if (range_end > zero_to_size)
 991                        range_end = zero_to_size;
 992
 993                ret = ocfs2_zero_extend_range(inode, range_start,
 994                                              range_end, di_bh);
 995                if (ret) {
 996                        mlog_errno(ret);
 997                        break;
 998                }
 999                zero_start = range_end;
1000        }
1001
1002        return ret;
1003}
1004
1005int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1006                          u64 new_i_size, u64 zero_to)
1007{
1008        int ret;
1009        u32 clusters_to_add;
1010        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1011
1012        /*
1013         * Only quota files call this without a bh, and they can't be
1014         * refcounted.
1015         */
1016        BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
1017        BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1018
1019        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1020        if (clusters_to_add < oi->ip_clusters)
1021                clusters_to_add = 0;
1022        else
1023                clusters_to_add -= oi->ip_clusters;
1024
1025        if (clusters_to_add) {
1026                ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
1027                                              clusters_to_add, 0);
1028                if (ret) {
1029                        mlog_errno(ret);
1030                        goto out;
1031                }
1032        }
1033
1034        /*
1035         * Call this even if we don't add any clusters to the tree. We
1036         * still need to zero the area between the old i_size and the
1037         * new i_size.
1038         */
1039        ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1040        if (ret < 0)
1041                mlog_errno(ret);
1042
1043out:
1044        return ret;
1045}
1046
1047static int ocfs2_extend_file(struct inode *inode,
1048                             struct buffer_head *di_bh,
1049                             u64 new_i_size)
1050{
1051        int ret = 0;
1052        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1053
1054        BUG_ON(!di_bh);
1055
1056        /* setattr sometimes calls us like this. */
1057        if (new_i_size == 0)
1058                goto out;
1059
1060        if (i_size_read(inode) == new_i_size)
1061                goto out;
1062        BUG_ON(new_i_size < i_size_read(inode));
1063
1064        /*
1065         * The alloc sem blocks people in read/write from reading our
1066         * allocation until we're done changing it. We depend on
1067         * i_mutex to block other extend/truncate calls while we're
1068         * here.  We even have to hold it for sparse files because there
1069         * might be some tail zeroing.
1070         */
1071        down_write(&oi->ip_alloc_sem);
1072
1073        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1074                /*
1075                 * We can optimize small extends by keeping the inodes
1076                 * inline data.
1077                 */
1078                if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1079                        up_write(&oi->ip_alloc_sem);
1080                        goto out_update_size;
1081                }
1082
1083                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1084                if (ret) {
1085                        up_write(&oi->ip_alloc_sem);
1086                        mlog_errno(ret);
1087                        goto out;
1088                }
1089        }
1090
1091        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1092                ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1093        else
1094                ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1095                                            new_i_size);
1096
1097        up_write(&oi->ip_alloc_sem);
1098
1099        if (ret < 0) {
1100                mlog_errno(ret);
1101                goto out;
1102        }
1103
1104out_update_size:
1105        ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1106        if (ret < 0)
1107                mlog_errno(ret);
1108
1109out:
1110        return ret;
1111}
1112
1113int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
1114                  struct iattr *attr)
1115{
1116        int status = 0, size_change;
1117        int inode_locked = 0;
1118        struct inode *inode = d_inode(dentry);
1119        struct super_block *sb = inode->i_sb;
1120        struct ocfs2_super *osb = OCFS2_SB(sb);
1121        struct buffer_head *bh = NULL;
1122        handle_t *handle = NULL;
1123        struct dquot *transfer_to[MAXQUOTAS] = { };
1124        int qtype;
1125        int had_lock;
1126        struct ocfs2_lock_holder oh;
1127
1128        trace_ocfs2_setattr(inode, dentry,
1129                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
1130                            dentry->d_name.len, dentry->d_name.name,
1131                            attr->ia_valid, attr->ia_mode,
1132                            from_kuid(&init_user_ns, attr->ia_uid),
1133                            from_kgid(&init_user_ns, attr->ia_gid));
1134
1135        /* ensuring we don't even attempt to truncate a symlink */
1136        if (S_ISLNK(inode->i_mode))
1137                attr->ia_valid &= ~ATTR_SIZE;
1138
1139#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1140                           | ATTR_GID | ATTR_UID | ATTR_MODE)
1141        if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1142                return 0;
1143
1144        status = setattr_prepare(&init_user_ns, dentry, attr);
1145        if (status)
1146                return status;
1147
1148        if (is_quota_modification(inode, attr)) {
1149                status = dquot_initialize(inode);
1150                if (status)
1151                        return status;
1152        }
1153        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1154        if (size_change) {
1155                /*
1156                 * Here we should wait dio to finish before inode lock
1157                 * to avoid a deadlock between ocfs2_setattr() and
1158                 * ocfs2_dio_end_io_write()
1159                 */
1160                inode_dio_wait(inode);
1161
1162                status = ocfs2_rw_lock(inode, 1);
1163                if (status < 0) {
1164                        mlog_errno(status);
1165                        goto bail;
1166                }
1167        }
1168
1169        had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
1170        if (had_lock < 0) {
1171                status = had_lock;
1172                goto bail_unlock_rw;
1173        } else if (had_lock) {
1174                /*
1175                 * As far as we know, ocfs2_setattr() could only be the first
1176                 * VFS entry point in the call chain of recursive cluster
1177                 * locking issue.
1178                 *
1179                 * For instance:
1180                 * chmod_common()
1181                 *  notify_change()
1182                 *   ocfs2_setattr()
1183                 *    posix_acl_chmod()
1184                 *     ocfs2_iop_get_acl()
1185                 *
1186                 * But, we're not 100% sure if it's always true, because the
1187                 * ordering of the VFS entry points in the call chain is out
1188                 * of our control. So, we'd better dump the stack here to
1189                 * catch the other cases of recursive locking.
1190                 */
1191                mlog(ML_ERROR, "Another case of recursive locking:\n");
1192                dump_stack();
1193        }
1194        inode_locked = 1;
1195
1196        if (size_change) {
1197                status = inode_newsize_ok(inode, attr->ia_size);
1198                if (status)
1199                        goto bail_unlock;
1200
1201                if (i_size_read(inode) >= attr->ia_size) {
1202                        if (ocfs2_should_order_data(inode)) {
1203                                status = ocfs2_begin_ordered_truncate(inode,
1204                                                                      attr->ia_size);
1205                                if (status)
1206                                        goto bail_unlock;
1207                        }
1208                        status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1209                } else
1210                        status = ocfs2_extend_file(inode, bh, attr->ia_size);
1211                if (status < 0) {
1212                        if (status != -ENOSPC)
1213                                mlog_errno(status);
1214                        status = -ENOSPC;
1215                        goto bail_unlock;
1216                }
1217        }
1218
1219        if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
1220            (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
1221                /*
1222                 * Gather pointers to quota structures so that allocation /
1223                 * freeing of quota structures happens here and not inside
1224                 * dquot_transfer() where we have problems with lock ordering
1225                 */
1226                if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
1227                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1228                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1229                        transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1230                        if (IS_ERR(transfer_to[USRQUOTA])) {
1231                                status = PTR_ERR(transfer_to[USRQUOTA]);
1232                                transfer_to[USRQUOTA] = NULL;
1233                                goto bail_unlock;
1234                        }
1235                }
1236                if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
1237                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1238                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1239                        transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1240                        if (IS_ERR(transfer_to[GRPQUOTA])) {
1241                                status = PTR_ERR(transfer_to[GRPQUOTA]);
1242                                transfer_to[GRPQUOTA] = NULL;
1243                                goto bail_unlock;
1244                        }
1245                }
1246                down_write(&OCFS2_I(inode)->ip_alloc_sem);
1247                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1248                                           2 * ocfs2_quota_trans_credits(sb));
1249                if (IS_ERR(handle)) {
1250                        status = PTR_ERR(handle);
1251                        mlog_errno(status);
1252                        goto bail_unlock_alloc;
1253                }
1254                status = __dquot_transfer(inode, transfer_to);
1255                if (status < 0)
1256                        goto bail_commit;
1257        } else {
1258                down_write(&OCFS2_I(inode)->ip_alloc_sem);
1259                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1260                if (IS_ERR(handle)) {
1261                        status = PTR_ERR(handle);
1262                        mlog_errno(status);
1263                        goto bail_unlock_alloc;
1264                }
1265        }
1266
1267        setattr_copy(&init_user_ns, inode, attr);
1268        mark_inode_dirty(inode);
1269
1270        status = ocfs2_mark_inode_dirty(handle, inode, bh);
1271        if (status < 0)
1272                mlog_errno(status);
1273
1274bail_commit:
1275        ocfs2_commit_trans(osb, handle);
1276bail_unlock_alloc:
1277        up_write(&OCFS2_I(inode)->ip_alloc_sem);
1278bail_unlock:
1279        if (status && inode_locked) {
1280                ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1281                inode_locked = 0;
1282        }
1283bail_unlock_rw:
1284        if (size_change)
1285                ocfs2_rw_unlock(inode, 1);
1286bail:
1287
1288        /* Release quota pointers in case we acquired them */
1289        for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
1290                dqput(transfer_to[qtype]);
1291
1292        if (!status && attr->ia_valid & ATTR_MODE) {
1293                status = ocfs2_acl_chmod(inode, bh);
1294                if (status < 0)
1295                        mlog_errno(status);
1296        }
1297        if (inode_locked)
1298                ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1299
1300        brelse(bh);
1301        return status;
1302}
1303
1304int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path,
1305                  struct kstat *stat, u32 request_mask, unsigned int flags)
1306{
1307        struct inode *inode = d_inode(path->dentry);
1308        struct super_block *sb = path->dentry->d_sb;
1309        struct ocfs2_super *osb = sb->s_fs_info;
1310        int err;
1311
1312        err = ocfs2_inode_revalidate(path->dentry);
1313        if (err) {
1314                if (err != -ENOENT)
1315                        mlog_errno(err);
1316                goto bail;
1317        }
1318
1319        generic_fillattr(&init_user_ns, inode, stat);
1320        /*
1321         * If there is inline data in the inode, the inode will normally not
1322         * have data blocks allocated (it may have an external xattr block).
1323         * Report at least one sector for such files, so tools like tar, rsync,
1324         * others don't incorrectly think the file is completely sparse.
1325         */
1326        if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1327                stat->blocks += (stat->size + 511)>>9;
1328
1329        /* We set the blksize from the cluster size for performance */
1330        stat->blksize = osb->s_clustersize;
1331
1332bail:
1333        return err;
1334}
1335
1336int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode,
1337                     int mask)
1338{
1339        int ret, had_lock;
1340        struct ocfs2_lock_holder oh;
1341
1342        if (mask & MAY_NOT_BLOCK)
1343                return -ECHILD;
1344
1345        had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
1346        if (had_lock < 0) {
1347                ret = had_lock;
1348                goto out;
1349        } else if (had_lock) {
1350                /* See comments in ocfs2_setattr() for details.
1351                 * The call chain of this case could be:
1352                 * do_sys_open()
1353                 *  may_open()
1354                 *   inode_permission()
1355                 *    ocfs2_permission()
1356                 *     ocfs2_iop_get_acl()
1357                 */
1358                mlog(ML_ERROR, "Another case of recursive locking:\n");
1359                dump_stack();
1360        }
1361
1362        ret = generic_permission(&init_user_ns, inode, mask);
1363
1364        ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
1365out:
1366        return ret;
1367}
1368
1369static int __ocfs2_write_remove_suid(struct inode *inode,
1370                                     struct buffer_head *bh)
1371{
1372        int ret;
1373        handle_t *handle;
1374        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1375        struct ocfs2_dinode *di;
1376
1377        trace_ocfs2_write_remove_suid(
1378                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
1379                        inode->i_mode);
1380
1381        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1382        if (IS_ERR(handle)) {
1383                ret = PTR_ERR(handle);
1384                mlog_errno(ret);
1385                goto out;
1386        }
1387
1388        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1389                                      OCFS2_JOURNAL_ACCESS_WRITE);
1390        if (ret < 0) {
1391                mlog_errno(ret);
1392                goto out_trans;
1393        }
1394
1395        inode->i_mode &= ~S_ISUID;
1396        if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1397                inode->i_mode &= ~S_ISGID;
1398
1399        di = (struct ocfs2_dinode *) bh->b_data;
1400        di->i_mode = cpu_to_le16(inode->i_mode);
1401        ocfs2_update_inode_fsync_trans(handle, inode, 0);
1402
1403        ocfs2_journal_dirty(handle, bh);
1404
1405out_trans:
1406        ocfs2_commit_trans(osb, handle);
1407out:
1408        return ret;
1409}
1410
1411static int ocfs2_write_remove_suid(struct inode *inode)
1412{
1413        int ret;
1414        struct buffer_head *bh = NULL;
1415
1416        ret = ocfs2_read_inode_block(inode, &bh);
1417        if (ret < 0) {
1418                mlog_errno(ret);
1419                goto out;
1420        }
1421
1422        ret =  __ocfs2_write_remove_suid(inode, bh);
1423out:
1424        brelse(bh);
1425        return ret;
1426}
1427
1428/*
1429 * Allocate enough extents to cover the region starting at byte offset
1430 * start for len bytes. Existing extents are skipped, any extents
1431 * added are marked as "unwritten".
1432 */
1433static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1434                                            u64 start, u64 len)
1435{
1436        int ret;
1437        u32 cpos, phys_cpos, clusters, alloc_size;
1438        u64 end = start + len;
1439        struct buffer_head *di_bh = NULL;
1440
1441        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1442                ret = ocfs2_read_inode_block(inode, &di_bh);
1443                if (ret) {
1444                        mlog_errno(ret);
1445                        goto out;
1446                }
1447
1448                /*
1449                 * Nothing to do if the requested reservation range
1450                 * fits within the inode.
1451                 */
1452                if (ocfs2_size_fits_inline_data(di_bh, end))
1453                        goto out;
1454
1455                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1456                if (ret) {
1457                        mlog_errno(ret);
1458                        goto out;
1459                }
1460        }
1461
1462        /*
1463         * We consider both start and len to be inclusive.
1464         */
1465        cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1466        clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1467        clusters -= cpos;
1468
1469        while (clusters) {
1470                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1471                                         &alloc_size, NULL);
1472                if (ret) {
1473                        mlog_errno(ret);
1474                        goto out;
1475                }
1476
1477                /*
1478                 * Hole or existing extent len can be arbitrary, so
1479                 * cap it to our own allocation request.
1480                 */
1481                if (alloc_size > clusters)
1482                        alloc_size = clusters;
1483
1484                if (phys_cpos) {
1485                        /*
1486                         * We already have an allocation at this
1487                         * region so we can safely skip it.
1488                         */
1489                        goto next;
1490                }
1491
1492                ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1493                if (ret) {
1494                        if (ret != -ENOSPC)
1495                                mlog_errno(ret);
1496                        goto out;
1497                }
1498
1499next:
1500                cpos += alloc_size;
1501                clusters -= alloc_size;
1502        }
1503
1504        ret = 0;
1505out:
1506
1507        brelse(di_bh);
1508        return ret;
1509}
1510
1511/*
1512 * Truncate a byte range, avoiding pages within partial clusters. This
1513 * preserves those pages for the zeroing code to write to.
1514 */
1515static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1516                                         u64 byte_len)
1517{
1518        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1519        loff_t start, end;
1520        struct address_space *mapping = inode->i_mapping;
1521
1522        start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1523        end = byte_start + byte_len;
1524        end = end & ~(osb->s_clustersize - 1);
1525
1526        if (start < end) {
1527                unmap_mapping_range(mapping, start, end - start, 0);
1528                truncate_inode_pages_range(mapping, start, end - 1);
1529        }
1530}
1531
1532/*
1533 * zero out partial blocks of one cluster.
1534 *
1535 * start: file offset where zero starts, will be made upper block aligned.
1536 * len: it will be trimmed to the end of current cluster if "start + len"
1537 *      is bigger than it.
1538 */
1539static int ocfs2_zeroout_partial_cluster(struct inode *inode,
1540                                        u64 start, u64 len)
1541{
1542        int ret;
1543        u64 start_block, end_block, nr_blocks;
1544        u64 p_block, offset;
1545        u32 cluster, p_cluster, nr_clusters;
1546        struct super_block *sb = inode->i_sb;
1547        u64 end = ocfs2_align_bytes_to_clusters(sb, start);
1548
1549        if (start + len < end)
1550                end = start + len;
1551
1552        start_block = ocfs2_blocks_for_bytes(sb, start);
1553        end_block = ocfs2_blocks_for_bytes(sb, end);
1554        nr_blocks = end_block - start_block;
1555        if (!nr_blocks)
1556                return 0;
1557
1558        cluster = ocfs2_bytes_to_clusters(sb, start);
1559        ret = ocfs2_get_clusters(inode, cluster, &p_cluster,
1560                                &nr_clusters, NULL);
1561        if (ret)
1562                return ret;
1563        if (!p_cluster)
1564                return 0;
1565
1566        offset = start_block - ocfs2_clusters_to_blocks(sb, cluster);
1567        p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset;
1568        return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS);
1569}
1570
1571static int ocfs2_zero_partial_clusters(struct inode *inode,
1572                                       u64 start, u64 len)
1573{
1574        int ret = 0;
1575        u64 tmpend = 0;
1576        u64 end = start + len;
1577        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578        unsigned int csize = osb->s_clustersize;
1579        handle_t *handle;
1580        loff_t isize = i_size_read(inode);
1581
1582        /*
1583         * The "start" and "end" values are NOT necessarily part of
1584         * the range whose allocation is being deleted. Rather, this
1585         * is what the user passed in with the request. We must zero
1586         * partial clusters here. There's no need to worry about
1587         * physical allocation - the zeroing code knows to skip holes.
1588         */
1589        trace_ocfs2_zero_partial_clusters(
1590                (unsigned long long)OCFS2_I(inode)->ip_blkno,
1591                (unsigned long long)start, (unsigned long long)end);
1592
1593        /*
1594         * If both edges are on a cluster boundary then there's no
1595         * zeroing required as the region is part of the allocation to
1596         * be truncated.
1597         */
1598        if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1599                goto out;
1600
1601        /* No page cache for EOF blocks, issue zero out to disk. */
1602        if (end > isize) {
1603                /*
1604                 * zeroout eof blocks in last cluster starting from
1605                 * "isize" even "start" > "isize" because it is
1606                 * complicated to zeroout just at "start" as "start"
1607                 * may be not aligned with block size, buffer write
1608                 * would be required to do that, but out of eof buffer
1609                 * write is not supported.
1610                 */
1611                ret = ocfs2_zeroout_partial_cluster(inode, isize,
1612                                        end - isize);
1613                if (ret) {
1614                        mlog_errno(ret);
1615                        goto out;
1616                }
1617                if (start >= isize)
1618                        goto out;
1619                end = isize;
1620        }
1621        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1622        if (IS_ERR(handle)) {
1623                ret = PTR_ERR(handle);
1624                mlog_errno(ret);
1625                goto out;
1626        }
1627
1628        /*
1629         * If start is on a cluster boundary and end is somewhere in another
1630         * cluster, we have not COWed the cluster starting at start, unless
1631         * end is also within the same cluster. So, in this case, we skip this
1632         * first call to ocfs2_zero_range_for_truncate() truncate and move on
1633         * to the next one.
1634         */
1635        if ((start & (csize - 1)) != 0) {
1636                /*
1637                 * We want to get the byte offset of the end of the 1st
1638                 * cluster.
1639                 */
1640                tmpend = (u64)osb->s_clustersize +
1641                        (start & ~(osb->s_clustersize - 1));
1642                if (tmpend > end)
1643                        tmpend = end;
1644
1645                trace_ocfs2_zero_partial_clusters_range1(
1646                        (unsigned long long)start,
1647                        (unsigned long long)tmpend);
1648
1649                ret = ocfs2_zero_range_for_truncate(inode, handle, start,
1650                                                    tmpend);
1651                if (ret)
1652                        mlog_errno(ret);
1653        }
1654
1655        if (tmpend < end) {
1656                /*
1657                 * This may make start and end equal, but the zeroing
1658                 * code will skip any work in that case so there's no
1659                 * need to catch it up here.
1660                 */
1661                start = end & ~(osb->s_clustersize - 1);
1662
1663                trace_ocfs2_zero_partial_clusters_range2(
1664                        (unsigned long long)start, (unsigned long long)end);
1665
1666                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1667                if (ret)
1668                        mlog_errno(ret);
1669        }
1670        ocfs2_update_inode_fsync_trans(handle, inode, 1);
1671
1672        ocfs2_commit_trans(osb, handle);
1673out:
1674        return ret;
1675}
1676
1677static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1678{
1679        int i;
1680        struct ocfs2_extent_rec *rec = NULL;
1681
1682        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1683
1684                rec = &el->l_recs[i];
1685
1686                if (le32_to_cpu(rec->e_cpos) < pos)
1687                        break;
1688        }
1689
1690        return i;
1691}
1692
1693/*
1694 * Helper to calculate the punching pos and length in one run, we handle the
1695 * following three cases in order:
1696 *
1697 * - remove the entire record
1698 * - remove a partial record
1699 * - no record needs to be removed (hole-punching completed)
1700*/
1701static void ocfs2_calc_trunc_pos(struct inode *inode,
1702                                 struct ocfs2_extent_list *el,
1703                                 struct ocfs2_extent_rec *rec,
1704                                 u32 trunc_start, u32 *trunc_cpos,
1705                                 u32 *trunc_len, u32 *trunc_end,
1706                                 u64 *blkno, int *done)
1707{
1708        int ret = 0;
1709        u32 coff, range;
1710
1711        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1712
1713        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1714                /*
1715                 * remove an entire extent record.
1716                 */
1717                *trunc_cpos = le32_to_cpu(rec->e_cpos);
1718                /*
1719                 * Skip holes if any.
1720                 */
1721                if (range < *trunc_end)
1722                        *trunc_end = range;
1723                *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1724                *blkno = le64_to_cpu(rec->e_blkno);
1725                *trunc_end = le32_to_cpu(rec->e_cpos);
1726        } else if (range > trunc_start) {
1727                /*
1728                 * remove a partial extent record, which means we're
1729                 * removing the last extent record.
1730                 */
1731                *trunc_cpos = trunc_start;
1732                /*
1733                 * skip hole if any.
1734                 */
1735                if (range < *trunc_end)
1736                        *trunc_end = range;
1737                *trunc_len = *trunc_end - trunc_start;
1738                coff = trunc_start - le32_to_cpu(rec->e_cpos);
1739                *blkno = le64_to_cpu(rec->e_blkno) +
1740                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
1741                *trunc_end = trunc_start;
1742        } else {
1743                /*
1744                 * It may have two following possibilities:
1745                 *
1746                 * - last record has been removed
1747                 * - trunc_start was within a hole
1748                 *
1749                 * both two cases mean the completion of hole punching.
1750                 */
1751                ret = 1;
1752        }
1753
1754        *done = ret;
1755}
1756
1757int ocfs2_remove_inode_range(struct inode *inode,
1758                             struct buffer_head *di_bh, u64 byte_start,
1759                             u64 byte_len)
1760{
1761        int ret = 0, flags = 0, done = 0, i;
1762        u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1763        u32 cluster_in_el;
1764        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1765        struct ocfs2_cached_dealloc_ctxt dealloc;
1766        struct address_space *mapping = inode->i_mapping;
1767        struct ocfs2_extent_tree et;
1768        struct ocfs2_path *path = NULL;
1769        struct ocfs2_extent_list *el = NULL;
1770        struct ocfs2_extent_rec *rec = NULL;
1771        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1772        u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1773
1774        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1775        ocfs2_init_dealloc_ctxt(&dealloc);
1776
1777        trace_ocfs2_remove_inode_range(
1778                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
1779                        (unsigned long long)byte_start,
1780                        (unsigned long long)byte_len);
1781
1782        if (byte_len == 0)
1783                return 0;
1784
1785        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1786                ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1787                                            byte_start + byte_len, 0);
1788                if (ret) {
1789                        mlog_errno(ret);
1790                        goto out;
1791                }
1792                /*
1793                 * There's no need to get fancy with the page cache
1794                 * truncate of an inline-data inode. We're talking
1795                 * about less than a page here, which will be cached
1796                 * in the dinode buffer anyway.
1797                 */
1798                unmap_mapping_range(mapping, 0, 0, 0);
1799                truncate_inode_pages(mapping, 0);
1800                goto out;
1801        }
1802
1803        /*
1804         * For reflinks, we may need to CoW 2 clusters which might be
1805         * partially zero'd later, if hole's start and end offset were
1806         * within one cluster(means is not exactly aligned to clustersize).
1807         */
1808
1809        if (ocfs2_is_refcount_inode(inode)) {
1810                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1811                if (ret) {
1812                        mlog_errno(ret);
1813                        goto out;
1814                }
1815
1816                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1817                if (ret) {
1818                        mlog_errno(ret);
1819                        goto out;
1820                }
1821        }
1822
1823        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1824        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1825        cluster_in_el = trunc_end;
1826
1827        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1828        if (ret) {
1829                mlog_errno(ret);
1830                goto out;
1831        }
1832
1833        path = ocfs2_new_path_from_et(&et);
1834        if (!path) {
1835                ret = -ENOMEM;
1836                mlog_errno(ret);
1837                goto out;
1838        }
1839
1840        while (trunc_end > trunc_start) {
1841
1842                ret = ocfs2_find_path(INODE_CACHE(inode), path,
1843                                      cluster_in_el);
1844                if (ret) {
1845                        mlog_errno(ret);
1846                        goto out;
1847                }
1848
1849                el = path_leaf_el(path);
1850
1851                i = ocfs2_find_rec(el, trunc_end);
1852                /*
1853                 * Need to go to previous extent block.
1854                 */
1855                if (i < 0) {
1856                        if (path->p_tree_depth == 0)
1857                                break;
1858
1859                        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1860                                                            path,
1861                                                            &cluster_in_el);
1862                        if (ret) {
1863                                mlog_errno(ret);
1864                                goto out;
1865                        }
1866
1867                        /*
1868                         * We've reached the leftmost extent block,
1869                         * it's safe to leave.
1870                         */
1871                        if (cluster_in_el == 0)
1872                                break;
1873
1874                        /*
1875                         * The 'pos' searched for previous extent block is
1876                         * always one cluster less than actual trunc_end.
1877                         */
1878                        trunc_end = cluster_in_el + 1;
1879
1880                        ocfs2_reinit_path(path, 1);
1881
1882                        continue;
1883
1884                } else
1885                        rec = &el->l_recs[i];
1886
1887                ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1888                                     &trunc_len, &trunc_end, &blkno, &done);
1889                if (done)
1890                        break;
1891
1892                flags = rec->e_flags;
1893                phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1894
1895                ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1896                                               phys_cpos, trunc_len, flags,
1897                                               &dealloc, refcount_loc, false);
1898                if (ret < 0) {
1899                        mlog_errno(ret);
1900                        goto out;
1901                }
1902
1903                cluster_in_el = trunc_end;
1904
1905                ocfs2_reinit_path(path, 1);
1906        }
1907
1908        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1909
1910out:
1911        ocfs2_free_path(path);
1912        ocfs2_schedule_truncate_log_flush(osb, 1);
1913        ocfs2_run_deallocs(osb, &dealloc);
1914
1915        return ret;
1916}
1917
1918/*
1919 * Parts of this function taken from xfs_change_file_space()
1920 */
1921static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1922                                     loff_t f_pos, unsigned int cmd,
1923                                     struct ocfs2_space_resv *sr,
1924                                     int change_size)
1925{
1926        int ret;
1927        s64 llen;
1928        loff_t size, orig_isize;
1929        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1930        struct buffer_head *di_bh = NULL;
1931        handle_t *handle;
1932        unsigned long long max_off = inode->i_sb->s_maxbytes;
1933
1934        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1935                return -EROFS;
1936
1937        inode_lock(inode);
1938
1939        /*
1940         * This prevents concurrent writes on other nodes
1941         */
1942        ret = ocfs2_rw_lock(inode, 1);
1943        if (ret) {
1944                mlog_errno(ret);
1945                goto out;
1946        }
1947
1948        ret = ocfs2_inode_lock(inode, &di_bh, 1);
1949        if (ret) {
1950                mlog_errno(ret);
1951                goto out_rw_unlock;
1952        }
1953
1954        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1955                ret = -EPERM;
1956                goto out_inode_unlock;
1957        }
1958
1959        switch (sr->l_whence) {
1960        case 0: /*SEEK_SET*/
1961                break;
1962        case 1: /*SEEK_CUR*/
1963                sr->l_start += f_pos;
1964                break;
1965        case 2: /*SEEK_END*/
1966                sr->l_start += i_size_read(inode);
1967                break;
1968        default:
1969                ret = -EINVAL;
1970                goto out_inode_unlock;
1971        }
1972        sr->l_whence = 0;
1973
1974        llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1975
1976        if (sr->l_start < 0
1977            || sr->l_start > max_off
1978            || (sr->l_start + llen) < 0
1979            || (sr->l_start + llen) > max_off) {
1980                ret = -EINVAL;
1981                goto out_inode_unlock;
1982        }
1983        size = sr->l_start + sr->l_len;
1984
1985        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
1986            cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
1987                if (sr->l_len <= 0) {
1988                        ret = -EINVAL;
1989                        goto out_inode_unlock;
1990                }
1991        }
1992
1993        if (file && should_remove_suid(file->f_path.dentry)) {
1994                ret = __ocfs2_write_remove_suid(inode, di_bh);
1995                if (ret) {
1996                        mlog_errno(ret);
1997                        goto out_inode_unlock;
1998                }
1999        }
2000
2001        down_write(&OCFS2_I(inode)->ip_alloc_sem);
2002        switch (cmd) {
2003        case OCFS2_IOC_RESVSP:
2004        case OCFS2_IOC_RESVSP64:
2005                /*
2006                 * This takes unsigned offsets, but the signed ones we
2007                 * pass have been checked against overflow above.
2008                 */
2009                ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
2010                                                       sr->l_len);
2011                break;
2012        case OCFS2_IOC_UNRESVSP:
2013        case OCFS2_IOC_UNRESVSP64:
2014                ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
2015                                               sr->l_len);
2016                break;
2017        default:
2018                ret = -EINVAL;
2019        }
2020
2021        orig_isize = i_size_read(inode);
2022        /* zeroout eof blocks in the cluster. */
2023        if (!ret && change_size && orig_isize < size) {
2024                ret = ocfs2_zeroout_partial_cluster(inode, orig_isize,
2025                                        size - orig_isize);
2026                if (!ret)
2027                        i_size_write(inode, size);
2028        }
2029        up_write(&OCFS2_I(inode)->ip_alloc_sem);
2030        if (ret) {
2031                mlog_errno(ret);
2032                goto out_inode_unlock;
2033        }
2034
2035        /*
2036         * We update c/mtime for these changes
2037         */
2038        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
2039        if (IS_ERR(handle)) {
2040                ret = PTR_ERR(handle);
2041                mlog_errno(ret);
2042                goto out_inode_unlock;
2043        }
2044
2045        inode->i_ctime = inode->i_mtime = current_time(inode);
2046        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
2047        if (ret < 0)
2048                mlog_errno(ret);
2049
2050        if (file && (file->f_flags & O_SYNC))
2051                handle->h_sync = 1;
2052
2053        ocfs2_commit_trans(osb, handle);
2054
2055out_inode_unlock:
2056        brelse(di_bh);
2057        ocfs2_inode_unlock(inode, 1);
2058out_rw_unlock:
2059        ocfs2_rw_unlock(inode, 1);
2060
2061out:
2062        inode_unlock(inode);
2063        return ret;
2064}
2065
2066int ocfs2_change_file_space(struct file *file, unsigned int cmd,
2067                            struct ocfs2_space_resv *sr)
2068{
2069        struct inode *inode = file_inode(file);
2070        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2071        int ret;
2072
2073        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
2074            !ocfs2_writes_unwritten_extents(osb))
2075                return -ENOTTY;
2076        else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
2077                 !ocfs2_sparse_alloc(osb))
2078                return -ENOTTY;
2079
2080        if (!S_ISREG(inode->i_mode))
2081                return -EINVAL;
2082
2083        if (!(file->f_mode & FMODE_WRITE))
2084                return -EBADF;
2085
2086        ret = mnt_want_write_file(file);
2087        if (ret)
2088                return ret;
2089        ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
2090        mnt_drop_write_file(file);
2091        return ret;
2092}
2093
2094static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
2095                            loff_t len)
2096{
2097        struct inode *inode = file_inode(file);
2098        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2099        struct ocfs2_space_resv sr;
2100        int change_size = 1;
2101        int cmd = OCFS2_IOC_RESVSP64;
2102
2103        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2104                return -EOPNOTSUPP;
2105        if (!ocfs2_writes_unwritten_extents(osb))
2106                return -EOPNOTSUPP;
2107
2108        if (mode & FALLOC_FL_KEEP_SIZE)
2109                change_size = 0;
2110
2111        if (mode & FALLOC_FL_PUNCH_HOLE)
2112                cmd = OCFS2_IOC_UNRESVSP64;
2113
2114        sr.l_whence = 0;
2115        sr.l_start = (s64)offset;
2116        sr.l_len = (s64)len;
2117
2118        return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2119                                         change_size);
2120}
2121
2122int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2123                                   size_t count)
2124{
2125        int ret = 0;
2126        unsigned int extent_flags;
2127        u32 cpos, clusters, extent_len, phys_cpos;
2128        struct super_block *sb = inode->i_sb;
2129
2130        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2131            !ocfs2_is_refcount_inode(inode) ||
2132            OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2133                return 0;
2134
2135        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2136        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2137
2138        while (clusters) {
2139                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2140                                         &extent_flags);
2141                if (ret < 0) {
2142                        mlog_errno(ret);
2143                        goto out;
2144                }
2145
2146                if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2147                        ret = 1;
2148                        break;
2149                }
2150
2151                if (extent_len > clusters)
2152                        extent_len = clusters;
2153
2154                clusters -= extent_len;
2155                cpos += extent_len;
2156        }
2157out:
2158        return ret;
2159}
2160
2161static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2162{
2163        int blockmask = inode->i_sb->s_blocksize - 1;
2164        loff_t final_size = pos + count;
2165
2166        if ((pos & blockmask) || (final_size & blockmask))
2167                return 1;
2168        return 0;
2169}
2170
2171static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
2172                                            struct buffer_head **di_bh,
2173                                            int meta_level,
2174                                            int write_sem,
2175                                            int wait)
2176{
2177        int ret = 0;
2178
2179        if (wait)
2180                ret = ocfs2_inode_lock(inode, di_bh, meta_level);
2181        else
2182                ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
2183        if (ret < 0)
2184                goto out;
2185
2186        if (wait) {
2187                if (write_sem)
2188                        down_write(&OCFS2_I(inode)->ip_alloc_sem);
2189                else
2190                        down_read(&OCFS2_I(inode)->ip_alloc_sem);
2191        } else {
2192                if (write_sem)
2193                        ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2194                else
2195                        ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2196
2197                if (!ret) {
2198                        ret = -EAGAIN;
2199                        goto out_unlock;
2200                }
2201        }
2202
2203        return ret;
2204
2205out_unlock:
2206        brelse(*di_bh);
2207        *di_bh = NULL;
2208        ocfs2_inode_unlock(inode, meta_level);
2209out:
2210        return ret;
2211}
2212
2213static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
2214                                               struct buffer_head **di_bh,
2215                                               int meta_level,
2216                                               int write_sem)
2217{
2218        if (write_sem)
2219                up_write(&OCFS2_I(inode)->ip_alloc_sem);
2220        else
2221                up_read(&OCFS2_I(inode)->ip_alloc_sem);
2222
2223        brelse(*di_bh);
2224        *di_bh = NULL;
2225
2226        if (meta_level >= 0)
2227                ocfs2_inode_unlock(inode, meta_level);
2228}
2229
2230static int ocfs2_prepare_inode_for_write(struct file *file,
2231                                         loff_t pos, size_t count, int wait)
2232{
2233        int ret = 0, meta_level = 0, overwrite_io = 0;
2234        int write_sem = 0;
2235        struct dentry *dentry = file->f_path.dentry;
2236        struct inode *inode = d_inode(dentry);
2237        struct buffer_head *di_bh = NULL;
2238        u32 cpos;
2239        u32 clusters;
2240
2241        /*
2242         * We start with a read level meta lock and only jump to an ex
2243         * if we need to make modifications here.
2244         */
2245        for(;;) {
2246                ret = ocfs2_inode_lock_for_extent_tree(inode,
2247                                                       &di_bh,
2248                                                       meta_level,
2249                                                       write_sem,
2250                                                       wait);
2251                if (ret < 0) {
2252                        if (ret != -EAGAIN)
2253                                mlog_errno(ret);
2254                        goto out;
2255                }
2256
2257                /*
2258                 * Check if IO will overwrite allocated blocks in case
2259                 * IOCB_NOWAIT flag is set.
2260                 */
2261                if (!wait && !overwrite_io) {
2262                        overwrite_io = 1;
2263
2264                        ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
2265                        if (ret < 0) {
2266                                if (ret != -EAGAIN)
2267                                        mlog_errno(ret);
2268                                goto out_unlock;
2269                        }
2270                }
2271
2272                /* Clear suid / sgid if necessary. We do this here
2273                 * instead of later in the write path because
2274                 * remove_suid() calls ->setattr without any hint that
2275                 * we may have already done our cluster locking. Since
2276                 * ocfs2_setattr() *must* take cluster locks to
2277                 * proceed, this will lead us to recursively lock the
2278                 * inode. There's also the dinode i_size state which
2279                 * can be lost via setattr during extending writes (we
2280                 * set inode->i_size at the end of a write. */
2281                if (should_remove_suid(dentry)) {
2282                        if (meta_level == 0) {
2283                                ocfs2_inode_unlock_for_extent_tree(inode,
2284                                                                   &di_bh,
2285                                                                   meta_level,
2286                                                                   write_sem);
2287                                meta_level = 1;
2288                                continue;
2289                        }
2290
2291                        ret = ocfs2_write_remove_suid(inode);
2292                        if (ret < 0) {
2293                                mlog_errno(ret);
2294                                goto out_unlock;
2295                        }
2296                }
2297
2298                ret = ocfs2_check_range_for_refcount(inode, pos, count);
2299                if (ret == 1) {
2300                        ocfs2_inode_unlock_for_extent_tree(inode,
2301                                                           &di_bh,
2302                                                           meta_level,
2303                                                           write_sem);
2304                        meta_level = 1;
2305                        write_sem = 1;
2306                        ret = ocfs2_inode_lock_for_extent_tree(inode,
2307                                                               &di_bh,
2308                                                               meta_level,
2309                                                               write_sem,
2310                                                               wait);
2311                        if (ret < 0) {
2312                                if (ret != -EAGAIN)
2313                                        mlog_errno(ret);
2314                                goto out;
2315                        }
2316
2317                        cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2318                        clusters =
2319                                ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2320                        ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
2321                }
2322
2323                if (ret < 0) {
2324                        if (ret != -EAGAIN)
2325                                mlog_errno(ret);
2326                        goto out_unlock;
2327                }
2328
2329                break;
2330        }
2331
2332out_unlock:
2333        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2334                                            pos, count, wait);
2335
2336        ocfs2_inode_unlock_for_extent_tree(inode,
2337                                           &di_bh,
2338                                           meta_level,
2339                                           write_sem);
2340
2341out:
2342        return ret;
2343}
2344
2345static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2346                                    struct iov_iter *from)
2347{
2348        int rw_level;
2349        ssize_t written = 0;
2350        ssize_t ret;
2351        size_t count = iov_iter_count(from);
2352        struct file *file = iocb->ki_filp;
2353        struct inode *inode = file_inode(file);
2354        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2355        int full_coherency = !(osb->s_mount_opt &
2356                               OCFS2_MOUNT_COHERENCY_BUFFERED);
2357        void *saved_ki_complete = NULL;
2358        int append_write = ((iocb->ki_pos + count) >=
2359                        i_size_read(inode) ? 1 : 0);
2360        int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2361        int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2362
2363        trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
2364                (unsigned long long)OCFS2_I(inode)->ip_blkno,
2365                file->f_path.dentry->d_name.len,
2366                file->f_path.dentry->d_name.name,
2367                (unsigned int)from->nr_segs);   /* GRRRRR */
2368
2369        if (!direct_io && nowait)
2370                return -EOPNOTSUPP;
2371
2372        if (count == 0)
2373                return 0;
2374
2375        if (nowait) {
2376                if (!inode_trylock(inode))
2377                        return -EAGAIN;
2378        } else
2379                inode_lock(inode);
2380
2381        /*
2382         * Concurrent O_DIRECT writes are allowed with
2383         * mount_option "coherency=buffered".
2384         * For append write, we must take rw EX.
2385         */
2386        rw_level = (!direct_io || full_coherency || append_write);
2387
2388        if (nowait)
2389                ret = ocfs2_try_rw_lock(inode, rw_level);
2390        else
2391                ret = ocfs2_rw_lock(inode, rw_level);
2392        if (ret < 0) {
2393                if (ret != -EAGAIN)
2394                        mlog_errno(ret);
2395                goto out_mutex;
2396        }
2397
2398        /*
2399         * O_DIRECT writes with "coherency=full" need to take EX cluster
2400         * inode_lock to guarantee coherency.
2401         */
2402        if (direct_io && full_coherency) {
2403                /*
2404                 * We need to take and drop the inode lock to force
2405                 * other nodes to drop their caches.  Buffered I/O
2406                 * already does this in write_begin().
2407                 */
2408                if (nowait)
2409                        ret = ocfs2_try_inode_lock(inode, NULL, 1);
2410                else
2411                        ret = ocfs2_inode_lock(inode, NULL, 1);
2412                if (ret < 0) {
2413                        if (ret != -EAGAIN)
2414                                mlog_errno(ret);
2415                        goto out;
2416                }
2417
2418                ocfs2_inode_unlock(inode, 1);
2419        }
2420
2421        ret = generic_write_checks(iocb, from);
2422        if (ret <= 0) {
2423                if (ret)
2424                        mlog_errno(ret);
2425                goto out;
2426        }
2427        count = ret;
2428
2429        ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
2430        if (ret < 0) {
2431                if (ret != -EAGAIN)
2432                        mlog_errno(ret);
2433                goto out;
2434        }
2435
2436        if (direct_io && !is_sync_kiocb(iocb) &&
2437            ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
2438                /*
2439                 * Make it a sync io if it's an unaligned aio.
2440                 */
2441                saved_ki_complete = xchg(&iocb->ki_complete, NULL);
2442        }
2443
2444        /* communicate with ocfs2_dio_end_io */
2445        ocfs2_iocb_set_rw_locked(iocb, rw_level);
2446
2447        written = __generic_file_write_iter(iocb, from);
2448        /* buffered aio wouldn't have proper lock coverage today */
2449        BUG_ON(written == -EIOCBQUEUED && !direct_io);
2450
2451        /*
2452         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2453         * function pointer which is called when o_direct io completes so that
2454         * it can unlock our rw lock.
2455         * Unfortunately there are error cases which call end_io and others
2456         * that don't.  so we don't have to unlock the rw_lock if either an
2457         * async dio is going to do it in the future or an end_io after an
2458         * error has already done it.
2459         */
2460        if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2461                rw_level = -1;
2462        }
2463
2464        if (unlikely(written <= 0))
2465                goto out;
2466
2467        if (((file->f_flags & O_DSYNC) && !direct_io) ||
2468            IS_SYNC(inode)) {
2469                ret = filemap_fdatawrite_range(file->f_mapping,
2470                                               iocb->ki_pos - written,
2471                                               iocb->ki_pos - 1);
2472                if (ret < 0)
2473                        written = ret;
2474
2475                if (!ret) {
2476                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
2477                        if (ret < 0)
2478                                written = ret;
2479                }
2480
2481                if (!ret)
2482                        ret = filemap_fdatawait_range(file->f_mapping,
2483                                                      iocb->ki_pos - written,
2484                                                      iocb->ki_pos - 1);
2485        }
2486
2487out:
2488        if (saved_ki_complete)
2489                xchg(&iocb->ki_complete, saved_ki_complete);
2490
2491        if (rw_level != -1)
2492                ocfs2_rw_unlock(inode, rw_level);
2493
2494out_mutex:
2495        inode_unlock(inode);
2496
2497        if (written)
2498                ret = written;
2499        return ret;
2500}
2501
2502static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2503                                   struct iov_iter *to)
2504{
2505        int ret = 0, rw_level = -1, lock_level = 0;
2506        struct file *filp = iocb->ki_filp;
2507        struct inode *inode = file_inode(filp);
2508        int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2509        int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2510
2511        trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
2512                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
2513                        filp->f_path.dentry->d_name.len,
2514                        filp->f_path.dentry->d_name.name,
2515                        to->nr_segs);   /* GRRRRR */
2516
2517
2518        if (!inode) {
2519                ret = -EINVAL;
2520                mlog_errno(ret);
2521                goto bail;
2522        }
2523
2524        if (!direct_io && nowait)
2525                return -EOPNOTSUPP;
2526
2527        /*
2528         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
2529         * need locks to protect pending reads from racing with truncate.
2530         */
2531        if (direct_io) {
2532                if (nowait)
2533                        ret = ocfs2_try_rw_lock(inode, 0);
2534                else
2535                        ret = ocfs2_rw_lock(inode, 0);
2536
2537                if (ret < 0) {
2538                        if (ret != -EAGAIN)
2539                                mlog_errno(ret);
2540                        goto bail;
2541                }
2542                rw_level = 0;
2543                /* communicate with ocfs2_dio_end_io */
2544                ocfs2_iocb_set_rw_locked(iocb, rw_level);
2545        }
2546
2547        /*
2548         * We're fine letting folks race truncates and extending
2549         * writes with read across the cluster, just like they can
2550         * locally. Hence no rw_lock during read.
2551         *
2552         * Take and drop the meta data lock to update inode fields
2553         * like i_size. This allows the checks down below
2554         * generic_file_read_iter() a chance of actually working.
2555         */
2556        ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
2557                                     !nowait);
2558        if (ret < 0) {
2559                if (ret != -EAGAIN)
2560                        mlog_errno(ret);
2561                goto bail;
2562        }
2563        ocfs2_inode_unlock(inode, lock_level);
2564
2565        ret = generic_file_read_iter(iocb, to);
2566        trace_generic_file_read_iter_ret(ret);
2567
2568        /* buffered aio wouldn't have proper lock coverage today */
2569        BUG_ON(ret == -EIOCBQUEUED && !direct_io);
2570
2571        /* see ocfs2_file_write_iter */
2572        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2573                rw_level = -1;
2574        }
2575
2576bail:
2577        if (rw_level != -1)
2578                ocfs2_rw_unlock(inode, rw_level);
2579
2580        return ret;
2581}
2582
2583/* Refer generic_file_llseek_unlocked() */
2584static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2585{
2586        struct inode *inode = file->f_mapping->host;
2587        int ret = 0;
2588
2589        inode_lock(inode);
2590
2591        switch (whence) {
2592        case SEEK_SET:
2593                break;
2594        case SEEK_END:
2595                /* SEEK_END requires the OCFS2 inode lock for the file
2596                 * because it references the file's size.
2597                 */
2598                ret = ocfs2_inode_lock(inode, NULL, 0);
2599                if (ret < 0) {
2600                        mlog_errno(ret);
2601                        goto out;
2602                }
2603                offset += i_size_read(inode);
2604                ocfs2_inode_unlock(inode, 0);
2605                break;
2606        case SEEK_CUR:
2607                if (offset == 0) {
2608                        offset = file->f_pos;
2609                        goto out;
2610                }
2611                offset += file->f_pos;
2612                break;
2613        case SEEK_DATA:
2614        case SEEK_HOLE:
2615                ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2616                if (ret)
2617                        goto out;
2618                break;
2619        default:
2620                ret = -EINVAL;
2621                goto out;
2622        }
2623
2624        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2625
2626out:
2627        inode_unlock(inode);
2628        if (ret)
2629                return ret;
2630        return offset;
2631}
2632
2633static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
2634                                     struct file *file_out, loff_t pos_out,
2635                                     loff_t len, unsigned int remap_flags)
2636{
2637        struct inode *inode_in = file_inode(file_in);
2638        struct inode *inode_out = file_inode(file_out);
2639        struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
2640        struct buffer_head *in_bh = NULL, *out_bh = NULL;
2641        bool same_inode = (inode_in == inode_out);
2642        loff_t remapped = 0;
2643        ssize_t ret;
2644
2645        if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
2646                return -EINVAL;
2647        if (!ocfs2_refcount_tree(osb))
2648                return -EOPNOTSUPP;
2649        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
2650                return -EROFS;
2651
2652        /* Lock both files against IO */
2653        ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
2654        if (ret)
2655                return ret;
2656
2657        /* Check file eligibility and prepare for block sharing. */
2658        ret = -EINVAL;
2659        if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
2660            (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
2661                goto out_unlock;
2662
2663        ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
2664                        &len, remap_flags);
2665        if (ret < 0 || len == 0)
2666                goto out_unlock;
2667
2668        /* Lock out changes to the allocation maps and remap. */
2669        down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2670        if (!same_inode)
2671                down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
2672                                  SINGLE_DEPTH_NESTING);
2673
2674        /* Zap any page cache for the destination file's range. */
2675        truncate_inode_pages_range(&inode_out->i_data,
2676                                   round_down(pos_out, PAGE_SIZE),
2677                                   round_up(pos_out + len, PAGE_SIZE) - 1);
2678
2679        remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
2680                        inode_out, out_bh, pos_out, len);
2681        up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2682        if (!same_inode)
2683                up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
2684        if (remapped < 0) {
2685                ret = remapped;
2686                mlog_errno(ret);
2687                goto out_unlock;
2688        }
2689
2690        /*
2691         * Empty the extent map so that we may get the right extent
2692         * record from the disk.
2693         */
2694        ocfs2_extent_map_trunc(inode_in, 0);
2695        ocfs2_extent_map_trunc(inode_out, 0);
2696
2697        ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
2698        if (ret) {
2699                mlog_errno(ret);
2700                goto out_unlock;
2701        }
2702
2703out_unlock:
2704        ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
2705        return remapped > 0 ? remapped : ret;
2706}
2707
2708const struct inode_operations ocfs2_file_iops = {
2709        .setattr        = ocfs2_setattr,
2710        .getattr        = ocfs2_getattr,
2711        .permission     = ocfs2_permission,
2712        .listxattr      = ocfs2_listxattr,
2713        .fiemap         = ocfs2_fiemap,
2714        .get_acl        = ocfs2_iop_get_acl,
2715        .set_acl        = ocfs2_iop_set_acl,
2716        .fileattr_get   = ocfs2_fileattr_get,
2717        .fileattr_set   = ocfs2_fileattr_set,
2718};
2719
2720const struct inode_operations ocfs2_special_file_iops = {
2721        .setattr        = ocfs2_setattr,
2722        .getattr        = ocfs2_getattr,
2723        .permission     = ocfs2_permission,
2724        .get_acl        = ocfs2_iop_get_acl,
2725        .set_acl        = ocfs2_iop_set_acl,
2726};
2727
2728/*
2729 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2730 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2731 */
2732const struct file_operations ocfs2_fops = {
2733        .llseek         = ocfs2_file_llseek,
2734        .mmap           = ocfs2_mmap,
2735        .fsync          = ocfs2_sync_file,
2736        .release        = ocfs2_file_release,
2737        .open           = ocfs2_file_open,
2738        .read_iter      = ocfs2_file_read_iter,
2739        .write_iter     = ocfs2_file_write_iter,
2740        .unlocked_ioctl = ocfs2_ioctl,
2741#ifdef CONFIG_COMPAT
2742        .compat_ioctl   = ocfs2_compat_ioctl,
2743#endif
2744        .lock           = ocfs2_lock,
2745        .flock          = ocfs2_flock,
2746        .splice_read    = generic_file_splice_read,
2747        .splice_write   = iter_file_splice_write,
2748        .fallocate      = ocfs2_fallocate,
2749        .remap_file_range = ocfs2_remap_file_range,
2750};
2751
2752const struct file_operations ocfs2_dops = {
2753        .llseek         = generic_file_llseek,
2754        .read           = generic_read_dir,
2755        .iterate        = ocfs2_readdir,
2756        .fsync          = ocfs2_sync_file,
2757        .release        = ocfs2_dir_release,
2758        .open           = ocfs2_dir_open,
2759        .unlocked_ioctl = ocfs2_ioctl,
2760#ifdef CONFIG_COMPAT
2761        .compat_ioctl   = ocfs2_compat_ioctl,
2762#endif
2763        .lock           = ocfs2_lock,
2764        .flock          = ocfs2_flock,
2765};
2766
2767/*
2768 * POSIX-lockless variants of our file_operations.
2769 *
2770 * These will be used if the underlying cluster stack does not support
2771 * posix file locking, if the user passes the "localflocks" mount
2772 * option, or if we have a local-only fs.
2773 *
2774 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2775 * so we still want it in the case of no stack support for
2776 * plocks. Internally, it will do the right thing when asked to ignore
2777 * the cluster.
2778 */
2779const struct file_operations ocfs2_fops_no_plocks = {
2780        .llseek         = ocfs2_file_llseek,
2781        .mmap           = ocfs2_mmap,
2782        .fsync          = ocfs2_sync_file,
2783        .release        = ocfs2_file_release,
2784        .open           = ocfs2_file_open,
2785        .read_iter      = ocfs2_file_read_iter,
2786        .write_iter     = ocfs2_file_write_iter,
2787        .unlocked_ioctl = ocfs2_ioctl,
2788#ifdef CONFIG_COMPAT
2789        .compat_ioctl   = ocfs2_compat_ioctl,
2790#endif
2791        .flock          = ocfs2_flock,
2792        .splice_read    = generic_file_splice_read,
2793        .splice_write   = iter_file_splice_write,
2794        .fallocate      = ocfs2_fallocate,
2795        .remap_file_range = ocfs2_remap_file_range,
2796};
2797
2798const struct file_operations ocfs2_dops_no_plocks = {
2799        .llseek         = generic_file_llseek,
2800        .read           = generic_read_dir,
2801        .iterate        = ocfs2_readdir,
2802        .fsync          = ocfs2_sync_file,
2803        .release        = ocfs2_dir_release,
2804        .open           = ocfs2_dir_open,
2805        .unlocked_ioctl = ocfs2_ioctl,
2806#ifdef CONFIG_COMPAT
2807        .compat_ioctl   = ocfs2_compat_ioctl,
2808#endif
2809        .flock          = ocfs2_flock,
2810};
2811