linux/fs/ocfs2/file.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/* -*- mode: c; c-basic-offset: 8; -*-
   3 * vim: noexpandtab sw=8 ts=8 sts=0:
   4 *
   5 * file.c
   6 *
   7 * File open, close, extend, truncate
   8 *
   9 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
  10 */
  11
  12#include <linux/capability.h>
  13#include <linux/fs.h>
  14#include <linux/types.h>
  15#include <linux/slab.h>
  16#include <linux/highmem.h>
  17#include <linux/pagemap.h>
  18#include <linux/uio.h>
  19#include <linux/sched.h>
  20#include <linux/splice.h>
  21#include <linux/mount.h>
  22#include <linux/writeback.h>
  23#include <linux/falloc.h>
  24#include <linux/quotaops.h>
  25#include <linux/blkdev.h>
  26#include <linux/backing-dev.h>
  27
  28#include <cluster/masklog.h>
  29
  30#include "ocfs2.h"
  31
  32#include "alloc.h"
  33#include "aops.h"
  34#include "dir.h"
  35#include "dlmglue.h"
  36#include "extent_map.h"
  37#include "file.h"
  38#include "sysfile.h"
  39#include "inode.h"
  40#include "ioctl.h"
  41#include "journal.h"
  42#include "locks.h"
  43#include "mmap.h"
  44#include "suballoc.h"
  45#include "super.h"
  46#include "xattr.h"
  47#include "acl.h"
  48#include "quota.h"
  49#include "refcounttree.h"
  50#include "ocfs2_trace.h"
  51
  52#include "buffer_head_io.h"
  53
  54static int ocfs2_init_file_private(struct inode *inode, struct file *file)
  55{
  56        struct ocfs2_file_private *fp;
  57
  58        fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
  59        if (!fp)
  60                return -ENOMEM;
  61
  62        fp->fp_file = file;
  63        mutex_init(&fp->fp_mutex);
  64        ocfs2_file_lock_res_init(&fp->fp_flock, fp);
  65        file->private_data = fp;
  66
  67        return 0;
  68}
  69
  70static void ocfs2_free_file_private(struct inode *inode, struct file *file)
  71{
  72        struct ocfs2_file_private *fp = file->private_data;
  73        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  74
  75        if (fp) {
  76                ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
  77                ocfs2_lock_res_free(&fp->fp_flock);
  78                kfree(fp);
  79                file->private_data = NULL;
  80        }
  81}
  82
  83static int ocfs2_file_open(struct inode *inode, struct file *file)
  84{
  85        int status;
  86        int mode = file->f_flags;
  87        struct ocfs2_inode_info *oi = OCFS2_I(inode);
  88
  89        trace_ocfs2_file_open(inode, file, file->f_path.dentry,
  90                              (unsigned long long)oi->ip_blkno,
  91                              file->f_path.dentry->d_name.len,
  92                              file->f_path.dentry->d_name.name, mode);
  93
  94        if (file->f_mode & FMODE_WRITE) {
  95                status = dquot_initialize(inode);
  96                if (status)
  97                        goto leave;
  98        }
  99
 100        spin_lock(&oi->ip_lock);
 101
 102        /* Check that the inode hasn't been wiped from disk by another
 103         * node. If it hasn't then we're safe as long as we hold the
 104         * spin lock until our increment of open count. */
 105        if (oi->ip_flags & OCFS2_INODE_DELETED) {
 106                spin_unlock(&oi->ip_lock);
 107
 108                status = -ENOENT;
 109                goto leave;
 110        }
 111
 112        if (mode & O_DIRECT)
 113                oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
 114
 115        oi->ip_open_count++;
 116        spin_unlock(&oi->ip_lock);
 117
 118        status = ocfs2_init_file_private(inode, file);
 119        if (status) {
 120                /*
 121                 * We want to set open count back if we're failing the
 122                 * open.
 123                 */
 124                spin_lock(&oi->ip_lock);
 125                oi->ip_open_count--;
 126                spin_unlock(&oi->ip_lock);
 127        }
 128
 129        file->f_mode |= FMODE_NOWAIT;
 130
 131leave:
 132        return status;
 133}
 134
 135static int ocfs2_file_release(struct inode *inode, struct file *file)
 136{
 137        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 138
 139        spin_lock(&oi->ip_lock);
 140        if (!--oi->ip_open_count)
 141                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 142
 143        trace_ocfs2_file_release(inode, file, file->f_path.dentry,
 144                                 oi->ip_blkno,
 145                                 file->f_path.dentry->d_name.len,
 146                                 file->f_path.dentry->d_name.name,
 147                                 oi->ip_open_count);
 148        spin_unlock(&oi->ip_lock);
 149
 150        ocfs2_free_file_private(inode, file);
 151
 152        return 0;
 153}
 154
 155static int ocfs2_dir_open(struct inode *inode, struct file *file)
 156{
 157        return ocfs2_init_file_private(inode, file);
 158}
 159
 160static int ocfs2_dir_release(struct inode *inode, struct file *file)
 161{
 162        ocfs2_free_file_private(inode, file);
 163        return 0;
 164}
 165
 166static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
 167                           int datasync)
 168{
 169        int err = 0;
 170        struct inode *inode = file->f_mapping->host;
 171        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 172        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 173        journal_t *journal = osb->journal->j_journal;
 174        int ret;
 175        tid_t commit_tid;
 176        bool needs_barrier = false;
 177
 178        trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
 179                              oi->ip_blkno,
 180                              file->f_path.dentry->d_name.len,
 181                              file->f_path.dentry->d_name.name,
 182                              (unsigned long long)datasync);
 183
 184        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 185                return -EROFS;
 186
 187        err = file_write_and_wait_range(file, start, end);
 188        if (err)
 189                return err;
 190
 191        commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
 192        if (journal->j_flags & JBD2_BARRIER &&
 193            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
 194                needs_barrier = true;
 195        err = jbd2_complete_transaction(journal, commit_tid);
 196        if (needs_barrier) {
 197                ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 198                if (!err)
 199                        err = ret;
 200        }
 201
 202        if (err)
 203                mlog_errno(err);
 204
 205        return (err < 0) ? -EIO : 0;
 206}
 207
 208int ocfs2_should_update_atime(struct inode *inode,
 209                              struct vfsmount *vfsmnt)
 210{
 211        struct timespec64 now;
 212        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 213
 214        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 215                return 0;
 216
 217        if ((inode->i_flags & S_NOATIME) ||
 218            ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)))
 219                return 0;
 220
 221        /*
 222         * We can be called with no vfsmnt structure - NFSD will
 223         * sometimes do this.
 224         *
 225         * Note that our action here is different than touch_atime() -
 226         * if we can't tell whether this is a noatime mount, then we
 227         * don't know whether to trust the value of s_atime_quantum.
 228         */
 229        if (vfsmnt == NULL)
 230                return 0;
 231
 232        if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
 233            ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 234                return 0;
 235
 236        if (vfsmnt->mnt_flags & MNT_RELATIME) {
 237                if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
 238                    (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 239                        return 1;
 240
 241                return 0;
 242        }
 243
 244        now = current_time(inode);
 245        if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
 246                return 0;
 247        else
 248                return 1;
 249}
 250
 251int ocfs2_update_inode_atime(struct inode *inode,
 252                             struct buffer_head *bh)
 253{
 254        int ret;
 255        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 256        handle_t *handle;
 257        struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 258
 259        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 260        if (IS_ERR(handle)) {
 261                ret = PTR_ERR(handle);
 262                mlog_errno(ret);
 263                goto out;
 264        }
 265
 266        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 267                                      OCFS2_JOURNAL_ACCESS_WRITE);
 268        if (ret) {
 269                mlog_errno(ret);
 270                goto out_commit;
 271        }
 272
 273        /*
 274         * Don't use ocfs2_mark_inode_dirty() here as we don't always
 275         * have i_mutex to guard against concurrent changes to other
 276         * inode fields.
 277         */
 278        inode->i_atime = current_time(inode);
 279        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 280        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
 281        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 282        ocfs2_journal_dirty(handle, bh);
 283
 284out_commit:
 285        ocfs2_commit_trans(osb, handle);
 286out:
 287        return ret;
 288}
 289
 290int ocfs2_set_inode_size(handle_t *handle,
 291                                struct inode *inode,
 292                                struct buffer_head *fe_bh,
 293                                u64 new_i_size)
 294{
 295        int status;
 296
 297        i_size_write(inode, new_i_size);
 298        inode->i_blocks = ocfs2_inode_sector_count(inode);
 299        inode->i_ctime = inode->i_mtime = current_time(inode);
 300
 301        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 302        if (status < 0) {
 303                mlog_errno(status);
 304                goto bail;
 305        }
 306
 307bail:
 308        return status;
 309}
 310
 311int ocfs2_simple_size_update(struct inode *inode,
 312                             struct buffer_head *di_bh,
 313                             u64 new_i_size)
 314{
 315        int ret;
 316        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 317        handle_t *handle = NULL;
 318
 319        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 320        if (IS_ERR(handle)) {
 321                ret = PTR_ERR(handle);
 322                mlog_errno(ret);
 323                goto out;
 324        }
 325
 326        ret = ocfs2_set_inode_size(handle, inode, di_bh,
 327                                   new_i_size);
 328        if (ret < 0)
 329                mlog_errno(ret);
 330
 331        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 332        ocfs2_commit_trans(osb, handle);
 333out:
 334        return ret;
 335}
 336
 337static int ocfs2_cow_file_pos(struct inode *inode,
 338                              struct buffer_head *fe_bh,
 339                              u64 offset)
 340{
 341        int status;
 342        u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 343        unsigned int num_clusters = 0;
 344        unsigned int ext_flags = 0;
 345
 346        /*
 347         * If the new offset is aligned to the range of the cluster, there is
 348         * no space for ocfs2_zero_range_for_truncate to fill, so no need to
 349         * CoW either.
 350         */
 351        if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
 352                return 0;
 353
 354        status = ocfs2_get_clusters(inode, cpos, &phys,
 355                                    &num_clusters, &ext_flags);
 356        if (status) {
 357                mlog_errno(status);
 358                goto out;
 359        }
 360
 361        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
 362                goto out;
 363
 364        return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
 365
 366out:
 367        return status;
 368}
 369
 370static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 371                                     struct inode *inode,
 372                                     struct buffer_head *fe_bh,
 373                                     u64 new_i_size)
 374{
 375        int status;
 376        handle_t *handle;
 377        struct ocfs2_dinode *di;
 378        u64 cluster_bytes;
 379
 380        /*
 381         * We need to CoW the cluster contains the offset if it is reflinked
 382         * since we will call ocfs2_zero_range_for_truncate later which will
 383         * write "0" from offset to the end of the cluster.
 384         */
 385        status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
 386        if (status) {
 387                mlog_errno(status);
 388                return status;
 389        }
 390
 391        /* TODO: This needs to actually orphan the inode in this
 392         * transaction. */
 393
 394        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 395        if (IS_ERR(handle)) {
 396                status = PTR_ERR(handle);
 397                mlog_errno(status);
 398                goto out;
 399        }
 400
 401        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
 402                                         OCFS2_JOURNAL_ACCESS_WRITE);
 403        if (status < 0) {
 404                mlog_errno(status);
 405                goto out_commit;
 406        }
 407
 408        /*
 409         * Do this before setting i_size.
 410         */
 411        cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
 412        status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
 413                                               cluster_bytes);
 414        if (status) {
 415                mlog_errno(status);
 416                goto out_commit;
 417        }
 418
 419        i_size_write(inode, new_i_size);
 420        inode->i_ctime = inode->i_mtime = current_time(inode);
 421
 422        di = (struct ocfs2_dinode *) fe_bh->b_data;
 423        di->i_size = cpu_to_le64(new_i_size);
 424        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 425        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 426        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 427
 428        ocfs2_journal_dirty(handle, fe_bh);
 429
 430out_commit:
 431        ocfs2_commit_trans(osb, handle);
 432out:
 433        return status;
 434}
 435
 436int ocfs2_truncate_file(struct inode *inode,
 437                               struct buffer_head *di_bh,
 438                               u64 new_i_size)
 439{
 440        int status = 0;
 441        struct ocfs2_dinode *fe = NULL;
 442        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 443
 444        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
 445         * already validated it */
 446        fe = (struct ocfs2_dinode *) di_bh->b_data;
 447
 448        trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
 449                                  (unsigned long long)le64_to_cpu(fe->i_size),
 450                                  (unsigned long long)new_i_size);
 451
 452        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 453                        "Inode %llu, inode i_size = %lld != di "
 454                        "i_size = %llu, i_flags = 0x%x\n",
 455                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 456                        i_size_read(inode),
 457                        (unsigned long long)le64_to_cpu(fe->i_size),
 458                        le32_to_cpu(fe->i_flags));
 459
 460        if (new_i_size > le64_to_cpu(fe->i_size)) {
 461                trace_ocfs2_truncate_file_error(
 462                        (unsigned long long)le64_to_cpu(fe->i_size),
 463                        (unsigned long long)new_i_size);
 464                status = -EINVAL;
 465                mlog_errno(status);
 466                goto bail;
 467        }
 468
 469        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 470
 471        ocfs2_resv_discard(&osb->osb_la_resmap,
 472                           &OCFS2_I(inode)->ip_la_data_resv);
 473
 474        /*
 475         * The inode lock forced other nodes to sync and drop their
 476         * pages, which (correctly) happens even if we have a truncate
 477         * without allocation change - ocfs2 cluster sizes can be much
 478         * greater than page size, so we have to truncate them
 479         * anyway.
 480         */
 481        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 482        truncate_inode_pages(inode->i_mapping, new_i_size);
 483
 484        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 485                status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
 486                                               i_size_read(inode), 1);
 487                if (status)
 488                        mlog_errno(status);
 489
 490                goto bail_unlock_sem;
 491        }
 492
 493        /* alright, we're going to need to do a full blown alloc size
 494         * change. Orphan the inode so that recovery can complete the
 495         * truncate if necessary. This does the task of marking
 496         * i_size. */
 497        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 498        if (status < 0) {
 499                mlog_errno(status);
 500                goto bail_unlock_sem;
 501        }
 502
 503        status = ocfs2_commit_truncate(osb, inode, di_bh);
 504        if (status < 0) {
 505                mlog_errno(status);
 506                goto bail_unlock_sem;
 507        }
 508
 509        /* TODO: orphan dir cleanup here. */
 510bail_unlock_sem:
 511        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 512
 513bail:
 514        if (!status && OCFS2_I(inode)->ip_clusters == 0)
 515                status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 516
 517        return status;
 518}
 519
 520/*
 521 * extend file allocation only here.
 522 * we'll update all the disk stuff, and oip->alloc_size
 523 *
 524 * expect stuff to be locked, a transaction started and enough data /
 525 * metadata reservations in the contexts.
 526 *
 527 * Will return -EAGAIN, and a reason if a restart is needed.
 528 * If passed in, *reason will always be set, even in error.
 529 */
 530int ocfs2_add_inode_data(struct ocfs2_super *osb,
 531                         struct inode *inode,
 532                         u32 *logical_offset,
 533                         u32 clusters_to_add,
 534                         int mark_unwritten,
 535                         struct buffer_head *fe_bh,
 536                         handle_t *handle,
 537                         struct ocfs2_alloc_context *data_ac,
 538                         struct ocfs2_alloc_context *meta_ac,
 539                         enum ocfs2_alloc_restarted *reason_ret)
 540{
 541        int ret;
 542        struct ocfs2_extent_tree et;
 543
 544        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
 545        ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
 546                                          clusters_to_add, mark_unwritten,
 547                                          data_ac, meta_ac, reason_ret);
 548
 549        return ret;
 550}
 551
 552static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 553                                   u32 clusters_to_add, int mark_unwritten)
 554{
 555        int status = 0;
 556        int restart_func = 0;
 557        int credits;
 558        u32 prev_clusters;
 559        struct buffer_head *bh = NULL;
 560        struct ocfs2_dinode *fe = NULL;
 561        handle_t *handle = NULL;
 562        struct ocfs2_alloc_context *data_ac = NULL;
 563        struct ocfs2_alloc_context *meta_ac = NULL;
 564        enum ocfs2_alloc_restarted why = RESTART_NONE;
 565        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 566        struct ocfs2_extent_tree et;
 567        int did_quota = 0;
 568
 569        /*
 570         * Unwritten extent only exists for file systems which
 571         * support holes.
 572         */
 573        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 574
 575        status = ocfs2_read_inode_block(inode, &bh);
 576        if (status < 0) {
 577                mlog_errno(status);
 578                goto leave;
 579        }
 580        fe = (struct ocfs2_dinode *) bh->b_data;
 581
 582restart_all:
 583        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 584
 585        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
 586        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 587                                       &data_ac, &meta_ac);
 588        if (status) {
 589                mlog_errno(status);
 590                goto leave;
 591        }
 592
 593        credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
 594        handle = ocfs2_start_trans(osb, credits);
 595        if (IS_ERR(handle)) {
 596                status = PTR_ERR(handle);
 597                handle = NULL;
 598                mlog_errno(status);
 599                goto leave;
 600        }
 601
 602restarted_transaction:
 603        trace_ocfs2_extend_allocation(
 604                (unsigned long long)OCFS2_I(inode)->ip_blkno,
 605                (unsigned long long)i_size_read(inode),
 606                le32_to_cpu(fe->i_clusters), clusters_to_add,
 607                why, restart_func);
 608
 609        status = dquot_alloc_space_nodirty(inode,
 610                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 611        if (status)
 612                goto leave;
 613        did_quota = 1;
 614
 615        /* reserve a write to the file entry early on - that we if we
 616         * run out of credits in the allocation path, we can still
 617         * update i_size. */
 618        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 619                                         OCFS2_JOURNAL_ACCESS_WRITE);
 620        if (status < 0) {
 621                mlog_errno(status);
 622                goto leave;
 623        }
 624
 625        prev_clusters = OCFS2_I(inode)->ip_clusters;
 626
 627        status = ocfs2_add_inode_data(osb,
 628                                      inode,
 629                                      &logical_start,
 630                                      clusters_to_add,
 631                                      mark_unwritten,
 632                                      bh,
 633                                      handle,
 634                                      data_ac,
 635                                      meta_ac,
 636                                      &why);
 637        if ((status < 0) && (status != -EAGAIN)) {
 638                if (status != -ENOSPC)
 639                        mlog_errno(status);
 640                goto leave;
 641        }
 642        ocfs2_update_inode_fsync_trans(handle, inode, 1);
 643        ocfs2_journal_dirty(handle, bh);
 644
 645        spin_lock(&OCFS2_I(inode)->ip_lock);
 646        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 647        spin_unlock(&OCFS2_I(inode)->ip_lock);
 648        /* Release unused quota reservation */
 649        dquot_free_space(inode,
 650                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 651        did_quota = 0;
 652
 653        if (why != RESTART_NONE && clusters_to_add) {
 654                if (why == RESTART_META) {
 655                        restart_func = 1;
 656                        status = 0;
 657                } else {
 658                        BUG_ON(why != RESTART_TRANS);
 659
 660                        status = ocfs2_allocate_extend_trans(handle, 1);
 661                        if (status < 0) {
 662                                /* handle still has to be committed at
 663                                 * this point. */
 664                                status = -ENOMEM;
 665                                mlog_errno(status);
 666                                goto leave;
 667                        }
 668                        goto restarted_transaction;
 669                }
 670        }
 671
 672        trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
 673             le32_to_cpu(fe->i_clusters),
 674             (unsigned long long)le64_to_cpu(fe->i_size),
 675             OCFS2_I(inode)->ip_clusters,
 676             (unsigned long long)i_size_read(inode));
 677
 678leave:
 679        if (status < 0 && did_quota)
 680                dquot_free_space(inode,
 681                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 682        if (handle) {
 683                ocfs2_commit_trans(osb, handle);
 684                handle = NULL;
 685        }
 686        if (data_ac) {
 687                ocfs2_free_alloc_context(data_ac);
 688                data_ac = NULL;
 689        }
 690        if (meta_ac) {
 691                ocfs2_free_alloc_context(meta_ac);
 692                meta_ac = NULL;
 693        }
 694        if ((!status) && restart_func) {
 695                restart_func = 0;
 696                goto restart_all;
 697        }
 698        brelse(bh);
 699        bh = NULL;
 700
 701        return status;
 702}
 703
 704/*
 705 * While a write will already be ordering the data, a truncate will not.
 706 * Thus, we need to explicitly order the zeroed pages.
 707 */
 708static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
 709                                                struct buffer_head *di_bh)
 710{
 711        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 712        handle_t *handle = NULL;
 713        int ret = 0;
 714
 715        if (!ocfs2_should_order_data(inode))
 716                goto out;
 717
 718        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 719        if (IS_ERR(handle)) {
 720                ret = -ENOMEM;
 721                mlog_errno(ret);
 722                goto out;
 723        }
 724
 725        ret = ocfs2_jbd2_file_inode(handle, inode);
 726        if (ret < 0) {
 727                mlog_errno(ret);
 728                goto out;
 729        }
 730
 731        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
 732                                      OCFS2_JOURNAL_ACCESS_WRITE);
 733        if (ret)
 734                mlog_errno(ret);
 735        ocfs2_update_inode_fsync_trans(handle, inode, 1);
 736
 737out:
 738        if (ret) {
 739                if (!IS_ERR(handle))
 740                        ocfs2_commit_trans(osb, handle);
 741                handle = ERR_PTR(ret);
 742        }
 743        return handle;
 744}
 745
 746/* Some parts of this taken from generic_cont_expand, which turned out
 747 * to be too fragile to do exactly what we need without us having to
 748 * worry about recursive locking in ->write_begin() and ->write_end(). */
 749static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 750                                 u64 abs_to, struct buffer_head *di_bh)
 751{
 752        struct address_space *mapping = inode->i_mapping;
 753        struct page *page;
 754        unsigned long index = abs_from >> PAGE_SHIFT;
 755        handle_t *handle;
 756        int ret = 0;
 757        unsigned zero_from, zero_to, block_start, block_end;
 758        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 759
 760        BUG_ON(abs_from >= abs_to);
 761        BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
 762        BUG_ON(abs_from & (inode->i_blkbits - 1));
 763
 764        handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
 765        if (IS_ERR(handle)) {
 766                ret = PTR_ERR(handle);
 767                goto out;
 768        }
 769
 770        page = find_or_create_page(mapping, index, GFP_NOFS);
 771        if (!page) {
 772                ret = -ENOMEM;
 773                mlog_errno(ret);
 774                goto out_commit_trans;
 775        }
 776
 777        /* Get the offsets within the page that we want to zero */
 778        zero_from = abs_from & (PAGE_SIZE - 1);
 779        zero_to = abs_to & (PAGE_SIZE - 1);
 780        if (!zero_to)
 781                zero_to = PAGE_SIZE;
 782
 783        trace_ocfs2_write_zero_page(
 784                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 785                        (unsigned long long)abs_from,
 786                        (unsigned long long)abs_to,
 787                        index, zero_from, zero_to);
 788
 789        /* We know that zero_from is block aligned */
 790        for (block_start = zero_from; block_start < zero_to;
 791             block_start = block_end) {
 792                block_end = block_start + i_blocksize(inode);
 793
 794                /*
 795                 * block_start is block-aligned.  Bump it by one to force
 796                 * __block_write_begin and block_commit_write to zero the
 797                 * whole block.
 798                 */
 799                ret = __block_write_begin(page, block_start + 1, 0,
 800                                          ocfs2_get_block);
 801                if (ret < 0) {
 802                        mlog_errno(ret);
 803                        goto out_unlock;
 804                }
 805
 806
 807                /* must not update i_size! */
 808                ret = block_commit_write(page, block_start + 1,
 809                                         block_start + 1);
 810                if (ret < 0)
 811                        mlog_errno(ret);
 812                else
 813                        ret = 0;
 814        }
 815
 816        /*
 817         * fs-writeback will release the dirty pages without page lock
 818         * whose offset are over inode size, the release happens at
 819         * block_write_full_page().
 820         */
 821        i_size_write(inode, abs_to);
 822        inode->i_blocks = ocfs2_inode_sector_count(inode);
 823        di->i_size = cpu_to_le64((u64)i_size_read(inode));
 824        inode->i_mtime = inode->i_ctime = current_time(inode);
 825        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 826        di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 827        di->i_mtime_nsec = di->i_ctime_nsec;
 828        if (handle) {
 829                ocfs2_journal_dirty(handle, di_bh);
 830                ocfs2_update_inode_fsync_trans(handle, inode, 1);
 831        }
 832
 833out_unlock:
 834        unlock_page(page);
 835        put_page(page);
 836out_commit_trans:
 837        if (handle)
 838                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 839out:
 840        return ret;
 841}
 842
 843/*
 844 * Find the next range to zero.  We do this in terms of bytes because
 845 * that's what ocfs2_zero_extend() wants, and it is dealing with the
 846 * pagecache.  We may return multiple extents.
 847 *
 848 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
 849 * needs to be zeroed.  range_start and range_end return the next zeroing
 850 * range.  A subsequent call should pass the previous range_end as its
 851 * zero_start.  If range_end is 0, there's nothing to do.
 852 *
 853 * Unwritten extents are skipped over.  Refcounted extents are CoWd.
 854 */
 855static int ocfs2_zero_extend_get_range(struct inode *inode,
 856                                       struct buffer_head *di_bh,
 857                                       u64 zero_start, u64 zero_end,
 858                                       u64 *range_start, u64 *range_end)
 859{
 860        int rc = 0, needs_cow = 0;
 861        u32 p_cpos, zero_clusters = 0;
 862        u32 zero_cpos =
 863                zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 864        u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
 865        unsigned int num_clusters = 0;
 866        unsigned int ext_flags = 0;
 867
 868        while (zero_cpos < last_cpos) {
 869                rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
 870                                        &num_clusters, &ext_flags);
 871                if (rc) {
 872                        mlog_errno(rc);
 873                        goto out;
 874                }
 875
 876                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
 877                        zero_clusters = num_clusters;
 878                        if (ext_flags & OCFS2_EXT_REFCOUNTED)
 879                                needs_cow = 1;
 880                        break;
 881                }
 882
 883                zero_cpos += num_clusters;
 884        }
 885        if (!zero_clusters) {
 886                *range_end = 0;
 887                goto out;
 888        }
 889
 890        while ((zero_cpos + zero_clusters) < last_cpos) {
 891                rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
 892                                        &p_cpos, &num_clusters,
 893                                        &ext_flags);
 894                if (rc) {
 895                        mlog_errno(rc);
 896                        goto out;
 897                }
 898
 899                if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
 900                        break;
 901                if (ext_flags & OCFS2_EXT_REFCOUNTED)
 902                        needs_cow = 1;
 903                zero_clusters += num_clusters;
 904        }
 905        if ((zero_cpos + zero_clusters) > last_cpos)
 906                zero_clusters = last_cpos - zero_cpos;
 907
 908        if (needs_cow) {
 909                rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos,
 910                                        zero_clusters, UINT_MAX);
 911                if (rc) {
 912                        mlog_errno(rc);
 913                        goto out;
 914                }
 915        }
 916
 917        *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
 918        *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
 919                                             zero_cpos + zero_clusters);
 920
 921out:
 922        return rc;
 923}
 924
 925/*
 926 * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
 927 * has made sure that the entire range needs zeroing.
 928 */
 929static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
 930                                   u64 range_end, struct buffer_head *di_bh)
 931{
 932        int rc = 0;
 933        u64 next_pos;
 934        u64 zero_pos = range_start;
 935
 936        trace_ocfs2_zero_extend_range(
 937                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 938                        (unsigned long long)range_start,
 939                        (unsigned long long)range_end);
 940        BUG_ON(range_start >= range_end);
 941
 942        while (zero_pos < range_end) {
 943                next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
 944                if (next_pos > range_end)
 945                        next_pos = range_end;
 946                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
 947                if (rc < 0) {
 948                        mlog_errno(rc);
 949                        break;
 950                }
 951                zero_pos = next_pos;
 952
 953                /*
 954                 * Very large extends have the potential to lock up
 955                 * the cpu for extended periods of time.
 956                 */
 957                cond_resched();
 958        }
 959
 960        return rc;
 961}
 962
 963int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 964                      loff_t zero_to_size)
 965{
 966        int ret = 0;
 967        u64 zero_start, range_start = 0, range_end = 0;
 968        struct super_block *sb = inode->i_sb;
 969
 970        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 971        trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
 972                                (unsigned long long)zero_start,
 973                                (unsigned long long)i_size_read(inode));
 974        while (zero_start < zero_to_size) {
 975                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
 976                                                  zero_to_size,
 977                                                  &range_start,
 978                                                  &range_end);
 979                if (ret) {
 980                        mlog_errno(ret);
 981                        break;
 982                }
 983                if (!range_end)
 984                        break;
 985                /* Trim the ends */
 986                if (range_start < zero_start)
 987                        range_start = zero_start;
 988                if (range_end > zero_to_size)
 989                        range_end = zero_to_size;
 990
 991                ret = ocfs2_zero_extend_range(inode, range_start,
 992                                              range_end, di_bh);
 993                if (ret) {
 994                        mlog_errno(ret);
 995                        break;
 996                }
 997                zero_start = range_end;
 998        }
 999
1000        return ret;
1001}
1002
1003int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
1004                          u64 new_i_size, u64 zero_to)
1005{
1006        int ret;
1007        u32 clusters_to_add;
1008        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1009
1010        /*
1011         * Only quota files call this without a bh, and they can't be
1012         * refcounted.
1013         */
1014        BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
1015        BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1016
1017        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1018        if (clusters_to_add < oi->ip_clusters)
1019                clusters_to_add = 0;
1020        else
1021                clusters_to_add -= oi->ip_clusters;
1022
1023        if (clusters_to_add) {
1024                ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
1025                                              clusters_to_add, 0);
1026                if (ret) {
1027                        mlog_errno(ret);
1028                        goto out;
1029                }
1030        }
1031
1032        /*
1033         * Call this even if we don't add any clusters to the tree. We
1034         * still need to zero the area between the old i_size and the
1035         * new i_size.
1036         */
1037        ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1038        if (ret < 0)
1039                mlog_errno(ret);
1040
1041out:
1042        return ret;
1043}
1044
1045static int ocfs2_extend_file(struct inode *inode,
1046                             struct buffer_head *di_bh,
1047                             u64 new_i_size)
1048{
1049        int ret = 0;
1050        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1051
1052        BUG_ON(!di_bh);
1053
1054        /* setattr sometimes calls us like this. */
1055        if (new_i_size == 0)
1056                goto out;
1057
1058        if (i_size_read(inode) == new_i_size)
1059                goto out;
1060        BUG_ON(new_i_size < i_size_read(inode));
1061
1062        /*
1063         * The alloc sem blocks people in read/write from reading our
1064         * allocation until we're done changing it. We depend on
1065         * i_mutex to block other extend/truncate calls while we're
1066         * here.  We even have to hold it for sparse files because there
1067         * might be some tail zeroing.
1068         */
1069        down_write(&oi->ip_alloc_sem);
1070
1071        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1072                /*
1073                 * We can optimize small extends by keeping the inodes
1074                 * inline data.
1075                 */
1076                if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1077                        up_write(&oi->ip_alloc_sem);
1078                        goto out_update_size;
1079                }
1080
1081                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1082                if (ret) {
1083                        up_write(&oi->ip_alloc_sem);
1084                        mlog_errno(ret);
1085                        goto out;
1086                }
1087        }
1088
1089        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1090                ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1091        else
1092                ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1093                                            new_i_size);
1094
1095        up_write(&oi->ip_alloc_sem);
1096
1097        if (ret < 0) {
1098                mlog_errno(ret);
1099                goto out;
1100        }
1101
1102out_update_size:
1103        ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1104        if (ret < 0)
1105                mlog_errno(ret);
1106
1107out:
1108        return ret;
1109}
1110
1111int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1112{
1113        int status = 0, size_change;
1114        int inode_locked = 0;
1115        struct inode *inode = d_inode(dentry);
1116        struct super_block *sb = inode->i_sb;
1117        struct ocfs2_super *osb = OCFS2_SB(sb);
1118        struct buffer_head *bh = NULL;
1119        handle_t *handle = NULL;
1120        struct dquot *transfer_to[MAXQUOTAS] = { };
1121        int qtype;
1122        int had_lock;
1123        struct ocfs2_lock_holder oh;
1124
1125        trace_ocfs2_setattr(inode, dentry,
1126                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
1127                            dentry->d_name.len, dentry->d_name.name,
1128                            attr->ia_valid, attr->ia_mode,
1129                            from_kuid(&init_user_ns, attr->ia_uid),
1130                            from_kgid(&init_user_ns, attr->ia_gid));
1131
1132        /* ensuring we don't even attempt to truncate a symlink */
1133        if (S_ISLNK(inode->i_mode))
1134                attr->ia_valid &= ~ATTR_SIZE;
1135
1136#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1137                           | ATTR_GID | ATTR_UID | ATTR_MODE)
1138        if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1139                return 0;
1140
1141        status = setattr_prepare(dentry, attr);
1142        if (status)
1143                return status;
1144
1145        if (is_quota_modification(inode, attr)) {
1146                status = dquot_initialize(inode);
1147                if (status)
1148                        return status;
1149        }
1150        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1151        if (size_change) {
1152                /*
1153                 * Here we should wait dio to finish before inode lock
1154                 * to avoid a deadlock between ocfs2_setattr() and
1155                 * ocfs2_dio_end_io_write()
1156                 */
1157                inode_dio_wait(inode);
1158
1159                status = ocfs2_rw_lock(inode, 1);
1160                if (status < 0) {
1161                        mlog_errno(status);
1162                        goto bail;
1163                }
1164        }
1165
1166        had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
1167        if (had_lock < 0) {
1168                status = had_lock;
1169                goto bail_unlock_rw;
1170        } else if (had_lock) {
1171                /*
1172                 * As far as we know, ocfs2_setattr() could only be the first
1173                 * VFS entry point in the call chain of recursive cluster
1174                 * locking issue.
1175                 *
1176                 * For instance:
1177                 * chmod_common()
1178                 *  notify_change()
1179                 *   ocfs2_setattr()
1180                 *    posix_acl_chmod()
1181                 *     ocfs2_iop_get_acl()
1182                 *
1183                 * But, we're not 100% sure if it's always true, because the
1184                 * ordering of the VFS entry points in the call chain is out
1185                 * of our control. So, we'd better dump the stack here to
1186                 * catch the other cases of recursive locking.
1187                 */
1188                mlog(ML_ERROR, "Another case of recursive locking:\n");
1189                dump_stack();
1190        }
1191        inode_locked = 1;
1192
1193        if (size_change) {
1194                status = inode_newsize_ok(inode, attr->ia_size);
1195                if (status)
1196                        goto bail_unlock;
1197
1198                if (i_size_read(inode) >= attr->ia_size) {
1199                        if (ocfs2_should_order_data(inode)) {
1200                                status = ocfs2_begin_ordered_truncate(inode,
1201                                                                      attr->ia_size);
1202                                if (status)
1203                                        goto bail_unlock;
1204                        }
1205                        status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1206                } else
1207                        status = ocfs2_extend_file(inode, bh, attr->ia_size);
1208                if (status < 0) {
1209                        if (status != -ENOSPC)
1210                                mlog_errno(status);
1211                        status = -ENOSPC;
1212                        goto bail_unlock;
1213                }
1214        }
1215
1216        if ((attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
1217            (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
1218                /*
1219                 * Gather pointers to quota structures so that allocation /
1220                 * freeing of quota structures happens here and not inside
1221                 * dquot_transfer() where we have problems with lock ordering
1222                 */
1223                if (attr->ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)
1224                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1225                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1226                        transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid));
1227                        if (IS_ERR(transfer_to[USRQUOTA])) {
1228                                status = PTR_ERR(transfer_to[USRQUOTA]);
1229                                goto bail_unlock;
1230                        }
1231                }
1232                if (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid)
1233                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1234                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1235                        transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid));
1236                        if (IS_ERR(transfer_to[GRPQUOTA])) {
1237                                status = PTR_ERR(transfer_to[GRPQUOTA]);
1238                                goto bail_unlock;
1239                        }
1240                }
1241                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1242                                           2 * ocfs2_quota_trans_credits(sb));
1243                if (IS_ERR(handle)) {
1244                        status = PTR_ERR(handle);
1245                        mlog_errno(status);
1246                        goto bail_unlock;
1247                }
1248                status = __dquot_transfer(inode, transfer_to);
1249                if (status < 0)
1250                        goto bail_commit;
1251        } else {
1252                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1253                if (IS_ERR(handle)) {
1254                        status = PTR_ERR(handle);
1255                        mlog_errno(status);
1256                        goto bail_unlock;
1257                }
1258        }
1259
1260        setattr_copy(inode, attr);
1261        mark_inode_dirty(inode);
1262
1263        status = ocfs2_mark_inode_dirty(handle, inode, bh);
1264        if (status < 0)
1265                mlog_errno(status);
1266
1267bail_commit:
1268        ocfs2_commit_trans(osb, handle);
1269bail_unlock:
1270        if (status && inode_locked) {
1271                ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1272                inode_locked = 0;
1273        }
1274bail_unlock_rw:
1275        if (size_change)
1276                ocfs2_rw_unlock(inode, 1);
1277bail:
1278
1279        /* Release quota pointers in case we acquired them */
1280        for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
1281                dqput(transfer_to[qtype]);
1282
1283        if (!status && attr->ia_valid & ATTR_MODE) {
1284                status = ocfs2_acl_chmod(inode, bh);
1285                if (status < 0)
1286                        mlog_errno(status);
1287        }
1288        if (inode_locked)
1289                ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
1290
1291        brelse(bh);
1292        return status;
1293}
1294
1295int ocfs2_getattr(const struct path *path, struct kstat *stat,
1296                  u32 request_mask, unsigned int flags)
1297{
1298        struct inode *inode = d_inode(path->dentry);
1299        struct super_block *sb = path->dentry->d_sb;
1300        struct ocfs2_super *osb = sb->s_fs_info;
1301        int err;
1302
1303        err = ocfs2_inode_revalidate(path->dentry);
1304        if (err) {
1305                if (err != -ENOENT)
1306                        mlog_errno(err);
1307                goto bail;
1308        }
1309
1310        generic_fillattr(inode, stat);
1311        /*
1312         * If there is inline data in the inode, the inode will normally not
1313         * have data blocks allocated (it may have an external xattr block).
1314         * Report at least one sector for such files, so tools like tar, rsync,
1315         * others don't incorrectly think the file is completely sparse.
1316         */
1317        if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
1318                stat->blocks += (stat->size + 511)>>9;
1319
1320        /* We set the blksize from the cluster size for performance */
1321        stat->blksize = osb->s_clustersize;
1322
1323bail:
1324        return err;
1325}
1326
1327int ocfs2_permission(struct inode *inode, int mask)
1328{
1329        int ret, had_lock;
1330        struct ocfs2_lock_holder oh;
1331
1332        if (mask & MAY_NOT_BLOCK)
1333                return -ECHILD;
1334
1335        had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
1336        if (had_lock < 0) {
1337                ret = had_lock;
1338                goto out;
1339        } else if (had_lock) {
1340                /* See comments in ocfs2_setattr() for details.
1341                 * The call chain of this case could be:
1342                 * do_sys_open()
1343                 *  may_open()
1344                 *   inode_permission()
1345                 *    ocfs2_permission()
1346                 *     ocfs2_iop_get_acl()
1347                 */
1348                mlog(ML_ERROR, "Another case of recursive locking:\n");
1349                dump_stack();
1350        }
1351
1352        ret = generic_permission(inode, mask);
1353
1354        ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
1355out:
1356        return ret;
1357}
1358
1359static int __ocfs2_write_remove_suid(struct inode *inode,
1360                                     struct buffer_head *bh)
1361{
1362        int ret;
1363        handle_t *handle;
1364        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1365        struct ocfs2_dinode *di;
1366
1367        trace_ocfs2_write_remove_suid(
1368                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
1369                        inode->i_mode);
1370
1371        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1372        if (IS_ERR(handle)) {
1373                ret = PTR_ERR(handle);
1374                mlog_errno(ret);
1375                goto out;
1376        }
1377
1378        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1379                                      OCFS2_JOURNAL_ACCESS_WRITE);
1380        if (ret < 0) {
1381                mlog_errno(ret);
1382                goto out_trans;
1383        }
1384
1385        inode->i_mode &= ~S_ISUID;
1386        if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1387                inode->i_mode &= ~S_ISGID;
1388
1389        di = (struct ocfs2_dinode *) bh->b_data;
1390        di->i_mode = cpu_to_le16(inode->i_mode);
1391        ocfs2_update_inode_fsync_trans(handle, inode, 0);
1392
1393        ocfs2_journal_dirty(handle, bh);
1394
1395out_trans:
1396        ocfs2_commit_trans(osb, handle);
1397out:
1398        return ret;
1399}
1400
1401static int ocfs2_write_remove_suid(struct inode *inode)
1402{
1403        int ret;
1404        struct buffer_head *bh = NULL;
1405
1406        ret = ocfs2_read_inode_block(inode, &bh);
1407        if (ret < 0) {
1408                mlog_errno(ret);
1409                goto out;
1410        }
1411
1412        ret =  __ocfs2_write_remove_suid(inode, bh);
1413out:
1414        brelse(bh);
1415        return ret;
1416}
1417
1418/*
1419 * Allocate enough extents to cover the region starting at byte offset
1420 * start for len bytes. Existing extents are skipped, any extents
1421 * added are marked as "unwritten".
1422 */
1423static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1424                                            u64 start, u64 len)
1425{
1426        int ret;
1427        u32 cpos, phys_cpos, clusters, alloc_size;
1428        u64 end = start + len;
1429        struct buffer_head *di_bh = NULL;
1430
1431        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1432                ret = ocfs2_read_inode_block(inode, &di_bh);
1433                if (ret) {
1434                        mlog_errno(ret);
1435                        goto out;
1436                }
1437
1438                /*
1439                 * Nothing to do if the requested reservation range
1440                 * fits within the inode.
1441                 */
1442                if (ocfs2_size_fits_inline_data(di_bh, end))
1443                        goto out;
1444
1445                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1446                if (ret) {
1447                        mlog_errno(ret);
1448                        goto out;
1449                }
1450        }
1451
1452        /*
1453         * We consider both start and len to be inclusive.
1454         */
1455        cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1456        clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1457        clusters -= cpos;
1458
1459        while (clusters) {
1460                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1461                                         &alloc_size, NULL);
1462                if (ret) {
1463                        mlog_errno(ret);
1464                        goto out;
1465                }
1466
1467                /*
1468                 * Hole or existing extent len can be arbitrary, so
1469                 * cap it to our own allocation request.
1470                 */
1471                if (alloc_size > clusters)
1472                        alloc_size = clusters;
1473
1474                if (phys_cpos) {
1475                        /*
1476                         * We already have an allocation at this
1477                         * region so we can safely skip it.
1478                         */
1479                        goto next;
1480                }
1481
1482                ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1483                if (ret) {
1484                        if (ret != -ENOSPC)
1485                                mlog_errno(ret);
1486                        goto out;
1487                }
1488
1489next:
1490                cpos += alloc_size;
1491                clusters -= alloc_size;
1492        }
1493
1494        ret = 0;
1495out:
1496
1497        brelse(di_bh);
1498        return ret;
1499}
1500
1501/*
1502 * Truncate a byte range, avoiding pages within partial clusters. This
1503 * preserves those pages for the zeroing code to write to.
1504 */
1505static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1506                                         u64 byte_len)
1507{
1508        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1509        loff_t start, end;
1510        struct address_space *mapping = inode->i_mapping;
1511
1512        start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1513        end = byte_start + byte_len;
1514        end = end & ~(osb->s_clustersize - 1);
1515
1516        if (start < end) {
1517                unmap_mapping_range(mapping, start, end - start, 0);
1518                truncate_inode_pages_range(mapping, start, end - 1);
1519        }
1520}
1521
1522static int ocfs2_zero_partial_clusters(struct inode *inode,
1523                                       u64 start, u64 len)
1524{
1525        int ret = 0;
1526        u64 tmpend = 0;
1527        u64 end = start + len;
1528        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1529        unsigned int csize = osb->s_clustersize;
1530        handle_t *handle;
1531
1532        /*
1533         * The "start" and "end" values are NOT necessarily part of
1534         * the range whose allocation is being deleted. Rather, this
1535         * is what the user passed in with the request. We must zero
1536         * partial clusters here. There's no need to worry about
1537         * physical allocation - the zeroing code knows to skip holes.
1538         */
1539        trace_ocfs2_zero_partial_clusters(
1540                (unsigned long long)OCFS2_I(inode)->ip_blkno,
1541                (unsigned long long)start, (unsigned long long)end);
1542
1543        /*
1544         * If both edges are on a cluster boundary then there's no
1545         * zeroing required as the region is part of the allocation to
1546         * be truncated.
1547         */
1548        if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1549                goto out;
1550
1551        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1552        if (IS_ERR(handle)) {
1553                ret = PTR_ERR(handle);
1554                mlog_errno(ret);
1555                goto out;
1556        }
1557
1558        /*
1559         * If start is on a cluster boundary and end is somewhere in another
1560         * cluster, we have not COWed the cluster starting at start, unless
1561         * end is also within the same cluster. So, in this case, we skip this
1562         * first call to ocfs2_zero_range_for_truncate() truncate and move on
1563         * to the next one.
1564         */
1565        if ((start & (csize - 1)) != 0) {
1566                /*
1567                 * We want to get the byte offset of the end of the 1st
1568                 * cluster.
1569                 */
1570                tmpend = (u64)osb->s_clustersize +
1571                        (start & ~(osb->s_clustersize - 1));
1572                if (tmpend > end)
1573                        tmpend = end;
1574
1575                trace_ocfs2_zero_partial_clusters_range1(
1576                        (unsigned long long)start,
1577                        (unsigned long long)tmpend);
1578
1579                ret = ocfs2_zero_range_for_truncate(inode, handle, start,
1580                                                    tmpend);
1581                if (ret)
1582                        mlog_errno(ret);
1583        }
1584
1585        if (tmpend < end) {
1586                /*
1587                 * This may make start and end equal, but the zeroing
1588                 * code will skip any work in that case so there's no
1589                 * need to catch it up here.
1590                 */
1591                start = end & ~(osb->s_clustersize - 1);
1592
1593                trace_ocfs2_zero_partial_clusters_range2(
1594                        (unsigned long long)start, (unsigned long long)end);
1595
1596                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1597                if (ret)
1598                        mlog_errno(ret);
1599        }
1600        ocfs2_update_inode_fsync_trans(handle, inode, 1);
1601
1602        ocfs2_commit_trans(osb, handle);
1603out:
1604        return ret;
1605}
1606
1607static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1608{
1609        int i;
1610        struct ocfs2_extent_rec *rec = NULL;
1611
1612        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1613
1614                rec = &el->l_recs[i];
1615
1616                if (le32_to_cpu(rec->e_cpos) < pos)
1617                        break;
1618        }
1619
1620        return i;
1621}
1622
1623/*
1624 * Helper to calculate the punching pos and length in one run, we handle the
1625 * following three cases in order:
1626 *
1627 * - remove the entire record
1628 * - remove a partial record
1629 * - no record needs to be removed (hole-punching completed)
1630*/
1631static void ocfs2_calc_trunc_pos(struct inode *inode,
1632                                 struct ocfs2_extent_list *el,
1633                                 struct ocfs2_extent_rec *rec,
1634                                 u32 trunc_start, u32 *trunc_cpos,
1635                                 u32 *trunc_len, u32 *trunc_end,
1636                                 u64 *blkno, int *done)
1637{
1638        int ret = 0;
1639        u32 coff, range;
1640
1641        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1642
1643        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1644                /*
1645                 * remove an entire extent record.
1646                 */
1647                *trunc_cpos = le32_to_cpu(rec->e_cpos);
1648                /*
1649                 * Skip holes if any.
1650                 */
1651                if (range < *trunc_end)
1652                        *trunc_end = range;
1653                *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1654                *blkno = le64_to_cpu(rec->e_blkno);
1655                *trunc_end = le32_to_cpu(rec->e_cpos);
1656        } else if (range > trunc_start) {
1657                /*
1658                 * remove a partial extent record, which means we're
1659                 * removing the last extent record.
1660                 */
1661                *trunc_cpos = trunc_start;
1662                /*
1663                 * skip hole if any.
1664                 */
1665                if (range < *trunc_end)
1666                        *trunc_end = range;
1667                *trunc_len = *trunc_end - trunc_start;
1668                coff = trunc_start - le32_to_cpu(rec->e_cpos);
1669                *blkno = le64_to_cpu(rec->e_blkno) +
1670                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
1671                *trunc_end = trunc_start;
1672        } else {
1673                /*
1674                 * It may have two following possibilities:
1675                 *
1676                 * - last record has been removed
1677                 * - trunc_start was within a hole
1678                 *
1679                 * both two cases mean the completion of hole punching.
1680                 */
1681                ret = 1;
1682        }
1683
1684        *done = ret;
1685}
1686
1687int ocfs2_remove_inode_range(struct inode *inode,
1688                             struct buffer_head *di_bh, u64 byte_start,
1689                             u64 byte_len)
1690{
1691        int ret = 0, flags = 0, done = 0, i;
1692        u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1693        u32 cluster_in_el;
1694        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1695        struct ocfs2_cached_dealloc_ctxt dealloc;
1696        struct address_space *mapping = inode->i_mapping;
1697        struct ocfs2_extent_tree et;
1698        struct ocfs2_path *path = NULL;
1699        struct ocfs2_extent_list *el = NULL;
1700        struct ocfs2_extent_rec *rec = NULL;
1701        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1702        u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1703
1704        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1705        ocfs2_init_dealloc_ctxt(&dealloc);
1706
1707        trace_ocfs2_remove_inode_range(
1708                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
1709                        (unsigned long long)byte_start,
1710                        (unsigned long long)byte_len);
1711
1712        if (byte_len == 0)
1713                return 0;
1714
1715        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1716                ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1717                                            byte_start + byte_len, 0);
1718                if (ret) {
1719                        mlog_errno(ret);
1720                        goto out;
1721                }
1722                /*
1723                 * There's no need to get fancy with the page cache
1724                 * truncate of an inline-data inode. We're talking
1725                 * about less than a page here, which will be cached
1726                 * in the dinode buffer anyway.
1727                 */
1728                unmap_mapping_range(mapping, 0, 0, 0);
1729                truncate_inode_pages(mapping, 0);
1730                goto out;
1731        }
1732
1733        /*
1734         * For reflinks, we may need to CoW 2 clusters which might be
1735         * partially zero'd later, if hole's start and end offset were
1736         * within one cluster(means is not exactly aligned to clustersize).
1737         */
1738
1739        if (ocfs2_is_refcount_inode(inode)) {
1740                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1741                if (ret) {
1742                        mlog_errno(ret);
1743                        goto out;
1744                }
1745
1746                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1747                if (ret) {
1748                        mlog_errno(ret);
1749                        goto out;
1750                }
1751        }
1752
1753        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1754        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1755        cluster_in_el = trunc_end;
1756
1757        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1758        if (ret) {
1759                mlog_errno(ret);
1760                goto out;
1761        }
1762
1763        path = ocfs2_new_path_from_et(&et);
1764        if (!path) {
1765                ret = -ENOMEM;
1766                mlog_errno(ret);
1767                goto out;
1768        }
1769
1770        while (trunc_end > trunc_start) {
1771
1772                ret = ocfs2_find_path(INODE_CACHE(inode), path,
1773                                      cluster_in_el);
1774                if (ret) {
1775                        mlog_errno(ret);
1776                        goto out;
1777                }
1778
1779                el = path_leaf_el(path);
1780
1781                i = ocfs2_find_rec(el, trunc_end);
1782                /*
1783                 * Need to go to previous extent block.
1784                 */
1785                if (i < 0) {
1786                        if (path->p_tree_depth == 0)
1787                                break;
1788
1789                        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1790                                                            path,
1791                                                            &cluster_in_el);
1792                        if (ret) {
1793                                mlog_errno(ret);
1794                                goto out;
1795                        }
1796
1797                        /*
1798                         * We've reached the leftmost extent block,
1799                         * it's safe to leave.
1800                         */
1801                        if (cluster_in_el == 0)
1802                                break;
1803
1804                        /*
1805                         * The 'pos' searched for previous extent block is
1806                         * always one cluster less than actual trunc_end.
1807                         */
1808                        trunc_end = cluster_in_el + 1;
1809
1810                        ocfs2_reinit_path(path, 1);
1811
1812                        continue;
1813
1814                } else
1815                        rec = &el->l_recs[i];
1816
1817                ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1818                                     &trunc_len, &trunc_end, &blkno, &done);
1819                if (done)
1820                        break;
1821
1822                flags = rec->e_flags;
1823                phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1824
1825                ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1826                                               phys_cpos, trunc_len, flags,
1827                                               &dealloc, refcount_loc, false);
1828                if (ret < 0) {
1829                        mlog_errno(ret);
1830                        goto out;
1831                }
1832
1833                cluster_in_el = trunc_end;
1834
1835                ocfs2_reinit_path(path, 1);
1836        }
1837
1838        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1839
1840out:
1841        ocfs2_free_path(path);
1842        ocfs2_schedule_truncate_log_flush(osb, 1);
1843        ocfs2_run_deallocs(osb, &dealloc);
1844
1845        return ret;
1846}
1847
1848/*
1849 * Parts of this function taken from xfs_change_file_space()
1850 */
1851static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1852                                     loff_t f_pos, unsigned int cmd,
1853                                     struct ocfs2_space_resv *sr,
1854                                     int change_size)
1855{
1856        int ret;
1857        s64 llen;
1858        loff_t size;
1859        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1860        struct buffer_head *di_bh = NULL;
1861        handle_t *handle;
1862        unsigned long long max_off = inode->i_sb->s_maxbytes;
1863
1864        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1865                return -EROFS;
1866
1867        inode_lock(inode);
1868
1869        /*
1870         * This prevents concurrent writes on other nodes
1871         */
1872        ret = ocfs2_rw_lock(inode, 1);
1873        if (ret) {
1874                mlog_errno(ret);
1875                goto out;
1876        }
1877
1878        ret = ocfs2_inode_lock(inode, &di_bh, 1);
1879        if (ret) {
1880                mlog_errno(ret);
1881                goto out_rw_unlock;
1882        }
1883
1884        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1885                ret = -EPERM;
1886                goto out_inode_unlock;
1887        }
1888
1889        switch (sr->l_whence) {
1890        case 0: /*SEEK_SET*/
1891                break;
1892        case 1: /*SEEK_CUR*/
1893                sr->l_start += f_pos;
1894                break;
1895        case 2: /*SEEK_END*/
1896                sr->l_start += i_size_read(inode);
1897                break;
1898        default:
1899                ret = -EINVAL;
1900                goto out_inode_unlock;
1901        }
1902        sr->l_whence = 0;
1903
1904        llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1905
1906        if (sr->l_start < 0
1907            || sr->l_start > max_off
1908            || (sr->l_start + llen) < 0
1909            || (sr->l_start + llen) > max_off) {
1910                ret = -EINVAL;
1911                goto out_inode_unlock;
1912        }
1913        size = sr->l_start + sr->l_len;
1914
1915        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
1916            cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
1917                if (sr->l_len <= 0) {
1918                        ret = -EINVAL;
1919                        goto out_inode_unlock;
1920                }
1921        }
1922
1923        if (file && should_remove_suid(file->f_path.dentry)) {
1924                ret = __ocfs2_write_remove_suid(inode, di_bh);
1925                if (ret) {
1926                        mlog_errno(ret);
1927                        goto out_inode_unlock;
1928                }
1929        }
1930
1931        down_write(&OCFS2_I(inode)->ip_alloc_sem);
1932        switch (cmd) {
1933        case OCFS2_IOC_RESVSP:
1934        case OCFS2_IOC_RESVSP64:
1935                /*
1936                 * This takes unsigned offsets, but the signed ones we
1937                 * pass have been checked against overflow above.
1938                 */
1939                ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1940                                                       sr->l_len);
1941                break;
1942        case OCFS2_IOC_UNRESVSP:
1943        case OCFS2_IOC_UNRESVSP64:
1944                ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1945                                               sr->l_len);
1946                break;
1947        default:
1948                ret = -EINVAL;
1949        }
1950        up_write(&OCFS2_I(inode)->ip_alloc_sem);
1951        if (ret) {
1952                mlog_errno(ret);
1953                goto out_inode_unlock;
1954        }
1955
1956        /*
1957         * We update c/mtime for these changes
1958         */
1959        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1960        if (IS_ERR(handle)) {
1961                ret = PTR_ERR(handle);
1962                mlog_errno(ret);
1963                goto out_inode_unlock;
1964        }
1965
1966        if (change_size && i_size_read(inode) < size)
1967                i_size_write(inode, size);
1968
1969        inode->i_ctime = inode->i_mtime = current_time(inode);
1970        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1971        if (ret < 0)
1972                mlog_errno(ret);
1973
1974        if (file && (file->f_flags & O_SYNC))
1975                handle->h_sync = 1;
1976
1977        ocfs2_commit_trans(osb, handle);
1978
1979out_inode_unlock:
1980        brelse(di_bh);
1981        ocfs2_inode_unlock(inode, 1);
1982out_rw_unlock:
1983        ocfs2_rw_unlock(inode, 1);
1984
1985out:
1986        inode_unlock(inode);
1987        return ret;
1988}
1989
1990int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1991                            struct ocfs2_space_resv *sr)
1992{
1993        struct inode *inode = file_inode(file);
1994        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1995        int ret;
1996
1997        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1998            !ocfs2_writes_unwritten_extents(osb))
1999                return -ENOTTY;
2000        else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
2001                 !ocfs2_sparse_alloc(osb))
2002                return -ENOTTY;
2003
2004        if (!S_ISREG(inode->i_mode))
2005                return -EINVAL;
2006
2007        if (!(file->f_mode & FMODE_WRITE))
2008                return -EBADF;
2009
2010        ret = mnt_want_write_file(file);
2011        if (ret)
2012                return ret;
2013        ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
2014        mnt_drop_write_file(file);
2015        return ret;
2016}
2017
2018static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
2019                            loff_t len)
2020{
2021        struct inode *inode = file_inode(file);
2022        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2023        struct ocfs2_space_resv sr;
2024        int change_size = 1;
2025        int cmd = OCFS2_IOC_RESVSP64;
2026
2027        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2028                return -EOPNOTSUPP;
2029        if (!ocfs2_writes_unwritten_extents(osb))
2030                return -EOPNOTSUPP;
2031
2032        if (mode & FALLOC_FL_KEEP_SIZE)
2033                change_size = 0;
2034
2035        if (mode & FALLOC_FL_PUNCH_HOLE)
2036                cmd = OCFS2_IOC_UNRESVSP64;
2037
2038        sr.l_whence = 0;
2039        sr.l_start = (s64)offset;
2040        sr.l_len = (s64)len;
2041
2042        return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2043                                         change_size);
2044}
2045
2046int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2047                                   size_t count)
2048{
2049        int ret = 0;
2050        unsigned int extent_flags;
2051        u32 cpos, clusters, extent_len, phys_cpos;
2052        struct super_block *sb = inode->i_sb;
2053
2054        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2055            !ocfs2_is_refcount_inode(inode) ||
2056            OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2057                return 0;
2058
2059        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2060        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2061
2062        while (clusters) {
2063                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2064                                         &extent_flags);
2065                if (ret < 0) {
2066                        mlog_errno(ret);
2067                        goto out;
2068                }
2069
2070                if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2071                        ret = 1;
2072                        break;
2073                }
2074
2075                if (extent_len > clusters)
2076                        extent_len = clusters;
2077
2078                clusters -= extent_len;
2079                cpos += extent_len;
2080        }
2081out:
2082        return ret;
2083}
2084
2085static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2086{
2087        int blockmask = inode->i_sb->s_blocksize - 1;
2088        loff_t final_size = pos + count;
2089
2090        if ((pos & blockmask) || (final_size & blockmask))
2091                return 1;
2092        return 0;
2093}
2094
2095static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2096                                            struct file *file,
2097                                            loff_t pos, size_t count,
2098                                            int *meta_level)
2099{
2100        int ret;
2101        struct buffer_head *di_bh = NULL;
2102        u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2103        u32 clusters =
2104                ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2105
2106        ret = ocfs2_inode_lock(inode, &di_bh, 1);
2107        if (ret) {
2108                mlog_errno(ret);
2109                goto out;
2110        }
2111
2112        *meta_level = 1;
2113
2114        ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
2115        if (ret)
2116                mlog_errno(ret);
2117out:
2118        brelse(di_bh);
2119        return ret;
2120}
2121
2122static int ocfs2_prepare_inode_for_write(struct file *file,
2123                                         loff_t pos, size_t count, int wait)
2124{
2125        int ret = 0, meta_level = 0, overwrite_io = 0;
2126        struct dentry *dentry = file->f_path.dentry;
2127        struct inode *inode = d_inode(dentry);
2128        struct buffer_head *di_bh = NULL;
2129        loff_t end;
2130
2131        /*
2132         * We start with a read level meta lock and only jump to an ex
2133         * if we need to make modifications here.
2134         */
2135        for(;;) {
2136                if (wait)
2137                        ret = ocfs2_inode_lock(inode, NULL, meta_level);
2138                else
2139                        ret = ocfs2_try_inode_lock(inode,
2140                                overwrite_io ? NULL : &di_bh, meta_level);
2141                if (ret < 0) {
2142                        meta_level = -1;
2143                        if (ret != -EAGAIN)
2144                                mlog_errno(ret);
2145                        goto out;
2146                }
2147
2148                /*
2149                 * Check if IO will overwrite allocated blocks in case
2150                 * IOCB_NOWAIT flag is set.
2151                 */
2152                if (!wait && !overwrite_io) {
2153                        overwrite_io = 1;
2154                        if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
2155                                ret = -EAGAIN;
2156                                goto out_unlock;
2157                        }
2158
2159                        ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
2160                        brelse(di_bh);
2161                        di_bh = NULL;
2162                        up_read(&OCFS2_I(inode)->ip_alloc_sem);
2163                        if (ret < 0) {
2164                                if (ret != -EAGAIN)
2165                                        mlog_errno(ret);
2166                                goto out_unlock;
2167                        }
2168                }
2169
2170                /* Clear suid / sgid if necessary. We do this here
2171                 * instead of later in the write path because
2172                 * remove_suid() calls ->setattr without any hint that
2173                 * we may have already done our cluster locking. Since
2174                 * ocfs2_setattr() *must* take cluster locks to
2175                 * proceed, this will lead us to recursively lock the
2176                 * inode. There's also the dinode i_size state which
2177                 * can be lost via setattr during extending writes (we
2178                 * set inode->i_size at the end of a write. */
2179                if (should_remove_suid(dentry)) {
2180                        if (meta_level == 0) {
2181                                ocfs2_inode_unlock(inode, meta_level);
2182                                meta_level = 1;
2183                                continue;
2184                        }
2185
2186                        ret = ocfs2_write_remove_suid(inode);
2187                        if (ret < 0) {
2188                                mlog_errno(ret);
2189                                goto out_unlock;
2190                        }
2191                }
2192
2193                end = pos + count;
2194
2195                ret = ocfs2_check_range_for_refcount(inode, pos, count);
2196                if (ret == 1) {
2197                        ocfs2_inode_unlock(inode, meta_level);
2198                        meta_level = -1;
2199
2200                        ret = ocfs2_prepare_inode_for_refcount(inode,
2201                                                               file,
2202                                                               pos,
2203                                                               count,
2204                                                               &meta_level);
2205                }
2206
2207                if (ret < 0) {
2208                        mlog_errno(ret);
2209                        goto out_unlock;
2210                }
2211
2212                break;
2213        }
2214
2215out_unlock:
2216        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2217                                            pos, count, wait);
2218
2219        brelse(di_bh);
2220
2221        if (meta_level >= 0)
2222                ocfs2_inode_unlock(inode, meta_level);
2223
2224out:
2225        return ret;
2226}
2227
2228static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2229                                    struct iov_iter *from)
2230{
2231        int rw_level;
2232        ssize_t written = 0;
2233        ssize_t ret;
2234        size_t count = iov_iter_count(from);
2235        struct file *file = iocb->ki_filp;
2236        struct inode *inode = file_inode(file);
2237        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2238        int full_coherency = !(osb->s_mount_opt &
2239                               OCFS2_MOUNT_COHERENCY_BUFFERED);
2240        void *saved_ki_complete = NULL;
2241        int append_write = ((iocb->ki_pos + count) >=
2242                        i_size_read(inode) ? 1 : 0);
2243        int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2244        int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2245
2246        trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
2247                (unsigned long long)OCFS2_I(inode)->ip_blkno,
2248                file->f_path.dentry->d_name.len,
2249                file->f_path.dentry->d_name.name,
2250                (unsigned int)from->nr_segs);   /* GRRRRR */
2251
2252        if (!direct_io && nowait)
2253                return -EOPNOTSUPP;
2254
2255        if (count == 0)
2256                return 0;
2257
2258        if (nowait) {
2259                if (!inode_trylock(inode))
2260                        return -EAGAIN;
2261        } else
2262                inode_lock(inode);
2263
2264        /*
2265         * Concurrent O_DIRECT writes are allowed with
2266         * mount_option "coherency=buffered".
2267         * For append write, we must take rw EX.
2268         */
2269        rw_level = (!direct_io || full_coherency || append_write);
2270
2271        if (nowait)
2272                ret = ocfs2_try_rw_lock(inode, rw_level);
2273        else
2274                ret = ocfs2_rw_lock(inode, rw_level);
2275        if (ret < 0) {
2276                if (ret != -EAGAIN)
2277                        mlog_errno(ret);
2278                goto out_mutex;
2279        }
2280
2281        /*
2282         * O_DIRECT writes with "coherency=full" need to take EX cluster
2283         * inode_lock to guarantee coherency.
2284         */
2285        if (direct_io && full_coherency) {
2286                /*
2287                 * We need to take and drop the inode lock to force
2288                 * other nodes to drop their caches.  Buffered I/O
2289                 * already does this in write_begin().
2290                 */
2291                if (nowait)
2292                        ret = ocfs2_try_inode_lock(inode, NULL, 1);
2293                else
2294                        ret = ocfs2_inode_lock(inode, NULL, 1);
2295                if (ret < 0) {
2296                        if (ret != -EAGAIN)
2297                                mlog_errno(ret);
2298                        goto out;
2299                }
2300
2301                ocfs2_inode_unlock(inode, 1);
2302        }
2303
2304        ret = generic_write_checks(iocb, from);
2305        if (ret <= 0) {
2306                if (ret)
2307                        mlog_errno(ret);
2308                goto out;
2309        }
2310        count = ret;
2311
2312        ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
2313        if (ret < 0) {
2314                if (ret != -EAGAIN)
2315                        mlog_errno(ret);
2316                goto out;
2317        }
2318
2319        if (direct_io && !is_sync_kiocb(iocb) &&
2320            ocfs2_is_io_unaligned(inode, count, iocb->ki_pos)) {
2321                /*
2322                 * Make it a sync io if it's an unaligned aio.
2323                 */
2324                saved_ki_complete = xchg(&iocb->ki_complete, NULL);
2325        }
2326
2327        /* communicate with ocfs2_dio_end_io */
2328        ocfs2_iocb_set_rw_locked(iocb, rw_level);
2329
2330        written = __generic_file_write_iter(iocb, from);
2331        /* buffered aio wouldn't have proper lock coverage today */
2332        BUG_ON(written == -EIOCBQUEUED && !direct_io);
2333
2334        /*
2335         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2336         * function pointer which is called when o_direct io completes so that
2337         * it can unlock our rw lock.
2338         * Unfortunately there are error cases which call end_io and others
2339         * that don't.  so we don't have to unlock the rw_lock if either an
2340         * async dio is going to do it in the future or an end_io after an
2341         * error has already done it.
2342         */
2343        if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2344                rw_level = -1;
2345        }
2346
2347        if (unlikely(written <= 0))
2348                goto out;
2349
2350        if (((file->f_flags & O_DSYNC) && !direct_io) ||
2351            IS_SYNC(inode)) {
2352                ret = filemap_fdatawrite_range(file->f_mapping,
2353                                               iocb->ki_pos - written,
2354                                               iocb->ki_pos - 1);
2355                if (ret < 0)
2356                        written = ret;
2357
2358                if (!ret) {
2359                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
2360                        if (ret < 0)
2361                                written = ret;
2362                }
2363
2364                if (!ret)
2365                        ret = filemap_fdatawait_range(file->f_mapping,
2366                                                      iocb->ki_pos - written,
2367                                                      iocb->ki_pos - 1);
2368        }
2369
2370out:
2371        if (saved_ki_complete)
2372                xchg(&iocb->ki_complete, saved_ki_complete);
2373
2374        if (rw_level != -1)
2375                ocfs2_rw_unlock(inode, rw_level);
2376
2377out_mutex:
2378        inode_unlock(inode);
2379
2380        if (written)
2381                ret = written;
2382        return ret;
2383}
2384
2385static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2386                                   struct iov_iter *to)
2387{
2388        int ret = 0, rw_level = -1, lock_level = 0;
2389        struct file *filp = iocb->ki_filp;
2390        struct inode *inode = file_inode(filp);
2391        int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2392        int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2393
2394        trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
2395                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
2396                        filp->f_path.dentry->d_name.len,
2397                        filp->f_path.dentry->d_name.name,
2398                        to->nr_segs);   /* GRRRRR */
2399
2400
2401        if (!inode) {
2402                ret = -EINVAL;
2403                mlog_errno(ret);
2404                goto bail;
2405        }
2406
2407        if (!direct_io && nowait)
2408                return -EOPNOTSUPP;
2409
2410        /*
2411         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
2412         * need locks to protect pending reads from racing with truncate.
2413         */
2414        if (direct_io) {
2415                if (nowait)
2416                        ret = ocfs2_try_rw_lock(inode, 0);
2417                else
2418                        ret = ocfs2_rw_lock(inode, 0);
2419
2420                if (ret < 0) {
2421                        if (ret != -EAGAIN)
2422                                mlog_errno(ret);
2423                        goto bail;
2424                }
2425                rw_level = 0;
2426                /* communicate with ocfs2_dio_end_io */
2427                ocfs2_iocb_set_rw_locked(iocb, rw_level);
2428        }
2429
2430        /*
2431         * We're fine letting folks race truncates and extending
2432         * writes with read across the cluster, just like they can
2433         * locally. Hence no rw_lock during read.
2434         *
2435         * Take and drop the meta data lock to update inode fields
2436         * like i_size. This allows the checks down below
2437         * generic_file_read_iter() a chance of actually working.
2438         */
2439        ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
2440                                     !nowait);
2441        if (ret < 0) {
2442                if (ret != -EAGAIN)
2443                        mlog_errno(ret);
2444                goto bail;
2445        }
2446        ocfs2_inode_unlock(inode, lock_level);
2447
2448        ret = generic_file_read_iter(iocb, to);
2449        trace_generic_file_read_iter_ret(ret);
2450
2451        /* buffered aio wouldn't have proper lock coverage today */
2452        BUG_ON(ret == -EIOCBQUEUED && !direct_io);
2453
2454        /* see ocfs2_file_write_iter */
2455        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2456                rw_level = -1;
2457        }
2458
2459bail:
2460        if (rw_level != -1)
2461                ocfs2_rw_unlock(inode, rw_level);
2462
2463        return ret;
2464}
2465
2466/* Refer generic_file_llseek_unlocked() */
2467static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
2468{
2469        struct inode *inode = file->f_mapping->host;
2470        int ret = 0;
2471
2472        inode_lock(inode);
2473
2474        switch (whence) {
2475        case SEEK_SET:
2476                break;
2477        case SEEK_END:
2478                /* SEEK_END requires the OCFS2 inode lock for the file
2479                 * because it references the file's size.
2480                 */
2481                ret = ocfs2_inode_lock(inode, NULL, 0);
2482                if (ret < 0) {
2483                        mlog_errno(ret);
2484                        goto out;
2485                }
2486                offset += i_size_read(inode);
2487                ocfs2_inode_unlock(inode, 0);
2488                break;
2489        case SEEK_CUR:
2490                if (offset == 0) {
2491                        offset = file->f_pos;
2492                        goto out;
2493                }
2494                offset += file->f_pos;
2495                break;
2496        case SEEK_DATA:
2497        case SEEK_HOLE:
2498                ret = ocfs2_seek_data_hole_offset(file, &offset, whence);
2499                if (ret)
2500                        goto out;
2501                break;
2502        default:
2503                ret = -EINVAL;
2504                goto out;
2505        }
2506
2507        offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
2508
2509out:
2510        inode_unlock(inode);
2511        if (ret)
2512                return ret;
2513        return offset;
2514}
2515
2516static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
2517                                     struct file *file_out, loff_t pos_out,
2518                                     loff_t len, unsigned int remap_flags)
2519{
2520        struct inode *inode_in = file_inode(file_in);
2521        struct inode *inode_out = file_inode(file_out);
2522        struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
2523        struct buffer_head *in_bh = NULL, *out_bh = NULL;
2524        bool same_inode = (inode_in == inode_out);
2525        loff_t remapped = 0;
2526        ssize_t ret;
2527
2528        if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
2529                return -EINVAL;
2530        if (!ocfs2_refcount_tree(osb))
2531                return -EOPNOTSUPP;
2532        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
2533                return -EROFS;
2534
2535        /* Lock both files against IO */
2536        ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
2537        if (ret)
2538                return ret;
2539
2540        /* Check file eligibility and prepare for block sharing. */
2541        ret = -EINVAL;
2542        if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
2543            (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
2544                goto out_unlock;
2545
2546        ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
2547                        &len, remap_flags);
2548        if (ret < 0 || len == 0)
2549                goto out_unlock;
2550
2551        /* Lock out changes to the allocation maps and remap. */
2552        down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2553        if (!same_inode)
2554                down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
2555                                  SINGLE_DEPTH_NESTING);
2556
2557        /* Zap any page cache for the destination file's range. */
2558        truncate_inode_pages_range(&inode_out->i_data,
2559                                   round_down(pos_out, PAGE_SIZE),
2560                                   round_up(pos_out + len, PAGE_SIZE) - 1);
2561
2562        remapped = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in,
2563                        inode_out, out_bh, pos_out, len);
2564        up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
2565        if (!same_inode)
2566                up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
2567        if (remapped < 0) {
2568                ret = remapped;
2569                mlog_errno(ret);
2570                goto out_unlock;
2571        }
2572
2573        /*
2574         * Empty the extent map so that we may get the right extent
2575         * record from the disk.
2576         */
2577        ocfs2_extent_map_trunc(inode_in, 0);
2578        ocfs2_extent_map_trunc(inode_out, 0);
2579
2580        ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
2581        if (ret) {
2582                mlog_errno(ret);
2583                goto out_unlock;
2584        }
2585
2586out_unlock:
2587        ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
2588        return remapped > 0 ? remapped : ret;
2589}
2590
2591const struct inode_operations ocfs2_file_iops = {
2592        .setattr        = ocfs2_setattr,
2593        .getattr        = ocfs2_getattr,
2594        .permission     = ocfs2_permission,
2595        .listxattr      = ocfs2_listxattr,
2596        .fiemap         = ocfs2_fiemap,
2597        .get_acl        = ocfs2_iop_get_acl,
2598        .set_acl        = ocfs2_iop_set_acl,
2599};
2600
2601const struct inode_operations ocfs2_special_file_iops = {
2602        .setattr        = ocfs2_setattr,
2603        .getattr        = ocfs2_getattr,
2604        .permission     = ocfs2_permission,
2605        .get_acl        = ocfs2_iop_get_acl,
2606        .set_acl        = ocfs2_iop_set_acl,
2607};
2608
2609/*
2610 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2611 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2612 */
2613const struct file_operations ocfs2_fops = {
2614        .llseek         = ocfs2_file_llseek,
2615        .mmap           = ocfs2_mmap,
2616        .fsync          = ocfs2_sync_file,
2617        .release        = ocfs2_file_release,
2618        .open           = ocfs2_file_open,
2619        .read_iter      = ocfs2_file_read_iter,
2620        .write_iter     = ocfs2_file_write_iter,
2621        .unlocked_ioctl = ocfs2_ioctl,
2622#ifdef CONFIG_COMPAT
2623        .compat_ioctl   = ocfs2_compat_ioctl,
2624#endif
2625        .lock           = ocfs2_lock,
2626        .flock          = ocfs2_flock,
2627        .splice_read    = generic_file_splice_read,
2628        .splice_write   = iter_file_splice_write,
2629        .fallocate      = ocfs2_fallocate,
2630        .remap_file_range = ocfs2_remap_file_range,
2631};
2632
2633const struct file_operations ocfs2_dops = {
2634        .llseek         = generic_file_llseek,
2635        .read           = generic_read_dir,
2636        .iterate        = ocfs2_readdir,
2637        .fsync          = ocfs2_sync_file,
2638        .release        = ocfs2_dir_release,
2639        .open           = ocfs2_dir_open,
2640        .unlocked_ioctl = ocfs2_ioctl,
2641#ifdef CONFIG_COMPAT
2642        .compat_ioctl   = ocfs2_compat_ioctl,
2643#endif
2644        .lock           = ocfs2_lock,
2645        .flock          = ocfs2_flock,
2646};
2647
2648/*
2649 * POSIX-lockless variants of our file_operations.
2650 *
2651 * These will be used if the underlying cluster stack does not support
2652 * posix file locking, if the user passes the "localflocks" mount
2653 * option, or if we have a local-only fs.
2654 *
2655 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2656 * so we still want it in the case of no stack support for
2657 * plocks. Internally, it will do the right thing when asked to ignore
2658 * the cluster.
2659 */
2660const struct file_operations ocfs2_fops_no_plocks = {
2661        .llseek         = ocfs2_file_llseek,
2662        .mmap           = ocfs2_mmap,
2663        .fsync          = ocfs2_sync_file,
2664        .release        = ocfs2_file_release,
2665        .open           = ocfs2_file_open,
2666        .read_iter      = ocfs2_file_read_iter,
2667        .write_iter     = ocfs2_file_write_iter,
2668        .unlocked_ioctl = ocfs2_ioctl,
2669#ifdef CONFIG_COMPAT
2670        .compat_ioctl   = ocfs2_compat_ioctl,
2671#endif
2672        .flock          = ocfs2_flock,
2673        .splice_read    = generic_file_splice_read,
2674        .splice_write   = iter_file_splice_write,
2675        .fallocate      = ocfs2_fallocate,
2676        .remap_file_range = ocfs2_remap_file_range,
2677};
2678
2679const struct file_operations ocfs2_dops_no_plocks = {
2680        .llseek         = generic_file_llseek,
2681        .read           = generic_read_dir,
2682        .iterate        = ocfs2_readdir,
2683        .fsync          = ocfs2_sync_file,
2684        .release        = ocfs2_dir_release,
2685        .open           = ocfs2_dir_open,
2686        .unlocked_ioctl = ocfs2_ioctl,
2687#ifdef CONFIG_COMPAT
2688        .compat_ioctl   = ocfs2_compat_ioctl,
2689#endif
2690        .flock          = ocfs2_flock,
2691};
2692