LXR linux/fs/ocfs2/file.c

   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * file.c
   5 *
   6 * File open, close, extend, truncate
   7 *
   8 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   9 *
  10 * This program is free software; you can redistribute it and/or
  11 * modify it under the terms of the GNU General Public
  12 * License as published by the Free Software Foundation; either
  13 * version 2 of the License, or (at your option) any later version.
  14 *
  15 * This program is distributed in the hope that it will be useful,
  16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 * General Public License for more details.
  19 *
  20 * You should have received a copy of the GNU General Public
  21 * License along with this program; if not, write to the
  22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23 * Boston, MA 021110-1307, USA.
  24 */
  25
  26#include <linux/capability.h>
  27#include <linux/fs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/highmem.h>
  31#include <linux/pagemap.h>
  32#include <linux/uio.h>
  33#include <linux/sched.h>
  34#include <linux/splice.h>
  35#include <linux/mount.h>
  36#include <linux/writeback.h>
  37#include <linux/falloc.h>
  38#include <linux/quotaops.h>
  39#include <linux/blkdev.h>
  40
  41#define MLOG_MASK_PREFIX ML_INODE
  42#include <cluster/masklog.h>
  43
  44#include "ocfs2.h"
  45
  46#include "alloc.h"
  47#include "aops.h"
  48#include "dir.h"
  49#include "dlmglue.h"
  50#include "extent_map.h"
  51#include "file.h"
  52#include "sysfile.h"
  53#include "inode.h"
  54#include "ioctl.h"
  55#include "journal.h"
  56#include "locks.h"
  57#include "mmap.h"
  58#include "suballoc.h"
  59#include "super.h"
  60#include "xattr.h"
  61#include "acl.h"
  62#include "quota.h"
  63#include "refcounttree.h"
  64
  65#include "buffer_head_io.h"
  66
  67static int ocfs2_init_file_private(struct inode *inode, struct file *file)
  68{
  69        struct ocfs2_file_private *fp;
  70
  71        fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
  72        if (!fp)
  73                return -ENOMEM;
  74
  75        fp->fp_file = file;
  76        mutex_init(&fp->fp_mutex);
  77        ocfs2_file_lock_res_init(&fp->fp_flock, fp);
  78        file->private_data = fp;
  79
  80        return 0;
  81}
  82
  83static void ocfs2_free_file_private(struct inode *inode, struct file *file)
  84{
  85        struct ocfs2_file_private *fp = file->private_data;
  86        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  87
  88        if (fp) {
  89                ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
  90                ocfs2_lock_res_free(&fp->fp_flock);
  91                kfree(fp);
  92                file->private_data = NULL;
  93        }
  94}
  95
  96static int ocfs2_file_open(struct inode *inode, struct file *file)
  97{
  98        int status;
  99        int mode = file->f_flags;
 100        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 101
 102        mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 103                   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
 104
 105        if (file->f_mode & FMODE_WRITE)
 106                dquot_initialize(inode);
 107
 108        spin_lock(&oi->ip_lock);
 109
 110        /* Check that the inode hasn't been wiped from disk by another
 111         * node. If it hasn't then we're safe as long as we hold the
 112         * spin lock until our increment of open count. */
 113        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 114                spin_unlock(&oi->ip_lock);
 115
 116                status = -ENOENT;
 117                goto leave;
 118        }
 119
 120        if (mode & O_DIRECT)
 121                oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
 122
 123        oi->ip_open_count++;
 124        spin_unlock(&oi->ip_lock);
 125
 126        status = ocfs2_init_file_private(inode, file);
 127        if (status) {
 128                /*
 129                 * We want to set open count back if we're failing the
 130                 * open.
 131                 */
 132                spin_lock(&oi->ip_lock);
 133                oi->ip_open_count--;
 134                spin_unlock(&oi->ip_lock);
 135        }
 136
 137leave:
 138        mlog_exit(status);
 139        return status;
 140}
 141
 142static int ocfs2_file_release(struct inode *inode, struct file *file)
 143{
 144        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 145
 146        mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 147                       file->f_path.dentry->d_name.len,
 148                       file->f_path.dentry->d_name.name);
 149
 150        spin_lock(&oi->ip_lock);
 151        if (!--oi->ip_open_count)
 152                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 153        spin_unlock(&oi->ip_lock);
 154
 155        ocfs2_free_file_private(inode, file);
 156
 157        mlog_exit(0);
 158
 159        return 0;
 160}
 161
 162static int ocfs2_dir_open(struct inode *inode, struct file *file)
 163{
 164        return ocfs2_init_file_private(inode, file);
 165}
 166
 167static int ocfs2_dir_release(struct inode *inode, struct file *file)
 168{
 169        ocfs2_free_file_private(inode, file);
 170        return 0;
 171}
 172
 173static int ocfs2_sync_file(struct file *file, int datasync)
 174{
 175        int err = 0;
 176        journal_t *journal;
 177        struct inode *inode = file->f_mapping->host;
 178        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 179
 180        mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
 181                   file->f_path.dentry, file->f_path.dentry->d_name.len,
 182                   file->f_path.dentry->d_name.name);
 183
 184        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 185                /*
 186                 * We still have to flush drive's caches to get data to the
 187                 * platter
 188                 */
 189                if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
 190                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 191                goto bail;
 192        }
 193
 194        journal = osb->journal->j_journal;
 195        err = jbd2_journal_force_commit(journal);
 196
 197bail:
 198        mlog_exit(err);
 199
 200        return (err < 0) ? -EIO : 0;
 201}
 202
 203int ocfs2_should_update_atime(struct inode *inode,
 204                              struct vfsmount *vfsmnt)
 205{
 206        struct timespec now;
 207        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 208
 209        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 210                return 0;
 211
 212        if ((inode->i_flags & S_NOATIME) ||
 213            ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
 214                return 0;
 215
 216        /*
 217         * We can be called with no vfsmnt structure - NFSD will
 218         * sometimes do this.
 219         *
 220         * Note that our action here is different than touch_atime() -
 221         * if we can't tell whether this is a noatime mount, then we
 222         * don't know whether to trust the value of s_atime_quantum.
 223         */
 224        if (vfsmnt == NULL)
 225                return 0;
 226
 227        if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
 228            ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 229                return 0;
 230
 231        if (vfsmnt->mnt_flags & MNT_RELATIME) {
 232                if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
 233                    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 234                        return 1;
 235
 236                return 0;
 237        }
 238
 239        now = CURRENT_TIME;
 240        if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
 241                return 0;
 242        else
 243                return 1;
 244}
 245
 246int ocfs2_update_inode_atime(struct inode *inode,
 247                             struct buffer_head *bh)
 248{
 249        int ret;
 250        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 251        handle_t *handle;
 252        struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 253
 254        mlog_entry_void();
 255
 256        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 257        if (IS_ERR(handle)) {
 258                ret = PTR_ERR(handle);
 259                mlog_errno(ret);
 260                goto out;
 261        }
 262
 263        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 264                                      OCFS2_JOURNAL_ACCESS_WRITE);
 265        if (ret) {
 266                mlog_errno(ret);
 267                goto out_commit;
 268        }
 269
 270        /*
 271         * Don't use ocfs2_mark_inode_dirty() here as we don't always
 272         * have i_mutex to guard against concurrent changes to other
 273         * inode fields.
 274         */
 275        inode->i_atime = CURRENT_TIME;
 276        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 277        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
 278        ocfs2_journal_dirty(handle, bh);
 279
 280out_commit:
 281        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 282out:
 283        mlog_exit(ret);
 284        return ret;
 285}
 286
 287static int ocfs2_set_inode_size(handle_t *handle,
 288                                struct inode *inode,
 289                                struct buffer_head *fe_bh,
 290                                u64 new_i_size)
 291{
 292        int status;
 293
 294        mlog_entry_void();
 295        i_size_write(inode, new_i_size);
 296        inode->i_blocks = ocfs2_inode_sector_count(inode);
 297        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 298
 299        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 300        if (status < 0) {
 301                mlog_errno(status);
 302                goto bail;
 303        }
 304
 305bail:
 306        mlog_exit(status);
 307        return status;
 308}
 309
 310int ocfs2_simple_size_update(struct inode *inode,
 311                             struct buffer_head *di_bh,
 312                             u64 new_i_size)
 313{
 314        int ret;
 315        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 316        handle_t *handle = NULL;
 317
 318        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 319        if (IS_ERR(handle)) {
 320                ret = PTR_ERR(handle);
 321                mlog_errno(ret);
 322                goto out;
 323        }
 324
 325        ret = ocfs2_set_inode_size(handle, inode, di_bh,
 326                                   new_i_size);
 327        if (ret < 0)
 328                mlog_errno(ret);
 329
 330        ocfs2_commit_trans(osb, handle);
 331out:
 332        return ret;
 333}
 334
 335static int ocfs2_cow_file_pos(struct inode *inode,
 336                              struct buffer_head *fe_bh,
 337                              u64 offset)
 338{
 339        int status;
 340        u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 341        unsigned int num_clusters = 0;
 342        unsigned int ext_flags = 0;
 343
 344        /*
 345         * If the new offset is aligned to the range of the cluster, there is
 346         * no space for ocfs2_zero_range_for_truncate to fill, so no need to
 347         * CoW either.
 348         */
 349        if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
 350                return 0;
 351
 352        status = ocfs2_get_clusters(inode, cpos, &phys,
 353                                    &num_clusters, &ext_flags);
 354        if (status) {
 355                mlog_errno(status);
 356                goto out;
 357        }
 358
 359        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
 360                goto out;
 361
 362        return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1);
 363
 364out:
 365        return status;
 366}
 367
 368static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 369                                     struct inode *inode,
 370                                     struct buffer_head *fe_bh,
 371                                     u64 new_i_size)
 372{
 373        int status;
 374        handle_t *handle;
 375        struct ocfs2_dinode *di;
 376        u64 cluster_bytes;
 377
 378        mlog_entry_void();
 379
 380        /*
 381         * We need to CoW the cluster contains the offset if it is reflinked
 382         * since we will call ocfs2_zero_range_for_truncate later which will
 383         * write "0" from offset to the end of the cluster.
 384         */
 385        status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
 386        if (status) {
 387                mlog_errno(status);
 388                return status;
 389        }
 390
 391        /* TODO: This needs to actually orphan the inode in this
 392         * transaction. */
 393
 394        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 395        if (IS_ERR(handle)) {
 396                status = PTR_ERR(handle);
 397                mlog_errno(status);
 398                goto out;
 399        }
 400
 401        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
 402                                         OCFS2_JOURNAL_ACCESS_WRITE);
 403        if (status < 0) {
 404                mlog_errno(status);
 405                goto out_commit;
 406        }
 407
 408        /*
 409         * Do this before setting i_size.
 410         */
 411        cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
 412        status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
 413                                               cluster_bytes);
 414        if (status) {
 415                mlog_errno(status);
 416                goto out_commit;
 417        }
 418
 419        i_size_write(inode, new_i_size);
 420        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 421
 422        di = (struct ocfs2_dinode *) fe_bh->b_data;
 423        di->i_size = cpu_to_le64(new_i_size);
 424        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 425        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 426
 427        ocfs2_journal_dirty(handle, fe_bh);
 428
 429out_commit:
 430        ocfs2_commit_trans(osb, handle);
 431out:
 432
 433        mlog_exit(status);
 434        return status;
 435}
 436
 437static int ocfs2_truncate_file(struct inode *inode,
 438                               struct buffer_head *di_bh,
 439                               u64 new_i_size)
 440{
 441        int status = 0;
 442        struct ocfs2_dinode *fe = NULL;
 443        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 444
 445        mlog_entry("(inode = %llu, new_i_size = %llu\n",
 446                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 447                   (unsigned long long)new_i_size);
 448
 449        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
 450         * already validated it */
 451        fe = (struct ocfs2_dinode *) di_bh->b_data;
 452
 453        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 454                        "Inode %llu, inode i_size = %lld != di "
 455                        "i_size = %llu, i_flags = 0x%x\n",
 456                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 457                        i_size_read(inode),
 458                        (unsigned long long)le64_to_cpu(fe->i_size),
 459                        le32_to_cpu(fe->i_flags));
 460
 461        if (new_i_size > le64_to_cpu(fe->i_size)) {
 462                mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
 463                     (unsigned long long)le64_to_cpu(fe->i_size),
 464                     (unsigned long long)new_i_size);
 465                status = -EINVAL;
 466                mlog_errno(status);
 467                goto bail;
 468        }
 469
 470        mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
 471             (unsigned long long)le64_to_cpu(fe->i_blkno),
 472             (unsigned long long)le64_to_cpu(fe->i_size),
 473             (unsigned long long)new_i_size);
 474
 475        /* lets handle the simple truncate cases before doing any more
 476         * cluster locking. */
 477        if (new_i_size == le64_to_cpu(fe->i_size))
 478                goto bail;
 479
 480        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 481
 482        ocfs2_resv_discard(&osb->osb_la_resmap,
 483                           &OCFS2_I(inode)->ip_la_data_resv);
 484
 485        /*
 486         * The inode lock forced other nodes to sync and drop their
 487         * pages, which (correctly) happens even if we have a truncate
 488         * without allocation change - ocfs2 cluster sizes can be much
 489         * greater than page size, so we have to truncate them
 490         * anyway.
 491         */
 492        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 493        truncate_inode_pages(inode->i_mapping, new_i_size);
 494
 495        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 496                status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
 497                                               i_size_read(inode), 1);
 498                if (status)
 499                        mlog_errno(status);
 500
 501                goto bail_unlock_sem;
 502        }
 503
 504        /* alright, we're going to need to do a full blown alloc size
 505         * change. Orphan the inode so that recovery can complete the
 506         * truncate if necessary. This does the task of marking
 507         * i_size. */
 508        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 509        if (status < 0) {
 510                mlog_errno(status);
 511                goto bail_unlock_sem;
 512        }
 513
 514        status = ocfs2_commit_truncate(osb, inode, di_bh);
 515        if (status < 0) {
 516                mlog_errno(status);
 517                goto bail_unlock_sem;
 518        }
 519
 520        /* TODO: orphan dir cleanup here. */
 521bail_unlock_sem:
 522        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 523
 524bail:
 525        if (!status && OCFS2_I(inode)->ip_clusters == 0)
 526                status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 527
 528        mlog_exit(status);
 529        return status;
 530}
 531
 532/*
 533 * extend file allocation only here.
 534 * we'll update all the disk stuff, and oip->alloc_size
 535 *
 536 * expect stuff to be locked, a transaction started and enough data /
 537 * metadata reservations in the contexts.
 538 *
 539 * Will return -EAGAIN, and a reason if a restart is needed.
 540 * If passed in, *reason will always be set, even in error.
 541 */
 542int ocfs2_add_inode_data(struct ocfs2_super *osb,
 543                         struct inode *inode,
 544                         u32 *logical_offset,
 545                         u32 clusters_to_add,
 546                         int mark_unwritten,
 547                         struct buffer_head *fe_bh,
 548                         handle_t *handle,
 549                         struct ocfs2_alloc_context *data_ac,
 550                         struct ocfs2_alloc_context *meta_ac,
 551                         enum ocfs2_alloc_restarted *reason_ret)
 552{
 553        int ret;
 554        struct ocfs2_extent_tree et;
 555
 556        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
 557        ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
 558                                          clusters_to_add, mark_unwritten,
 559                                          data_ac, meta_ac, reason_ret);
 560
 561        return ret;
 562}
 563
 564static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 565                                     u32 clusters_to_add, int mark_unwritten)
 566{
 567        int status = 0;
 568        int restart_func = 0;
 569        int credits;
 570        u32 prev_clusters;
 571        struct buffer_head *bh = NULL;
 572        struct ocfs2_dinode *fe = NULL;
 573        handle_t *handle = NULL;
 574        struct ocfs2_alloc_context *data_ac = NULL;
 575        struct ocfs2_alloc_context *meta_ac = NULL;
 576        enum ocfs2_alloc_restarted why;
 577        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 578        struct ocfs2_extent_tree et;
 579        int did_quota = 0;
 580
 581        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 582
 583        /*
 584         * This function only exists for file systems which don't
 585         * support holes.
 586         */
 587        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 588
 589        status = ocfs2_read_inode_block(inode, &bh);
 590        if (status < 0) {
 591                mlog_errno(status);
 592                goto leave;
 593        }
 594        fe = (struct ocfs2_dinode *) bh->b_data;
 595
 596restart_all:
 597        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 598
 599        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
 600             "clusters_to_add = %u\n",
 601             (unsigned long long)OCFS2_I(inode)->ip_blkno,
 602             (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
 603             clusters_to_add);
 604        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
 605        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 606                                       &data_ac, &meta_ac);
 607        if (status) {
 608                mlog_errno(status);
 609                goto leave;
 610        }
 611
 612        credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
 613                                            clusters_to_add);
 614        handle = ocfs2_start_trans(osb, credits);
 615        if (IS_ERR(handle)) {
 616                status = PTR_ERR(handle);
 617                handle = NULL;
 618                mlog_errno(status);
 619                goto leave;
 620        }
 621
 622restarted_transaction:
 623        status = dquot_alloc_space_nodirty(inode,
 624                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 625        if (status)
 626                goto leave;
 627        did_quota = 1;
 628
 629        /* reserve a write to the file entry early on - that we if we
 630         * run out of credits in the allocation path, we can still
 631         * update i_size. */
 632        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 633                                         OCFS2_JOURNAL_ACCESS_WRITE);
 634        if (status < 0) {
 635                mlog_errno(status);
 636                goto leave;
 637        }
 638
 639        prev_clusters = OCFS2_I(inode)->ip_clusters;
 640
 641        status = ocfs2_add_inode_data(osb,
 642                                      inode,
 643                                      &logical_start,
 644                                      clusters_to_add,
 645                                      mark_unwritten,
 646                                      bh,
 647                                      handle,
 648                                      data_ac,
 649                                      meta_ac,
 650                                      &why);
 651        if ((status < 0) && (status != -EAGAIN)) {
 652                if (status != -ENOSPC)
 653                        mlog_errno(status);
 654                goto leave;
 655        }
 656
 657        ocfs2_journal_dirty(handle, bh);
 658
 659        spin_lock(&OCFS2_I(inode)->ip_lock);
 660        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 661        spin_unlock(&OCFS2_I(inode)->ip_lock);
 662        /* Release unused quota reservation */
 663        dquot_free_space(inode,
 664                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 665        did_quota = 0;
 666
 667        if (why != RESTART_NONE && clusters_to_add) {
 668                if (why == RESTART_META) {
 669                        mlog(0, "restarting function.\n");
 670                        restart_func = 1;
 671                        status = 0;
 672                } else {
 673                        BUG_ON(why != RESTART_TRANS);
 674
 675                        mlog(0, "restarting transaction.\n");
 676                        /* TODO: This can be more intelligent. */
 677                        credits = ocfs2_calc_extend_credits(osb->sb,
 678                                                            &fe->id2.i_list,
 679                                                            clusters_to_add);
 680                        status = ocfs2_extend_trans(handle, credits);
 681                        if (status < 0) {
 682                                /* handle still has to be committed at
 683                                 * this point. */
 684                                status = -ENOMEM;
 685                                mlog_errno(status);
 686                                goto leave;
 687                        }
 688                        goto restarted_transaction;
 689                }
 690        }
 691
 692        mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
 693             le32_to_cpu(fe->i_clusters),
 694             (unsigned long long)le64_to_cpu(fe->i_size));
 695        mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 696             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 697
 698leave:
 699        if (status < 0 && did_quota)
 700                dquot_free_space(inode,
 701                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 702        if (handle) {
 703                ocfs2_commit_trans(osb, handle);
 704                handle = NULL;
 705        }
 706        if (data_ac) {
 707                ocfs2_free_alloc_context(data_ac);
 708                data_ac = NULL;
 709        }
 710        if (meta_ac) {
 711                ocfs2_free_alloc_context(meta_ac);
 712                meta_ac = NULL;
 713        }
 714        if ((!status) && restart_func) {
 715                restart_func = 0;
 716                goto restart_all;
 717        }
 718        brelse(bh);
 719        bh = NULL;
 720
 721        mlog_exit(status);
 722        return status;
 723}
 724
 725/*
 726 * While a write will already be ordering the data, a truncate will not.
 727 * Thus, we need to explicitly order the zeroed pages.
 728 */
 729static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
 730{
 731        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 732        handle_t *handle = NULL;
 733        int ret = 0;
 734
 735        if (!ocfs2_should_order_data(inode))
 736                goto out;
 737
 738        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 739        if (IS_ERR(handle)) {
 740                ret = -ENOMEM;
 741                mlog_errno(ret);
 742                goto out;
 743        }
 744
 745        ret = ocfs2_jbd2_file_inode(handle, inode);
 746        if (ret < 0)
 747                mlog_errno(ret);
 748
 749out:
 750        if (ret) {
 751                if (!IS_ERR(handle))
 752                        ocfs2_commit_trans(osb, handle);
 753                handle = ERR_PTR(ret);
 754        }
 755        return handle;
 756}
 757
 758/* Some parts of this taken from generic_cont_expand, which turned out
 759 * to be too fragile to do exactly what we need without us having to
 760 * worry about recursive locking in ->write_begin() and ->write_end(). */
 761static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 762                                 u64 abs_to)
 763{
 764        struct address_space *mapping = inode->i_mapping;
 765        struct page *page;
 766        unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
 767        handle_t *handle = NULL;
 768        int ret = 0;
 769        unsigned zero_from, zero_to, block_start, block_end;
 770
 771        BUG_ON(abs_from >= abs_to);
 772        BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
 773        BUG_ON(abs_from & (inode->i_blkbits - 1));
 774
 775        page = find_or_create_page(mapping, index, GFP_NOFS);
 776        if (!page) {
 777                ret = -ENOMEM;
 778                mlog_errno(ret);
 779                goto out;
 780        }
 781
 782        /* Get the offsets within the page that we want to zero */
 783        zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
 784        zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
 785        if (!zero_to)
 786                zero_to = PAGE_CACHE_SIZE;
 787
 788        mlog(0,
 789             "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
 790             (unsigned long long)abs_from, (unsigned long long)abs_to,
 791             index, zero_from, zero_to);
 792
 793        /* We know that zero_from is block aligned */
 794        for (block_start = zero_from; block_start < zero_to;
 795             block_start = block_end) {
 796                block_end = block_start + (1 << inode->i_blkbits);
 797
 798                /*
 799                 * block_start is block-aligned.  Bump it by one to force
 800                 * __block_write_begin and block_commit_write to zero the
 801                 * whole block.
 802                 */
 803                ret = __block_write_begin(page, block_start + 1, 0,
 804                                          ocfs2_get_block);
 805                if (ret < 0) {
 806                        mlog_errno(ret);
 807                        goto out_unlock;
 808                }
 809
 810                if (!handle) {
 811                        handle = ocfs2_zero_start_ordered_transaction(inode);
 812                        if (IS_ERR(handle)) {
 813                                ret = PTR_ERR(handle);
 814                                handle = NULL;
 815                                break;
 816                        }
 817                }
 818
 819                /* must not update i_size! */
 820                ret = block_commit_write(page, block_start + 1,
 821                                         block_start + 1);
 822                if (ret < 0)
 823                        mlog_errno(ret);
 824                else
 825                        ret = 0;
 826        }
 827
 828        if (handle)
 829                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 830
 831out_unlock:
 832        unlock_page(page);
 833        page_cache_release(page);
 834out:
 835        return ret;
 836}
 837
 838/*
 839 * Find the next range to zero.  We do this in terms of bytes because
 840 * that's what ocfs2_zero_extend() wants, and it is dealing with the
 841 * pagecache.  We may return multiple extents.
 842 *
 843 * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
 844 * needs to be zeroed.  range_start and range_end return the next zeroing
 845 * range.  A subsequent call should pass the previous range_end as its
 846 * zero_start.  If range_end is 0, there's nothing to do.
 847 *
 848 * Unwritten extents are skipped over.  Refcounted extents are CoWd.
 849 */
 850static int ocfs2_zero_extend_get_range(struct inode *inode,
 851                                       struct buffer_head *di_bh,
 852                                       u64 zero_start, u64 zero_end,
 853                                       u64 *range_start, u64 *range_end)
 854{
 855        int rc = 0, needs_cow = 0;
 856        u32 p_cpos, zero_clusters = 0;
 857        u32 zero_cpos =
 858                zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 859        u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
 860        unsigned int num_clusters = 0;
 861        unsigned int ext_flags = 0;
 862
 863        while (zero_cpos < last_cpos) {
 864                rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
 865                                        &num_clusters, &ext_flags);
 866                if (rc) {
 867                        mlog_errno(rc);
 868                        goto out;
 869                }
 870
 871                if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
 872                        zero_clusters = num_clusters;
 873                        if (ext_flags & OCFS2_EXT_REFCOUNTED)
 874                                needs_cow = 1;
 875                        break;
 876                }
 877
 878                zero_cpos += num_clusters;
 879        }
 880        if (!zero_clusters) {
 881                *range_end = 0;
 882                goto out;
 883        }
 884
 885        while ((zero_cpos + zero_clusters) < last_cpos) {
 886                rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
 887                                        &p_cpos, &num_clusters,
 888                                        &ext_flags);
 889                if (rc) {
 890                        mlog_errno(rc);
 891                        goto out;
 892                }
 893
 894                if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
 895                        break;
 896                if (ext_flags & OCFS2_EXT_REFCOUNTED)
 897                        needs_cow = 1;
 898                zero_clusters += num_clusters;
 899        }
 900        if ((zero_cpos + zero_clusters) > last_cpos)
 901                zero_clusters = last_cpos - zero_cpos;
 902
 903        if (needs_cow) {
 904                rc = ocfs2_refcount_cow(inode, NULL, di_bh, zero_cpos,
 905                                        zero_clusters, UINT_MAX);
 906                if (rc) {
 907                        mlog_errno(rc);
 908                        goto out;
 909                }
 910        }
 911
 912        *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
 913        *range_end = ocfs2_clusters_to_bytes(inode->i_sb,
 914                                             zero_cpos + zero_clusters);
 915
 916out:
 917        return rc;
 918}
 919
 920/*
 921 * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
 922 * has made sure that the entire range needs zeroing.
 923 */
 924static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
 925                                   u64 range_end)
 926{
 927        int rc = 0;
 928        u64 next_pos;
 929        u64 zero_pos = range_start;
 930
 931        mlog(0, "range_start = %llu, range_end = %llu\n",
 932             (unsigned long long)range_start,
 933             (unsigned long long)range_end);
 934        BUG_ON(range_start >= range_end);
 935
 936        while (zero_pos < range_end) {
 937                next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
 938                if (next_pos > range_end)
 939                        next_pos = range_end;
 940                rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
 941                if (rc < 0) {
 942                        mlog_errno(rc);
 943                        break;
 944                }
 945                zero_pos = next_pos;
 946
 947                /*
 948                 * Very large extends have the potential to lock up
 949                 * the cpu for extended periods of time.
 950                 */
 951                cond_resched();
 952        }
 953
 954        return rc;
 955}
 956
 957int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 958                      loff_t zero_to_size)
 959{
 960        int ret = 0;
 961        u64 zero_start, range_start = 0, range_end = 0;
 962        struct super_block *sb = inode->i_sb;
 963
 964        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 965        mlog(0, "zero_start %llu for i_size %llu\n",
 966             (unsigned long long)zero_start,
 967             (unsigned long long)i_size_read(inode));
 968        while (zero_start < zero_to_size) {
 969                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
 970                                                  zero_to_size,
 971                                                  &range_start,
 972                                                  &range_end);
 973                if (ret) {
 974                        mlog_errno(ret);
 975                        break;
 976                }
 977                if (!range_end)
 978                        break;
 979                /* Trim the ends */
 980                if (range_start < zero_start)
 981                        range_start = zero_start;
 982                if (range_end > zero_to_size)
 983                        range_end = zero_to_size;
 984
 985                ret = ocfs2_zero_extend_range(inode, range_start,
 986                                              range_end);
 987                if (ret) {
 988                        mlog_errno(ret);
 989                        break;
 990                }
 991                zero_start = range_end;
 992        }
 993
 994        return ret;
 995}
 996
 997int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
 998                          u64 new_i_size, u64 zero_to)
 999{
1000        int ret;

1001        u32 clusters_to_add;
1002        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1003
1004        /*
1005         * Only quota files call this without a bh, and they can't be
1006         * refcounted.
1007         */
1008        BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
1009        BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
1010
1011        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
1012        if (clusters_to_add < oi->ip_clusters)
1013                clusters_to_add = 0;
1014        else
1015                clusters_to_add -= oi->ip_clusters;
1016
1017        if (clusters_to_add) {
1018                ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
1019                                                clusters_to_add, 0);
1020                if (ret) {
1021                        mlog_errno(ret);
1022                        goto out;
1023                }
1024        }
1025
1026        /*
1027         * Call this even if we don't add any clusters to the tree. We
1028         * still need to zero the area between the old i_size and the
1029         * new i_size.
1030         */
1031        ret = ocfs2_zero_extend(inode, di_bh, zero_to);
1032        if (ret < 0)
1033                mlog_errno(ret);
1034
1035out:
1036        return ret;
1037}
1038
1039static int ocfs2_extend_file(struct inode *inode,
1040                             struct buffer_head *di_bh,
1041                             u64 new_i_size)
1042{
1043        int ret = 0;
1044        struct ocfs2_inode_info *oi = OCFS2_I(inode);
1045
1046        BUG_ON(!di_bh);
1047
1048        /* setattr sometimes calls us like this. */
1049        if (new_i_size == 0)
1050                goto out;
1051
1052        if (i_size_read(inode) == new_i_size)
1053                goto out;
1054        BUG_ON(new_i_size < i_size_read(inode));
1055
1056        /*
1057         * The alloc sem blocks people in read/write from reading our
1058         * allocation until we're done changing it. We depend on
1059         * i_mutex to block other extend/truncate calls while we're
1060         * here.  We even have to hold it for sparse files because there
1061         * might be some tail zeroing.
1062         */
1063        down_write(&oi->ip_alloc_sem);
1064
1065        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1066                /*
1067                 * We can optimize small extends by keeping the inodes
1068                 * inline data.
1069                 */
1070                if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
1071                        up_write(&oi->ip_alloc_sem);
1072                        goto out_update_size;
1073                }
1074
1075                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1076                if (ret) {
1077                        up_write(&oi->ip_alloc_sem);
1078                        mlog_errno(ret);
1079                        goto out;
1080                }
1081        }
1082
1083        if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1084                ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
1085        else
1086                ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
1087                                            new_i_size);
1088
1089        up_write(&oi->ip_alloc_sem);
1090
1091        if (ret < 0) {
1092                mlog_errno(ret);
1093                goto out;
1094        }
1095
1096out_update_size:
1097        ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
1098        if (ret < 0)
1099                mlog_errno(ret);
1100
1101out:
1102        return ret;
1103}
1104
1105int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1106{
1107        int status = 0, size_change;
1108        struct inode *inode = dentry->d_inode;
1109        struct super_block *sb = inode->i_sb;
1110        struct ocfs2_super *osb = OCFS2_SB(sb);
1111        struct buffer_head *bh = NULL;
1112        handle_t *handle = NULL;
1113        struct dquot *transfer_to[MAXQUOTAS] = { };
1114        int qtype;
1115
1116        mlog_entry("(0x%p, '%.*s')\n", dentry,
1117                   dentry->d_name.len, dentry->d_name.name);
1118
1119        /* ensuring we don't even attempt to truncate a symlink */
1120        if (S_ISLNK(inode->i_mode))
1121                attr->ia_valid &= ~ATTR_SIZE;
1122
1123        if (attr->ia_valid & ATTR_MODE)
1124                mlog(0, "mode change: %d\n", attr->ia_mode);
1125        if (attr->ia_valid & ATTR_UID)
1126                mlog(0, "uid change: %d\n", attr->ia_uid);
1127        if (attr->ia_valid & ATTR_GID)
1128                mlog(0, "gid change: %d\n", attr->ia_gid);
1129        if (attr->ia_valid & ATTR_SIZE)
1130                mlog(0, "size change...\n");
1131        if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
1132                mlog(0, "time change...\n");
1133
1134#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1135                           | ATTR_GID | ATTR_UID | ATTR_MODE)
1136        if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
1137                mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
1138                return 0;
1139        }
1140
1141        status = inode_change_ok(inode, attr);
1142        if (status)
1143                return status;
1144
1145        if (is_quota_modification(inode, attr))
1146                dquot_initialize(inode);
1147        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
1148        if (size_change) {
1149                status = ocfs2_rw_lock(inode, 1);
1150                if (status < 0) {
1151                        mlog_errno(status);
1152                        goto bail;
1153                }
1154        }
1155
1156        status = ocfs2_inode_lock(inode, &bh, 1);
1157        if (status < 0) {
1158                if (status != -ENOENT)
1159                        mlog_errno(status);
1160                goto bail_unlock_rw;
1161        }
1162
1163        if (size_change && attr->ia_size != i_size_read(inode)) {
1164                status = inode_newsize_ok(inode, attr->ia_size);
1165                if (status)
1166                        goto bail_unlock;
1167
1168                if (i_size_read(inode) > attr->ia_size) {
1169                        if (ocfs2_should_order_data(inode)) {
1170                                status = ocfs2_begin_ordered_truncate(inode,
1171                                                                      attr->ia_size);
1172                                if (status)
1173                                        goto bail_unlock;
1174                        }
1175                        status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1176                } else
1177                        status = ocfs2_extend_file(inode, bh, attr->ia_size);
1178                if (status < 0) {
1179                        if (status != -ENOSPC)
1180                                mlog_errno(status);
1181                        status = -ENOSPC;
1182                        goto bail_unlock;
1183                }
1184        }
1185
1186        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
1187            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
1188                /*
1189                 * Gather pointers to quota structures so that allocation /
1190                 * freeing of quota structures happens here and not inside
1191                 * dquot_transfer() where we have problems with lock ordering
1192                 */
1193                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1194                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1195                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1196                        transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1197                                                      USRQUOTA);
1198                        if (!transfer_to[USRQUOTA]) {
1199                                status = -ESRCH;
1200                                goto bail_unlock;
1201                        }
1202                }
1203                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1204                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1205                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1206                        transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1207                                                      GRPQUOTA);
1208                        if (!transfer_to[GRPQUOTA]) {
1209                                status = -ESRCH;
1210                                goto bail_unlock;
1211                        }
1212                }
1213                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1214                                           2 * ocfs2_quota_trans_credits(sb));
1215                if (IS_ERR(handle)) {
1216                        status = PTR_ERR(handle);
1217                        mlog_errno(status);
1218                        goto bail_unlock;
1219                }
1220                status = __dquot_transfer(inode, transfer_to);
1221                if (status < 0)
1222                        goto bail_commit;
1223        } else {
1224                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1225                if (IS_ERR(handle)) {
1226                        status = PTR_ERR(handle);
1227                        mlog_errno(status);
1228                        goto bail_unlock;
1229                }
1230        }
1231
1232        /*
1233         * This will intentionally not wind up calling truncate_setsize(),
1234         * since all the work for a size change has been done above.
1235         * Otherwise, we could get into problems with truncate as
1236         * ip_alloc_sem is used there to protect against i_size
1237         * changes.
1238         *
1239         * XXX: this means the conditional below can probably be removed.
1240         */
1241        if ((attr->ia_valid & ATTR_SIZE) &&
1242            attr->ia_size != i_size_read(inode)) {
1243                status = vmtruncate(inode, attr->ia_size);
1244                if (status) {
1245                        mlog_errno(status);
1246                        goto bail_commit;
1247                }
1248        }
1249
1250        setattr_copy(inode, attr);
1251        mark_inode_dirty(inode);
1252
1253        status = ocfs2_mark_inode_dirty(handle, inode, bh);
1254        if (status < 0)
1255                mlog_errno(status);
1256
1257bail_commit:
1258        ocfs2_commit_trans(osb, handle);
1259bail_unlock:
1260        ocfs2_inode_unlock(inode, 1);
1261bail_unlock_rw:
1262        if (size_change)
1263                ocfs2_rw_unlock(inode, 1);
1264bail:
1265        brelse(bh);
1266
1267        /* Release quota pointers in case we acquired them */
1268        for (qtype = 0; qtype < MAXQUOTAS; qtype++)
1269                dqput(transfer_to[qtype]);
1270
1271        if (!status && attr->ia_valid & ATTR_MODE) {
1272                status = ocfs2_acl_chmod(inode);
1273                if (status < 0)
1274                        mlog_errno(status);
1275        }
1276
1277        mlog_exit(status);
1278        return status;
1279}
1280
1281int ocfs2_getattr(struct vfsmount *mnt,
1282                  struct dentry *dentry,
1283                  struct kstat *stat)
1284{
1285        struct inode *inode = dentry->d_inode;
1286        struct super_block *sb = dentry->d_inode->i_sb;
1287        struct ocfs2_super *osb = sb->s_fs_info;
1288        int err;
1289
1290        mlog_entry_void();
1291
1292        err = ocfs2_inode_revalidate(dentry);
1293        if (err) {
1294                if (err != -ENOENT)
1295                        mlog_errno(err);
1296                goto bail;
1297        }
1298
1299        generic_fillattr(inode, stat);
1300
1301        /* We set the blksize from the cluster size for performance */
1302        stat->blksize = osb->s_clustersize;
1303
1304bail:
1305        mlog_exit(err);
1306
1307        return err;
1308}
1309
1310int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
1311{
1312        int ret;
1313
1314        if (flags & IPERM_FLAG_RCU)
1315                return -ECHILD;
1316
1317        mlog_entry_void();
1318
1319        ret = ocfs2_inode_lock(inode, NULL, 0);
1320        if (ret) {
1321                if (ret != -ENOENT)
1322                        mlog_errno(ret);
1323                goto out;
1324        }
1325
1326        ret = generic_permission(inode, mask, flags, ocfs2_check_acl);
1327
1328        ocfs2_inode_unlock(inode, 0);
1329out:
1330        mlog_exit(ret);
1331        return ret;
1332}
1333
1334static int __ocfs2_write_remove_suid(struct inode *inode,
1335                                     struct buffer_head *bh)
1336{
1337        int ret;
1338        handle_t *handle;
1339        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1340        struct ocfs2_dinode *di;
1341
1342        mlog_entry("(Inode %llu, mode 0%o)\n",
1343                   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1344
1345        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1346        if (IS_ERR(handle)) {
1347                ret = PTR_ERR(handle);
1348                mlog_errno(ret);
1349                goto out;
1350        }
1351
1352        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1353                                      OCFS2_JOURNAL_ACCESS_WRITE);
1354        if (ret < 0) {
1355                mlog_errno(ret);
1356                goto out_trans;
1357        }
1358
1359        inode->i_mode &= ~S_ISUID;
1360        if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1361                inode->i_mode &= ~S_ISGID;
1362
1363        di = (struct ocfs2_dinode *) bh->b_data;
1364        di->i_mode = cpu_to_le16(inode->i_mode);
1365
1366        ocfs2_journal_dirty(handle, bh);
1367
1368out_trans:
1369        ocfs2_commit_trans(osb, handle);
1370out:
1371        mlog_exit(ret);
1372        return ret;
1373}
1374
1375/*
1376 * Will look for holes and unwritten extents in the range starting at
1377 * pos for count bytes (inclusive).
1378 */
1379static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1380                                       size_t count)
1381{
1382        int ret = 0;
1383        unsigned int extent_flags;
1384        u32 cpos, clusters, extent_len, phys_cpos;
1385        struct super_block *sb = inode->i_sb;
1386
1387        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1388        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1389
1390        while (clusters) {
1391                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1392                                         &extent_flags);
1393                if (ret < 0) {
1394                        mlog_errno(ret);
1395                        goto out;
1396                }
1397
1398                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1399                        ret = 1;
1400                        break;
1401                }
1402
1403                if (extent_len > clusters)
1404                        extent_len = clusters;
1405
1406                clusters -= extent_len;
1407                cpos += extent_len;
1408        }
1409out:
1410        return ret;
1411}
1412
1413static int ocfs2_write_remove_suid(struct inode *inode)
1414{
1415        int ret;
1416        struct buffer_head *bh = NULL;
1417
1418        ret = ocfs2_read_inode_block(inode, &bh);
1419        if (ret < 0) {
1420                mlog_errno(ret);
1421                goto out;
1422        }
1423
1424        ret =  __ocfs2_write_remove_suid(inode, bh);
1425out:
1426        brelse(bh);
1427        return ret;
1428}
1429
1430/*
1431 * Allocate enough extents to cover the region starting at byte offset
1432 * start for len bytes. Existing extents are skipped, any extents
1433 * added are marked as "unwritten".
1434 */
1435static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1436                                            u64 start, u64 len)
1437{
1438        int ret;
1439        u32 cpos, phys_cpos, clusters, alloc_size;
1440        u64 end = start + len;
1441        struct buffer_head *di_bh = NULL;
1442
1443        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1444                ret = ocfs2_read_inode_block(inode, &di_bh);
1445                if (ret) {
1446                        mlog_errno(ret);
1447                        goto out;
1448                }
1449
1450                /*
1451                 * Nothing to do if the requested reservation range
1452                 * fits within the inode.
1453                 */
1454                if (ocfs2_size_fits_inline_data(di_bh, end))
1455                        goto out;
1456
1457                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1458                if (ret) {
1459                        mlog_errno(ret);
1460                        goto out;
1461                }
1462        }
1463
1464        /*
1465         * We consider both start and len to be inclusive.
1466         */
1467        cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1468        clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1469        clusters -= cpos;
1470
1471        while (clusters) {
1472                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1473                                         &alloc_size, NULL);
1474                if (ret) {
1475                        mlog_errno(ret);
1476                        goto out;
1477                }
1478
1479                /*
1480                 * Hole or existing extent len can be arbitrary, so
1481                 * cap it to our own allocation request.
1482                 */
1483                if (alloc_size > clusters)
1484                        alloc_size = clusters;
1485
1486                if (phys_cpos) {
1487                        /*
1488                         * We already have an allocation at this
1489                         * region so we can safely skip it.
1490                         */
1491                        goto next;
1492                }
1493
1494                ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1495                if (ret) {
1496                        if (ret != -ENOSPC)
1497                                mlog_errno(ret);
1498                        goto out;
1499                }
1500
1501next:
1502                cpos += alloc_size;
1503                clusters -= alloc_size;
1504        }
1505
1506        ret = 0;
1507out:
1508
1509        brelse(di_bh);
1510        return ret;
1511}
1512
1513/*
1514 * Truncate a byte range, avoiding pages within partial clusters. This
1515 * preserves those pages for the zeroing code to write to.
1516 */
1517static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1518                                         u64 byte_len)
1519{
1520        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1521        loff_t start, end;
1522        struct address_space *mapping = inode->i_mapping;
1523
1524        start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1525        end = byte_start + byte_len;
1526        end = end & ~(osb->s_clustersize - 1);
1527
1528        if (start < end) {
1529                unmap_mapping_range(mapping, start, end - start, 0);
1530                truncate_inode_pages_range(mapping, start, end - 1);
1531        }
1532}
1533
1534static int ocfs2_zero_partial_clusters(struct inode *inode,
1535                                       u64 start, u64 len)
1536{
1537        int ret = 0;
1538        u64 tmpend, end = start + len;
1539        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1540        unsigned int csize = osb->s_clustersize;
1541        handle_t *handle;
1542
1543        /*
1544         * The "start" and "end" values are NOT necessarily part of
1545         * the range whose allocation is being deleted. Rather, this
1546         * is what the user passed in with the request. We must zero
1547         * partial clusters here. There's no need to worry about
1548         * physical allocation - the zeroing code knows to skip holes.
1549         */
1550        mlog(0, "byte start: %llu, end: %llu\n",
1551             (unsigned long long)start, (unsigned long long)end);
1552
1553        /*
1554         * If both edges are on a cluster boundary then there's no
1555         * zeroing required as the region is part of the allocation to
1556         * be truncated.
1557         */
1558        if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1559                goto out;
1560
1561        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1562        if (IS_ERR(handle)) {
1563                ret = PTR_ERR(handle);
1564                mlog_errno(ret);
1565                goto out;
1566        }
1567
1568        /*
1569         * We want to get the byte offset of the end of the 1st cluster.
1570         */
1571        tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1572        if (tmpend > end)
1573                tmpend = end;
1574
1575        mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1576             (unsigned long long)start, (unsigned long long)tmpend);
1577
1578        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1579        if (ret)
1580                mlog_errno(ret);
1581
1582        if (tmpend < end) {
1583                /*
1584                 * This may make start and end equal, but the zeroing
1585                 * code will skip any work in that case so there's no
1586                 * need to catch it up here.
1587                 */
1588                start = end & ~(osb->s_clustersize - 1);
1589
1590                mlog(0, "2nd range: start: %llu, end: %llu\n",
1591                     (unsigned long long)start, (unsigned long long)end);
1592
1593                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1594                if (ret)
1595                        mlog_errno(ret);
1596        }
1597
1598        ocfs2_commit_trans(osb, handle);
1599out:
1600        return ret;
1601}
1602
1603static int ocfs2_find_rec(struct ocfs2_extent_list *el, u32 pos)
1604{
1605        int i;
1606        struct ocfs2_extent_rec *rec = NULL;
1607
1608        for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1609
1610                rec = &el->l_recs[i];
1611
1612                if (le32_to_cpu(rec->e_cpos) < pos)
1613                        break;
1614        }
1615
1616        return i;
1617}
1618
1619/*
1620 * Helper to calculate the punching pos and length in one run, we handle the
1621 * following three cases in order:
1622 *
1623 * - remove the entire record
1624 * - remove a partial record
1625 * - no record needs to be removed (hole-punching completed)
1626*/
1627static void ocfs2_calc_trunc_pos(struct inode *inode,
1628                                 struct ocfs2_extent_list *el,
1629                                 struct ocfs2_extent_rec *rec,
1630                                 u32 trunc_start, u32 *trunc_cpos,
1631                                 u32 *trunc_len, u32 *trunc_end,
1632                                 u64 *blkno, int *done)
1633{
1634        int ret = 0;
1635        u32 coff, range;
1636
1637        range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1638
1639        if (le32_to_cpu(rec->e_cpos) >= trunc_start) {
1640                *trunc_cpos = le32_to_cpu(rec->e_cpos);
1641                /*
1642                 * Skip holes if any.
1643                 */
1644                if (range < *trunc_end)
1645                        *trunc_end = range;
1646                *trunc_len = *trunc_end - le32_to_cpu(rec->e_cpos);
1647                *blkno = le64_to_cpu(rec->e_blkno);
1648                *trunc_end = le32_to_cpu(rec->e_cpos);
1649        } else if (range > trunc_start) {
1650                *trunc_cpos = trunc_start;
1651                *trunc_len = *trunc_end - trunc_start;
1652                coff = trunc_start - le32_to_cpu(rec->e_cpos);
1653                *blkno = le64_to_cpu(rec->e_blkno) +
1654                                ocfs2_clusters_to_blocks(inode->i_sb, coff);
1655                *trunc_end = trunc_start;
1656        } else {
1657                /*
1658                 * It may have two following possibilities:
1659                 *
1660                 * - last record has been removed
1661                 * - trunc_start was within a hole
1662                 *
1663                 * both two cases mean the completion of hole punching.
1664                 */
1665                ret = 1;
1666        }
1667
1668        *done = ret;
1669}
1670
1671static int ocfs2_remove_inode_range(struct inode *inode,
1672                                    struct buffer_head *di_bh, u64 byte_start,
1673                                    u64 byte_len)
1674{
1675        int ret = 0, flags = 0, done = 0, i;
1676        u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
1677        u32 cluster_in_el;
1678        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1679        struct ocfs2_cached_dealloc_ctxt dealloc;
1680        struct address_space *mapping = inode->i_mapping;
1681        struct ocfs2_extent_tree et;
1682        struct ocfs2_path *path = NULL;
1683        struct ocfs2_extent_list *el = NULL;
1684        struct ocfs2_extent_rec *rec = NULL;
1685        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1686        u64 blkno, refcount_loc = le64_to_cpu(di->i_refcount_loc);
1687
1688        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1689        ocfs2_init_dealloc_ctxt(&dealloc);
1690
1691        if (byte_len == 0)
1692                return 0;
1693
1694        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1695                ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1696                                            byte_start + byte_len, 0);
1697                if (ret) {
1698                        mlog_errno(ret);
1699                        goto out;
1700                }
1701                /*
1702                 * There's no need to get fancy with the page cache
1703                 * truncate of an inline-data inode. We're talking
1704                 * about less than a page here, which will be cached
1705                 * in the dinode buffer anyway.
1706                 */
1707                unmap_mapping_range(mapping, 0, 0, 0);
1708                truncate_inode_pages(mapping, 0);
1709                goto out;
1710        }
1711
1712        /*
1713         * For reflinks, we may need to CoW 2 clusters which might be
1714         * partially zero'd later, if hole's start and end offset were
1715         * within one cluster(means is not exactly aligned to clustersize).
1716         */
1717
1718        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
1719
1720                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
1721                if (ret) {
1722                        mlog_errno(ret);
1723                        goto out;
1724                }
1725
1726                ret = ocfs2_cow_file_pos(inode, di_bh, byte_start + byte_len);
1727                if (ret) {
1728                        mlog_errno(ret);
1729                        goto out;
1730                }
1731        }
1732
1733        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1734        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1735        cluster_in_el = trunc_end;
1736
1737        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
1738             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1739             (unsigned long long)byte_start,
1740             (unsigned long long)byte_len, trunc_start, trunc_end);
1741
1742        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1743        if (ret) {
1744                mlog_errno(ret);
1745                goto out;
1746        }
1747
1748        path = ocfs2_new_path_from_et(&et);
1749        if (!path) {
1750                ret = -ENOMEM;
1751                mlog_errno(ret);
1752                goto out;
1753        }
1754
1755        while (trunc_end > trunc_start) {
1756
1757                ret = ocfs2_find_path(INODE_CACHE(inode), path,
1758                                      cluster_in_el);
1759                if (ret) {
1760                        mlog_errno(ret);
1761                        goto out;
1762                }
1763
1764                el = path_leaf_el(path);
1765
1766                i = ocfs2_find_rec(el, trunc_end);
1767                /*
1768                 * Need to go to previous extent block.
1769                 */
1770                if (i < 0) {
1771                        if (path->p_tree_depth == 0)
1772                                break;
1773
1774                        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
1775                                                            path,
1776                                                            &cluster_in_el);
1777                        if (ret) {
1778                                mlog_errno(ret);
1779                                goto out;
1780                        }
1781
1782                        /*
1783                         * We've reached the leftmost extent block,
1784                         * it's safe to leave.
1785                         */
1786                        if (cluster_in_el == 0)
1787                                break;
1788
1789                        /*
1790                         * The 'pos' searched for previous extent block is
1791                         * always one cluster less than actual trunc_end.
1792                         */
1793                        trunc_end = cluster_in_el + 1;
1794
1795                        ocfs2_reinit_path(path, 1);
1796
1797                        continue;
1798
1799                } else
1800                        rec = &el->l_recs[i];
1801
1802                ocfs2_calc_trunc_pos(inode, el, rec, trunc_start, &trunc_cpos,
1803                                     &trunc_len, &trunc_end, &blkno, &done);
1804                if (done)
1805                        break;
1806
1807                flags = rec->e_flags;
1808                phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno);
1809
1810                ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos,
1811                                               phys_cpos, trunc_len, flags,
1812                                               &dealloc, refcount_loc);
1813                if (ret < 0) {
1814                        mlog_errno(ret);
1815                        goto out;
1816                }
1817
1818                cluster_in_el = trunc_end;
1819
1820                ocfs2_reinit_path(path, 1);
1821        }
1822
1823        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1824
1825out:
1826        ocfs2_schedule_truncate_log_flush(osb, 1);
1827        ocfs2_run_deallocs(osb, &dealloc);
1828
1829        return ret;
1830}
1831
1832/*
1833 * Parts of this function taken from xfs_change_file_space()
1834 */
1835static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1836                                     loff_t f_pos, unsigned int cmd,
1837                                     struct ocfs2_space_resv *sr,
1838                                     int change_size)
1839{
1840        int ret;
1841        s64 llen;
1842        loff_t size;
1843        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1844        struct buffer_head *di_bh = NULL;
1845        handle_t *handle;
1846        unsigned long long max_off = inode->i_sb->s_maxbytes;
1847
1848        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1849                return -EROFS;
1850
1851        mutex_lock(&inode->i_mutex);
1852
1853        /*
1854         * This prevents concurrent writes on other nodes
1855         */
1856        ret = ocfs2_rw_lock(inode, 1);
1857        if (ret) {
1858                mlog_errno(ret);
1859                goto out;
1860        }
1861
1862        ret = ocfs2_inode_lock(inode, &di_bh, 1);
1863        if (ret) {
1864                mlog_errno(ret);
1865                goto out_rw_unlock;
1866        }
1867
1868        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1869                ret = -EPERM;
1870                goto out_inode_unlock;
1871        }
1872
1873        switch (sr->l_whence) {
1874        case 0: /*SEEK_SET*/
1875                break;
1876        case 1: /*SEEK_CUR*/
1877                sr->l_start += f_pos;
1878                break;
1879        case 2: /*SEEK_END*/
1880                sr->l_start += i_size_read(inode);
1881                break;
1882        default:
1883                ret = -EINVAL;
1884                goto out_inode_unlock;
1885        }
1886        sr->l_whence = 0;
1887
1888        llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1889
1890        if (sr->l_start < 0
1891            || sr->l_start > max_off
1892            || (sr->l_start + llen) < 0
1893            || (sr->l_start + llen) > max_off) {
1894                ret = -EINVAL;
1895                goto out_inode_unlock;
1896        }
1897        size = sr->l_start + sr->l_len;
1898
1899        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1900                if (sr->l_len <= 0) {
1901                        ret = -EINVAL;
1902                        goto out_inode_unlock;
1903                }
1904        }
1905
1906        if (file && should_remove_suid(file->f_path.dentry)) {
1907                ret = __ocfs2_write_remove_suid(inode, di_bh);
1908                if (ret) {
1909                        mlog_errno(ret);
1910                        goto out_inode_unlock;
1911                }
1912        }
1913
1914        down_write(&OCFS2_I(inode)->ip_alloc_sem);
1915        switch (cmd) {
1916        case OCFS2_IOC_RESVSP:
1917        case OCFS2_IOC_RESVSP64:
1918                /*
1919                 * This takes unsigned offsets, but the signed ones we
1920                 * pass have been checked against overflow above.
1921                 */
1922                ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1923                                                       sr->l_len);
1924                break;
1925        case OCFS2_IOC_UNRESVSP:
1926        case OCFS2_IOC_UNRESVSP64:
1927                ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1928                                               sr->l_len);
1929                break;
1930        default:
1931                ret = -EINVAL;
1932        }
1933        up_write(&OCFS2_I(inode)->ip_alloc_sem);
1934        if (ret) {
1935                mlog_errno(ret);
1936                goto out_inode_unlock;
1937        }
1938
1939        /*
1940         * We update c/mtime for these changes
1941         */
1942        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1943        if (IS_ERR(handle)) {
1944                ret = PTR_ERR(handle);
1945                mlog_errno(ret);
1946                goto out_inode_unlock;
1947        }
1948
1949        if (change_size && i_size_read(inode) < size)
1950                i_size_write(inode, size);
1951
1952        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1953        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1954        if (ret < 0)
1955                mlog_errno(ret);
1956
1957        ocfs2_commit_trans(osb, handle);
1958
1959out_inode_unlock:
1960        brelse(di_bh);
1961        ocfs2_inode_unlock(inode, 1);
1962out_rw_unlock:
1963        ocfs2_rw_unlock(inode, 1);
1964
1965out:
1966        mutex_unlock(&inode->i_mutex);
1967        return ret;
1968}
1969
1970int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1971                            struct ocfs2_space_resv *sr)
1972{
1973        struct inode *inode = file->f_path.dentry->d_inode;
1974        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1975
1976        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1977            !ocfs2_writes_unwritten_extents(osb))
1978                return -ENOTTY;
1979        else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1980                 !ocfs2_sparse_alloc(osb))
1981                return -ENOTTY;
1982
1983        if (!S_ISREG(inode->i_mode))
1984                return -EINVAL;
1985
1986        if (!(file->f_mode & FMODE_WRITE))
1987                return -EBADF;
1988
1989        return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1990}
1991
1992static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
1993                            loff_t len)
1994{
1995        struct inode *inode = file->f_path.dentry->d_inode;
1996        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1997        struct ocfs2_space_resv sr;
1998        int change_size = 1;
1999        int cmd = OCFS2_IOC_RESVSP64;
2000

2001        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
2002                return -EOPNOTSUPP;
2003        if (!ocfs2_writes_unwritten_extents(osb))
2004                return -EOPNOTSUPP;
2005
2006        if (mode & FALLOC_FL_KEEP_SIZE)
2007                change_size = 0;
2008
2009        if (mode & FALLOC_FL_PUNCH_HOLE)
2010                cmd = OCFS2_IOC_UNRESVSP64;
2011
2012        sr.l_whence = 0;
2013        sr.l_start = (s64)offset;
2014        sr.l_len = (s64)len;
2015
2016        return __ocfs2_change_file_space(NULL, inode, offset, cmd, &sr,
2017                                         change_size);
2018}
2019
2020int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
2021                                   size_t count)
2022{
2023        int ret = 0;
2024        unsigned int extent_flags;
2025        u32 cpos, clusters, extent_len, phys_cpos;
2026        struct super_block *sb = inode->i_sb;
2027
2028        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
2029            !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
2030            OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
2031                return 0;
2032
2033        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
2034        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
2035
2036        while (clusters) {
2037                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
2038                                         &extent_flags);
2039                if (ret < 0) {
2040                        mlog_errno(ret);
2041                        goto out;
2042                }
2043
2044                if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
2045                        ret = 1;
2046                        break;
2047                }
2048
2049                if (extent_len > clusters)
2050                        extent_len = clusters;
2051
2052                clusters -= extent_len;
2053                cpos += extent_len;
2054        }
2055out:
2056        return ret;
2057}
2058
2059static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
2060                                            struct file *file,
2061                                            loff_t pos, size_t count,
2062                                            int *meta_level)
2063{
2064        int ret;
2065        struct buffer_head *di_bh = NULL;
2066        u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2067        u32 clusters =
2068                ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2069
2070        ret = ocfs2_inode_lock(inode, &di_bh, 1);
2071        if (ret) {
2072                mlog_errno(ret);
2073                goto out;
2074        }
2075
2076        *meta_level = 1;
2077
2078        ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX);
2079        if (ret)
2080                mlog_errno(ret);
2081out:
2082        brelse(di_bh);
2083        return ret;
2084}
2085
2086static int ocfs2_prepare_inode_for_write(struct file *file,
2087                                         loff_t *ppos,
2088                                         size_t count,
2089                                         int appending,
2090                                         int *direct_io,
2091                                         int *has_refcount)
2092{
2093        int ret = 0, meta_level = 0;
2094        struct dentry *dentry = file->f_path.dentry;
2095        struct inode *inode = dentry->d_inode;
2096        loff_t saved_pos, end;
2097
2098        /*
2099         * We start with a read level meta lock and only jump to an ex
2100         * if we need to make modifications here.
2101         */
2102        for(;;) {
2103                ret = ocfs2_inode_lock(inode, NULL, meta_level);
2104                if (ret < 0) {
2105                        meta_level = -1;
2106                        mlog_errno(ret);
2107                        goto out;
2108                }
2109
2110                /* Clear suid / sgid if necessary. We do this here
2111                 * instead of later in the write path because
2112                 * remove_suid() calls ->setattr without any hint that
2113                 * we may have already done our cluster locking. Since
2114                 * ocfs2_setattr() *must* take cluster locks to
2115                 * proceeed, this will lead us to recursively lock the
2116                 * inode. There's also the dinode i_size state which
2117                 * can be lost via setattr during extending writes (we
2118                 * set inode->i_size at the end of a write. */
2119                if (should_remove_suid(dentry)) {
2120                        if (meta_level == 0) {
2121                                ocfs2_inode_unlock(inode, meta_level);
2122                                meta_level = 1;
2123                                continue;
2124                        }
2125
2126                        ret = ocfs2_write_remove_suid(inode);
2127                        if (ret < 0) {
2128                                mlog_errno(ret);
2129                                goto out_unlock;
2130                        }
2131                }
2132
2133                /* work on a copy of ppos until we're sure that we won't have
2134                 * to recalculate it due to relocking. */
2135                if (appending) {
2136                        saved_pos = i_size_read(inode);
2137                        mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
2138                } else {
2139                        saved_pos = *ppos;
2140                }
2141
2142                end = saved_pos + count;
2143
2144                ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
2145                if (ret == 1) {
2146                        ocfs2_inode_unlock(inode, meta_level);
2147                        meta_level = -1;
2148
2149                        ret = ocfs2_prepare_inode_for_refcount(inode,
2150                                                               file,
2151                                                               saved_pos,
2152                                                               count,
2153                                                               &meta_level);
2154                        if (has_refcount)
2155                                *has_refcount = 1;
2156                        if (direct_io)
2157                                *direct_io = 0;
2158                }
2159
2160                if (ret < 0) {
2161                        mlog_errno(ret);
2162                        goto out_unlock;
2163                }
2164
2165                /*
2166                 * Skip the O_DIRECT checks if we don't need
2167                 * them.
2168                 */
2169                if (!direct_io || !(*direct_io))
2170                        break;
2171
2172                /*
2173                 * There's no sane way to do direct writes to an inode
2174                 * with inline data.
2175                 */
2176                if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
2177                        *direct_io = 0;
2178                        break;
2179                }
2180
2181                /*
2182                 * Allowing concurrent direct writes means
2183                 * i_size changes wouldn't be synchronized, so
2184                 * one node could wind up truncating another
2185                 * nodes writes.
2186                 */
2187                if (end > i_size_read(inode)) {
2188                        *direct_io = 0;
2189                        break;
2190                }
2191
2192                /*
2193                 * We don't fill holes during direct io, so
2194                 * check for them here. If any are found, the
2195                 * caller will have to retake some cluster
2196                 * locks and initiate the io as buffered.
2197                 */
2198                ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
2199                if (ret == 1) {
2200                        *direct_io = 0;
2201                        ret = 0;
2202                } else if (ret < 0)
2203                        mlog_errno(ret);
2204                break;
2205        }
2206
2207        if (appending)
2208                *ppos = saved_pos;
2209
2210out_unlock:
2211        if (meta_level >= 0)
2212                ocfs2_inode_unlock(inode, meta_level);
2213
2214out:
2215        return ret;
2216}
2217
2218static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2219                                    const struct iovec *iov,
2220                                    unsigned long nr_segs,
2221                                    loff_t pos)
2222{
2223        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
2224        int can_do_direct, has_refcount = 0;
2225        ssize_t written = 0;
2226        size_t ocount;          /* original count */
2227        size_t count;           /* after file limit checks */
2228        loff_t old_size, *ppos = &iocb->ki_pos;
2229        u32 old_clusters;
2230        struct file *file = iocb->ki_filp;
2231        struct inode *inode = file->f_path.dentry->d_inode;
2232        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2233        int full_coherency = !(osb->s_mount_opt &
2234                               OCFS2_MOUNT_COHERENCY_BUFFERED);
2235
2236        mlog_entry("(0x%p, %u, '%.*s')\n", file,
2237                   (unsigned int)nr_segs,
2238                   file->f_path.dentry->d_name.len,
2239                   file->f_path.dentry->d_name.name);
2240
2241        if (iocb->ki_left == 0)
2242                return 0;
2243
2244        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2245
2246        appending = file->f_flags & O_APPEND ? 1 : 0;
2247        direct_io = file->f_flags & O_DIRECT ? 1 : 0;
2248
2249        mutex_lock(&inode->i_mutex);
2250
2251        ocfs2_iocb_clear_sem_locked(iocb);
2252
2253relock:
2254        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
2255        if (direct_io) {
2256                down_read(&inode->i_alloc_sem);
2257                have_alloc_sem = 1;
2258                /* communicate with ocfs2_dio_end_io */
2259                ocfs2_iocb_set_sem_locked(iocb);
2260        }
2261
2262        /*
2263         * Concurrent O_DIRECT writes are allowed with
2264         * mount_option "coherency=buffered".
2265         */
2266        rw_level = (!direct_io || full_coherency);
2267
2268        ret = ocfs2_rw_lock(inode, rw_level);
2269        if (ret < 0) {
2270                mlog_errno(ret);
2271                goto out_sems;
2272        }
2273
2274        /*
2275         * O_DIRECT writes with "coherency=full" need to take EX cluster
2276         * inode_lock to guarantee coherency.
2277         */
2278        if (direct_io && full_coherency) {
2279                /*
2280                 * We need to take and drop the inode lock to force
2281                 * other nodes to drop their caches.  Buffered I/O
2282                 * already does this in write_begin().
2283                 */
2284                ret = ocfs2_inode_lock(inode, NULL, 1);
2285                if (ret < 0) {
2286                        mlog_errno(ret);
2287                        goto out_sems;
2288                }
2289
2290                ocfs2_inode_unlock(inode, 1);
2291        }
2292
2293        can_do_direct = direct_io;
2294        ret = ocfs2_prepare_inode_for_write(file, ppos,
2295                                            iocb->ki_left, appending,
2296                                            &can_do_direct, &has_refcount);
2297        if (ret < 0) {
2298                mlog_errno(ret);
2299                goto out;
2300        }
2301
2302        /*
2303         * We can't complete the direct I/O as requested, fall back to
2304         * buffered I/O.
2305         */
2306        if (direct_io && !can_do_direct) {
2307                ocfs2_rw_unlock(inode, rw_level);
2308                up_read(&inode->i_alloc_sem);
2309
2310                have_alloc_sem = 0;
2311                rw_level = -1;
2312
2313                direct_io = 0;
2314                goto relock;
2315        }
2316
2317        /*
2318         * To later detect whether a journal commit for sync writes is
2319         * necessary, we sample i_size, and cluster count here.
2320         */
2321        old_size = i_size_read(inode);
2322        old_clusters = OCFS2_I(inode)->ip_clusters;
2323
2324        /* communicate with ocfs2_dio_end_io */
2325        ocfs2_iocb_set_rw_locked(iocb, rw_level);
2326
2327        ret = generic_segment_checks(iov, &nr_segs, &ocount,
2328                                     VERIFY_READ);
2329        if (ret)
2330                goto out_dio;
2331
2332        count = ocount;
2333        ret = generic_write_checks(file, ppos, &count,
2334                                   S_ISBLK(inode->i_mode));
2335        if (ret)
2336                goto out_dio;
2337
2338        if (direct_io) {
2339                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
2340                                                    ppos, count, ocount);
2341                if (written < 0) {
2342                        ret = written;
2343                        goto out_dio;
2344                }
2345        } else {
2346                current->backing_dev_info = file->f_mapping->backing_dev_info;
2347                written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
2348                                                      ppos, count, 0);
2349                current->backing_dev_info = NULL;
2350        }
2351
2352out_dio:
2353        /* buffered aio wouldn't have proper lock coverage today */
2354        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2355
2356        if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2357            ((file->f_flags & O_DIRECT) && !direct_io)) {
2358                ret = filemap_fdatawrite_range(file->f_mapping, pos,
2359                                               pos + count - 1);
2360                if (ret < 0)
2361                        written = ret;
2362
2363                if (!ret && ((old_size != i_size_read(inode)) ||
2364                             (old_clusters != OCFS2_I(inode)->ip_clusters) ||
2365                             has_refcount)) {
2366                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
2367                        if (ret < 0)
2368                                written = ret;
2369                }
2370
2371                if (!ret)
2372                        ret = filemap_fdatawait_range(file->f_mapping, pos,
2373                                                      pos + count - 1);
2374        }
2375
2376        /*
2377         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2378         * function pointer which is called when o_direct io completes so that
2379         * it can unlock our rw lock.  (it's the clustered equivalent of
2380         * i_alloc_sem; protects truncate from racing with pending ios).
2381         * Unfortunately there are error cases which call end_io and others
2382         * that don't.  so we don't have to unlock the rw_lock if either an
2383         * async dio is going to do it in the future or an end_io after an
2384         * error has already done it.
2385         */
2386        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2387                rw_level = -1;
2388                have_alloc_sem = 0;
2389        }
2390
2391out:
2392        if (rw_level != -1)
2393                ocfs2_rw_unlock(inode, rw_level);
2394
2395out_sems:
2396        if (have_alloc_sem) {
2397                up_read(&inode->i_alloc_sem);
2398                ocfs2_iocb_clear_sem_locked(iocb);
2399        }
2400
2401        mutex_unlock(&inode->i_mutex);
2402
2403        if (written)
2404                ret = written;
2405        mlog_exit(ret);
2406        return ret;
2407}
2408
2409static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2410                                struct file *out,
2411                                struct splice_desc *sd)
2412{
2413        int ret;
2414
2415        ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
2416                                            sd->total_len, 0, NULL, NULL);
2417        if (ret < 0) {
2418                mlog_errno(ret);
2419                return ret;
2420        }
2421
2422        return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2423}
2424
2425static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2426                                       struct file *out,
2427                                       loff_t *ppos,
2428                                       size_t len,
2429                                       unsigned int flags)
2430{
2431        int ret;
2432        struct address_space *mapping = out->f_mapping;
2433        struct inode *inode = mapping->host;
2434        struct splice_desc sd = {
2435                .total_len = len,
2436                .flags = flags,
2437                .pos = *ppos,
2438                .u.file = out,
2439        };
2440
2441        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
2442                   (unsigned int)len,
2443                   out->f_path.dentry->d_name.len,
2444                   out->f_path.dentry->d_name.name);
2445
2446        if (pipe->inode)
2447                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
2448
2449        splice_from_pipe_begin(&sd);
2450        do {
2451                ret = splice_from_pipe_next(pipe, &sd);
2452                if (ret <= 0)
2453                        break;
2454
2455                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2456                ret = ocfs2_rw_lock(inode, 1);
2457                if (ret < 0)
2458                        mlog_errno(ret);
2459                else {
2460                        ret = ocfs2_splice_to_file(pipe, out, &sd);
2461                        ocfs2_rw_unlock(inode, 1);
2462                }
2463                mutex_unlock(&inode->i_mutex);
2464        } while (ret > 0);
2465        splice_from_pipe_end(pipe, &sd);
2466
2467        if (pipe->inode)
2468                mutex_unlock(&pipe->inode->i_mutex);
2469
2470        if (sd.num_spliced)
2471                ret = sd.num_spliced;
2472
2473        if (ret > 0) {
2474                unsigned long nr_pages;
2475                int err;
2476
2477                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2478
2479                err = generic_write_sync(out, *ppos, ret);
2480                if (err)
2481                        ret = err;
2482                else
2483                        *ppos += ret;
2484
2485                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2486        }
2487
2488        mlog_exit(ret);
2489        return ret;
2490}
2491
2492static ssize_t ocfs2_file_splice_read(struct file *in,
2493                                      loff_t *ppos,
2494                                      struct pipe_inode_info *pipe,
2495                                      size_t len,
2496                                      unsigned int flags)
2497{
2498        int ret = 0, lock_level = 0;
2499        struct inode *inode = in->f_path.dentry->d_inode;
2500
2501        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
2502                   (unsigned int)len,
2503                   in->f_path.dentry->d_name.len,
2504                   in->f_path.dentry->d_name.name);
2505
2506        /*
2507         * See the comment in ocfs2_file_aio_read()
2508         */
2509        ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2510        if (ret < 0) {
2511                mlog_errno(ret);
2512                goto bail;
2513        }
2514        ocfs2_inode_unlock(inode, lock_level);
2515
2516        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2517
2518bail:
2519        mlog_exit(ret);
2520        return ret;
2521}
2522
2523static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2524                                   const struct iovec *iov,
2525                                   unsigned long nr_segs,
2526                                   loff_t pos)
2527{
2528        int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2529        struct file *filp = iocb->ki_filp;
2530        struct inode *inode = filp->f_path.dentry->d_inode;
2531
2532        mlog_entry("(0x%p, %u, '%.*s')\n", filp,
2533                   (unsigned int)nr_segs,
2534                   filp->f_path.dentry->d_name.len,
2535                   filp->f_path.dentry->d_name.name);
2536
2537        if (!inode) {
2538                ret = -EINVAL;
2539                mlog_errno(ret);
2540                goto bail;
2541        }
2542
2543        ocfs2_iocb_clear_sem_locked(iocb);
2544
2545        /*
2546         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
2547         * need locks to protect pending reads from racing with truncate.
2548         */
2549        if (filp->f_flags & O_DIRECT) {
2550                down_read(&inode->i_alloc_sem);
2551                have_alloc_sem = 1;
2552                ocfs2_iocb_set_sem_locked(iocb);
2553
2554                ret = ocfs2_rw_lock(inode, 0);
2555                if (ret < 0) {
2556                        mlog_errno(ret);
2557                        goto bail;
2558                }
2559                rw_level = 0;
2560                /* communicate with ocfs2_dio_end_io */
2561                ocfs2_iocb_set_rw_locked(iocb, rw_level);
2562        }
2563
2564        /*
2565         * We're fine letting folks race truncates and extending
2566         * writes with read across the cluster, just like they can
2567         * locally. Hence no rw_lock during read.
2568         *
2569         * Take and drop the meta data lock to update inode fields
2570         * like i_size. This allows the checks down below
2571         * generic_file_aio_read() a chance of actually working.
2572         */
2573        ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2574        if (ret < 0) {
2575                mlog_errno(ret);
2576                goto bail;
2577        }
2578        ocfs2_inode_unlock(inode, lock_level);
2579
2580        ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2581        if (ret == -EINVAL)
2582                mlog(0, "generic_file_aio_read returned -EINVAL\n");
2583
2584        /* buffered aio wouldn't have proper lock coverage today */
2585        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2586
2587        /* see ocfs2_file_aio_write */
2588        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2589                rw_level = -1;
2590                have_alloc_sem = 0;
2591        }
2592
2593bail:
2594        if (have_alloc_sem) {
2595                up_read(&inode->i_alloc_sem);
2596                ocfs2_iocb_clear_sem_locked(iocb);
2597        }
2598        if (rw_level != -1)
2599                ocfs2_rw_unlock(inode, rw_level);
2600        mlog_exit(ret);
2601
2602        return ret;
2603}
2604
2605const struct inode_operations ocfs2_file_iops = {
2606        .setattr        = ocfs2_setattr,
2607        .getattr        = ocfs2_getattr,
2608        .permission     = ocfs2_permission,
2609        .setxattr       = generic_setxattr,
2610        .getxattr       = generic_getxattr,
2611        .listxattr      = ocfs2_listxattr,
2612        .removexattr    = generic_removexattr,
2613        .fiemap         = ocfs2_fiemap,
2614};
2615
2616const struct inode_operations ocfs2_special_file_iops = {
2617        .setattr        = ocfs2_setattr,
2618        .getattr        = ocfs2_getattr,
2619        .permission     = ocfs2_permission,
2620};
2621
2622/*
2623 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2624 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2625 */
2626const struct file_operations ocfs2_fops = {
2627        .llseek         = generic_file_llseek,
2628        .read           = do_sync_read,
2629        .write          = do_sync_write,
2630        .mmap           = ocfs2_mmap,
2631        .fsync          = ocfs2_sync_file,
2632        .release        = ocfs2_file_release,
2633        .open           = ocfs2_file_open,
2634        .aio_read       = ocfs2_file_aio_read,
2635        .aio_write      = ocfs2_file_aio_write,
2636        .unlocked_ioctl = ocfs2_ioctl,
2637#ifdef CONFIG_COMPAT
2638        .compat_ioctl   = ocfs2_compat_ioctl,
2639#endif
2640        .lock           = ocfs2_lock,
2641        .flock          = ocfs2_flock,
2642        .splice_read    = ocfs2_file_splice_read,
2643        .splice_write   = ocfs2_file_splice_write,
2644        .fallocate      = ocfs2_fallocate,
2645};
2646
2647const struct file_operations ocfs2_dops = {
2648        .llseek         = generic_file_llseek,
2649        .read           = generic_read_dir,
2650        .readdir        = ocfs2_readdir,
2651        .fsync          = ocfs2_sync_file,
2652        .release        = ocfs2_dir_release,
2653        .open           = ocfs2_dir_open,
2654        .unlocked_ioctl = ocfs2_ioctl,
2655#ifdef CONFIG_COMPAT
2656        .compat_ioctl   = ocfs2_compat_ioctl,
2657#endif
2658        .lock           = ocfs2_lock,
2659        .flock          = ocfs2_flock,
2660};
2661
2662/*
2663 * POSIX-lockless variants of our file_operations.
2664 *
2665 * These will be used if the underlying cluster stack does not support
2666 * posix file locking, if the user passes the "localflocks" mount
2667 * option, or if we have a local-only fs.
2668 *
2669 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2670 * so we still want it in the case of no stack support for
2671 * plocks. Internally, it will do the right thing when asked to ignore
2672 * the cluster.
2673 */
2674const struct file_operations ocfs2_fops_no_plocks = {
2675        .llseek         = generic_file_llseek,
2676        .read           = do_sync_read,
2677        .write          = do_sync_write,
2678        .mmap           = ocfs2_mmap,
2679        .fsync          = ocfs2_sync_file,
2680        .release        = ocfs2_file_release,
2681        .open           = ocfs2_file_open,
2682        .aio_read       = ocfs2_file_aio_read,
2683        .aio_write      = ocfs2_file_aio_write,
2684        .unlocked_ioctl = ocfs2_ioctl,
2685#ifdef CONFIG_COMPAT
2686        .compat_ioctl   = ocfs2_compat_ioctl,
2687#endif
2688        .flock          = ocfs2_flock,
2689        .splice_read    = ocfs2_file_splice_read,
2690        .splice_write   = ocfs2_file_splice_write,
2691};
2692
2693const struct file_operations ocfs2_dops_no_plocks = {
2694        .llseek         = generic_file_llseek,
2695        .read           = generic_read_dir,
2696        .readdir        = ocfs2_readdir,
2697        .fsync          = ocfs2_sync_file,
2698        .release        = ocfs2_dir_release,
2699        .open           = ocfs2_dir_open,
2700        .unlocked_ioctl = ocfs2_ioctl,
2701#ifdef CONFIG_COMPAT
2702        .compat_ioctl   = ocfs2_compat_ioctl,
2703#endif
2704        .flock          = ocfs2_flock,
2705};
2706