LXR linux/fs/ocfs2/file.c

   1/* -*- mode: c; c-basic-offset: 8; -*-
   2 * vim: noexpandtab sw=8 ts=8 sts=0:
   3 *
   4 * file.c
   5 *
   6 * File open, close, extend, truncate
   7 *
   8 * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   9 *
  10 * This program is free software; you can redistribute it and/or
  11 * modify it under the terms of the GNU General Public
  12 * License as published by the Free Software Foundation; either
  13 * version 2 of the License, or (at your option) any later version.
  14 *
  15 * This program is distributed in the hope that it will be useful,
  16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 * General Public License for more details.
  19 *
  20 * You should have received a copy of the GNU General Public
  21 * License along with this program; if not, write to the
  22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23 * Boston, MA 021110-1307, USA.
  24 */
  25
  26#include <linux/capability.h>
  27#include <linux/fs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/highmem.h>
  31#include <linux/pagemap.h>
  32#include <linux/uio.h>
  33#include <linux/sched.h>
  34#include <linux/splice.h>
  35#include <linux/mount.h>
  36#include <linux/writeback.h>
  37#include <linux/falloc.h>
  38#include <linux/quotaops.h>
  39
  40#define MLOG_MASK_PREFIX ML_INODE
  41#include <cluster/masklog.h>
  42
  43#include "ocfs2.h"
  44
  45#include "alloc.h"
  46#include "aops.h"
  47#include "dir.h"
  48#include "dlmglue.h"
  49#include "extent_map.h"
  50#include "file.h"
  51#include "sysfile.h"
  52#include "inode.h"
  53#include "ioctl.h"
  54#include "journal.h"
  55#include "locks.h"
  56#include "mmap.h"
  57#include "suballoc.h"
  58#include "super.h"
  59#include "xattr.h"
  60#include "acl.h"
  61#include "quota.h"
  62#include "refcounttree.h"
  63
  64#include "buffer_head_io.h"
  65
  66static int ocfs2_sync_inode(struct inode *inode)
  67{
  68        filemap_fdatawrite(inode->i_mapping);
  69        return sync_mapping_buffers(inode->i_mapping);
  70}
  71
  72static int ocfs2_init_file_private(struct inode *inode, struct file *file)
  73{
  74        struct ocfs2_file_private *fp;
  75
  76        fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
  77        if (!fp)
  78                return -ENOMEM;
  79
  80        fp->fp_file = file;
  81        mutex_init(&fp->fp_mutex);
  82        ocfs2_file_lock_res_init(&fp->fp_flock, fp);
  83        file->private_data = fp;
  84
  85        return 0;
  86}
  87
  88static void ocfs2_free_file_private(struct inode *inode, struct file *file)
  89{
  90        struct ocfs2_file_private *fp = file->private_data;
  91        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  92
  93        if (fp) {
  94                ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
  95                ocfs2_lock_res_free(&fp->fp_flock);
  96                kfree(fp);
  97                file->private_data = NULL;
  98        }
  99}
 100
 101static int ocfs2_file_open(struct inode *inode, struct file *file)
 102{
 103        int status;
 104        int mode = file->f_flags;
 105        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 106
 107        mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 108                   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
 109
 110        spin_lock(&oi->ip_lock);
 111
 112        /* Check that the inode hasn't been wiped from disk by another
 113         * node. If it hasn't then we're safe as long as we hold the
 114         * spin lock until our increment of open count. */
 115        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 116                spin_unlock(&oi->ip_lock);
 117
 118                status = -ENOENT;
 119                goto leave;
 120        }
 121
 122        if (mode & O_DIRECT)
 123                oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
 124
 125        oi->ip_open_count++;
 126        spin_unlock(&oi->ip_lock);
 127
 128        status = ocfs2_init_file_private(inode, file);
 129        if (status) {
 130                /*
 131                 * We want to set open count back if we're failing the
 132                 * open.
 133                 */
 134                spin_lock(&oi->ip_lock);
 135                oi->ip_open_count--;
 136                spin_unlock(&oi->ip_lock);
 137        }
 138
 139leave:
 140        mlog_exit(status);
 141        return status;
 142}
 143
 144static int ocfs2_file_release(struct inode *inode, struct file *file)
 145{
 146        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 147
 148        mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
 149                       file->f_path.dentry->d_name.len,
 150                       file->f_path.dentry->d_name.name);
 151
 152        spin_lock(&oi->ip_lock);
 153        if (!--oi->ip_open_count)
 154                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 155        spin_unlock(&oi->ip_lock);
 156
 157        ocfs2_free_file_private(inode, file);
 158
 159        mlog_exit(0);
 160
 161        return 0;
 162}
 163
 164static int ocfs2_dir_open(struct inode *inode, struct file *file)
 165{
 166        return ocfs2_init_file_private(inode, file);
 167}
 168
 169static int ocfs2_dir_release(struct inode *inode, struct file *file)
 170{
 171        ocfs2_free_file_private(inode, file);
 172        return 0;
 173}
 174
 175static int ocfs2_sync_file(struct file *file,
 176                           struct dentry *dentry,
 177                           int datasync)
 178{
 179        int err = 0;
 180        journal_t *journal;
 181        struct inode *inode = dentry->d_inode;
 182        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 183
 184        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
 185                   dentry->d_name.len, dentry->d_name.name);
 186
 187        err = ocfs2_sync_inode(dentry->d_inode);
 188        if (err)
 189                goto bail;
 190
 191        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 192                goto bail;
 193
 194        journal = osb->journal->j_journal;
 195        err = jbd2_journal_force_commit(journal);
 196
 197bail:
 198        mlog_exit(err);
 199
 200        return (err < 0) ? -EIO : 0;
 201}
 202
 203int ocfs2_should_update_atime(struct inode *inode,
 204                              struct vfsmount *vfsmnt)
 205{
 206        struct timespec now;
 207        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 208
 209        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 210                return 0;
 211
 212        if ((inode->i_flags & S_NOATIME) ||
 213            ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
 214                return 0;
 215
 216        /*
 217         * We can be called with no vfsmnt structure - NFSD will
 218         * sometimes do this.
 219         *
 220         * Note that our action here is different than touch_atime() -
 221         * if we can't tell whether this is a noatime mount, then we
 222         * don't know whether to trust the value of s_atime_quantum.
 223         */
 224        if (vfsmnt == NULL)
 225                return 0;
 226
 227        if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
 228            ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
 229                return 0;
 230
 231        if (vfsmnt->mnt_flags & MNT_RELATIME) {
 232                if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
 233                    (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
 234                        return 1;
 235
 236                return 0;
 237        }
 238
 239        now = CURRENT_TIME;
 240        if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
 241                return 0;
 242        else
 243                return 1;
 244}
 245
 246int ocfs2_update_inode_atime(struct inode *inode,
 247                             struct buffer_head *bh)
 248{
 249        int ret;
 250        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 251        handle_t *handle;
 252        struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 253
 254        mlog_entry_void();
 255
 256        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 257        if (IS_ERR(handle)) {
 258                ret = PTR_ERR(handle);
 259                mlog_errno(ret);
 260                goto out;
 261        }
 262
 263        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 264                                      OCFS2_JOURNAL_ACCESS_WRITE);
 265        if (ret) {
 266                mlog_errno(ret);
 267                goto out_commit;
 268        }
 269
 270        /*
 271         * Don't use ocfs2_mark_inode_dirty() here as we don't always
 272         * have i_mutex to guard against concurrent changes to other
 273         * inode fields.
 274         */
 275        inode->i_atime = CURRENT_TIME;
 276        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
 277        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
 278
 279        ret = ocfs2_journal_dirty(handle, bh);
 280        if (ret < 0)
 281                mlog_errno(ret);
 282
 283out_commit:
 284        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 285out:
 286        mlog_exit(ret);
 287        return ret;
 288}
 289
 290static int ocfs2_set_inode_size(handle_t *handle,
 291                                struct inode *inode,
 292                                struct buffer_head *fe_bh,
 293                                u64 new_i_size)
 294{
 295        int status;
 296
 297        mlog_entry_void();
 298        i_size_write(inode, new_i_size);
 299        inode->i_blocks = ocfs2_inode_sector_count(inode);
 300        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 301
 302        status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 303        if (status < 0) {
 304                mlog_errno(status);
 305                goto bail;
 306        }
 307
 308bail:
 309        mlog_exit(status);
 310        return status;
 311}
 312
 313int ocfs2_simple_size_update(struct inode *inode,
 314                             struct buffer_head *di_bh,
 315                             u64 new_i_size)
 316{
 317        int ret;
 318        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 319        handle_t *handle = NULL;
 320
 321        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 322        if (IS_ERR(handle)) {
 323                ret = PTR_ERR(handle);
 324                mlog_errno(ret);
 325                goto out;
 326        }
 327
 328        ret = ocfs2_set_inode_size(handle, inode, di_bh,
 329                                   new_i_size);
 330        if (ret < 0)
 331                mlog_errno(ret);
 332
 333        ocfs2_commit_trans(osb, handle);
 334out:
 335        return ret;
 336}
 337
 338static int ocfs2_cow_file_pos(struct inode *inode,
 339                              struct buffer_head *fe_bh,
 340                              u64 offset)
 341{
 342        int status;
 343        u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
 344        unsigned int num_clusters = 0;
 345        unsigned int ext_flags = 0;
 346
 347        /*
 348         * If the new offset is aligned to the range of the cluster, there is
 349         * no space for ocfs2_zero_range_for_truncate to fill, so no need to
 350         * CoW either.
 351         */
 352        if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
 353                return 0;
 354
 355        status = ocfs2_get_clusters(inode, cpos, &phys,
 356                                    &num_clusters, &ext_flags);
 357        if (status) {
 358                mlog_errno(status);
 359                goto out;
 360        }
 361
 362        if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
 363                goto out;
 364
 365        return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
 366
 367out:
 368        return status;
 369}
 370
 371static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 372                                     struct inode *inode,
 373                                     struct buffer_head *fe_bh,
 374                                     u64 new_i_size)
 375{
 376        int status;
 377        handle_t *handle;
 378        struct ocfs2_dinode *di;
 379        u64 cluster_bytes;
 380
 381        mlog_entry_void();
 382
 383        /*
 384         * We need to CoW the cluster contains the offset if it is reflinked
 385         * since we will call ocfs2_zero_range_for_truncate later which will
 386         * write "0" from offset to the end of the cluster.
 387         */
 388        status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
 389        if (status) {
 390                mlog_errno(status);
 391                return status;
 392        }
 393
 394        /* TODO: This needs to actually orphan the inode in this
 395         * transaction. */
 396
 397        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 398        if (IS_ERR(handle)) {
 399                status = PTR_ERR(handle);
 400                mlog_errno(status);
 401                goto out;
 402        }
 403
 404        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
 405                                         OCFS2_JOURNAL_ACCESS_WRITE);
 406        if (status < 0) {
 407                mlog_errno(status);
 408                goto out_commit;
 409        }
 410
 411        /*
 412         * Do this before setting i_size.
 413         */
 414        cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
 415        status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
 416                                               cluster_bytes);
 417        if (status) {
 418                mlog_errno(status);
 419                goto out_commit;
 420        }
 421
 422        i_size_write(inode, new_i_size);
 423        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 424
 425        di = (struct ocfs2_dinode *) fe_bh->b_data;
 426        di->i_size = cpu_to_le64(new_i_size);
 427        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
 428        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 429
 430        status = ocfs2_journal_dirty(handle, fe_bh);
 431        if (status < 0)
 432                mlog_errno(status);
 433
 434out_commit:
 435        ocfs2_commit_trans(osb, handle);
 436out:
 437
 438        mlog_exit(status);
 439        return status;
 440}
 441
 442static int ocfs2_truncate_file(struct inode *inode,
 443                               struct buffer_head *di_bh,
 444                               u64 new_i_size)
 445{
 446        int status = 0;
 447        struct ocfs2_dinode *fe = NULL;
 448        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 449        struct ocfs2_truncate_context *tc = NULL;
 450
 451        mlog_entry("(inode = %llu, new_i_size = %llu\n",
 452                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 453                   (unsigned long long)new_i_size);
 454
 455        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
 456         * already validated it */
 457        fe = (struct ocfs2_dinode *) di_bh->b_data;
 458
 459        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 460                        "Inode %llu, inode i_size = %lld != di "
 461                        "i_size = %llu, i_flags = 0x%x\n",
 462                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
 463                        i_size_read(inode),
 464                        (unsigned long long)le64_to_cpu(fe->i_size),
 465                        le32_to_cpu(fe->i_flags));
 466
 467        if (new_i_size > le64_to_cpu(fe->i_size)) {
 468                mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
 469                     (unsigned long long)le64_to_cpu(fe->i_size),
 470                     (unsigned long long)new_i_size);
 471                status = -EINVAL;
 472                mlog_errno(status);
 473                goto bail;
 474        }
 475
 476        mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
 477             (unsigned long long)le64_to_cpu(fe->i_blkno),
 478             (unsigned long long)le64_to_cpu(fe->i_size),
 479             (unsigned long long)new_i_size);
 480
 481        /* lets handle the simple truncate cases before doing any more
 482         * cluster locking. */
 483        if (new_i_size == le64_to_cpu(fe->i_size))
 484                goto bail;
 485
 486        down_write(&OCFS2_I(inode)->ip_alloc_sem);
 487
 488        /*
 489         * The inode lock forced other nodes to sync and drop their
 490         * pages, which (correctly) happens even if we have a truncate
 491         * without allocation change - ocfs2 cluster sizes can be much
 492         * greater than page size, so we have to truncate them
 493         * anyway.
 494         */
 495        unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 496        truncate_inode_pages(inode->i_mapping, new_i_size);
 497
 498        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 499                status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
 500                                               i_size_read(inode), 1);
 501                if (status)
 502                        mlog_errno(status);
 503
 504                goto bail_unlock_sem;
 505        }
 506
 507        /* alright, we're going to need to do a full blown alloc size
 508         * change. Orphan the inode so that recovery can complete the
 509         * truncate if necessary. This does the task of marking
 510         * i_size. */
 511        status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 512        if (status < 0) {
 513                mlog_errno(status);
 514                goto bail_unlock_sem;
 515        }
 516
 517        status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 518        if (status < 0) {
 519                mlog_errno(status);
 520                goto bail_unlock_sem;
 521        }
 522
 523        status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 524        if (status < 0) {
 525                mlog_errno(status);
 526                goto bail_unlock_sem;
 527        }
 528
 529        /* TODO: orphan dir cleanup here. */
 530bail_unlock_sem:
 531        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 532
 533bail:
 534        if (!status && OCFS2_I(inode)->ip_clusters == 0)
 535                status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 536
 537        mlog_exit(status);
 538        return status;
 539}
 540
 541/*
 542 * extend file allocation only here.
 543 * we'll update all the disk stuff, and oip->alloc_size
 544 *
 545 * expect stuff to be locked, a transaction started and enough data /
 546 * metadata reservations in the contexts.
 547 *
 548 * Will return -EAGAIN, and a reason if a restart is needed.
 549 * If passed in, *reason will always be set, even in error.
 550 */
 551int ocfs2_add_inode_data(struct ocfs2_super *osb,
 552                         struct inode *inode,
 553                         u32 *logical_offset,
 554                         u32 clusters_to_add,
 555                         int mark_unwritten,
 556                         struct buffer_head *fe_bh,
 557                         handle_t *handle,
 558                         struct ocfs2_alloc_context *data_ac,
 559                         struct ocfs2_alloc_context *meta_ac,
 560                         enum ocfs2_alloc_restarted *reason_ret)
 561{
 562        int ret;
 563        struct ocfs2_extent_tree et;
 564
 565        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
 566        ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
 567                                          clusters_to_add, mark_unwritten,
 568                                          data_ac, meta_ac, reason_ret);
 569
 570        return ret;
 571}
 572
 573static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 574                                     u32 clusters_to_add, int mark_unwritten)
 575{
 576        int status = 0;
 577        int restart_func = 0;
 578        int credits;
 579        u32 prev_clusters;
 580        struct buffer_head *bh = NULL;
 581        struct ocfs2_dinode *fe = NULL;
 582        handle_t *handle = NULL;
 583        struct ocfs2_alloc_context *data_ac = NULL;
 584        struct ocfs2_alloc_context *meta_ac = NULL;
 585        enum ocfs2_alloc_restarted why;
 586        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 587        struct ocfs2_extent_tree et;
 588        int did_quota = 0;
 589
 590        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 591
 592        /*
 593         * This function only exists for file systems which don't
 594         * support holes.
 595         */
 596        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 597
 598        status = ocfs2_read_inode_block(inode, &bh);
 599        if (status < 0) {
 600                mlog_errno(status);
 601                goto leave;
 602        }
 603        fe = (struct ocfs2_dinode *) bh->b_data;
 604
 605restart_all:
 606        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 607
 608        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
 609             "clusters_to_add = %u\n",
 610             (unsigned long long)OCFS2_I(inode)->ip_blkno,
 611             (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
 612             clusters_to_add);
 613        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
 614        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 615                                       &data_ac, &meta_ac);
 616        if (status) {
 617                mlog_errno(status);
 618                goto leave;
 619        }
 620
 621        credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
 622                                            clusters_to_add);
 623        handle = ocfs2_start_trans(osb, credits);
 624        if (IS_ERR(handle)) {
 625                status = PTR_ERR(handle);
 626                handle = NULL;
 627                mlog_errno(status);
 628                goto leave;
 629        }
 630
 631restarted_transaction:
 632        if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
 633            clusters_to_add))) {
 634                status = -EDQUOT;
 635                goto leave;
 636        }
 637        did_quota = 1;
 638
 639        /* reserve a write to the file entry early on - that we if we
 640         * run out of credits in the allocation path, we can still
 641         * update i_size. */
 642        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 643                                         OCFS2_JOURNAL_ACCESS_WRITE);
 644        if (status < 0) {
 645                mlog_errno(status);
 646                goto leave;
 647        }
 648
 649        prev_clusters = OCFS2_I(inode)->ip_clusters;
 650
 651        status = ocfs2_add_inode_data(osb,
 652                                      inode,
 653                                      &logical_start,
 654                                      clusters_to_add,
 655                                      mark_unwritten,
 656                                      bh,
 657                                      handle,
 658                                      data_ac,
 659                                      meta_ac,
 660                                      &why);
 661        if ((status < 0) && (status != -EAGAIN)) {
 662                if (status != -ENOSPC)
 663                        mlog_errno(status);
 664                goto leave;
 665        }
 666
 667        status = ocfs2_journal_dirty(handle, bh);
 668        if (status < 0) {
 669                mlog_errno(status);
 670                goto leave;
 671        }
 672
 673        spin_lock(&OCFS2_I(inode)->ip_lock);
 674        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 675        spin_unlock(&OCFS2_I(inode)->ip_lock);
 676        /* Release unused quota reservation */
 677        vfs_dq_free_space(inode,
 678                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 679        did_quota = 0;
 680
 681        if (why != RESTART_NONE && clusters_to_add) {
 682                if (why == RESTART_META) {
 683                        mlog(0, "restarting function.\n");
 684                        restart_func = 1;
 685                } else {
 686                        BUG_ON(why != RESTART_TRANS);
 687
 688                        mlog(0, "restarting transaction.\n");
 689                        /* TODO: This can be more intelligent. */
 690                        credits = ocfs2_calc_extend_credits(osb->sb,
 691                                                            &fe->id2.i_list,
 692                                                            clusters_to_add);
 693                        status = ocfs2_extend_trans(handle, credits);
 694                        if (status < 0) {
 695                                /* handle still has to be committed at
 696                                 * this point. */
 697                                status = -ENOMEM;
 698                                mlog_errno(status);
 699                                goto leave;
 700                        }
 701                        goto restarted_transaction;
 702                }
 703        }
 704
 705        mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
 706             le32_to_cpu(fe->i_clusters),
 707             (unsigned long long)le64_to_cpu(fe->i_size));
 708        mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 709             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 710
 711leave:
 712        if (status < 0 && did_quota)
 713                vfs_dq_free_space(inode,
 714                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 715        if (handle) {
 716                ocfs2_commit_trans(osb, handle);
 717                handle = NULL;
 718        }
 719        if (data_ac) {
 720                ocfs2_free_alloc_context(data_ac);
 721                data_ac = NULL;
 722        }
 723        if (meta_ac) {
 724                ocfs2_free_alloc_context(meta_ac);
 725                meta_ac = NULL;
 726        }
 727        if ((!status) && restart_func) {
 728                restart_func = 0;
 729                goto restart_all;
 730        }
 731        brelse(bh);
 732        bh = NULL;
 733
 734        mlog_exit(status);
 735        return status;
 736}
 737
 738/* Some parts of this taken from generic_cont_expand, which turned out
 739 * to be too fragile to do exactly what we need without us having to
 740 * worry about recursive locking in ->write_begin() and ->write_end(). */
 741static int ocfs2_write_zero_page(struct inode *inode,
 742                                 u64 size)
 743{
 744        struct address_space *mapping = inode->i_mapping;
 745        struct page *page;
 746        unsigned long index;
 747        unsigned int offset;
 748        handle_t *handle = NULL;
 749        int ret;
 750
 751        offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
 752        /* ugh.  in prepare/commit_write, if from==to==start of block, we 
 753        ** skip the prepare.  make sure we never send an offset for the start
 754        ** of a block
 755        */
 756        if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
 757                offset++;
 758        }
 759        index = size >> PAGE_CACHE_SHIFT;
 760
 761        page = grab_cache_page(mapping, index);
 762        if (!page) {
 763                ret = -ENOMEM;
 764                mlog_errno(ret);
 765                goto out;
 766        }
 767
 768        ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
 769        if (ret < 0) {
 770                mlog_errno(ret);
 771                goto out_unlock;
 772        }
 773
 774        if (ocfs2_should_order_data(inode)) {
 775                handle = ocfs2_start_walk_page_trans(inode, page, offset,
 776                                                     offset);
 777                if (IS_ERR(handle)) {
 778                        ret = PTR_ERR(handle);
 779                        handle = NULL;
 780                        goto out_unlock;
 781                }
 782        }
 783
 784        /* must not update i_size! */
 785        ret = block_commit_write(page, offset, offset);
 786        if (ret < 0)
 787                mlog_errno(ret);
 788        else
 789                ret = 0;
 790
 791        if (handle)
 792                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 793out_unlock:
 794        unlock_page(page);
 795        page_cache_release(page);
 796out:
 797        return ret;
 798}
 799
 800static int ocfs2_zero_extend(struct inode *inode,
 801                             u64 zero_to_size)
 802{
 803        int ret = 0;
 804        u64 start_off;
 805        struct super_block *sb = inode->i_sb;
 806
 807        start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 808        while (start_off < zero_to_size) {
 809                ret = ocfs2_write_zero_page(inode, start_off);
 810                if (ret < 0) {
 811                        mlog_errno(ret);
 812                        goto out;
 813                }
 814
 815                start_off += sb->s_blocksize;
 816
 817                /*
 818                 * Very large extends have the potential to lock up
 819                 * the cpu for extended periods of time.
 820                 */
 821                cond_resched();
 822        }
 823
 824out:
 825        return ret;
 826}
 827
 828int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
 829{
 830        int ret;
 831        u32 clusters_to_add;
 832        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 833
 834        clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
 835        if (clusters_to_add < oi->ip_clusters)
 836                clusters_to_add = 0;
 837        else
 838                clusters_to_add -= oi->ip_clusters;
 839
 840        if (clusters_to_add) {
 841                ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
 842                                                clusters_to_add, 0);
 843                if (ret) {
 844                        mlog_errno(ret);
 845                        goto out;
 846                }
 847        }
 848
 849        /*
 850         * Call this even if we don't add any clusters to the tree. We
 851         * still need to zero the area between the old i_size and the
 852         * new i_size.
 853         */
 854        ret = ocfs2_zero_extend(inode, zero_to);
 855        if (ret < 0)
 856                mlog_errno(ret);
 857
 858out:
 859        return ret;
 860}
 861
 862static int ocfs2_extend_file(struct inode *inode,
 863                             struct buffer_head *di_bh,
 864                             u64 new_i_size)
 865{
 866        int ret = 0;
 867        struct ocfs2_inode_info *oi = OCFS2_I(inode);
 868
 869        BUG_ON(!di_bh);
 870
 871        /* setattr sometimes calls us like this. */
 872        if (new_i_size == 0)
 873                goto out;
 874
 875        if (i_size_read(inode) == new_i_size)
 876                goto out;
 877        BUG_ON(new_i_size < i_size_read(inode));
 878
 879        /*
 880         * Fall through for converting inline data, even if the fs
 881         * supports sparse files.
 882         *
 883         * The check for inline data here is legal - nobody can add
 884         * the feature since we have i_mutex. We must check it again
 885         * after acquiring ip_alloc_sem though, as paths like mmap
 886         * might have raced us to converting the inode to extents.
 887         */
 888        if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 889            && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 890                goto out_update_size;
 891
 892        /*
 893         * The alloc sem blocks people in read/write from reading our
 894         * allocation until we're done changing it. We depend on
 895         * i_mutex to block other extend/truncate calls while we're
 896         * here.
 897         */
 898        down_write(&oi->ip_alloc_sem);
 899
 900        if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 901                /*
 902                 * We can optimize small extends by keeping the inodes
 903                 * inline data.
 904                 */
 905                if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
 906                        up_write(&oi->ip_alloc_sem);
 907                        goto out_update_size;
 908                }
 909
 910                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
 911                if (ret) {
 912                        up_write(&oi->ip_alloc_sem);
 913
 914                        mlog_errno(ret);
 915                        goto out;
 916                }
 917        }
 918
 919        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 920                ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
 921
 922        up_write(&oi->ip_alloc_sem);
 923
 924        if (ret < 0) {
 925                mlog_errno(ret);
 926                goto out;
 927        }
 928
 929out_update_size:
 930        ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 931        if (ret < 0)
 932                mlog_errno(ret);
 933
 934out:
 935        return ret;
 936}
 937
 938int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 939{
 940        int status = 0, size_change;
 941        struct inode *inode = dentry->d_inode;
 942        struct super_block *sb = inode->i_sb;
 943        struct ocfs2_super *osb = OCFS2_SB(sb);
 944        struct buffer_head *bh = NULL;
 945        handle_t *handle = NULL;
 946        int qtype;
 947        struct dquot *transfer_from[MAXQUOTAS] = { };
 948        struct dquot *transfer_to[MAXQUOTAS] = { };
 949
 950        mlog_entry("(0x%p, '%.*s')\n", dentry,
 951                   dentry->d_name.len, dentry->d_name.name);
 952
 953        /* ensuring we don't even attempt to truncate a symlink */
 954        if (S_ISLNK(inode->i_mode))
 955                attr->ia_valid &= ~ATTR_SIZE;
 956
 957        if (attr->ia_valid & ATTR_MODE)
 958                mlog(0, "mode change: %d\n", attr->ia_mode);
 959        if (attr->ia_valid & ATTR_UID)
 960                mlog(0, "uid change: %d\n", attr->ia_uid);
 961        if (attr->ia_valid & ATTR_GID)
 962                mlog(0, "gid change: %d\n", attr->ia_gid);
 963        if (attr->ia_valid & ATTR_SIZE)
 964                mlog(0, "size change...\n");
 965        if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
 966                mlog(0, "time change...\n");
 967
 968#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
 969                           | ATTR_GID | ATTR_UID | ATTR_MODE)
 970        if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
 971                mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
 972                return 0;
 973        }
 974
 975        status = inode_change_ok(inode, attr);
 976        if (status)
 977                return status;
 978
 979        size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 980        if (size_change) {
 981                status = ocfs2_rw_lock(inode, 1);
 982                if (status < 0) {
 983                        mlog_errno(status);
 984                        goto bail;
 985                }
 986        }
 987
 988        status = ocfs2_inode_lock(inode, &bh, 1);
 989        if (status < 0) {
 990                if (status != -ENOENT)
 991                        mlog_errno(status);
 992                goto bail_unlock_rw;
 993        }
 994
 995        if (size_change && attr->ia_size != i_size_read(inode)) {
 996                if (attr->ia_size > sb->s_maxbytes) {
 997                        status = -EFBIG;
 998                        goto bail_unlock;
 999                }
1000

1001                if (i_size_read(inode) > attr->ia_size) {
1002                        if (ocfs2_should_order_data(inode)) {
1003                                status = ocfs2_begin_ordered_truncate(inode,
1004                                                                      attr->ia_size);
1005                                if (status)
1006                                        goto bail_unlock;
1007                        }
1008                        status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1009                } else
1010                        status = ocfs2_extend_file(inode, bh, attr->ia_size);
1011                if (status < 0) {
1012                        if (status != -ENOSPC)
1013                                mlog_errno(status);
1014                        status = -ENOSPC;
1015                        goto bail_unlock;
1016                }
1017        }
1018
1019        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
1020            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
1021                /*
1022                 * Gather pointers to quota structures so that allocation /
1023                 * freeing of quota structures happens here and not inside
1024                 * vfs_dq_transfer() where we have problems with lock ordering
1025                 */
1026                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1027                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1028                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1029                        transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1030                                                      USRQUOTA);
1031                        transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
1032                                                        USRQUOTA);
1033                        if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
1034                                status = -ESRCH;
1035                                goto bail_unlock;
1036                        }
1037                }
1038                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1039                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1040                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1041                        transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1042                                                      GRPQUOTA);
1043                        transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
1044                                                        GRPQUOTA);
1045                        if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
1046                                status = -ESRCH;
1047                                goto bail_unlock;
1048                        }
1049                }
1050                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1051                                           2 * ocfs2_quota_trans_credits(sb));
1052                if (IS_ERR(handle)) {
1053                        status = PTR_ERR(handle);
1054                        mlog_errno(status);
1055                        goto bail_unlock;
1056                }
1057                status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
1058                if (status < 0)
1059                        goto bail_commit;
1060        } else {
1061                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1062                if (IS_ERR(handle)) {
1063                        status = PTR_ERR(handle);
1064                        mlog_errno(status);
1065                        goto bail_unlock;
1066                }
1067        }
1068
1069        /*
1070         * This will intentionally not wind up calling vmtruncate(),
1071         * since all the work for a size change has been done above.
1072         * Otherwise, we could get into problems with truncate as
1073         * ip_alloc_sem is used there to protect against i_size
1074         * changes.
1075         */
1076        status = inode_setattr(inode, attr);
1077        if (status < 0) {
1078                mlog_errno(status);
1079                goto bail_commit;
1080        }
1081
1082        status = ocfs2_mark_inode_dirty(handle, inode, bh);
1083        if (status < 0)
1084                mlog_errno(status);
1085
1086bail_commit:
1087        ocfs2_commit_trans(osb, handle);
1088bail_unlock:
1089        ocfs2_inode_unlock(inode, 1);
1090bail_unlock_rw:
1091        if (size_change)
1092                ocfs2_rw_unlock(inode, 1);
1093bail:
1094        brelse(bh);
1095
1096        /* Release quota pointers in case we acquired them */
1097        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1098                dqput(transfer_to[qtype]);
1099                dqput(transfer_from[qtype]);
1100        }
1101
1102        if (!status && attr->ia_valid & ATTR_MODE) {
1103                status = ocfs2_acl_chmod(inode);
1104                if (status < 0)
1105                        mlog_errno(status);
1106        }
1107
1108        mlog_exit(status);
1109        return status;
1110}
1111
1112int ocfs2_getattr(struct vfsmount *mnt,
1113                  struct dentry *dentry,
1114                  struct kstat *stat)
1115{
1116        struct inode *inode = dentry->d_inode;
1117        struct super_block *sb = dentry->d_inode->i_sb;
1118        struct ocfs2_super *osb = sb->s_fs_info;
1119        int err;
1120
1121        mlog_entry_void();
1122
1123        err = ocfs2_inode_revalidate(dentry);
1124        if (err) {
1125                if (err != -ENOENT)
1126                        mlog_errno(err);
1127                goto bail;
1128        }
1129
1130        generic_fillattr(inode, stat);
1131
1132        /* We set the blksize from the cluster size for performance */
1133        stat->blksize = osb->s_clustersize;
1134
1135bail:
1136        mlog_exit(err);
1137
1138        return err;
1139}
1140
1141int ocfs2_permission(struct inode *inode, int mask)
1142{
1143        int ret;
1144
1145        mlog_entry_void();
1146
1147        ret = ocfs2_inode_lock(inode, NULL, 0);
1148        if (ret) {
1149                if (ret != -ENOENT)
1150                        mlog_errno(ret);
1151                goto out;
1152        }
1153
1154        ret = generic_permission(inode, mask, ocfs2_check_acl);
1155
1156        ocfs2_inode_unlock(inode, 0);
1157out:
1158        mlog_exit(ret);
1159        return ret;
1160}
1161
1162static int __ocfs2_write_remove_suid(struct inode *inode,
1163                                     struct buffer_head *bh)
1164{
1165        int ret;
1166        handle_t *handle;
1167        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1168        struct ocfs2_dinode *di;
1169
1170        mlog_entry("(Inode %llu, mode 0%o)\n",
1171                   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1172
1173        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1174        if (IS_ERR(handle)) {
1175                ret = PTR_ERR(handle);
1176                mlog_errno(ret);
1177                goto out;
1178        }
1179
1180        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1181                                      OCFS2_JOURNAL_ACCESS_WRITE);
1182        if (ret < 0) {
1183                mlog_errno(ret);
1184                goto out_trans;
1185        }
1186
1187        inode->i_mode &= ~S_ISUID;
1188        if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1189                inode->i_mode &= ~S_ISGID;
1190
1191        di = (struct ocfs2_dinode *) bh->b_data;
1192        di->i_mode = cpu_to_le16(inode->i_mode);
1193
1194        ret = ocfs2_journal_dirty(handle, bh);
1195        if (ret < 0)
1196                mlog_errno(ret);
1197
1198out_trans:
1199        ocfs2_commit_trans(osb, handle);
1200out:
1201        mlog_exit(ret);
1202        return ret;
1203}
1204
1205/*
1206 * Will look for holes and unwritten extents in the range starting at
1207 * pos for count bytes (inclusive).
1208 */
1209static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1210                                       size_t count)
1211{
1212        int ret = 0;
1213        unsigned int extent_flags;
1214        u32 cpos, clusters, extent_len, phys_cpos;
1215        struct super_block *sb = inode->i_sb;
1216
1217        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1218        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1219
1220        while (clusters) {
1221                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1222                                         &extent_flags);
1223                if (ret < 0) {
1224                        mlog_errno(ret);
1225                        goto out;
1226                }
1227
1228                if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1229                        ret = 1;
1230                        break;
1231                }
1232
1233                if (extent_len > clusters)
1234                        extent_len = clusters;
1235
1236                clusters -= extent_len;
1237                cpos += extent_len;
1238        }
1239out:
1240        return ret;
1241}
1242
1243static int ocfs2_write_remove_suid(struct inode *inode)
1244{
1245        int ret;
1246        struct buffer_head *bh = NULL;
1247
1248        ret = ocfs2_read_inode_block(inode, &bh);
1249        if (ret < 0) {
1250                mlog_errno(ret);
1251                goto out;
1252        }
1253
1254        ret =  __ocfs2_write_remove_suid(inode, bh);
1255out:
1256        brelse(bh);
1257        return ret;
1258}
1259
1260/*
1261 * Allocate enough extents to cover the region starting at byte offset
1262 * start for len bytes. Existing extents are skipped, any extents
1263 * added are marked as "unwritten".
1264 */
1265static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1266                                            u64 start, u64 len)
1267{
1268        int ret;
1269        u32 cpos, phys_cpos, clusters, alloc_size;
1270        u64 end = start + len;
1271        struct buffer_head *di_bh = NULL;
1272
1273        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1274                ret = ocfs2_read_inode_block(inode, &di_bh);
1275                if (ret) {
1276                        mlog_errno(ret);
1277                        goto out;
1278                }
1279
1280                /*
1281                 * Nothing to do if the requested reservation range
1282                 * fits within the inode.
1283                 */
1284                if (ocfs2_size_fits_inline_data(di_bh, end))
1285                        goto out;
1286
1287                ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1288                if (ret) {
1289                        mlog_errno(ret);
1290                        goto out;
1291                }
1292        }
1293
1294        /*
1295         * We consider both start and len to be inclusive.
1296         */
1297        cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1298        clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1299        clusters -= cpos;
1300
1301        while (clusters) {
1302                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1303                                         &alloc_size, NULL);
1304                if (ret) {
1305                        mlog_errno(ret);
1306                        goto out;
1307                }
1308
1309                /*
1310                 * Hole or existing extent len can be arbitrary, so
1311                 * cap it to our own allocation request.
1312                 */
1313                if (alloc_size > clusters)
1314                        alloc_size = clusters;
1315
1316                if (phys_cpos) {
1317                        /*
1318                         * We already have an allocation at this
1319                         * region so we can safely skip it.
1320                         */
1321                        goto next;
1322                }
1323
1324                ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1325                if (ret) {
1326                        if (ret != -ENOSPC)
1327                                mlog_errno(ret);
1328                        goto out;
1329                }
1330
1331next:
1332                cpos += alloc_size;
1333                clusters -= alloc_size;
1334        }
1335
1336        ret = 0;
1337out:
1338
1339        brelse(di_bh);
1340        return ret;
1341}
1342
1343/*
1344 * Truncate a byte range, avoiding pages within partial clusters. This
1345 * preserves those pages for the zeroing code to write to.
1346 */
1347static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1348                                         u64 byte_len)
1349{
1350        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1351        loff_t start, end;
1352        struct address_space *mapping = inode->i_mapping;
1353
1354        start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1355        end = byte_start + byte_len;
1356        end = end & ~(osb->s_clustersize - 1);
1357
1358        if (start < end) {
1359                unmap_mapping_range(mapping, start, end - start, 0);
1360                truncate_inode_pages_range(mapping, start, end - 1);
1361        }
1362}
1363
1364static int ocfs2_zero_partial_clusters(struct inode *inode,
1365                                       u64 start, u64 len)
1366{
1367        int ret = 0;
1368        u64 tmpend, end = start + len;
1369        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1370        unsigned int csize = osb->s_clustersize;
1371        handle_t *handle;
1372
1373        /*
1374         * The "start" and "end" values are NOT necessarily part of
1375         * the range whose allocation is being deleted. Rather, this
1376         * is what the user passed in with the request. We must zero
1377         * partial clusters here. There's no need to worry about
1378         * physical allocation - the zeroing code knows to skip holes.
1379         */
1380        mlog(0, "byte start: %llu, end: %llu\n",
1381             (unsigned long long)start, (unsigned long long)end);
1382
1383        /*
1384         * If both edges are on a cluster boundary then there's no
1385         * zeroing required as the region is part of the allocation to
1386         * be truncated.
1387         */
1388        if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1389                goto out;
1390
1391        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1392        if (IS_ERR(handle)) {
1393                ret = PTR_ERR(handle);
1394                mlog_errno(ret);
1395                goto out;
1396        }
1397
1398        /*
1399         * We want to get the byte offset of the end of the 1st cluster.
1400         */
1401        tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1402        if (tmpend > end)
1403                tmpend = end;
1404
1405        mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1406             (unsigned long long)start, (unsigned long long)tmpend);
1407
1408        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1409        if (ret)
1410                mlog_errno(ret);
1411
1412        if (tmpend < end) {
1413                /*
1414                 * This may make start and end equal, but the zeroing
1415                 * code will skip any work in that case so there's no
1416                 * need to catch it up here.
1417                 */
1418                start = end & ~(osb->s_clustersize - 1);
1419
1420                mlog(0, "2nd range: start: %llu, end: %llu\n",
1421                     (unsigned long long)start, (unsigned long long)end);
1422
1423                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1424                if (ret)
1425                        mlog_errno(ret);
1426        }
1427
1428        ocfs2_commit_trans(osb, handle);
1429out:
1430        return ret;
1431}
1432
1433static int ocfs2_remove_inode_range(struct inode *inode,
1434                                    struct buffer_head *di_bh, u64 byte_start,
1435                                    u64 byte_len)
1436{
1437        int ret = 0;
1438        u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1439        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1440        struct ocfs2_cached_dealloc_ctxt dealloc;
1441        struct address_space *mapping = inode->i_mapping;
1442        struct ocfs2_extent_tree et;
1443
1444        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1445        ocfs2_init_dealloc_ctxt(&dealloc);
1446
1447        if (byte_len == 0)
1448                return 0;
1449
1450        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1451                ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1452                                            byte_start + byte_len, 0);
1453                if (ret) {
1454                        mlog_errno(ret);
1455                        goto out;
1456                }
1457                /*
1458                 * There's no need to get fancy with the page cache
1459                 * truncate of an inline-data inode. We're talking
1460                 * about less than a page here, which will be cached
1461                 * in the dinode buffer anyway.
1462                 */
1463                unmap_mapping_range(mapping, 0, 0, 0);
1464                truncate_inode_pages(mapping, 0);
1465                goto out;
1466        }
1467
1468        trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1469        trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1470        if (trunc_len >= trunc_start)
1471                trunc_len -= trunc_start;
1472        else
1473                trunc_len = 0;
1474
1475        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1476             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1477             (unsigned long long)byte_start,
1478             (unsigned long long)byte_len, trunc_start, trunc_len);
1479
1480        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1481        if (ret) {
1482                mlog_errno(ret);
1483                goto out;
1484        }
1485
1486        cpos = trunc_start;
1487        while (trunc_len) {
1488                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1489                                         &alloc_size, NULL);
1490                if (ret) {
1491                        mlog_errno(ret);
1492                        goto out;
1493                }
1494
1495                if (alloc_size > trunc_len)
1496                        alloc_size = trunc_len;
1497
1498                /* Only do work for non-holes */
1499                if (phys_cpos != 0) {
1500                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
1501                                                       phys_cpos, alloc_size,
1502                                                       &dealloc);
1503                        if (ret) {
1504                                mlog_errno(ret);
1505                                goto out;
1506                        }
1507                }
1508
1509                cpos += alloc_size;
1510                trunc_len -= alloc_size;
1511        }
1512
1513        ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1514
1515out:
1516        ocfs2_schedule_truncate_log_flush(osb, 1);
1517        ocfs2_run_deallocs(osb, &dealloc);
1518
1519        return ret;
1520}
1521
1522/*
1523 * Parts of this function taken from xfs_change_file_space()
1524 */
1525static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1526                                     loff_t f_pos, unsigned int cmd,
1527                                     struct ocfs2_space_resv *sr,
1528                                     int change_size)
1529{
1530        int ret;
1531        s64 llen;
1532        loff_t size;
1533        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1534        struct buffer_head *di_bh = NULL;
1535        handle_t *handle;
1536        unsigned long long max_off = inode->i_sb->s_maxbytes;
1537
1538        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1539                return -EROFS;
1540
1541        mutex_lock(&inode->i_mutex);
1542
1543        /*
1544         * This prevents concurrent writes on other nodes
1545         */
1546        ret = ocfs2_rw_lock(inode, 1);
1547        if (ret) {
1548                mlog_errno(ret);
1549                goto out;
1550        }
1551
1552        ret = ocfs2_inode_lock(inode, &di_bh, 1);
1553        if (ret) {
1554                mlog_errno(ret);
1555                goto out_rw_unlock;
1556        }
1557
1558        if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1559                ret = -EPERM;
1560                goto out_inode_unlock;
1561        }
1562
1563        switch (sr->l_whence) {
1564        case 0: /*SEEK_SET*/
1565                break;
1566        case 1: /*SEEK_CUR*/
1567                sr->l_start += f_pos;
1568                break;
1569        case 2: /*SEEK_END*/
1570                sr->l_start += i_size_read(inode);
1571                break;
1572        default:
1573                ret = -EINVAL;
1574                goto out_inode_unlock;
1575        }
1576        sr->l_whence = 0;
1577
1578        llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1579
1580        if (sr->l_start < 0
1581            || sr->l_start > max_off
1582            || (sr->l_start + llen) < 0
1583            || (sr->l_start + llen) > max_off) {
1584                ret = -EINVAL;
1585                goto out_inode_unlock;
1586        }
1587        size = sr->l_start + sr->l_len;
1588
1589        if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1590                if (sr->l_len <= 0) {
1591                        ret = -EINVAL;
1592                        goto out_inode_unlock;
1593                }
1594        }
1595
1596        if (file && should_remove_suid(file->f_path.dentry)) {
1597                ret = __ocfs2_write_remove_suid(inode, di_bh);
1598                if (ret) {
1599                        mlog_errno(ret);
1600                        goto out_inode_unlock;
1601                }
1602        }
1603
1604        down_write(&OCFS2_I(inode)->ip_alloc_sem);
1605        switch (cmd) {
1606        case OCFS2_IOC_RESVSP:
1607        case OCFS2_IOC_RESVSP64:
1608                /*
1609                 * This takes unsigned offsets, but the signed ones we
1610                 * pass have been checked against overflow above.
1611                 */
1612                ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1613                                                       sr->l_len);
1614                break;
1615        case OCFS2_IOC_UNRESVSP:
1616        case OCFS2_IOC_UNRESVSP64:
1617                ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1618                                               sr->l_len);
1619                break;
1620        default:
1621                ret = -EINVAL;
1622        }
1623        up_write(&OCFS2_I(inode)->ip_alloc_sem);
1624        if (ret) {
1625                mlog_errno(ret);
1626                goto out_inode_unlock;
1627        }
1628
1629        /*
1630         * We update c/mtime for these changes
1631         */
1632        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1633        if (IS_ERR(handle)) {
1634                ret = PTR_ERR(handle);
1635                mlog_errno(ret);
1636                goto out_inode_unlock;
1637        }
1638
1639        if (change_size && i_size_read(inode) < size)
1640                i_size_write(inode, size);
1641
1642        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1643        ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1644        if (ret < 0)
1645                mlog_errno(ret);
1646
1647        ocfs2_commit_trans(osb, handle);
1648
1649out_inode_unlock:
1650        brelse(di_bh);
1651        ocfs2_inode_unlock(inode, 1);
1652out_rw_unlock:
1653        ocfs2_rw_unlock(inode, 1);
1654
1655out:
1656        mutex_unlock(&inode->i_mutex);
1657        return ret;
1658}
1659
1660int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1661                            struct ocfs2_space_resv *sr)
1662{
1663        struct inode *inode = file->f_path.dentry->d_inode;
1664        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1665
1666        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1667            !ocfs2_writes_unwritten_extents(osb))
1668                return -ENOTTY;
1669        else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1670                 !ocfs2_sparse_alloc(osb))
1671                return -ENOTTY;
1672
1673        if (!S_ISREG(inode->i_mode))
1674                return -EINVAL;
1675
1676        if (!(file->f_mode & FMODE_WRITE))
1677                return -EBADF;
1678
1679        return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1680}
1681
1682static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1683                            loff_t len)
1684{
1685        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1686        struct ocfs2_space_resv sr;
1687        int change_size = 1;
1688
1689        if (!ocfs2_writes_unwritten_extents(osb))
1690                return -EOPNOTSUPP;
1691
1692        if (S_ISDIR(inode->i_mode))
1693                return -ENODEV;
1694
1695        if (mode & FALLOC_FL_KEEP_SIZE)
1696                change_size = 0;
1697
1698        sr.l_whence = 0;
1699        sr.l_start = (s64)offset;
1700        sr.l_len = (s64)len;
1701
1702        return __ocfs2_change_file_space(NULL, inode, offset,
1703                                         OCFS2_IOC_RESVSP64, &sr, change_size);
1704}
1705
1706int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
1707                                   size_t count)
1708{
1709        int ret = 0;
1710        unsigned int extent_flags;
1711        u32 cpos, clusters, extent_len, phys_cpos;
1712        struct super_block *sb = inode->i_sb;
1713
1714        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
1715            !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
1716            OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1717                return 0;
1718
1719        cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1720        clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1721
1722        while (clusters) {
1723                ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1724                                         &extent_flags);
1725                if (ret < 0) {
1726                        mlog_errno(ret);
1727                        goto out;
1728                }
1729
1730                if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
1731                        ret = 1;
1732                        break;
1733                }
1734
1735                if (extent_len > clusters)
1736                        extent_len = clusters;
1737
1738                clusters -= extent_len;
1739                cpos += extent_len;
1740        }
1741out:
1742        return ret;
1743}
1744
1745static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
1746                                            loff_t pos, size_t count,
1747                                            int *meta_level)
1748{
1749        int ret;
1750        struct buffer_head *di_bh = NULL;
1751        u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1752        u32 clusters =
1753                ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
1754
1755        ret = ocfs2_inode_lock(inode, &di_bh, 1);
1756        if (ret) {
1757                mlog_errno(ret);
1758                goto out;
1759        }
1760
1761        *meta_level = 1;
1762
1763        ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
1764        if (ret)
1765                mlog_errno(ret);
1766out:
1767        brelse(di_bh);
1768        return ret;
1769}
1770
1771static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1772                                         loff_t *ppos,
1773                                         size_t count,
1774                                         int appending,
1775                                         int *direct_io)
1776{
1777        int ret = 0, meta_level = 0;
1778        struct inode *inode = dentry->d_inode;
1779        loff_t saved_pos, end;
1780
1781        /* 
1782         * We start with a read level meta lock and only jump to an ex
1783         * if we need to make modifications here.
1784         */
1785        for(;;) {
1786                ret = ocfs2_inode_lock(inode, NULL, meta_level);
1787                if (ret < 0) {
1788                        meta_level = -1;
1789                        mlog_errno(ret);
1790                        goto out;
1791                }
1792
1793                /* Clear suid / sgid if necessary. We do this here
1794                 * instead of later in the write path because
1795                 * remove_suid() calls ->setattr without any hint that
1796                 * we may have already done our cluster locking. Since
1797                 * ocfs2_setattr() *must* take cluster locks to
1798                 * proceeed, this will lead us to recursively lock the
1799                 * inode. There's also the dinode i_size state which
1800                 * can be lost via setattr during extending writes (we
1801                 * set inode->i_size at the end of a write. */
1802                if (should_remove_suid(dentry)) {
1803                        if (meta_level == 0) {
1804                                ocfs2_inode_unlock(inode, meta_level);
1805                                meta_level = 1;
1806                                continue;
1807                        }
1808
1809                        ret = ocfs2_write_remove_suid(inode);
1810                        if (ret < 0) {
1811                                mlog_errno(ret);
1812                                goto out_unlock;
1813                        }
1814                }
1815
1816                /* work on a copy of ppos until we're sure that we won't have
1817                 * to recalculate it due to relocking. */
1818                if (appending) {
1819                        saved_pos = i_size_read(inode);
1820                        mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1821                } else {
1822                        saved_pos = *ppos;
1823                }
1824
1825                end = saved_pos + count;
1826
1827                ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
1828                if (ret == 1) {
1829                        ocfs2_inode_unlock(inode, meta_level);
1830                        meta_level = -1;
1831
1832                        ret = ocfs2_prepare_inode_for_refcount(inode,
1833                                                               saved_pos,
1834                                                               count,
1835                                                               &meta_level);
1836                }
1837
1838                if (ret < 0) {
1839                        mlog_errno(ret);
1840                        goto out_unlock;
1841                }
1842
1843                /*
1844                 * Skip the O_DIRECT checks if we don't need
1845                 * them.
1846                 */
1847                if (!direct_io || !(*direct_io))
1848                        break;
1849
1850                /*
1851                 * There's no sane way to do direct writes to an inode
1852                 * with inline data.
1853                 */
1854                if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1855                        *direct_io = 0;
1856                        break;
1857                }
1858
1859                /*
1860                 * Allowing concurrent direct writes means
1861                 * i_size changes wouldn't be synchronized, so
1862                 * one node could wind up truncating another
1863                 * nodes writes.
1864                 */
1865                if (end > i_size_read(inode)) {
1866                        *direct_io = 0;
1867                        break;
1868                }
1869
1870                /*
1871                 * We don't fill holes during direct io, so
1872                 * check for them here. If any are found, the
1873                 * caller will have to retake some cluster
1874                 * locks and initiate the io as buffered.
1875                 */
1876                ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
1877                if (ret == 1) {
1878                        *direct_io = 0;
1879                        ret = 0;
1880                } else if (ret < 0)
1881                        mlog_errno(ret);
1882                break;
1883        }
1884
1885        if (appending)
1886                *ppos = saved_pos;
1887
1888out_unlock:
1889        if (meta_level >= 0)
1890                ocfs2_inode_unlock(inode, meta_level);
1891
1892out:
1893        return ret;
1894}
1895
1896static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1897                                    const struct iovec *iov,
1898                                    unsigned long nr_segs,
1899                                    loff_t pos)
1900{
1901        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
1902        int can_do_direct;
1903        ssize_t written = 0;
1904        size_t ocount;          /* original count */
1905        size_t count;           /* after file limit checks */
1906        loff_t old_size, *ppos = &iocb->ki_pos;
1907        u32 old_clusters;
1908        struct file *file = iocb->ki_filp;
1909        struct inode *inode = file->f_path.dentry->d_inode;
1910        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1911
1912        mlog_entry("(0x%p, %u, '%.*s')\n", file,
1913                   (unsigned int)nr_segs,
1914                   file->f_path.dentry->d_name.len,
1915                   file->f_path.dentry->d_name.name);
1916
1917        if (iocb->ki_left == 0)
1918                return 0;
1919
1920        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1921
1922        appending = file->f_flags & O_APPEND ? 1 : 0;
1923        direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1924
1925        mutex_lock(&inode->i_mutex);
1926
1927relock:
1928        /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1929        if (direct_io) {
1930                down_read(&inode->i_alloc_sem);
1931                have_alloc_sem = 1;
1932        }
1933
1934        /* concurrent O_DIRECT writes are allowed */
1935        rw_level = !direct_io;
1936        ret = ocfs2_rw_lock(inode, rw_level);
1937        if (ret < 0) {
1938                mlog_errno(ret);
1939                goto out_sems;
1940        }
1941
1942        can_do_direct = direct_io;
1943        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1944                                            iocb->ki_left, appending,
1945                                            &can_do_direct);
1946        if (ret < 0) {
1947                mlog_errno(ret);
1948                goto out;
1949        }
1950
1951        /*
1952         * We can't complete the direct I/O as requested, fall back to
1953         * buffered I/O.
1954         */
1955        if (direct_io && !can_do_direct) {
1956                ocfs2_rw_unlock(inode, rw_level);
1957                up_read(&inode->i_alloc_sem);
1958
1959                have_alloc_sem = 0;
1960                rw_level = -1;
1961
1962                direct_io = 0;
1963                goto relock;
1964        }
1965
1966        /*
1967         * To later detect whether a journal commit for sync writes is
1968         * necessary, we sample i_size, and cluster count here.
1969         */
1970        old_size = i_size_read(inode);
1971        old_clusters = OCFS2_I(inode)->ip_clusters;
1972
1973        /* communicate with ocfs2_dio_end_io */
1974        ocfs2_iocb_set_rw_locked(iocb, rw_level);
1975
1976        if (direct_io) {
1977                ret = generic_segment_checks(iov, &nr_segs, &ocount,
1978                                             VERIFY_READ);
1979                if (ret)
1980                        goto out_dio;
1981
1982                count = ocount;
1983                ret = generic_write_checks(file, ppos, &count,
1984                                           S_ISBLK(inode->i_mode));
1985                if (ret)
1986                        goto out_dio;
1987
1988                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1989                                                    ppos, count, ocount);
1990                if (written < 0) {
1991                        /*
1992                         * direct write may have instantiated a few
1993                         * blocks outside i_size. Trim these off again.
1994                         * Don't need i_size_read because we hold i_mutex.
1995                         */
1996                        if (*ppos + count > inode->i_size)
1997                                vmtruncate(inode, inode->i_size);
1998                        ret = written;
1999                        goto out_dio;
2000                }

2001        } else {
2002                written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
2003        }
2004
2005out_dio:
2006        /* buffered aio wouldn't have proper lock coverage today */
2007        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2008
2009        if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
2010                ret = filemap_fdatawrite_range(file->f_mapping, pos,
2011                                               pos + count - 1);
2012                if (ret < 0)
2013                        written = ret;
2014
2015                if (!ret && (old_size != i_size_read(inode) ||
2016                    old_clusters != OCFS2_I(inode)->ip_clusters)) {
2017                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
2018                        if (ret < 0)
2019                                written = ret;
2020                }
2021
2022                if (!ret)
2023                        ret = filemap_fdatawait_range(file->f_mapping, pos,
2024                                                      pos + count - 1);
2025        }
2026
2027        /* 
2028         * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2029         * function pointer which is called when o_direct io completes so that
2030         * it can unlock our rw lock.  (it's the clustered equivalent of
2031         * i_alloc_sem; protects truncate from racing with pending ios).
2032         * Unfortunately there are error cases which call end_io and others
2033         * that don't.  so we don't have to unlock the rw_lock if either an
2034         * async dio is going to do it in the future or an end_io after an
2035         * error has already done it.
2036         */
2037        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2038                rw_level = -1;
2039                have_alloc_sem = 0;
2040        }
2041
2042out:
2043        if (rw_level != -1)
2044                ocfs2_rw_unlock(inode, rw_level);
2045
2046out_sems:
2047        if (have_alloc_sem)
2048                up_read(&inode->i_alloc_sem);
2049
2050        mutex_unlock(&inode->i_mutex);
2051
2052        if (written)
2053                ret = written;
2054        mlog_exit(ret);
2055        return ret;
2056}
2057
2058static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2059                                struct file *out,
2060                                struct splice_desc *sd)
2061{
2062        int ret;
2063
2064        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
2065                                            sd->total_len, 0, NULL);
2066        if (ret < 0) {
2067                mlog_errno(ret);
2068                return ret;
2069        }
2070
2071        return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2072}
2073
2074static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2075                                       struct file *out,
2076                                       loff_t *ppos,
2077                                       size_t len,
2078                                       unsigned int flags)
2079{
2080        int ret;
2081        struct address_space *mapping = out->f_mapping;
2082        struct inode *inode = mapping->host;
2083        struct splice_desc sd = {
2084                .total_len = len,
2085                .flags = flags,
2086                .pos = *ppos,
2087                .u.file = out,
2088        };
2089
2090        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
2091                   (unsigned int)len,
2092                   out->f_path.dentry->d_name.len,
2093                   out->f_path.dentry->d_name.name);
2094
2095        if (pipe->inode)
2096                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
2097
2098        splice_from_pipe_begin(&sd);
2099        do {
2100                ret = splice_from_pipe_next(pipe, &sd);
2101                if (ret <= 0)
2102                        break;
2103
2104                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2105                ret = ocfs2_rw_lock(inode, 1);
2106                if (ret < 0)
2107                        mlog_errno(ret);
2108                else {
2109                        ret = ocfs2_splice_to_file(pipe, out, &sd);
2110                        ocfs2_rw_unlock(inode, 1);
2111                }
2112                mutex_unlock(&inode->i_mutex);
2113        } while (ret > 0);
2114        splice_from_pipe_end(pipe, &sd);
2115
2116        if (pipe->inode)
2117                mutex_unlock(&pipe->inode->i_mutex);
2118
2119        if (sd.num_spliced)
2120                ret = sd.num_spliced;
2121
2122        if (ret > 0) {
2123                unsigned long nr_pages;
2124                int err;
2125
2126                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2127
2128                err = generic_write_sync(out, *ppos, ret);
2129                if (err)
2130                        ret = err;
2131                else
2132                        *ppos += ret;
2133
2134                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2135        }
2136
2137        mlog_exit(ret);
2138        return ret;
2139}
2140
2141static ssize_t ocfs2_file_splice_read(struct file *in,
2142                                      loff_t *ppos,
2143                                      struct pipe_inode_info *pipe,
2144                                      size_t len,
2145                                      unsigned int flags)
2146{
2147        int ret = 0, lock_level = 0;
2148        struct inode *inode = in->f_path.dentry->d_inode;
2149
2150        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
2151                   (unsigned int)len,
2152                   in->f_path.dentry->d_name.len,
2153                   in->f_path.dentry->d_name.name);
2154
2155        /*
2156         * See the comment in ocfs2_file_aio_read()
2157         */
2158        ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2159        if (ret < 0) {
2160                mlog_errno(ret);
2161                goto bail;
2162        }
2163        ocfs2_inode_unlock(inode, lock_level);
2164
2165        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2166
2167bail:
2168        mlog_exit(ret);
2169        return ret;
2170}
2171
2172static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2173                                   const struct iovec *iov,
2174                                   unsigned long nr_segs,
2175                                   loff_t pos)
2176{
2177        int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2178        struct file *filp = iocb->ki_filp;
2179        struct inode *inode = filp->f_path.dentry->d_inode;
2180
2181        mlog_entry("(0x%p, %u, '%.*s')\n", filp,
2182                   (unsigned int)nr_segs,
2183                   filp->f_path.dentry->d_name.len,
2184                   filp->f_path.dentry->d_name.name);
2185
2186        if (!inode) {
2187                ret = -EINVAL;
2188                mlog_errno(ret);
2189                goto bail;
2190        }
2191
2192        /* 
2193         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
2194         * need locks to protect pending reads from racing with truncate.
2195         */
2196        if (filp->f_flags & O_DIRECT) {
2197                down_read(&inode->i_alloc_sem);
2198                have_alloc_sem = 1;
2199
2200                ret = ocfs2_rw_lock(inode, 0);
2201                if (ret < 0) {
2202                        mlog_errno(ret);
2203                        goto bail;
2204                }
2205                rw_level = 0;
2206                /* communicate with ocfs2_dio_end_io */
2207                ocfs2_iocb_set_rw_locked(iocb, rw_level);
2208        }
2209
2210        /*
2211         * We're fine letting folks race truncates and extending
2212         * writes with read across the cluster, just like they can
2213         * locally. Hence no rw_lock during read.
2214         * 
2215         * Take and drop the meta data lock to update inode fields
2216         * like i_size. This allows the checks down below
2217         * generic_file_aio_read() a chance of actually working. 
2218         */
2219        ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2220        if (ret < 0) {
2221                mlog_errno(ret);
2222                goto bail;
2223        }
2224        ocfs2_inode_unlock(inode, lock_level);
2225
2226        ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2227        if (ret == -EINVAL)
2228                mlog(0, "generic_file_aio_read returned -EINVAL\n");
2229
2230        /* buffered aio wouldn't have proper lock coverage today */
2231        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2232
2233        /* see ocfs2_file_aio_write */
2234        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2235                rw_level = -1;
2236                have_alloc_sem = 0;
2237        }
2238
2239bail:
2240        if (have_alloc_sem)
2241                up_read(&inode->i_alloc_sem);
2242        if (rw_level != -1) 
2243                ocfs2_rw_unlock(inode, rw_level);
2244        mlog_exit(ret);
2245
2246        return ret;
2247}
2248
2249const struct inode_operations ocfs2_file_iops = {
2250        .setattr        = ocfs2_setattr,
2251        .getattr        = ocfs2_getattr,
2252        .permission     = ocfs2_permission,
2253        .setxattr       = generic_setxattr,
2254        .getxattr       = generic_getxattr,
2255        .listxattr      = ocfs2_listxattr,
2256        .removexattr    = generic_removexattr,
2257        .fallocate      = ocfs2_fallocate,
2258        .fiemap         = ocfs2_fiemap,
2259};
2260
2261const struct inode_operations ocfs2_special_file_iops = {
2262        .setattr        = ocfs2_setattr,
2263        .getattr        = ocfs2_getattr,
2264        .permission     = ocfs2_permission,
2265};
2266
2267/*
2268 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2269 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2270 */
2271const struct file_operations ocfs2_fops = {
2272        .llseek         = generic_file_llseek,
2273        .read           = do_sync_read,
2274        .write          = do_sync_write,
2275        .mmap           = ocfs2_mmap,
2276        .fsync          = ocfs2_sync_file,
2277        .release        = ocfs2_file_release,
2278        .open           = ocfs2_file_open,
2279        .aio_read       = ocfs2_file_aio_read,
2280        .aio_write      = ocfs2_file_aio_write,
2281        .unlocked_ioctl = ocfs2_ioctl,
2282#ifdef CONFIG_COMPAT
2283        .compat_ioctl   = ocfs2_compat_ioctl,
2284#endif
2285        .lock           = ocfs2_lock,
2286        .flock          = ocfs2_flock,
2287        .splice_read    = ocfs2_file_splice_read,
2288        .splice_write   = ocfs2_file_splice_write,
2289};
2290
2291const struct file_operations ocfs2_dops = {
2292        .llseek         = generic_file_llseek,
2293        .read           = generic_read_dir,
2294        .readdir        = ocfs2_readdir,
2295        .fsync          = ocfs2_sync_file,
2296        .release        = ocfs2_dir_release,
2297        .open           = ocfs2_dir_open,
2298        .unlocked_ioctl = ocfs2_ioctl,
2299#ifdef CONFIG_COMPAT
2300        .compat_ioctl   = ocfs2_compat_ioctl,
2301#endif
2302        .lock           = ocfs2_lock,
2303        .flock          = ocfs2_flock,
2304};
2305
2306/*
2307 * POSIX-lockless variants of our file_operations.
2308 *
2309 * These will be used if the underlying cluster stack does not support
2310 * posix file locking, if the user passes the "localflocks" mount
2311 * option, or if we have a local-only fs.
2312 *
2313 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2314 * so we still want it in the case of no stack support for
2315 * plocks. Internally, it will do the right thing when asked to ignore
2316 * the cluster.
2317 */
2318const struct file_operations ocfs2_fops_no_plocks = {
2319        .llseek         = generic_file_llseek,
2320        .read           = do_sync_read,
2321        .write          = do_sync_write,
2322        .mmap           = ocfs2_mmap,
2323        .fsync          = ocfs2_sync_file,
2324        .release        = ocfs2_file_release,
2325        .open           = ocfs2_file_open,
2326        .aio_read       = ocfs2_file_aio_read,
2327        .aio_write      = ocfs2_file_aio_write,
2328        .unlocked_ioctl = ocfs2_ioctl,
2329#ifdef CONFIG_COMPAT
2330        .compat_ioctl   = ocfs2_compat_ioctl,
2331#endif
2332        .flock          = ocfs2_flock,
2333        .splice_read    = ocfs2_file_splice_read,
2334        .splice_write   = ocfs2_file_splice_write,
2335};
2336
2337const struct file_operations ocfs2_dops_no_plocks = {
2338        .llseek         = generic_file_llseek,
2339        .read           = generic_read_dir,
2340        .readdir        = ocfs2_readdir,
2341        .fsync          = ocfs2_sync_file,
2342        .release        = ocfs2_dir_release,
2343        .open           = ocfs2_dir_open,
2344        .unlocked_ioctl = ocfs2_ioctl,
2345#ifdef CONFIG_COMPAT
2346        .compat_ioctl   = ocfs2_compat_ioctl,
2347#endif
2348        .flock          = ocfs2_flock,
2349};
2350