linux/drivers/staging/lustre/lustre/llite/file.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 *
  36 * lustre/llite/file.c
  37 *
  38 * Author: Peter Braam <braam@clusterfs.com>
  39 * Author: Phil Schwan <phil@clusterfs.com>
  40 * Author: Andreas Dilger <adilger@clusterfs.com>
  41 */
  42
  43#define DEBUG_SUBSYSTEM S_LLITE
  44#include "../include/lustre_dlm.h"
  45#include "../include/lustre_lite.h"
  46#include <linux/pagemap.h>
  47#include <linux/file.h>
  48#include "llite_internal.h"
  49#include "../include/lustre/ll_fiemap.h"
  50
  51#include "../include/cl_object.h"
  52
  53static int
  54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  55
  56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  57                          bool *lease_broken);
  58
  59static enum llioc_iter
  60ll_iocontrol_call(struct inode *inode, struct file *file,
  61                  unsigned int cmd, unsigned long arg, int *rcp);
  62
  63static struct ll_file_data *ll_file_data_get(void)
  64{
  65        struct ll_file_data *fd;
  66
  67        OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  68        if (fd == NULL)
  69                return NULL;
  70        fd->fd_write_failed = false;
  71        return fd;
  72}
  73
  74static void ll_file_data_put(struct ll_file_data *fd)
  75{
  76        if (fd != NULL)
  77                OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  78}
  79
  80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  81                          struct lustre_handle *fh)
  82{
  83        op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  84        op_data->op_attr.ia_mode = inode->i_mode;
  85        op_data->op_attr.ia_atime = inode->i_atime;
  86        op_data->op_attr.ia_mtime = inode->i_mtime;
  87        op_data->op_attr.ia_ctime = inode->i_ctime;
  88        op_data->op_attr.ia_size = i_size_read(inode);
  89        op_data->op_attr_blocks = inode->i_blocks;
  90        ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  91                                        ll_inode_to_ext_flags(inode->i_flags);
  92        op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  93        if (fh)
  94                op_data->op_handle = *fh;
  95        op_data->op_capa1 = ll_mdscapa_get(inode);
  96
  97        if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  98                op_data->op_bias |= MDS_DATA_MODIFIED;
  99}
 100
 101/**
 102 * Closes the IO epoch and packs all the attributes into @op_data for
 103 * the CLOSE rpc.
 104 */
 105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
 106                             struct obd_client_handle *och)
 107{
 108        op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 109                                        ATTR_MTIME | ATTR_MTIME_SET |
 110                                        ATTR_CTIME | ATTR_CTIME_SET;
 111
 112        if (!(och->och_flags & FMODE_WRITE))
 113                goto out;
 114
 115        if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 116                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 117        else
 118                ll_ioepoch_close(inode, op_data, &och, 0);
 119
 120out:
 121        ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 122        ll_prep_md_op_data(op_data, inode, NULL, NULL,
 123                           0, 0, LUSTRE_OPC_ANY, NULL);
 124}
 125
 126static int ll_close_inode_openhandle(struct obd_export *md_exp,
 127                                     struct inode *inode,
 128                                     struct obd_client_handle *och,
 129                                     const __u64 *data_version)
 130{
 131        struct obd_export *exp = ll_i2mdexp(inode);
 132        struct md_op_data *op_data;
 133        struct ptlrpc_request *req = NULL;
 134        struct obd_device *obd = class_exp2obd(exp);
 135        int epoch_close = 1;
 136        int rc;
 137
 138        if (obd == NULL) {
 139                /*
 140                 * XXX: in case of LMV, is this correct to access
 141                 * ->exp_handle?
 142                 */
 143                CERROR("Invalid MDC connection handle %#llx\n",
 144                       ll_i2mdexp(inode)->exp_handle.h_cookie);
 145                rc = 0;
 146                goto out;
 147        }
 148
 149        op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
 150        if (!op_data) {
 151                /* XXX We leak openhandle and request here. */
 152                rc = -ENOMEM;
 153                goto out;
 154        }
 155
 156        ll_prepare_close(inode, op_data, och);
 157        if (data_version != NULL) {
 158                /* Pass in data_version implies release. */
 159                op_data->op_bias |= MDS_HSM_RELEASE;
 160                op_data->op_data_version = *data_version;
 161                op_data->op_lease_handle = och->och_lease_handle;
 162                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 163        }
 164        epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 165        rc = md_close(md_exp, op_data, och->och_mod, &req);
 166        if (rc == -EAGAIN) {
 167                /* This close must have the epoch closed. */
 168                LASSERT(epoch_close);
 169                /* MDS has instructed us to obtain Size-on-MDS attribute from
 170                 * OSTs and send setattr to back to MDS. */
 171                rc = ll_som_update(inode, op_data);
 172                if (rc) {
 173                        CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
 174                               inode->i_ino, rc);
 175                        rc = 0;
 176                }
 177        } else if (rc) {
 178                CERROR("inode %lu mdc close failed: rc = %d\n",
 179                       inode->i_ino, rc);
 180        }
 181
 182        /* DATA_MODIFIED flag was successfully sent on close, cancel data
 183         * modification flag. */
 184        if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 185                struct ll_inode_info *lli = ll_i2info(inode);
 186
 187                spin_lock(&lli->lli_lock);
 188                lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 189                spin_unlock(&lli->lli_lock);
 190        }
 191
 192        if (rc == 0) {
 193                rc = ll_objects_destroy(req, inode);
 194                if (rc)
 195                        CERROR("inode %lu ll_objects destroy: rc = %d\n",
 196                               inode->i_ino, rc);
 197        }
 198        if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
 199                struct mdt_body *body;
 200                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 201                if (!(body->valid & OBD_MD_FLRELEASED))
 202                        rc = -EBUSY;
 203        }
 204
 205        ll_finish_md_op_data(op_data);
 206
 207out:
 208        if (exp_connect_som(exp) && !epoch_close &&
 209            S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 210                ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 211        } else {
 212                md_clear_open_replay_data(md_exp, och);
 213                /* Free @och if it is not waiting for DONE_WRITING. */
 214                och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 215                OBD_FREE_PTR(och);
 216        }
 217        if (req) /* This is close request */
 218                ptlrpc_req_finished(req);
 219        return rc;
 220}
 221
 222int ll_md_real_close(struct inode *inode, fmode_t fmode)
 223{
 224        struct ll_inode_info *lli = ll_i2info(inode);
 225        struct obd_client_handle **och_p;
 226        struct obd_client_handle *och;
 227        __u64 *och_usecount;
 228        int rc = 0;
 229
 230        if (fmode & FMODE_WRITE) {
 231                och_p = &lli->lli_mds_write_och;
 232                och_usecount = &lli->lli_open_fd_write_count;
 233        } else if (fmode & FMODE_EXEC) {
 234                och_p = &lli->lli_mds_exec_och;
 235                och_usecount = &lli->lli_open_fd_exec_count;
 236        } else {
 237                LASSERT(fmode & FMODE_READ);
 238                och_p = &lli->lli_mds_read_och;
 239                och_usecount = &lli->lli_open_fd_read_count;
 240        }
 241
 242        mutex_lock(&lli->lli_och_mutex);
 243        if (*och_usecount > 0) {
 244                /* There are still users of this handle, so skip
 245                 * freeing it. */
 246                mutex_unlock(&lli->lli_och_mutex);
 247                return 0;
 248        }
 249
 250        och = *och_p;
 251        *och_p = NULL;
 252        mutex_unlock(&lli->lli_och_mutex);
 253
 254        if (och != NULL) {
 255                /* There might be a race and this handle may already
 256                   be closed. */
 257                rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 258                                               inode, och, NULL);
 259        }
 260
 261        return rc;
 262}
 263
 264static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 265                       struct file *file)
 266{
 267        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 268        struct ll_inode_info *lli = ll_i2info(inode);
 269        int lockmode;
 270        __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 271        struct lustre_handle lockh;
 272        ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 273        int rc = 0;
 274
 275        /* clear group lock, if present */
 276        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 277                ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 278
 279        if (fd->fd_lease_och != NULL) {
 280                bool lease_broken;
 281
 282                /* Usually the lease is not released when the
 283                 * application crashed, we need to release here. */
 284                rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 285                CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 286                        PFID(&lli->lli_fid), rc, lease_broken);
 287
 288                fd->fd_lease_och = NULL;
 289        }
 290
 291        if (fd->fd_och != NULL) {
 292                rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
 293                fd->fd_och = NULL;
 294                goto out;
 295        }
 296
 297        /* Let's see if we have good enough OPEN lock on the file and if
 298           we can skip talking to MDS */
 299
 300        mutex_lock(&lli->lli_och_mutex);
 301        if (fd->fd_omode & FMODE_WRITE) {
 302                lockmode = LCK_CW;
 303                LASSERT(lli->lli_open_fd_write_count);
 304                lli->lli_open_fd_write_count--;
 305        } else if (fd->fd_omode & FMODE_EXEC) {
 306                lockmode = LCK_PR;
 307                LASSERT(lli->lli_open_fd_exec_count);
 308                lli->lli_open_fd_exec_count--;
 309        } else {
 310                lockmode = LCK_CR;
 311                LASSERT(lli->lli_open_fd_read_count);
 312                lli->lli_open_fd_read_count--;
 313        }
 314        mutex_unlock(&lli->lli_och_mutex);
 315
 316        if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 317                           LDLM_IBITS, &policy, lockmode, &lockh))
 318                rc = ll_md_real_close(inode, fd->fd_omode);
 319
 320out:
 321        LUSTRE_FPRIVATE(file) = NULL;
 322        ll_file_data_put(fd);
 323        ll_capa_close(inode);
 324
 325        return rc;
 326}
 327
 328/* While this returns an error code, fput() the caller does not, so we need
 329 * to make every effort to clean up all of our state here.  Also, applications
 330 * rarely check close errors and even if an error is returned they will not
 331 * re-try the close call.
 332 */
 333int ll_file_release(struct inode *inode, struct file *file)
 334{
 335        struct ll_file_data *fd;
 336        struct ll_sb_info *sbi = ll_i2sbi(inode);
 337        struct ll_inode_info *lli = ll_i2info(inode);
 338        int rc;
 339
 340        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 341               inode->i_generation, inode);
 342
 343#ifdef CONFIG_FS_POSIX_ACL
 344        if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
 345                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 346
 347                LASSERT(fd != NULL);
 348                if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 349                        fd->fd_flags &= ~LL_FILE_RMTACL;
 350                        rct_del(&sbi->ll_rct, current_pid());
 351                        et_search_free(&sbi->ll_et, current_pid());
 352                }
 353        }
 354#endif
 355
 356        if (!is_root_inode(inode))
 357                ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 358        fd = LUSTRE_FPRIVATE(file);
 359        LASSERT(fd != NULL);
 360
 361        /* The last ref on @file, maybe not the owner pid of statahead.
 362         * Different processes can open the same dir, "ll_opendir_key" means:
 363         * it is me that should stop the statahead thread. */
 364        if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 365            lli->lli_opendir_pid != 0)
 366                ll_stop_statahead(inode, lli->lli_opendir_key);
 367
 368        if (is_root_inode(inode)) {
 369                LUSTRE_FPRIVATE(file) = NULL;
 370                ll_file_data_put(fd);
 371                return 0;
 372        }
 373
 374        if (!S_ISDIR(inode->i_mode)) {
 375                lov_read_and_clear_async_rc(lli->lli_clob);
 376                lli->lli_async_rc = 0;
 377        }
 378
 379        rc = ll_md_close(sbi->ll_md_exp, inode, file);
 380
 381        if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 382                libcfs_debug_dumplog();
 383
 384        return rc;
 385}
 386
 387static int ll_intent_file_open(struct dentry *dentry, void *lmm,
 388                               int lmmsize, struct lookup_intent *itp)
 389{
 390        struct inode *inode = dentry->d_inode;
 391        struct ll_sb_info *sbi = ll_i2sbi(inode);
 392        struct dentry *parent = dentry->d_parent;
 393        const char *name = dentry->d_name.name;
 394        const int len = dentry->d_name.len;
 395        struct md_op_data *op_data;
 396        struct ptlrpc_request *req;
 397        __u32 opc = LUSTRE_OPC_ANY;
 398        int rc;
 399
 400        /* Usually we come here only for NFSD, and we want open lock.
 401           But we can also get here with pre 2.6.15 patchless kernels, and in
 402           that case that lock is also ok */
 403        /* We can also get here if there was cached open handle in revalidate_it
 404         * but it disappeared while we were getting from there to ll_file_open.
 405         * But this means this file was closed and immediately opened which
 406         * makes a good candidate for using OPEN lock */
 407        /* If lmmsize & lmm are not 0, we are just setting stripe info
 408         * parameters. No need for the open lock */
 409        if (lmm == NULL && lmmsize == 0) {
 410                itp->it_flags |= MDS_OPEN_LOCK;
 411                if (itp->it_flags & FMODE_WRITE)
 412                        opc = LUSTRE_OPC_CREATE;
 413        }
 414
 415        op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 416                                      inode, name, len,
 417                                      O_RDWR, opc, NULL);
 418        if (IS_ERR(op_data))
 419                return PTR_ERR(op_data);
 420
 421        itp->it_flags |= MDS_OPEN_BY_FID;
 422        rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 423                            0 /*unused */, &req, ll_md_blocking_ast, 0);
 424        ll_finish_md_op_data(op_data);
 425        if (rc == -ESTALE) {
 426                /* reason for keep own exit path - don`t flood log
 427                * with messages with -ESTALE errors.
 428                */
 429                if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 430                     it_open_error(DISP_OPEN_OPEN, itp))
 431                        goto out;
 432                ll_release_openhandle(inode, itp);
 433                goto out;
 434        }
 435
 436        if (it_disposition(itp, DISP_LOOKUP_NEG)) {
 437                rc = -ENOENT;
 438                goto out;
 439        }
 440
 441        if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 442                rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 443                CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 444                goto out;
 445        }
 446
 447        rc = ll_prep_inode(&inode, req, NULL, itp);
 448        if (!rc && itp->d.lustre.it_lock_mode)
 449                ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
 450
 451out:
 452        ptlrpc_req_finished(req);
 453        ll_intent_drop_lock(itp);
 454
 455        return rc;
 456}
 457
 458/**
 459 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 460 * not believe attributes if a few ioepoch holders exist. Attributes for
 461 * previous ioepoch if new one is opened are also skipped by MDS.
 462 */
 463void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 464{
 465        if (ioepoch && lli->lli_ioepoch != ioepoch) {
 466                lli->lli_ioepoch = ioepoch;
 467                CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
 468                       ioepoch, PFID(&lli->lli_fid));
 469        }
 470}
 471
 472static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 473                       struct obd_client_handle *och)
 474{
 475        struct ptlrpc_request *req = it->d.lustre.it_data;
 476        struct mdt_body *body;
 477
 478        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 479        och->och_fh = body->handle;
 480        och->och_fid = body->fid1;
 481        och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
 482        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 483        och->och_flags = it->it_flags;
 484
 485        return md_set_open_replay_data(md_exp, och, it);
 486}
 487
 488static int ll_local_open(struct file *file, struct lookup_intent *it,
 489                         struct ll_file_data *fd, struct obd_client_handle *och)
 490{
 491        struct inode *inode = file_inode(file);
 492        struct ll_inode_info *lli = ll_i2info(inode);
 493
 494        LASSERT(!LUSTRE_FPRIVATE(file));
 495
 496        LASSERT(fd != NULL);
 497
 498        if (och) {
 499                struct ptlrpc_request *req = it->d.lustre.it_data;
 500                struct mdt_body *body;
 501                int rc;
 502
 503                rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 504                if (rc != 0)
 505                        return rc;
 506
 507                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 508                ll_ioepoch_open(lli, body->ioepoch);
 509        }
 510
 511        LUSTRE_FPRIVATE(file) = fd;
 512        ll_readahead_init(inode, &fd->fd_ras);
 513        fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 514        return 0;
 515}
 516
 517/* Open a file, and (for the very first open) create objects on the OSTs at
 518 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 519 * creation or open until ll_lov_setstripe() ioctl is called.
 520 *
 521 * If we already have the stripe MD locally then we don't request it in
 522 * md_open(), by passing a lmm_size = 0.
 523 *
 524 * It is up to the application to ensure no other processes open this file
 525 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 526 * used.  We might be able to avoid races of that sort by getting lli_open_sem
 527 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 528 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 529 */
 530int ll_file_open(struct inode *inode, struct file *file)
 531{
 532        struct ll_inode_info *lli = ll_i2info(inode);
 533        struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 534                                          .it_flags = file->f_flags };
 535        struct obd_client_handle **och_p = NULL;
 536        __u64 *och_usecount = NULL;
 537        struct ll_file_data *fd;
 538        int rc = 0, opendir_set = 0;
 539
 540        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 541               inode->i_generation, inode, file->f_flags);
 542
 543        it = file->private_data; /* XXX: compat macro */
 544        file->private_data = NULL; /* prevent ll_local_open assertion */
 545
 546        fd = ll_file_data_get();
 547        if (fd == NULL) {
 548                rc = -ENOMEM;
 549                goto out_openerr;
 550        }
 551
 552        fd->fd_file = file;
 553        if (S_ISDIR(inode->i_mode)) {
 554                spin_lock(&lli->lli_sa_lock);
 555                if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 556                    lli->lli_opendir_pid == 0) {
 557                        lli->lli_opendir_key = fd;
 558                        lli->lli_opendir_pid = current_pid();
 559                        opendir_set = 1;
 560                }
 561                spin_unlock(&lli->lli_sa_lock);
 562        }
 563
 564        if (is_root_inode(inode)) {
 565                LUSTRE_FPRIVATE(file) = fd;
 566                return 0;
 567        }
 568
 569        if (!it || !it->d.lustre.it_disposition) {
 570                /* Convert f_flags into access mode. We cannot use file->f_mode,
 571                 * because everything but O_ACCMODE mask was stripped from
 572                 * there */
 573                if ((oit.it_flags + 1) & O_ACCMODE)
 574                        oit.it_flags++;
 575                if (file->f_flags & O_TRUNC)
 576                        oit.it_flags |= FMODE_WRITE;
 577
 578                /* kernel only call f_op->open in dentry_open.  filp_open calls
 579                 * dentry_open after call to open_namei that checks permissions.
 580                 * Only nfsd_open call dentry_open directly without checking
 581                 * permissions and because of that this code below is safe. */
 582                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 583                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 584
 585                /* We do not want O_EXCL here, presumably we opened the file
 586                 * already? XXX - NFS implications? */
 587                oit.it_flags &= ~O_EXCL;
 588
 589                /* bug20584, if "it_flags" contains O_CREAT, the file will be
 590                 * created if necessary, then "IT_CREAT" should be set to keep
 591                 * consistent with it */
 592                if (oit.it_flags & O_CREAT)
 593                        oit.it_op |= IT_CREAT;
 594
 595                it = &oit;
 596        }
 597
 598restart:
 599        /* Let's see if we have file open on MDS already. */
 600        if (it->it_flags & FMODE_WRITE) {
 601                och_p = &lli->lli_mds_write_och;
 602                och_usecount = &lli->lli_open_fd_write_count;
 603        } else if (it->it_flags & FMODE_EXEC) {
 604                och_p = &lli->lli_mds_exec_och;
 605                och_usecount = &lli->lli_open_fd_exec_count;
 606         } else {
 607                och_p = &lli->lli_mds_read_och;
 608                och_usecount = &lli->lli_open_fd_read_count;
 609        }
 610
 611        mutex_lock(&lli->lli_och_mutex);
 612        if (*och_p) { /* Open handle is present */
 613                if (it_disposition(it, DISP_OPEN_OPEN)) {
 614                        /* Well, there's extra open request that we do not need,
 615                           let's close it somehow. This will decref request. */
 616                        rc = it_open_error(DISP_OPEN_OPEN, it);
 617                        if (rc) {
 618                                mutex_unlock(&lli->lli_och_mutex);
 619                                goto out_openerr;
 620                        }
 621
 622                        ll_release_openhandle(inode, it);
 623                }
 624                (*och_usecount)++;
 625
 626                rc = ll_local_open(file, it, fd, NULL);
 627                if (rc) {
 628                        (*och_usecount)--;
 629                        mutex_unlock(&lli->lli_och_mutex);
 630                        goto out_openerr;
 631                }
 632        } else {
 633                LASSERT(*och_usecount == 0);
 634                if (!it->d.lustre.it_disposition) {
 635                        /* We cannot just request lock handle now, new ELC code
 636                           means that one of other OPEN locks for this file
 637                           could be cancelled, and since blocking ast handler
 638                           would attempt to grab och_mutex as well, that would
 639                           result in a deadlock */
 640                        mutex_unlock(&lli->lli_och_mutex);
 641                        it->it_create_mode |= M_CHECK_STALE;
 642                        rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
 643                        it->it_create_mode &= ~M_CHECK_STALE;
 644                        if (rc)
 645                                goto out_openerr;
 646
 647                        goto restart;
 648                }
 649                *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
 650                if (!*och_p) {
 651                        rc = -ENOMEM;
 652                        goto out_och_free;
 653                }
 654
 655                (*och_usecount)++;
 656
 657                /* md_intent_lock() didn't get a request ref if there was an
 658                 * open error, so don't do cleanup on the request here
 659                 * (bug 3430) */
 660                /* XXX (green): Should not we bail out on any error here, not
 661                 * just open error? */
 662                rc = it_open_error(DISP_OPEN_OPEN, it);
 663                if (rc)
 664                        goto out_och_free;
 665
 666                LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 667
 668                rc = ll_local_open(file, it, fd, *och_p);
 669                if (rc)
 670                        goto out_och_free;
 671        }
 672        mutex_unlock(&lli->lli_och_mutex);
 673        fd = NULL;
 674
 675        /* Must do this outside lli_och_mutex lock to prevent deadlock where
 676           different kind of OPEN lock for this same inode gets cancelled
 677           by ldlm_cancel_lru */
 678        if (!S_ISREG(inode->i_mode))
 679                goto out_och_free;
 680
 681        ll_capa_open(inode);
 682
 683        if (!lli->lli_has_smd &&
 684            (cl_is_lov_delay_create(file->f_flags) ||
 685             (file->f_mode & FMODE_WRITE) == 0)) {
 686                CDEBUG(D_INODE, "object creation was delayed\n");
 687                goto out_och_free;
 688        }
 689        cl_lov_delay_create_clear(&file->f_flags);
 690        goto out_och_free;
 691
 692out_och_free:
 693        if (rc) {
 694                if (och_p && *och_p) {
 695                        OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 696                        *och_p = NULL; /* OBD_FREE writes some magic there */
 697                        (*och_usecount)--;
 698                }
 699                mutex_unlock(&lli->lli_och_mutex);
 700
 701out_openerr:
 702                if (opendir_set != 0)
 703                        ll_stop_statahead(inode, lli->lli_opendir_key);
 704                if (fd != NULL)
 705                        ll_file_data_put(fd);
 706        } else {
 707                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 708        }
 709
 710        if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 711                ptlrpc_req_finished(it->d.lustre.it_data);
 712                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 713        }
 714
 715        return rc;
 716}
 717
 718static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 719                        struct ldlm_lock_desc *desc, void *data, int flag)
 720{
 721        int rc;
 722        struct lustre_handle lockh;
 723
 724        switch (flag) {
 725        case LDLM_CB_BLOCKING:
 726                ldlm_lock2handle(lock, &lockh);
 727                rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 728                if (rc < 0) {
 729                        CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 730                        return rc;
 731                }
 732                break;
 733        case LDLM_CB_CANCELING:
 734                /* do nothing */
 735                break;
 736        }
 737        return 0;
 738}
 739
 740/**
 741 * Acquire a lease and open the file.
 742 */
 743static struct obd_client_handle *
 744ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 745              __u64 open_flags)
 746{
 747        struct lookup_intent it = { .it_op = IT_OPEN };
 748        struct ll_sb_info *sbi = ll_i2sbi(inode);
 749        struct md_op_data *op_data;
 750        struct ptlrpc_request *req;
 751        struct lustre_handle old_handle = { 0 };
 752        struct obd_client_handle *och = NULL;
 753        int rc;
 754        int rc2;
 755
 756        if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 757                return ERR_PTR(-EINVAL);
 758
 759        if (file != NULL) {
 760                struct ll_inode_info *lli = ll_i2info(inode);
 761                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 762                struct obd_client_handle **och_p;
 763                __u64 *och_usecount;
 764
 765                if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 766                        return ERR_PTR(-EPERM);
 767
 768                /* Get the openhandle of the file */
 769                rc = -EBUSY;
 770                mutex_lock(&lli->lli_och_mutex);
 771                if (fd->fd_lease_och != NULL) {
 772                        mutex_unlock(&lli->lli_och_mutex);
 773                        return ERR_PTR(rc);
 774                }
 775
 776                if (fd->fd_och == NULL) {
 777                        if (file->f_mode & FMODE_WRITE) {
 778                                LASSERT(lli->lli_mds_write_och != NULL);
 779                                och_p = &lli->lli_mds_write_och;
 780                                och_usecount = &lli->lli_open_fd_write_count;
 781                        } else {
 782                                LASSERT(lli->lli_mds_read_och != NULL);
 783                                och_p = &lli->lli_mds_read_och;
 784                                och_usecount = &lli->lli_open_fd_read_count;
 785                        }
 786                        if (*och_usecount == 1) {
 787                                fd->fd_och = *och_p;
 788                                *och_p = NULL;
 789                                *och_usecount = 0;
 790                                rc = 0;
 791                        }
 792                }
 793                mutex_unlock(&lli->lli_och_mutex);
 794                if (rc < 0) /* more than 1 opener */
 795                        return ERR_PTR(rc);
 796
 797                LASSERT(fd->fd_och != NULL);
 798                old_handle = fd->fd_och->och_fh;
 799        }
 800
 801        och = kzalloc(sizeof(*och), GFP_NOFS);
 802        if (!och)
 803                return ERR_PTR(-ENOMEM);
 804
 805        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 806                                        LUSTRE_OPC_ANY, NULL);
 807        if (IS_ERR(op_data)) {
 808                rc = PTR_ERR(op_data);
 809                goto out;
 810        }
 811
 812        /* To tell the MDT this openhandle is from the same owner */
 813        op_data->op_handle = old_handle;
 814
 815        it.it_flags = fmode | open_flags;
 816        it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 817        rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
 818                                ll_md_blocking_lease_ast,
 819        /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 820         * it can be cancelled which may mislead applications that the lease is
 821         * broken;
 822         * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 823         * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 824         * doesn't deal with openhandle, so normal openhandle will be leaked. */
 825                                LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 826        ll_finish_md_op_data(op_data);
 827        ptlrpc_req_finished(req);
 828        if (rc < 0)
 829                goto out_release_it;
 830
 831        if (it_disposition(&it, DISP_LOOKUP_NEG)) {
 832                rc = -ENOENT;
 833                goto out_release_it;
 834        }
 835
 836        rc = it_open_error(DISP_OPEN_OPEN, &it);
 837        if (rc)
 838                goto out_release_it;
 839
 840        LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 841        ll_och_fill(sbi->ll_md_exp, &it, och);
 842
 843        if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
 844                rc = -EOPNOTSUPP;
 845                goto out_close;
 846        }
 847
 848        /* already get lease, handle lease lock */
 849        ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 850        if (it.d.lustre.it_lock_mode == 0 ||
 851            it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
 852                /* open lock must return for lease */
 853                CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 854                        PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
 855                        it.d.lustre.it_lock_bits);
 856                rc = -EPROTO;
 857                goto out_close;
 858        }
 859
 860        ll_intent_release(&it);
 861        return och;
 862
 863out_close:
 864        rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
 865        if (rc2)
 866                CERROR("Close openhandle returned %d\n", rc2);
 867
 868        /* cancel open lock */
 869        if (it.d.lustre.it_lock_mode != 0) {
 870                ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 871                                                it.d.lustre.it_lock_mode);
 872                it.d.lustre.it_lock_mode = 0;
 873        }
 874out_release_it:
 875        ll_intent_release(&it);
 876out:
 877        OBD_FREE_PTR(och);
 878        return ERR_PTR(rc);
 879}
 880
 881/**
 882 * Release lease and close the file.
 883 * It will check if the lease has ever broken.
 884 */
 885static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 886                          bool *lease_broken)
 887{
 888        struct ldlm_lock *lock;
 889        bool cancelled = true;
 890        int rc;
 891
 892        lock = ldlm_handle2lock(&och->och_lease_handle);
 893        if (lock != NULL) {
 894                lock_res_and_lock(lock);
 895                cancelled = ldlm_is_cancel(lock);
 896                unlock_res_and_lock(lock);
 897                ldlm_lock_put(lock);
 898        }
 899
 900        CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 901                PFID(&ll_i2info(inode)->lli_fid), cancelled);
 902
 903        if (!cancelled)
 904                ldlm_cli_cancel(&och->och_lease_handle, 0);
 905        if (lease_broken != NULL)
 906                *lease_broken = cancelled;
 907
 908        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
 909                                       NULL);
 910        return rc;
 911}
 912
 913/* Fills the obdo with the attributes for the lsm */
 914static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 915                          struct obd_capa *capa, struct obdo *obdo,
 916                          __u64 ioepoch, int sync)
 917{
 918        struct ptlrpc_request_set *set;
 919        struct obd_info     oinfo = { { { 0 } } };
 920        int                     rc;
 921
 922        LASSERT(lsm != NULL);
 923
 924        oinfo.oi_md = lsm;
 925        oinfo.oi_oa = obdo;
 926        oinfo.oi_oa->o_oi = lsm->lsm_oi;
 927        oinfo.oi_oa->o_mode = S_IFREG;
 928        oinfo.oi_oa->o_ioepoch = ioepoch;
 929        oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 930                               OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 931                               OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 932                               OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 933                               OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 934                               OBD_MD_FLDATAVERSION;
 935        oinfo.oi_capa = capa;
 936        if (sync) {
 937                oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 938                oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 939        }
 940
 941        set = ptlrpc_prep_set();
 942        if (set == NULL) {
 943                CERROR("can't allocate ptlrpc set\n");
 944                rc = -ENOMEM;
 945        } else {
 946                rc = obd_getattr_async(exp, &oinfo, set);
 947                if (rc == 0)
 948                        rc = ptlrpc_set_wait(set);
 949                ptlrpc_set_destroy(set);
 950        }
 951        if (rc == 0)
 952                oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 953                                         OBD_MD_FLATIME | OBD_MD_FLMTIME |
 954                                         OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 955                                         OBD_MD_FLDATAVERSION);
 956        return rc;
 957}
 958
 959/**
 960  * Performs the getattr on the inode and updates its fields.
 961  * If @sync != 0, perform the getattr under the server-side lock.
 962  */
 963int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 964                     __u64 ioepoch, int sync)
 965{
 966        struct obd_capa      *capa = ll_mdscapa_get(inode);
 967        struct lov_stripe_md *lsm;
 968        int rc;
 969
 970        lsm = ccc_inode_lsm_get(inode);
 971        rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 972                            capa, obdo, ioepoch, sync);
 973        capa_put(capa);
 974        if (rc == 0) {
 975                struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
 976
 977                obdo_refresh_inode(inode, obdo, obdo->o_valid);
 978                CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
 979                       POSTID(oi), i_size_read(inode),
 980                       (unsigned long long)inode->i_blocks,
 981                       1UL << inode->i_blkbits);
 982        }
 983        ccc_inode_lsm_put(inode, lsm);
 984        return rc;
 985}
 986
 987int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 988{
 989        struct ll_inode_info *lli = ll_i2info(inode);
 990        struct cl_object *obj = lli->lli_clob;
 991        struct cl_attr *attr = ccc_env_thread_attr(env);
 992        struct ost_lvb lvb;
 993        int rc = 0;
 994
 995        ll_inode_size_lock(inode);
 996        /* merge timestamps the most recently obtained from mds with
 997           timestamps obtained from osts */
 998        LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 999        LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1000        LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1001
1002        lvb.lvb_size = i_size_read(inode);
1003        lvb.lvb_blocks = inode->i_blocks;
1004        lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1005        lvb.lvb_atime = LTIME_S(inode->i_atime);
1006        lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1007
1008        cl_object_attr_lock(obj);
1009        rc = cl_object_attr_get(env, obj, attr);
1010        cl_object_attr_unlock(obj);
1011
1012        if (rc == 0) {
1013                if (lvb.lvb_atime < attr->cat_atime)
1014                        lvb.lvb_atime = attr->cat_atime;
1015                if (lvb.lvb_ctime < attr->cat_ctime)
1016                        lvb.lvb_ctime = attr->cat_ctime;
1017                if (lvb.lvb_mtime < attr->cat_mtime)
1018                        lvb.lvb_mtime = attr->cat_mtime;
1019
1020                CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1021                                PFID(&lli->lli_fid), attr->cat_size);
1022                cl_isize_write_nolock(inode, attr->cat_size);
1023
1024                inode->i_blocks = attr->cat_blocks;
1025
1026                LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1027                LTIME_S(inode->i_atime) = lvb.lvb_atime;
1028                LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1029        }
1030        ll_inode_size_unlock(inode);
1031
1032        return rc;
1033}
1034
1035int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1036                     lstat_t *st)
1037{
1038        struct obdo obdo = { 0 };
1039        int rc;
1040
1041        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1042        if (rc == 0) {
1043                st->st_size   = obdo.o_size;
1044                st->st_blocks = obdo.o_blocks;
1045                st->st_mtime  = obdo.o_mtime;
1046                st->st_atime  = obdo.o_atime;
1047                st->st_ctime  = obdo.o_ctime;
1048        }
1049        return rc;
1050}
1051
1052static bool file_is_noatime(const struct file *file)
1053{
1054        const struct vfsmount *mnt = file->f_path.mnt;
1055        const struct inode *inode = file_inode(file);
1056
1057        /* Adapted from file_accessed() and touch_atime().*/
1058        if (file->f_flags & O_NOATIME)
1059                return true;
1060
1061        if (inode->i_flags & S_NOATIME)
1062                return true;
1063
1064        if (IS_NOATIME(inode))
1065                return true;
1066
1067        if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1068                return true;
1069
1070        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1071                return true;
1072
1073        if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1074                return true;
1075
1076        return false;
1077}
1078
1079void ll_io_init(struct cl_io *io, const struct file *file, int write)
1080{
1081        struct inode *inode = file_inode(file);
1082
1083        io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1084        if (write) {
1085                io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1086                io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1087                                      file->f_flags & O_DIRECT ||
1088                                      IS_SYNC(inode);
1089        }
1090        io->ci_obj     = ll_i2info(inode)->lli_clob;
1091        io->ci_lockreq = CILR_MAYBE;
1092        if (ll_file_nolock(file)) {
1093                io->ci_lockreq = CILR_NEVER;
1094                io->ci_no_srvlock = 1;
1095        } else if (file->f_flags & O_APPEND) {
1096                io->ci_lockreq = CILR_MANDATORY;
1097        }
1098
1099        io->ci_noatime = file_is_noatime(file);
1100}
1101
1102static ssize_t
1103ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1104                   struct file *file, enum cl_io_type iot,
1105                   loff_t *ppos, size_t count)
1106{
1107        struct ll_inode_info *lli = ll_i2info(file_inode(file));
1108        struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1109        struct cl_io     *io;
1110        ssize_t        result;
1111
1112restart:
1113        io = ccc_env_thread_io(env);
1114        ll_io_init(io, file, iot == CIT_WRITE);
1115
1116        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1117                struct vvp_io *vio = vvp_env_io(env);
1118                struct ccc_io *cio = ccc_env_io(env);
1119                int write_mutex_locked = 0;
1120
1121                cio->cui_fd  = LUSTRE_FPRIVATE(file);
1122                vio->cui_io_subtype = args->via_io_subtype;
1123
1124                switch (vio->cui_io_subtype) {
1125                case IO_NORMAL:
1126                        cio->cui_iter = args->u.normal.via_iter;
1127                        cio->cui_iocb = args->u.normal.via_iocb;
1128                        if ((iot == CIT_WRITE) &&
1129                            !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1130                                if (mutex_lock_interruptible(&lli->
1131                                                               lli_write_mutex)) {
1132                                        result = -ERESTARTSYS;
1133                                        goto out;
1134                                }
1135                                write_mutex_locked = 1;
1136                        } else if (iot == CIT_READ) {
1137                                down_read(&lli->lli_trunc_sem);
1138                        }
1139                        break;
1140                case IO_SPLICE:
1141                        vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1142                        vio->u.splice.cui_flags = args->u.splice.via_flags;
1143                        break;
1144                default:
1145                        CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1146                        LBUG();
1147                }
1148                result = cl_io_loop(env, io);
1149                if (write_mutex_locked)
1150                        mutex_unlock(&lli->lli_write_mutex);
1151                else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1152                        up_read(&lli->lli_trunc_sem);
1153        } else {
1154                /* cl_io_rw_init() handled IO */
1155                result = io->ci_result;
1156        }
1157
1158        if (io->ci_nob > 0) {
1159                result = io->ci_nob;
1160                *ppos = io->u.ci_wr.wr.crw_pos;
1161        }
1162        goto out;
1163out:
1164        cl_io_fini(env, io);
1165        /* If any bit been read/written (result != 0), we just return
1166         * short read/write instead of restart io. */
1167        if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1168                CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1169                       iot == CIT_READ ? "read" : "write",
1170                       file, *ppos, count);
1171                LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1172                goto restart;
1173        }
1174
1175        if (iot == CIT_READ) {
1176                if (result >= 0)
1177                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1178                                           LPROC_LL_READ_BYTES, result);
1179        } else if (iot == CIT_WRITE) {
1180                if (result >= 0) {
1181                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1182                                           LPROC_LL_WRITE_BYTES, result);
1183                        fd->fd_write_failed = false;
1184                } else if (result != -ERESTARTSYS) {
1185                        fd->fd_write_failed = true;
1186                }
1187        }
1188
1189        return result;
1190}
1191
1192static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1193{
1194        struct lu_env      *env;
1195        struct vvp_io_args *args;
1196        ssize_t      result;
1197        int              refcheck;
1198
1199        env = cl_env_get(&refcheck);
1200        if (IS_ERR(env))
1201                return PTR_ERR(env);
1202
1203        args = vvp_env_args(env, IO_NORMAL);
1204        args->u.normal.via_iter = to;
1205        args->u.normal.via_iocb = iocb;
1206
1207        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1208                                    &iocb->ki_pos, iov_iter_count(to));
1209        cl_env_put(env, &refcheck);
1210        return result;
1211}
1212
1213/*
1214 * Write to a file (through the page cache).
1215 */
1216static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1217{
1218        struct lu_env      *env;
1219        struct vvp_io_args *args;
1220        ssize_t      result;
1221        int              refcheck;
1222
1223        env = cl_env_get(&refcheck);
1224        if (IS_ERR(env))
1225                return PTR_ERR(env);
1226
1227        args = vvp_env_args(env, IO_NORMAL);
1228        args->u.normal.via_iter = from;
1229        args->u.normal.via_iocb = iocb;
1230
1231        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1232                                  &iocb->ki_pos, iov_iter_count(from));
1233        cl_env_put(env, &refcheck);
1234        return result;
1235}
1236
1237/*
1238 * Send file content (through pagecache) somewhere with helper
1239 */
1240static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1241                                   struct pipe_inode_info *pipe, size_t count,
1242                                   unsigned int flags)
1243{
1244        struct lu_env      *env;
1245        struct vvp_io_args *args;
1246        ssize_t      result;
1247        int              refcheck;
1248
1249        env = cl_env_get(&refcheck);
1250        if (IS_ERR(env))
1251                return PTR_ERR(env);
1252
1253        args = vvp_env_args(env, IO_SPLICE);
1254        args->u.splice.via_pipe = pipe;
1255        args->u.splice.via_flags = flags;
1256
1257        result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1258        cl_env_put(env, &refcheck);
1259        return result;
1260}
1261
1262static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1263{
1264        struct obd_export *exp = ll_i2dtexp(inode);
1265        struct obd_trans_info oti = { 0 };
1266        struct obdo *oa = NULL;
1267        int lsm_size;
1268        int rc = 0;
1269        struct lov_stripe_md *lsm = NULL, *lsm2;
1270
1271        OBDO_ALLOC(oa);
1272        if (oa == NULL)
1273                return -ENOMEM;
1274
1275        lsm = ccc_inode_lsm_get(inode);
1276        if (!lsm_has_objects(lsm)) {
1277                rc = -ENOENT;
1278                goto out;
1279        }
1280
1281        lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1282                   (lsm->lsm_stripe_count));
1283
1284        OBD_ALLOC_LARGE(lsm2, lsm_size);
1285        if (lsm2 == NULL) {
1286                rc = -ENOMEM;
1287                goto out;
1288        }
1289
1290        oa->o_oi = *oi;
1291        oa->o_nlink = ost_idx;
1292        oa->o_flags |= OBD_FL_RECREATE_OBJS;
1293        oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1294        obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1295                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1296        obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1297        memcpy(lsm2, lsm, lsm_size);
1298        ll_inode_size_lock(inode);
1299        rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1300        ll_inode_size_unlock(inode);
1301
1302        OBD_FREE_LARGE(lsm2, lsm_size);
1303        goto out;
1304out:
1305        ccc_inode_lsm_put(inode, lsm);
1306        OBDO_FREE(oa);
1307        return rc;
1308}
1309
1310static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1311{
1312        struct ll_recreate_obj ucreat;
1313        struct ost_id           oi;
1314
1315        if (!capable(CFS_CAP_SYS_ADMIN))
1316                return -EPERM;
1317
1318        if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1319                           sizeof(ucreat)))
1320                return -EFAULT;
1321
1322        ostid_set_seq_mdt0(&oi);
1323        ostid_set_id(&oi, ucreat.lrc_id);
1324        return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1325}
1326
1327static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1328{
1329        struct lu_fid   fid;
1330        struct ost_id   oi;
1331        u32             ost_idx;
1332
1333        if (!capable(CFS_CAP_SYS_ADMIN))
1334                return -EPERM;
1335
1336        if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1337                return -EFAULT;
1338
1339        fid_to_ostid(&fid, &oi);
1340        ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1341        return ll_lov_recreate(inode, &oi, ost_idx);
1342}
1343
1344int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1345                             int flags, struct lov_user_md *lum, int lum_size)
1346{
1347        struct lov_stripe_md *lsm = NULL;
1348        struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1349        int rc = 0;
1350
1351        lsm = ccc_inode_lsm_get(inode);
1352        if (lsm != NULL) {
1353                ccc_inode_lsm_put(inode, lsm);
1354                CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1355                       inode->i_ino);
1356                rc = -EEXIST;
1357                goto out;
1358        }
1359
1360        ll_inode_size_lock(inode);
1361        rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1362        if (rc)
1363                goto out_unlock;
1364        rc = oit.d.lustre.it_status;
1365        if (rc < 0)
1366                goto out_req_free;
1367
1368        ll_release_openhandle(inode, &oit);
1369
1370out_unlock:
1371        ll_inode_size_unlock(inode);
1372        ll_intent_release(&oit);
1373        ccc_inode_lsm_put(inode, lsm);
1374out:
1375        return rc;
1376out_req_free:
1377        ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1378        goto out;
1379}
1380
1381int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1382                             struct lov_mds_md **lmmp, int *lmm_size,
1383                             struct ptlrpc_request **request)
1384{
1385        struct ll_sb_info *sbi = ll_i2sbi(inode);
1386        struct mdt_body  *body;
1387        struct lov_mds_md *lmm = NULL;
1388        struct ptlrpc_request *req = NULL;
1389        struct md_op_data *op_data;
1390        int rc, lmmsize;
1391
1392        rc = ll_get_default_mdsize(sbi, &lmmsize);
1393        if (rc)
1394                return rc;
1395
1396        op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1397                                     strlen(filename), lmmsize,
1398                                     LUSTRE_OPC_ANY, NULL);
1399        if (IS_ERR(op_data))
1400                return PTR_ERR(op_data);
1401
1402        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1403        rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1404        ll_finish_md_op_data(op_data);
1405        if (rc < 0) {
1406                CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1407                       filename, rc);
1408                goto out;
1409        }
1410
1411        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1412        LASSERT(body != NULL); /* checked by mdc_getattr_name */
1413
1414        lmmsize = body->eadatasize;
1415
1416        if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1417                        lmmsize == 0) {
1418                rc = -ENODATA;
1419                goto out;
1420        }
1421
1422        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1423        LASSERT(lmm != NULL);
1424
1425        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1426            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1427                rc = -EPROTO;
1428                goto out;
1429        }
1430
1431        /*
1432         * This is coming from the MDS, so is probably in
1433         * little endian.  We convert it to host endian before
1434         * passing it to userspace.
1435         */
1436        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1437                int stripe_count;
1438
1439                stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440                if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1441                        stripe_count = 0;
1442
1443                /* if function called for directory - we should
1444                 * avoid swab not existent lsm objects */
1445                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447                        if (S_ISREG(body->mode))
1448                                lustre_swab_lov_user_md_objects(
1449                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1450                                 stripe_count);
1451                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453                        if (S_ISREG(body->mode))
1454                                lustre_swab_lov_user_md_objects(
1455                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1456                                 stripe_count);
1457                }
1458        }
1459
1460out:
1461        *lmmp = lmm;
1462        *lmm_size = lmmsize;
1463        *request = req;
1464        return rc;
1465}
1466
1467static int ll_lov_setea(struct inode *inode, struct file *file,
1468                            unsigned long arg)
1469{
1470        int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471        struct lov_user_md      *lump;
1472        int                      lum_size = sizeof(struct lov_user_md) +
1473                                            sizeof(struct lov_user_ost_data);
1474        int                      rc;
1475
1476        if (!capable(CFS_CAP_SYS_ADMIN))
1477                return -EPERM;
1478
1479        OBD_ALLOC_LARGE(lump, lum_size);
1480        if (lump == NULL)
1481                return -ENOMEM;
1482
1483        if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1484                OBD_FREE_LARGE(lump, lum_size);
1485                return -EFAULT;
1486        }
1487
1488        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1489                                     lum_size);
1490        cl_lov_delay_create_clear(&file->f_flags);
1491
1492        OBD_FREE_LARGE(lump, lum_size);
1493        return rc;
1494}
1495
1496static int ll_lov_setstripe(struct inode *inode, struct file *file,
1497                            unsigned long arg)
1498{
1499        struct lov_user_md_v3    lumv3;
1500        struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1501        struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1502        struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1503        int                      lum_size, rc;
1504        int                      flags = FMODE_WRITE;
1505
1506        /* first try with v1 which is smaller than v3 */
1507        lum_size = sizeof(struct lov_user_md_v1);
1508        if (copy_from_user(lumv1, lumv1p, lum_size))
1509                return -EFAULT;
1510
1511        if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512                lum_size = sizeof(struct lov_user_md_v3);
1513                if (copy_from_user(&lumv3, lumv3p, lum_size))
1514                        return -EFAULT;
1515        }
1516
1517        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1518                                      lum_size);
1519        cl_lov_delay_create_clear(&file->f_flags);
1520        if (rc == 0) {
1521                struct lov_stripe_md *lsm;
1522                __u32 gen;
1523
1524                put_user(0, &lumv1p->lmm_stripe_count);
1525
1526                ll_layout_refresh(inode, &gen);
1527                lsm = ccc_inode_lsm_get(inode);
1528                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1529                                   0, lsm, (void *)arg);
1530                ccc_inode_lsm_put(inode, lsm);
1531        }
1532        return rc;
1533}
1534
1535static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1536{
1537        struct lov_stripe_md *lsm;
1538        int rc = -ENODATA;
1539
1540        lsm = ccc_inode_lsm_get(inode);
1541        if (lsm != NULL)
1542                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1543                                   lsm, (void *)arg);
1544        ccc_inode_lsm_put(inode, lsm);
1545        return rc;
1546}
1547
1548static int
1549ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1550{
1551        struct ll_inode_info   *lli = ll_i2info(inode);
1552        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1553        struct ccc_grouplock    grouplock;
1554        int                  rc;
1555
1556        if (arg == 0) {
1557                CWARN("group id for group lock must not be 0\n");
1558                return -EINVAL;
1559        }
1560
1561        if (ll_file_nolock(file))
1562                return -EOPNOTSUPP;
1563
1564        spin_lock(&lli->lli_lock);
1565        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1566                CWARN("group lock already existed with gid %lu\n",
1567                      fd->fd_grouplock.cg_gid);
1568                spin_unlock(&lli->lli_lock);
1569                return -EINVAL;
1570        }
1571        LASSERT(fd->fd_grouplock.cg_lock == NULL);
1572        spin_unlock(&lli->lli_lock);
1573
1574        rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1575                              arg, (file->f_flags & O_NONBLOCK), &grouplock);
1576        if (rc)
1577                return rc;
1578
1579        spin_lock(&lli->lli_lock);
1580        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1581                spin_unlock(&lli->lli_lock);
1582                CERROR("another thread just won the race\n");
1583                cl_put_grouplock(&grouplock);
1584                return -EINVAL;
1585        }
1586
1587        fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1588        fd->fd_grouplock = grouplock;
1589        spin_unlock(&lli->lli_lock);
1590
1591        CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1592        return 0;
1593}
1594
1595static int ll_put_grouplock(struct inode *inode, struct file *file,
1596                            unsigned long arg)
1597{
1598        struct ll_inode_info   *lli = ll_i2info(inode);
1599        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1600        struct ccc_grouplock    grouplock;
1601
1602        spin_lock(&lli->lli_lock);
1603        if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1604                spin_unlock(&lli->lli_lock);
1605                CWARN("no group lock held\n");
1606                return -EINVAL;
1607        }
1608        LASSERT(fd->fd_grouplock.cg_lock != NULL);
1609
1610        if (fd->fd_grouplock.cg_gid != arg) {
1611                CWARN("group lock %lu doesn't match current id %lu\n",
1612                       arg, fd->fd_grouplock.cg_gid);
1613                spin_unlock(&lli->lli_lock);
1614                return -EINVAL;
1615        }
1616
1617        grouplock = fd->fd_grouplock;
1618        memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1619        fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1620        spin_unlock(&lli->lli_lock);
1621
1622        cl_put_grouplock(&grouplock);
1623        CDEBUG(D_INFO, "group lock %lu released\n", arg);
1624        return 0;
1625}
1626
1627/**
1628 * Close inode open handle
1629 *
1630 * \param inode  [in]     inode in question
1631 * \param it     [in,out] intent which contains open info and result
1632 *
1633 * \retval 0     success
1634 * \retval <0    failure
1635 */
1636int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1637{
1638        struct obd_client_handle *och;
1639        int rc;
1640
1641        LASSERT(inode);
1642
1643        /* Root ? Do nothing. */
1644        if (is_root_inode(inode))
1645                return 0;
1646
1647        /* No open handle to close? Move away */
1648        if (!it_disposition(it, DISP_OPEN_OPEN))
1649                return 0;
1650
1651        LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1652
1653        och = kzalloc(sizeof(*och), GFP_NOFS);
1654        if (!och) {
1655                rc = -ENOMEM;
1656                goto out;
1657        }
1658
1659        ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1660
1661        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1662                                       inode, och, NULL);
1663out:
1664        /* this one is in place of ll_file_open */
1665        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1666                ptlrpc_req_finished(it->d.lustre.it_data);
1667                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1668        }
1669        return rc;
1670}
1671
1672/**
1673 * Get size for inode for which FIEMAP mapping is requested.
1674 * Make the FIEMAP get_info call and returns the result.
1675 */
1676static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1677                        size_t num_bytes)
1678{
1679        struct obd_export *exp = ll_i2dtexp(inode);
1680        struct lov_stripe_md *lsm = NULL;
1681        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1682        __u32 vallen = num_bytes;
1683        int rc;
1684
1685        /* Checks for fiemap flags */
1686        if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1687                fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1688                return -EBADR;
1689        }
1690
1691        /* Check for FIEMAP_FLAG_SYNC */
1692        if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1693                rc = filemap_fdatawrite(inode->i_mapping);
1694                if (rc)
1695                        return rc;
1696        }
1697
1698        lsm = ccc_inode_lsm_get(inode);
1699        if (lsm == NULL)
1700                return -ENOENT;
1701
1702        /* If the stripe_count > 1 and the application does not understand
1703         * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1704         */
1705        if (lsm->lsm_stripe_count > 1 &&
1706            !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1707                rc = -EOPNOTSUPP;
1708                goto out;
1709        }
1710
1711        fm_key.oa.o_oi = lsm->lsm_oi;
1712        fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1713
1714        obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1715        obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1716        /* If filesize is 0, then there would be no objects for mapping */
1717        if (fm_key.oa.o_size == 0) {
1718                fiemap->fm_mapped_extents = 0;
1719                rc = 0;
1720                goto out;
1721        }
1722
1723        memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1724
1725        rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1726                          fiemap, lsm);
1727        if (rc)
1728                CERROR("obd_get_info failed: rc = %d\n", rc);
1729
1730out:
1731        ccc_inode_lsm_put(inode, lsm);
1732        return rc;
1733}
1734
1735int ll_fid2path(struct inode *inode, void __user *arg)
1736{
1737        struct obd_export *exp = ll_i2mdexp(inode);
1738        const struct getinfo_fid2path __user *gfin = arg;
1739        struct getinfo_fid2path *gfout;
1740        u32 pathlen;
1741        size_t outsize;
1742        int rc;
1743
1744        if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1745            !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1746                return -EPERM;
1747
1748        /* Only need to get the buflen */
1749        if (get_user(pathlen, &gfin->gf_pathlen))
1750                return -EFAULT;
1751
1752        if (pathlen > PATH_MAX)
1753                return -EINVAL;
1754
1755        outsize = sizeof(*gfout) + pathlen;
1756
1757        gfout = kzalloc(outsize, GFP_NOFS);
1758        if (!gfout)
1759                return -ENOMEM;
1760
1761        if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1762                rc = -EFAULT;
1763                goto gf_free;
1764        }
1765
1766        /* Call mdc_iocontrol */
1767        rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1768        if (rc != 0)
1769                goto gf_free;
1770
1771        if (copy_to_user(arg, gfout, outsize))
1772                rc = -EFAULT;
1773
1774gf_free:
1775        OBD_FREE(gfout, outsize);
1776        return rc;
1777}
1778
1779static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1780{
1781        struct ll_user_fiemap *fiemap_s;
1782        size_t num_bytes, ret_bytes;
1783        unsigned int extent_count;
1784        int rc = 0;
1785
1786        /* Get the extent count so we can calculate the size of
1787         * required fiemap buffer */
1788        if (get_user(extent_count,
1789            &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1790                return -EFAULT;
1791
1792        if (extent_count >=
1793            (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1794                return -EINVAL;
1795        num_bytes = sizeof(*fiemap_s) + (extent_count *
1796                                         sizeof(struct ll_fiemap_extent));
1797
1798        OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1799        if (fiemap_s == NULL)
1800                return -ENOMEM;
1801
1802        /* get the fiemap value */
1803        if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1804                           sizeof(*fiemap_s))) {
1805                rc = -EFAULT;
1806                goto error;
1807        }
1808
1809        /* If fm_extent_count is non-zero, read the first extent since
1810         * it is used to calculate end_offset and device from previous
1811         * fiemap call. */
1812        if (extent_count) {
1813                if (copy_from_user(&fiemap_s->fm_extents[0],
1814                    (char __user *)arg + sizeof(*fiemap_s),
1815                    sizeof(struct ll_fiemap_extent))) {
1816                        rc = -EFAULT;
1817                        goto error;
1818                }
1819        }
1820
1821        rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1822        if (rc)
1823                goto error;
1824
1825        ret_bytes = sizeof(struct ll_user_fiemap);
1826
1827        if (extent_count != 0)
1828                ret_bytes += (fiemap_s->fm_mapped_extents *
1829                                 sizeof(struct ll_fiemap_extent));
1830
1831        if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1832                rc = -EFAULT;
1833
1834error:
1835        OBD_FREE_LARGE(fiemap_s, num_bytes);
1836        return rc;
1837}
1838
1839/*
1840 * Read the data_version for inode.
1841 *
1842 * This value is computed using stripe object version on OST.
1843 * Version is computed using server side locking.
1844 *
1845 * @param extent_lock  Take extent lock. Not needed if a process is already
1846 *                     holding the OST object group locks.
1847 */
1848int ll_data_version(struct inode *inode, __u64 *data_version,
1849                    int extent_lock)
1850{
1851        struct lov_stripe_md    *lsm = NULL;
1852        struct ll_sb_info       *sbi = ll_i2sbi(inode);
1853        struct obdo             *obdo = NULL;
1854        int                      rc;
1855
1856        /* If no stripe, we consider version is 0. */
1857        lsm = ccc_inode_lsm_get(inode);
1858        if (!lsm_has_objects(lsm)) {
1859                *data_version = 0;
1860                CDEBUG(D_INODE, "No object for inode\n");
1861                rc = 0;
1862                goto out;
1863        }
1864
1865        obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1866        if (!obdo) {
1867                rc = -ENOMEM;
1868                goto out;
1869        }
1870
1871        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1872        if (rc == 0) {
1873                if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1874                        rc = -EOPNOTSUPP;
1875                else
1876                        *data_version = obdo->o_data_version;
1877        }
1878
1879        OBD_FREE_PTR(obdo);
1880out:
1881        ccc_inode_lsm_put(inode, lsm);
1882        return rc;
1883}
1884
1885/*
1886 * Trigger a HSM release request for the provided inode.
1887 */
1888int ll_hsm_release(struct inode *inode)
1889{
1890        struct cl_env_nest nest;
1891        struct lu_env *env;
1892        struct obd_client_handle *och = NULL;
1893        __u64 data_version = 0;
1894        int rc;
1895
1896
1897        CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1898               ll_get_fsname(inode->i_sb, NULL, 0),
1899               PFID(&ll_i2info(inode)->lli_fid));
1900
1901        och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1902        if (IS_ERR(och)) {
1903                rc = PTR_ERR(och);
1904                goto out;
1905        }
1906
1907        /* Grab latest data_version and [am]time values */
1908        rc = ll_data_version(inode, &data_version, 1);
1909        if (rc != 0)
1910                goto out;
1911
1912        env = cl_env_nested_get(&nest);
1913        if (IS_ERR(env)) {
1914                rc = PTR_ERR(env);
1915                goto out;
1916        }
1917
1918        ll_merge_lvb(env, inode);
1919        cl_env_nested_put(&nest, env);
1920
1921        /* Release the file.
1922         * NB: lease lock handle is released in mdc_hsm_release_pack() because
1923         * we still need it to pack l_remote_handle to MDT. */
1924        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1925                                       &data_version);
1926        och = NULL;
1927
1928
1929out:
1930        if (och != NULL && !IS_ERR(och)) /* close the file */
1931                ll_lease_close(och, inode, NULL);
1932
1933        return rc;
1934}
1935
1936struct ll_swap_stack {
1937        struct iattr             ia1, ia2;
1938        __u64                    dv1, dv2;
1939        struct inode            *inode1, *inode2;
1940        bool                     check_dv1, check_dv2;
1941};
1942
1943static int ll_swap_layouts(struct file *file1, struct file *file2,
1944                           struct lustre_swap_layouts *lsl)
1945{
1946        struct mdc_swap_layouts  msl;
1947        struct md_op_data       *op_data;
1948        __u32                    gid;
1949        __u64                    dv;
1950        struct ll_swap_stack    *llss = NULL;
1951        int                      rc;
1952
1953        llss = kzalloc(sizeof(*llss), GFP_NOFS);
1954        if (!llss)
1955                return -ENOMEM;
1956
1957        llss->inode1 = file_inode(file1);
1958        llss->inode2 = file_inode(file2);
1959
1960        if (!S_ISREG(llss->inode2->i_mode)) {
1961                rc = -EINVAL;
1962                goto free;
1963        }
1964
1965        if (inode_permission(llss->inode1, MAY_WRITE) ||
1966            inode_permission(llss->inode2, MAY_WRITE)) {
1967                rc = -EPERM;
1968                goto free;
1969        }
1970
1971        if (llss->inode2->i_sb != llss->inode1->i_sb) {
1972                rc = -EXDEV;
1973                goto free;
1974        }
1975
1976        /* we use 2 bool because it is easier to swap than 2 bits */
1977        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1978                llss->check_dv1 = true;
1979
1980        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1981                llss->check_dv2 = true;
1982
1983        /* we cannot use lsl->sl_dvX directly because we may swap them */
1984        llss->dv1 = lsl->sl_dv1;
1985        llss->dv2 = lsl->sl_dv2;
1986
1987        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1988        if (rc == 0) /* same file, done! */ {
1989                rc = 0;
1990                goto free;
1991        }
1992
1993        if (rc < 0) { /* sequentialize it */
1994                swap(llss->inode1, llss->inode2);
1995                swap(file1, file2);
1996                swap(llss->dv1, llss->dv2);
1997                swap(llss->check_dv1, llss->check_dv2);
1998        }
1999
2000        gid = lsl->sl_gid;
2001        if (gid != 0) { /* application asks to flush dirty cache */
2002                rc = ll_get_grouplock(llss->inode1, file1, gid);
2003                if (rc < 0)
2004                        goto free;
2005
2006                rc = ll_get_grouplock(llss->inode2, file2, gid);
2007                if (rc < 0) {
2008                        ll_put_grouplock(llss->inode1, file1, gid);
2009                        goto free;
2010                }
2011        }
2012
2013        /* to be able to restore mtime and atime after swap
2014         * we need to first save them */
2015        if (lsl->sl_flags &
2016            (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2017                llss->ia1.ia_mtime = llss->inode1->i_mtime;
2018                llss->ia1.ia_atime = llss->inode1->i_atime;
2019                llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2020                llss->ia2.ia_mtime = llss->inode2->i_mtime;
2021                llss->ia2.ia_atime = llss->inode2->i_atime;
2022                llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2023        }
2024
2025        /* ultimate check, before swapping the layouts we check if
2026         * dataversion has changed (if requested) */
2027        if (llss->check_dv1) {
2028                rc = ll_data_version(llss->inode1, &dv, 0);
2029                if (rc)
2030                        goto putgl;
2031                if (dv != llss->dv1) {
2032                        rc = -EAGAIN;
2033                        goto putgl;
2034                }
2035        }
2036
2037        if (llss->check_dv2) {
2038                rc = ll_data_version(llss->inode2, &dv, 0);
2039                if (rc)
2040                        goto putgl;
2041                if (dv != llss->dv2) {
2042                        rc = -EAGAIN;
2043                        goto putgl;
2044                }
2045        }
2046
2047        /* struct md_op_data is used to send the swap args to the mdt
2048         * only flags is missing, so we use struct mdc_swap_layouts
2049         * through the md_op_data->op_data */
2050        /* flags from user space have to be converted before they are send to
2051         * server, no flag is sent today, they are only used on the client */
2052        msl.msl_flags = 0;
2053        rc = -ENOMEM;
2054        op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2055                                     0, LUSTRE_OPC_ANY, &msl);
2056        if (IS_ERR(op_data)) {
2057                rc = PTR_ERR(op_data);
2058                goto free;
2059        }
2060
2061        rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2062                           sizeof(*op_data), op_data, NULL);
2063        ll_finish_md_op_data(op_data);
2064
2065putgl:
2066        if (gid != 0) {
2067                ll_put_grouplock(llss->inode2, file2, gid);
2068                ll_put_grouplock(llss->inode1, file1, gid);
2069        }
2070
2071        /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2072        if (rc != 0)
2073                goto free;
2074
2075        /* clear useless flags */
2076        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2077                llss->ia1.ia_valid &= ~ATTR_MTIME;
2078                llss->ia2.ia_valid &= ~ATTR_MTIME;
2079        }
2080
2081        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2082                llss->ia1.ia_valid &= ~ATTR_ATIME;
2083                llss->ia2.ia_valid &= ~ATTR_ATIME;
2084        }
2085
2086        /* update time if requested */
2087        rc = 0;
2088        if (llss->ia2.ia_valid != 0) {
2089                mutex_lock(&llss->inode1->i_mutex);
2090                rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2091                mutex_unlock(&llss->inode1->i_mutex);
2092        }
2093
2094        if (llss->ia1.ia_valid != 0) {
2095                int rc1;
2096
2097                mutex_lock(&llss->inode2->i_mutex);
2098                rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2099                mutex_unlock(&llss->inode2->i_mutex);
2100                if (rc == 0)
2101                        rc = rc1;
2102        }
2103
2104free:
2105        if (llss != NULL)
2106                OBD_FREE_PTR(llss);
2107
2108        return rc;
2109}
2110
2111static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2112{
2113        struct md_op_data       *op_data;
2114        int                      rc;
2115
2116        /* Non-root users are forbidden to set or clear flags which are
2117         * NOT defined in HSM_USER_MASK. */
2118        if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2119            !capable(CFS_CAP_SYS_ADMIN))
2120                return -EPERM;
2121
2122        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2123                                     LUSTRE_OPC_ANY, hss);
2124        if (IS_ERR(op_data))
2125                return PTR_ERR(op_data);
2126
2127        rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2128                           sizeof(*op_data), op_data, NULL);
2129
2130        ll_finish_md_op_data(op_data);
2131
2132        return rc;
2133}
2134
2135static int ll_hsm_import(struct inode *inode, struct file *file,
2136                         struct hsm_user_import *hui)
2137{
2138        struct hsm_state_set    *hss = NULL;
2139        struct iattr            *attr = NULL;
2140        int                      rc;
2141
2142
2143        if (!S_ISREG(inode->i_mode))
2144                return -EINVAL;
2145
2146        /* set HSM flags */
2147        hss = kzalloc(sizeof(*hss), GFP_NOFS);
2148        if (!hss) {
2149                rc = -ENOMEM;
2150                goto out;
2151        }
2152
2153        hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2154        hss->hss_archive_id = hui->hui_archive_id;
2155        hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2156        rc = ll_hsm_state_set(inode, hss);
2157        if (rc != 0)
2158                goto out;
2159
2160        attr = kzalloc(sizeof(*attr), GFP_NOFS);
2161        if (!attr) {
2162                rc = -ENOMEM;
2163                goto out;
2164        }
2165
2166        attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2167        attr->ia_mode |= S_IFREG;
2168        attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2169        attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2170        attr->ia_size = hui->hui_size;
2171        attr->ia_mtime.tv_sec = hui->hui_mtime;
2172        attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2173        attr->ia_atime.tv_sec = hui->hui_atime;
2174        attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2175
2176        attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2177                         ATTR_UID | ATTR_GID |
2178                         ATTR_MTIME | ATTR_MTIME_SET |
2179                         ATTR_ATIME | ATTR_ATIME_SET;
2180
2181        mutex_lock(&inode->i_mutex);
2182
2183        rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2184        if (rc == -ENODATA)
2185                rc = 0;
2186
2187        mutex_unlock(&inode->i_mutex);
2188
2189out:
2190        if (hss != NULL)
2191                OBD_FREE_PTR(hss);
2192
2193        if (attr != NULL)
2194                OBD_FREE_PTR(attr);
2195
2196        return rc;
2197}
2198
2199static long
2200ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2201{
2202        struct inode            *inode = file_inode(file);
2203        struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2204        int                      flags, rc;
2205
2206        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2207               inode->i_generation, inode, cmd);
2208        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2209
2210        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2211        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2212                return -ENOTTY;
2213
2214        switch (cmd) {
2215        case LL_IOC_GETFLAGS:
2216                /* Get the current value of the file flags */
2217                return put_user(fd->fd_flags, (int *)arg);
2218        case LL_IOC_SETFLAGS:
2219        case LL_IOC_CLRFLAGS:
2220                /* Set or clear specific file flags */
2221                /* XXX This probably needs checks to ensure the flags are
2222                 *     not abused, and to handle any flag side effects.
2223                 */
2224                if (get_user(flags, (int *) arg))
2225                        return -EFAULT;
2226
2227                if (cmd == LL_IOC_SETFLAGS) {
2228                        if ((flags & LL_FILE_IGNORE_LOCK) &&
2229                            !(file->f_flags & O_DIRECT)) {
2230                                CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2231                                       current->comm);
2232                                return -EINVAL;
2233                        }
2234
2235                        fd->fd_flags |= flags;
2236                } else {
2237                        fd->fd_flags &= ~flags;
2238                }
2239                return 0;
2240        case LL_IOC_LOV_SETSTRIPE:
2241                return ll_lov_setstripe(inode, file, arg);
2242        case LL_IOC_LOV_SETEA:
2243                return ll_lov_setea(inode, file, arg);
2244        case LL_IOC_LOV_SWAP_LAYOUTS: {
2245                struct file *file2;
2246                struct lustre_swap_layouts lsl;
2247
2248                if (copy_from_user(&lsl, (char *)arg,
2249                                       sizeof(struct lustre_swap_layouts)))
2250                        return -EFAULT;
2251
2252                if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2253                        return -EPERM;
2254
2255                file2 = fget(lsl.sl_fd);
2256                if (file2 == NULL)
2257                        return -EBADF;
2258
2259                rc = -EPERM;
2260                if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2261                        rc = ll_swap_layouts(file, file2, &lsl);
2262                fput(file2);
2263                return rc;
2264        }
2265        case LL_IOC_LOV_GETSTRIPE:
2266                return ll_lov_getstripe(inode, arg);
2267        case LL_IOC_RECREATE_OBJ:
2268                return ll_lov_recreate_obj(inode, arg);
2269        case LL_IOC_RECREATE_FID:
2270                return ll_lov_recreate_fid(inode, arg);
2271        case FSFILT_IOC_FIEMAP:
2272                return ll_ioctl_fiemap(inode, arg);
2273        case FSFILT_IOC_GETFLAGS:
2274        case FSFILT_IOC_SETFLAGS:
2275                return ll_iocontrol(inode, file, cmd, arg);
2276        case FSFILT_IOC_GETVERSION_OLD:
2277        case FSFILT_IOC_GETVERSION:
2278                return put_user(inode->i_generation, (int *)arg);
2279        case LL_IOC_GROUP_LOCK:
2280                return ll_get_grouplock(inode, file, arg);
2281        case LL_IOC_GROUP_UNLOCK:
2282                return ll_put_grouplock(inode, file, arg);
2283        case IOC_OBD_STATFS:
2284                return ll_obd_statfs(inode, (void *)arg);
2285
2286        /* We need to special case any other ioctls we want to handle,
2287         * to send them to the MDS/OST as appropriate and to properly
2288         * network encode the arg field.
2289        case FSFILT_IOC_SETVERSION_OLD:
2290        case FSFILT_IOC_SETVERSION:
2291        */
2292        case LL_IOC_FLUSHCTX:
2293                return ll_flush_ctx(inode);
2294        case LL_IOC_PATH2FID: {
2295                if (copy_to_user((void *)arg, ll_inode2fid(inode),
2296                                 sizeof(struct lu_fid)))
2297                        return -EFAULT;
2298
2299                return 0;
2300        }
2301        case OBD_IOC_FID2PATH:
2302                return ll_fid2path(inode, (void *)arg);
2303        case LL_IOC_DATA_VERSION: {
2304                struct ioc_data_version idv;
2305                int                     rc;
2306
2307                if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2308                        return -EFAULT;
2309
2310                rc = ll_data_version(inode, &idv.idv_version,
2311                                !(idv.idv_flags & LL_DV_NOFLUSH));
2312
2313                if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2314                        return -EFAULT;
2315
2316                return rc;
2317        }
2318
2319        case LL_IOC_GET_MDTIDX: {
2320                int mdtidx;
2321
2322                mdtidx = ll_get_mdt_idx(inode);
2323                if (mdtidx < 0)
2324                        return mdtidx;
2325
2326                if (put_user((int)mdtidx, (int *)arg))
2327                        return -EFAULT;
2328
2329                return 0;
2330        }
2331        case OBD_IOC_GETDTNAME:
2332        case OBD_IOC_GETMDNAME:
2333                return ll_get_obd_name(inode, cmd, arg);
2334        case LL_IOC_HSM_STATE_GET: {
2335                struct md_op_data       *op_data;
2336                struct hsm_user_state   *hus;
2337                int                      rc;
2338
2339                hus = kzalloc(sizeof(*hus), GFP_NOFS);
2340                if (!hus)
2341                        return -ENOMEM;
2342
2343                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2344                                             LUSTRE_OPC_ANY, hus);
2345                if (IS_ERR(op_data)) {
2346                        OBD_FREE_PTR(hus);
2347                        return PTR_ERR(op_data);
2348                }
2349
2350                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2351                                   op_data, NULL);
2352
2353                if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2354                        rc = -EFAULT;
2355
2356                ll_finish_md_op_data(op_data);
2357                OBD_FREE_PTR(hus);
2358                return rc;
2359        }
2360        case LL_IOC_HSM_STATE_SET: {
2361                struct hsm_state_set    *hss;
2362                int                      rc;
2363
2364                hss = kzalloc(sizeof(*hss), GFP_NOFS);
2365                if (!hss)
2366                        return -ENOMEM;
2367
2368                if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2369                        OBD_FREE_PTR(hss);
2370                        return -EFAULT;
2371                }
2372
2373                rc = ll_hsm_state_set(inode, hss);
2374
2375                OBD_FREE_PTR(hss);
2376                return rc;
2377        }
2378        case LL_IOC_HSM_ACTION: {
2379                struct md_op_data               *op_data;
2380                struct hsm_current_action       *hca;
2381                int                              rc;
2382
2383                hca = kzalloc(sizeof(*hca), GFP_NOFS);
2384                if (!hca)
2385                        return -ENOMEM;
2386
2387                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2388                                             LUSTRE_OPC_ANY, hca);
2389                if (IS_ERR(op_data)) {
2390                        OBD_FREE_PTR(hca);
2391                        return PTR_ERR(op_data);
2392                }
2393
2394                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2395                                   op_data, NULL);
2396
2397                if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2398                        rc = -EFAULT;
2399
2400                ll_finish_md_op_data(op_data);
2401                OBD_FREE_PTR(hca);
2402                return rc;
2403        }
2404        case LL_IOC_SET_LEASE: {
2405                struct ll_inode_info *lli = ll_i2info(inode);
2406                struct obd_client_handle *och = NULL;
2407                bool lease_broken;
2408                fmode_t mode = 0;
2409
2410                switch (arg) {
2411                case F_WRLCK:
2412                        if (!(file->f_mode & FMODE_WRITE))
2413                                return -EPERM;
2414                        mode = FMODE_WRITE;
2415                        break;
2416                case F_RDLCK:
2417                        if (!(file->f_mode & FMODE_READ))
2418                                return -EPERM;
2419                        mode = FMODE_READ;
2420                        break;
2421                case F_UNLCK:
2422                        mutex_lock(&lli->lli_och_mutex);
2423                        if (fd->fd_lease_och != NULL) {
2424                                och = fd->fd_lease_och;
2425                                fd->fd_lease_och = NULL;
2426                        }
2427                        mutex_unlock(&lli->lli_och_mutex);
2428
2429                        if (och != NULL) {
2430                                mode = och->och_flags &
2431                                       (FMODE_READ|FMODE_WRITE);
2432                                rc = ll_lease_close(och, inode, &lease_broken);
2433                                if (rc == 0 && lease_broken)
2434                                        mode = 0;
2435                        } else {
2436                                rc = -ENOLCK;
2437                        }
2438
2439                        /* return the type of lease or error */
2440                        return rc < 0 ? rc : (int)mode;
2441                default:
2442                        return -EINVAL;
2443                }
2444
2445                CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2446
2447                /* apply for lease */
2448                och = ll_lease_open(inode, file, mode, 0);
2449                if (IS_ERR(och))
2450                        return PTR_ERR(och);
2451
2452                rc = 0;
2453                mutex_lock(&lli->lli_och_mutex);
2454                if (fd->fd_lease_och == NULL) {
2455                        fd->fd_lease_och = och;
2456                        och = NULL;
2457                }
2458                mutex_unlock(&lli->lli_och_mutex);
2459                if (och != NULL) {
2460                        /* impossible now that only excl is supported for now */
2461                        ll_lease_close(och, inode, &lease_broken);
2462                        rc = -EBUSY;
2463                }
2464                return rc;
2465        }
2466        case LL_IOC_GET_LEASE: {
2467                struct ll_inode_info *lli = ll_i2info(inode);
2468                struct ldlm_lock *lock = NULL;
2469
2470                rc = 0;
2471                mutex_lock(&lli->lli_och_mutex);
2472                if (fd->fd_lease_och != NULL) {
2473                        struct obd_client_handle *och = fd->fd_lease_och;
2474
2475                        lock = ldlm_handle2lock(&och->och_lease_handle);
2476                        if (lock != NULL) {
2477                                lock_res_and_lock(lock);
2478                                if (!ldlm_is_cancel(lock))
2479                                        rc = och->och_flags &
2480                                                (FMODE_READ | FMODE_WRITE);
2481                                unlock_res_and_lock(lock);
2482                                ldlm_lock_put(lock);
2483                        }
2484                }
2485                mutex_unlock(&lli->lli_och_mutex);
2486                return rc;
2487        }
2488        case LL_IOC_HSM_IMPORT: {
2489                struct hsm_user_import *hui;
2490
2491                hui = kzalloc(sizeof(*hui), GFP_NOFS);
2492                if (!hui)
2493                        return -ENOMEM;
2494
2495                if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2496                        OBD_FREE_PTR(hui);
2497                        return -EFAULT;
2498                }
2499
2500                rc = ll_hsm_import(inode, file, hui);
2501
2502                OBD_FREE_PTR(hui);
2503                return rc;
2504        }
2505        default: {
2506                int err;
2507
2508                if (LLIOC_STOP ==
2509                     ll_iocontrol_call(inode, file, cmd, arg, &err))
2510                        return err;
2511
2512                return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2513                                     (void *)arg);
2514        }
2515        }
2516}
2517
2518
2519static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2520{
2521        struct inode *inode = file_inode(file);
2522        loff_t retval, eof = 0;
2523
2524        retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2525                           (origin == SEEK_CUR) ? file->f_pos : 0);
2526        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2527               inode->i_ino, inode->i_generation, inode, retval, retval,
2528               origin);
2529        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2530
2531        if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2532                retval = ll_glimpse_size(inode);
2533                if (retval != 0)
2534                        return retval;
2535                eof = i_size_read(inode);
2536        }
2537
2538        retval = generic_file_llseek_size(file, offset, origin,
2539                                          ll_file_maxbytes(inode), eof);
2540        return retval;
2541}
2542
2543static int ll_flush(struct file *file, fl_owner_t id)
2544{
2545        struct inode *inode = file_inode(file);
2546        struct ll_inode_info *lli = ll_i2info(inode);
2547        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2548        int rc, err;
2549
2550        LASSERT(!S_ISDIR(inode->i_mode));
2551
2552        /* catch async errors that were recorded back when async writeback
2553         * failed for pages in this mapping. */
2554        rc = lli->lli_async_rc;
2555        lli->lli_async_rc = 0;
2556        err = lov_read_and_clear_async_rc(lli->lli_clob);
2557        if (rc == 0)
2558                rc = err;
2559
2560        /* The application has been told write failure already.
2561         * Do not report failure again. */
2562        if (fd->fd_write_failed)
2563                return 0;
2564        return rc ? -EIO : 0;
2565}
2566
2567/**
2568 * Called to make sure a portion of file has been written out.
2569 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2570 *
2571 * Return how many pages have been written.
2572 */
2573int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2574                       enum cl_fsync_mode mode, int ignore_layout)
2575{
2576        struct cl_env_nest nest;
2577        struct lu_env *env;
2578        struct cl_io *io;
2579        struct obd_capa *capa = NULL;
2580        struct cl_fsync_io *fio;
2581        int result;
2582
2583        if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2584            mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2585                return -EINVAL;
2586
2587        env = cl_env_nested_get(&nest);
2588        if (IS_ERR(env))
2589                return PTR_ERR(env);
2590
2591        capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2592
2593        io = ccc_env_thread_io(env);
2594        io->ci_obj = cl_i2info(inode)->lli_clob;
2595        io->ci_ignore_layout = ignore_layout;
2596
2597        /* initialize parameters for sync */
2598        fio = &io->u.ci_fsync;
2599        fio->fi_capa = capa;
2600        fio->fi_start = start;
2601        fio->fi_end = end;
2602        fio->fi_fid = ll_inode2fid(inode);
2603        fio->fi_mode = mode;
2604        fio->fi_nr_written = 0;
2605
2606        if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2607                result = cl_io_loop(env, io);
2608        else
2609                result = io->ci_result;
2610        if (result == 0)
2611                result = fio->fi_nr_written;
2612        cl_io_fini(env, io);
2613        cl_env_nested_put(&nest, env);
2614
2615        capa_put(capa);
2616
2617        return result;
2618}
2619
2620int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2621{
2622        struct inode *inode = file_inode(file);
2623        struct ll_inode_info *lli = ll_i2info(inode);
2624        struct ptlrpc_request *req;
2625        struct obd_capa *oc;
2626        int rc, err;
2627
2628        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2629               inode->i_generation, inode);
2630        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2631
2632        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2633        mutex_lock(&inode->i_mutex);
2634
2635        /* catch async errors that were recorded back when async writeback
2636         * failed for pages in this mapping. */
2637        if (!S_ISDIR(inode->i_mode)) {
2638                err = lli->lli_async_rc;
2639                lli->lli_async_rc = 0;
2640                if (rc == 0)
2641                        rc = err;
2642                err = lov_read_and_clear_async_rc(lli->lli_clob);
2643                if (rc == 0)
2644                        rc = err;
2645        }
2646
2647        oc = ll_mdscapa_get(inode);
2648        err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2649                      &req);
2650        capa_put(oc);
2651        if (!rc)
2652                rc = err;
2653        if (!err)
2654                ptlrpc_req_finished(req);
2655
2656        if (S_ISREG(inode->i_mode)) {
2657                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2658
2659                err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2660                if (rc == 0 && err < 0)
2661                        rc = err;
2662                if (rc < 0)
2663                        fd->fd_write_failed = true;
2664                else
2665                        fd->fd_write_failed = false;
2666        }
2667
2668        mutex_unlock(&inode->i_mutex);
2669        return rc;
2670}
2671
2672static int
2673ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2674{
2675        struct inode *inode = file_inode(file);
2676        struct ll_sb_info *sbi = ll_i2sbi(inode);
2677        struct ldlm_enqueue_info einfo = {
2678                .ei_type        = LDLM_FLOCK,
2679                .ei_cb_cp       = ldlm_flock_completion_ast,
2680                .ei_cbdata      = file_lock,
2681        };
2682        struct md_op_data *op_data;
2683        struct lustre_handle lockh = {0};
2684        ldlm_policy_data_t flock = {{0}};
2685        __u64 flags = 0;
2686        int rc;
2687        int rc2 = 0;
2688
2689        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2690               inode->i_ino, file_lock);
2691
2692        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2693
2694        if (file_lock->fl_flags & FL_FLOCK)
2695                LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2696        else if (!(file_lock->fl_flags & FL_POSIX))
2697                return -EINVAL;
2698
2699        flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2700        flock.l_flock.pid = file_lock->fl_pid;
2701        flock.l_flock.start = file_lock->fl_start;
2702        flock.l_flock.end = file_lock->fl_end;
2703
2704        /* Somewhat ugly workaround for svc lockd.
2705         * lockd installs custom fl_lmops->lm_compare_owner that checks
2706         * for the fl_owner to be the same (which it always is on local node
2707         * I guess between lockd processes) and then compares pid.
2708         * As such we assign pid to the owner field to make it all work,
2709         * conflict with normal locks is unlikely since pid space and
2710         * pointer space for current->files are not intersecting */
2711        if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2712                flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2713
2714        switch (file_lock->fl_type) {
2715        case F_RDLCK:
2716                einfo.ei_mode = LCK_PR;
2717                break;
2718        case F_UNLCK:
2719                /* An unlock request may or may not have any relation to
2720                 * existing locks so we may not be able to pass a lock handle
2721                 * via a normal ldlm_lock_cancel() request. The request may even
2722                 * unlock a byte range in the middle of an existing lock. In
2723                 * order to process an unlock request we need all of the same
2724                 * information that is given with a normal read or write record
2725                 * lock request. To avoid creating another ldlm unlock (cancel)
2726                 * message we'll treat a LCK_NL flock request as an unlock. */
2727                einfo.ei_mode = LCK_NL;
2728                break;
2729        case F_WRLCK:
2730                einfo.ei_mode = LCK_PW;
2731                break;
2732        default:
2733                CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2734                        file_lock->fl_type);
2735                return -ENOTSUPP;
2736        }
2737
2738        switch (cmd) {
2739        case F_SETLKW:
2740#ifdef F_SETLKW64
2741        case F_SETLKW64:
2742#endif
2743                flags = 0;
2744                break;
2745        case F_SETLK:
2746#ifdef F_SETLK64
2747        case F_SETLK64:
2748#endif
2749                flags = LDLM_FL_BLOCK_NOWAIT;
2750                break;
2751        case F_GETLK:
2752#ifdef F_GETLK64
2753        case F_GETLK64:
2754#endif
2755                flags = LDLM_FL_TEST_LOCK;
2756                /* Save the old mode so that if the mode in the lock changes we
2757                 * can decrement the appropriate reader or writer refcount. */
2758                file_lock->fl_type = einfo.ei_mode;
2759                break;
2760        default:
2761                CERROR("unknown fcntl lock command: %d\n", cmd);
2762                return -EINVAL;
2763        }
2764
2765        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2766                                     LUSTRE_OPC_ANY, NULL);
2767        if (IS_ERR(op_data))
2768                return PTR_ERR(op_data);
2769
2770        CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2771               inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2772               flock.l_flock.start, flock.l_flock.end);
2773
2774        rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2775                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2776
2777        if ((file_lock->fl_flags & FL_FLOCK) &&
2778            (rc == 0 || file_lock->fl_type == F_UNLCK))
2779                rc2  = flock_lock_file_wait(file, file_lock);
2780        if ((file_lock->fl_flags & FL_POSIX) &&
2781            (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2782            !(flags & LDLM_FL_TEST_LOCK))
2783                rc2  = posix_lock_file_wait(file, file_lock);
2784
2785        if (rc2 && file_lock->fl_type != F_UNLCK) {
2786                einfo.ei_mode = LCK_NL;
2787                md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2788                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2789                rc = rc2;
2790        }
2791
2792        ll_finish_md_op_data(op_data);
2793
2794        return rc;
2795}
2796
2797static int
2798ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2799{
2800        return -ENOSYS;
2801}
2802
2803/**
2804 * test if some locks matching bits and l_req_mode are acquired
2805 * - bits can be in different locks
2806 * - if found clear the common lock bits in *bits
2807 * - the bits not found, are kept in *bits
2808 * \param inode [IN]
2809 * \param bits [IN] searched lock bits [IN]
2810 * \param l_req_mode [IN] searched lock mode
2811 * \retval boolean, true iff all bits are found
2812 */
2813int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2814{
2815        struct lustre_handle lockh;
2816        ldlm_policy_data_t policy;
2817        ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2818                                (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2819        struct lu_fid *fid;
2820        __u64 flags;
2821        int i;
2822
2823        if (!inode)
2824               return 0;
2825
2826        fid = &ll_i2info(inode)->lli_fid;
2827        CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2828               ldlm_lockname[mode]);
2829
2830        flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2831        for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2832                policy.l_inodebits.bits = *bits & (1 << i);
2833                if (policy.l_inodebits.bits == 0)
2834                        continue;
2835
2836                if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2837                                  &policy, mode, &lockh)) {
2838                        struct ldlm_lock *lock;
2839
2840                        lock = ldlm_handle2lock(&lockh);
2841                        if (lock) {
2842                                *bits &=
2843                                      ~(lock->l_policy_data.l_inodebits.bits);
2844                                LDLM_LOCK_PUT(lock);
2845                        } else {
2846                                *bits &= ~policy.l_inodebits.bits;
2847                        }
2848                }
2849        }
2850        return *bits == 0;
2851}
2852
2853ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2854                            struct lustre_handle *lockh, __u64 flags,
2855                            ldlm_mode_t mode)
2856{
2857        ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2858        struct lu_fid *fid;
2859        ldlm_mode_t rc;
2860
2861        fid = &ll_i2info(inode)->lli_fid;
2862        CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2863
2864        rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2865                           fid, LDLM_IBITS, &policy, mode, lockh);
2866
2867        return rc;
2868}
2869
2870static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2871{
2872        /* Already unlinked. Just update nlink and return success */
2873        if (rc == -ENOENT) {
2874                clear_nlink(inode);
2875                /* This path cannot be hit for regular files unless in
2876                 * case of obscure races, so no need to validate size.
2877                 */
2878                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2879                        return 0;
2880        } else if (rc != 0) {
2881                CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2882                             "%s: revalidate FID "DFID" error: rc = %d\n",
2883                             ll_get_fsname(inode->i_sb, NULL, 0),
2884                             PFID(ll_inode2fid(inode)), rc);
2885        }
2886
2887        return rc;
2888}
2889
2890static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2891{
2892        struct inode *inode = dentry->d_inode;
2893        struct ptlrpc_request *req = NULL;
2894        struct obd_export *exp;
2895        int rc = 0;
2896
2897        LASSERT(inode != NULL);
2898
2899        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2900               inode->i_ino, inode->i_generation, inode, dentry);
2901
2902        exp = ll_i2mdexp(inode);
2903
2904        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2905         *      But under CMD case, it caused some lock issues, should be fixed
2906         *      with new CMD ibits lock. See bug 12718 */
2907        if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2908                struct lookup_intent oit = { .it_op = IT_GETATTR };
2909                struct md_op_data *op_data;
2910
2911                if (ibits == MDS_INODELOCK_LOOKUP)
2912                        oit.it_op = IT_LOOKUP;
2913
2914                /* Call getattr by fid, so do not provide name at all. */
2915                op_data = ll_prep_md_op_data(NULL, inode,
2916                                             inode, NULL, 0, 0,
2917                                             LUSTRE_OPC_ANY, NULL);
2918                if (IS_ERR(op_data))
2919                        return PTR_ERR(op_data);
2920
2921                oit.it_create_mode |= M_CHECK_STALE;
2922                rc = md_intent_lock(exp, op_data, NULL, 0,
2923                                    /* we are not interested in name
2924                                       based lookup */
2925                                    &oit, 0, &req,
2926                                    ll_md_blocking_ast, 0);
2927                ll_finish_md_op_data(op_data);
2928                oit.it_create_mode &= ~M_CHECK_STALE;
2929                if (rc < 0) {
2930                        rc = ll_inode_revalidate_fini(inode, rc);
2931                        goto out;
2932                }
2933
2934                rc = ll_revalidate_it_finish(req, &oit, inode);
2935                if (rc != 0) {
2936                        ll_intent_release(&oit);
2937                        goto out;
2938                }
2939
2940                /* Unlinked? Unhash dentry, so it is not picked up later by
2941                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2942                   here to preserve get_cwd functionality on 2.6.
2943                   Bug 10503 */
2944                if (!dentry->d_inode->i_nlink)
2945                        d_lustre_invalidate(dentry, 0);
2946
2947                ll_lookup_finish_locks(&oit, inode);
2948        } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2949                struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2950                u64 valid = OBD_MD_FLGETATTR;
2951                struct md_op_data *op_data;
2952                int ealen = 0;
2953
2954                if (S_ISREG(inode->i_mode)) {
2955                        rc = ll_get_default_mdsize(sbi, &ealen);
2956                        if (rc)
2957                                return rc;
2958                        valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2959                }
2960
2961                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2962                                             0, ealen, LUSTRE_OPC_ANY,
2963                                             NULL);
2964                if (IS_ERR(op_data))
2965                        return PTR_ERR(op_data);
2966
2967                op_data->op_valid = valid;
2968                /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2969                 * capa for this inode. Because we only keep capas of dirs
2970                 * fresh. */
2971                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2972                ll_finish_md_op_data(op_data);
2973                if (rc) {
2974                        rc = ll_inode_revalidate_fini(inode, rc);
2975                        return rc;
2976                }
2977
2978                rc = ll_prep_inode(&inode, req, NULL, NULL);
2979        }
2980out:
2981        ptlrpc_req_finished(req);
2982        return rc;
2983}
2984
2985static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2986{
2987        struct inode *inode = dentry->d_inode;
2988        int rc;
2989
2990        rc = __ll_inode_revalidate(dentry, ibits);
2991        if (rc != 0)
2992                return rc;
2993
2994        /* if object isn't regular file, don't validate size */
2995        if (!S_ISREG(inode->i_mode)) {
2996                LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2997                LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2998                LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2999        } else {
3000                /* In case of restore, the MDT has the right size and has
3001                 * already send it back without granting the layout lock,
3002                 * inode is up-to-date so glimpse is useless.
3003                 * Also to glimpse we need the layout, in case of a running
3004                 * restore the MDT holds the layout lock so the glimpse will
3005                 * block up to the end of restore (getattr will block)
3006                 */
3007                if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3008                        rc = ll_glimpse_size(inode);
3009        }
3010        return rc;
3011}
3012
3013int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3014{
3015        struct inode *inode = de->d_inode;
3016        struct ll_sb_info *sbi = ll_i2sbi(inode);
3017        struct ll_inode_info *lli = ll_i2info(inode);
3018        int res = 0;
3019
3020        res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3021                                      MDS_INODELOCK_LOOKUP);
3022        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3023
3024        if (res)
3025                return res;
3026
3027        stat->dev = inode->i_sb->s_dev;
3028        if (ll_need_32bit_api(sbi))
3029                stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3030        else
3031                stat->ino = inode->i_ino;
3032        stat->mode = inode->i_mode;
3033        stat->nlink = inode->i_nlink;
3034        stat->uid = inode->i_uid;
3035        stat->gid = inode->i_gid;
3036        stat->rdev = inode->i_rdev;
3037        stat->atime = inode->i_atime;
3038        stat->mtime = inode->i_mtime;
3039        stat->ctime = inode->i_ctime;
3040        stat->blksize = 1 << inode->i_blkbits;
3041
3042        stat->size = i_size_read(inode);
3043        stat->blocks = inode->i_blocks;
3044
3045        return 0;
3046}
3047
3048static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3049                     __u64 start, __u64 len)
3050{
3051        int rc;
3052        size_t num_bytes;
3053        struct ll_user_fiemap *fiemap;
3054        unsigned int extent_count = fieinfo->fi_extents_max;
3055
3056        num_bytes = sizeof(*fiemap) + (extent_count *
3057                                       sizeof(struct ll_fiemap_extent));
3058        OBD_ALLOC_LARGE(fiemap, num_bytes);
3059
3060        if (fiemap == NULL)
3061                return -ENOMEM;
3062
3063        fiemap->fm_flags = fieinfo->fi_flags;
3064        fiemap->fm_extent_count = fieinfo->fi_extents_max;
3065        fiemap->fm_start = start;
3066        fiemap->fm_length = len;
3067        if (extent_count > 0)
3068                memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3069                       sizeof(struct ll_fiemap_extent));
3070
3071        rc = ll_do_fiemap(inode, fiemap, num_bytes);
3072
3073        fieinfo->fi_flags = fiemap->fm_flags;
3074        fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3075        if (extent_count > 0)
3076                memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3077                       fiemap->fm_mapped_extents *
3078                       sizeof(struct ll_fiemap_extent));
3079
3080        OBD_FREE_LARGE(fiemap, num_bytes);
3081        return rc;
3082}
3083
3084struct posix_acl *ll_get_acl(struct inode *inode, int type)
3085{
3086        struct ll_inode_info *lli = ll_i2info(inode);
3087        struct posix_acl *acl = NULL;
3088
3089        spin_lock(&lli->lli_lock);
3090        /* VFS' acl_permission_check->check_acl will release the refcount */
3091        acl = posix_acl_dup(lli->lli_posix_acl);
3092        spin_unlock(&lli->lli_lock);
3093
3094        return acl;
3095}
3096
3097
3098int ll_inode_permission(struct inode *inode, int mask)
3099{
3100        int rc = 0;
3101
3102#ifdef MAY_NOT_BLOCK
3103        if (mask & MAY_NOT_BLOCK)
3104                return -ECHILD;
3105#endif
3106
3107       /* as root inode are NOT getting validated in lookup operation,
3108        * need to do it before permission check. */
3109
3110        if (is_root_inode(inode)) {
3111                rc = __ll_inode_revalidate(inode->i_sb->s_root,
3112                                           MDS_INODELOCK_LOOKUP);
3113                if (rc)
3114                        return rc;
3115        }
3116
3117        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3118               inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3119
3120        if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3121                return lustre_check_remote_perm(inode, mask);
3122
3123        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3124        rc = generic_permission(inode, mask);
3125
3126        return rc;
3127}
3128
3129/* -o localflock - only provides locally consistent flock locks */
3130struct file_operations ll_file_operations = {
3131        .read      = new_sync_read,
3132        .read_iter = ll_file_read_iter,
3133        .write    = new_sync_write,
3134        .write_iter = ll_file_write_iter,
3135        .unlocked_ioctl = ll_file_ioctl,
3136        .open      = ll_file_open,
3137        .release        = ll_file_release,
3138        .mmap      = ll_file_mmap,
3139        .llseek  = ll_file_seek,
3140        .splice_read    = ll_file_splice_read,
3141        .fsync    = ll_fsync,
3142        .flush    = ll_flush
3143};
3144
3145struct file_operations ll_file_operations_flock = {
3146        .read      = new_sync_read,
3147        .read_iter    = ll_file_read_iter,
3148        .write    = new_sync_write,
3149        .write_iter   = ll_file_write_iter,
3150        .unlocked_ioctl = ll_file_ioctl,
3151        .open      = ll_file_open,
3152        .release        = ll_file_release,
3153        .mmap      = ll_file_mmap,
3154        .llseek  = ll_file_seek,
3155        .splice_read    = ll_file_splice_read,
3156        .fsync    = ll_fsync,
3157        .flush    = ll_flush,
3158        .flock    = ll_file_flock,
3159        .lock      = ll_file_flock
3160};
3161
3162/* These are for -o noflock - to return ENOSYS on flock calls */
3163struct file_operations ll_file_operations_noflock = {
3164        .read      = new_sync_read,
3165        .read_iter    = ll_file_read_iter,
3166        .write    = new_sync_write,
3167        .write_iter   = ll_file_write_iter,
3168        .unlocked_ioctl = ll_file_ioctl,
3169        .open      = ll_file_open,
3170        .release        = ll_file_release,
3171        .mmap      = ll_file_mmap,
3172        .llseek  = ll_file_seek,
3173        .splice_read    = ll_file_splice_read,
3174        .fsync    = ll_fsync,
3175        .flush    = ll_flush,
3176        .flock    = ll_file_noflock,
3177        .lock      = ll_file_noflock
3178};
3179
3180struct inode_operations ll_file_inode_operations = {
3181        .setattr        = ll_setattr,
3182        .getattr        = ll_getattr,
3183        .permission     = ll_inode_permission,
3184        .setxattr       = ll_setxattr,
3185        .getxattr       = ll_getxattr,
3186        .listxattr      = ll_listxattr,
3187        .removexattr    = ll_removexattr,
3188        .fiemap         = ll_fiemap,
3189        .get_acl        = ll_get_acl,
3190};
3191
3192/* dynamic ioctl number support routines */
3193static struct llioc_ctl_data {
3194        struct rw_semaphore     ioc_sem;
3195        struct list_head              ioc_head;
3196} llioc = {
3197        __RWSEM_INITIALIZER(llioc.ioc_sem),
3198        LIST_HEAD_INIT(llioc.ioc_head)
3199};
3200
3201
3202struct llioc_data {
3203        struct list_head              iocd_list;
3204        unsigned int        iocd_size;
3205        llioc_callback_t        iocd_cb;
3206        unsigned int        iocd_count;
3207        unsigned int        iocd_cmd[0];
3208};
3209
3210void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3211{
3212        unsigned int size;
3213        struct llioc_data *in_data = NULL;
3214
3215        if (cb == NULL || cmd == NULL ||
3216            count > LLIOC_MAX_CMD || count < 0)
3217                return NULL;
3218
3219        size = sizeof(*in_data) + count * sizeof(unsigned int);
3220        in_data = kzalloc(size, GFP_NOFS);
3221        if (!in_data)
3222                return NULL;
3223
3224        memset(in_data, 0, sizeof(*in_data));
3225        in_data->iocd_size = size;
3226        in_data->iocd_cb = cb;
3227        in_data->iocd_count = count;
3228        memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3229
3230        down_write(&llioc.ioc_sem);
3231        list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3232        up_write(&llioc.ioc_sem);
3233
3234        return in_data;
3235}
3236
3237void ll_iocontrol_unregister(void *magic)
3238{
3239        struct llioc_data *tmp;
3240
3241        if (magic == NULL)
3242                return;
3243
3244        down_write(&llioc.ioc_sem);
3245        list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3246                if (tmp == magic) {
3247                        unsigned int size = tmp->iocd_size;
3248
3249                        list_del(&tmp->iocd_list);
3250                        up_write(&llioc.ioc_sem);
3251
3252                        OBD_FREE(tmp, size);
3253                        return;
3254                }
3255        }
3256        up_write(&llioc.ioc_sem);
3257
3258        CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3259}
3260
3261EXPORT_SYMBOL(ll_iocontrol_register);
3262EXPORT_SYMBOL(ll_iocontrol_unregister);
3263
3264static enum llioc_iter
3265ll_iocontrol_call(struct inode *inode, struct file *file,
3266                  unsigned int cmd, unsigned long arg, int *rcp)
3267{
3268        enum llioc_iter ret = LLIOC_CONT;
3269        struct llioc_data *data;
3270        int rc = -EINVAL, i;
3271
3272        down_read(&llioc.ioc_sem);
3273        list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3274                for (i = 0; i < data->iocd_count; i++) {
3275                        if (cmd != data->iocd_cmd[i])
3276                                continue;
3277
3278                        ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3279                        break;
3280                }
3281
3282                if (ret == LLIOC_STOP)
3283                        break;
3284        }
3285        up_read(&llioc.ioc_sem);
3286
3287        if (rcp)
3288                *rcp = rc;
3289        return ret;
3290}
3291
3292int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3293{
3294        struct ll_inode_info *lli = ll_i2info(inode);
3295        struct cl_env_nest nest;
3296        struct lu_env *env;
3297        int result;
3298
3299        if (lli->lli_clob == NULL)
3300                return 0;
3301
3302        env = cl_env_nested_get(&nest);
3303        if (IS_ERR(env))
3304                return PTR_ERR(env);
3305
3306        result = cl_conf_set(env, lli->lli_clob, conf);
3307        cl_env_nested_put(&nest, env);
3308
3309        if (conf->coc_opc == OBJECT_CONF_SET) {
3310                struct ldlm_lock *lock = conf->coc_lock;
3311
3312                LASSERT(lock != NULL);
3313                LASSERT(ldlm_has_layout(lock));
3314                if (result == 0) {
3315                        /* it can only be allowed to match after layout is
3316                         * applied to inode otherwise false layout would be
3317                         * seen. Applying layout should happen before dropping
3318                         * the intent lock. */
3319                        ldlm_lock_allow_match(lock);
3320                }
3321        }
3322        return result;
3323}
3324
3325/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3326static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3327
3328{
3329        struct ll_sb_info *sbi = ll_i2sbi(inode);
3330        struct obd_capa *oc;
3331        struct ptlrpc_request *req;
3332        struct mdt_body *body;
3333        void *lvbdata;
3334        void *lmm;
3335        int lmmsize;
3336        int rc;
3337
3338        CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3339               PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3340               lock->l_lvb_data, lock->l_lvb_len);
3341
3342        if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3343                return 0;
3344
3345        /* if layout lock was granted right away, the layout is returned
3346         * within DLM_LVB of dlm reply; otherwise if the lock was ever
3347         * blocked and then granted via completion ast, we have to fetch
3348         * layout here. Please note that we can't use the LVB buffer in
3349         * completion AST because it doesn't have a large enough buffer */
3350        oc = ll_mdscapa_get(inode);
3351        rc = ll_get_default_mdsize(sbi, &lmmsize);
3352        if (rc == 0)
3353                rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3354                                OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3355                                lmmsize, 0, &req);
3356        capa_put(oc);
3357        if (rc < 0)
3358                return rc;
3359
3360        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3361        if (body == NULL) {
3362                rc = -EPROTO;
3363                goto out;
3364        }
3365
3366        lmmsize = body->eadatasize;
3367        if (lmmsize == 0) /* empty layout */ {
3368                rc = 0;
3369                goto out;
3370        }
3371
3372        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3373        if (lmm == NULL) {
3374                rc = -EFAULT;
3375                goto out;
3376        }
3377
3378        OBD_ALLOC_LARGE(lvbdata, lmmsize);
3379        if (lvbdata == NULL) {
3380                rc = -ENOMEM;
3381                goto out;
3382        }
3383
3384        memcpy(lvbdata, lmm, lmmsize);
3385        lock_res_and_lock(lock);
3386        if (lock->l_lvb_data != NULL)
3387                OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3388
3389        lock->l_lvb_data = lvbdata;
3390        lock->l_lvb_len = lmmsize;
3391        unlock_res_and_lock(lock);
3392
3393out:
3394        ptlrpc_req_finished(req);
3395        return rc;
3396}
3397
3398/**
3399 * Apply the layout to the inode. Layout lock is held and will be released
3400 * in this function.
3401 */
3402static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3403                                struct inode *inode, __u32 *gen, bool reconf)
3404{
3405        struct ll_inode_info *lli = ll_i2info(inode);
3406        struct ll_sb_info    *sbi = ll_i2sbi(inode);
3407        struct ldlm_lock *lock;
3408        struct lustre_md md = { NULL };
3409        struct cl_object_conf conf;
3410        int rc = 0;
3411        bool lvb_ready;
3412        bool wait_layout = false;
3413
3414        LASSERT(lustre_handle_is_used(lockh));
3415
3416        lock = ldlm_handle2lock(lockh);
3417        LASSERT(lock != NULL);
3418        LASSERT(ldlm_has_layout(lock));
3419
3420        LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3421                   inode, PFID(&lli->lli_fid), reconf);
3422
3423        /* in case this is a caching lock and reinstate with new inode */
3424        md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3425
3426        lock_res_and_lock(lock);
3427        lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3428        unlock_res_and_lock(lock);
3429        /* checking lvb_ready is racy but this is okay. The worst case is
3430         * that multi processes may configure the file on the same time. */
3431        if (lvb_ready || !reconf) {
3432                rc = -ENODATA;
3433                if (lvb_ready) {
3434                        /* layout_gen must be valid if layout lock is not
3435                         * cancelled and stripe has already set */
3436                        *gen = ll_layout_version_get(lli);
3437                        rc = 0;
3438                }
3439                goto out;
3440        }
3441
3442        rc = ll_layout_fetch(inode, lock);
3443        if (rc < 0)
3444                goto out;
3445
3446        /* for layout lock, lmm is returned in lock's lvb.
3447         * lvb_data is immutable if the lock is held so it's safe to access it
3448         * without res lock. See the description in ldlm_lock_decref_internal()
3449         * for the condition to free lvb_data of layout lock */
3450        if (lock->l_lvb_data != NULL) {
3451                rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3452                                  lock->l_lvb_data, lock->l_lvb_len);
3453                if (rc >= 0) {
3454                        *gen = LL_LAYOUT_GEN_EMPTY;
3455                        if (md.lsm != NULL)
3456                                *gen = md.lsm->lsm_layout_gen;
3457                        rc = 0;
3458                } else {
3459                        CERROR("%s: file "DFID" unpackmd error: %d\n",
3460                                ll_get_fsname(inode->i_sb, NULL, 0),
3461                                PFID(&lli->lli_fid), rc);
3462                }
3463        }
3464        if (rc < 0)
3465                goto out;
3466
3467        /* set layout to file. Unlikely this will fail as old layout was
3468         * surely eliminated */
3469        memset(&conf, 0, sizeof(conf));
3470        conf.coc_opc = OBJECT_CONF_SET;
3471        conf.coc_inode = inode;
3472        conf.coc_lock = lock;
3473        conf.u.coc_md = &md;
3474        rc = ll_layout_conf(inode, &conf);
3475
3476        if (md.lsm != NULL)
3477                obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3478
3479        /* refresh layout failed, need to wait */
3480        wait_layout = rc == -EBUSY;
3481
3482out:
3483        LDLM_LOCK_PUT(lock);
3484        ldlm_lock_decref(lockh, mode);
3485
3486        /* wait for IO to complete if it's still being used. */
3487        if (wait_layout) {
3488                CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3489                        ll_get_fsname(inode->i_sb, NULL, 0),
3490                        inode, PFID(&lli->lli_fid));
3491
3492                memset(&conf, 0, sizeof(conf));
3493                conf.coc_opc = OBJECT_CONF_WAIT;
3494                conf.coc_inode = inode;
3495                rc = ll_layout_conf(inode, &conf);
3496                if (rc == 0)
3497                        rc = -EAGAIN;
3498
3499                CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3500                        PFID(&lli->lli_fid), rc);
3501        }
3502        return rc;
3503}
3504
3505/**
3506 * This function checks if there exists a LAYOUT lock on the client side,
3507 * or enqueues it if it doesn't have one in cache.
3508 *
3509 * This function will not hold layout lock so it may be revoked any time after
3510 * this function returns. Any operations depend on layout should be redone
3511 * in that case.
3512 *
3513 * This function should be called before lov_io_init() to get an uptodate
3514 * layout version, the caller should save the version number and after IO
3515 * is finished, this function should be called again to verify that layout
3516 * is not changed during IO time.
3517 */
3518int ll_layout_refresh(struct inode *inode, __u32 *gen)
3519{
3520        struct ll_inode_info  *lli = ll_i2info(inode);
3521        struct ll_sb_info     *sbi = ll_i2sbi(inode);
3522        struct md_op_data     *op_data;
3523        struct lookup_intent   it;
3524        struct lustre_handle   lockh;
3525        ldlm_mode_t            mode;
3526        struct ldlm_enqueue_info einfo = {
3527                .ei_type = LDLM_IBITS,
3528                .ei_mode = LCK_CR,
3529                .ei_cb_bl = ll_md_blocking_ast,
3530                .ei_cb_cp = ldlm_completion_ast,
3531        };
3532        int rc;
3533
3534        *gen = ll_layout_version_get(lli);
3535        if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3536                return 0;
3537
3538        /* sanity checks */
3539        LASSERT(fid_is_sane(ll_inode2fid(inode)));
3540        LASSERT(S_ISREG(inode->i_mode));
3541
3542        /* take layout lock mutex to enqueue layout lock exclusively. */
3543        mutex_lock(&lli->lli_layout_mutex);
3544
3545again:
3546        /* mostly layout lock is caching on the local side, so try to match
3547         * it before grabbing layout lock mutex. */
3548        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3549                               LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3550        if (mode != 0) { /* hit cached lock */
3551                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3552                if (rc == -EAGAIN)
3553                        goto again;
3554
3555                mutex_unlock(&lli->lli_layout_mutex);
3556                return rc;
3557        }
3558
3559        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3560                        0, 0, LUSTRE_OPC_ANY, NULL);
3561        if (IS_ERR(op_data)) {
3562                mutex_unlock(&lli->lli_layout_mutex);
3563                return PTR_ERR(op_data);
3564        }
3565
3566        /* have to enqueue one */
3567        memset(&it, 0, sizeof(it));
3568        it.it_op = IT_LAYOUT;
3569        lockh.cookie = 0ULL;
3570
3571        LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3572                        ll_get_fsname(inode->i_sb, NULL, 0), inode,
3573                        PFID(&lli->lli_fid));
3574
3575        rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3576                        NULL, 0, NULL, 0);
3577        if (it.d.lustre.it_data != NULL)
3578                ptlrpc_req_finished(it.d.lustre.it_data);
3579        it.d.lustre.it_data = NULL;
3580
3581        ll_finish_md_op_data(op_data);
3582
3583        mode = it.d.lustre.it_lock_mode;
3584        it.d.lustre.it_lock_mode = 0;
3585        ll_intent_drop_lock(&it);
3586
3587        if (rc == 0) {
3588                /* set lock data in case this is a new lock */
3589                ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3590                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3591                if (rc == -EAGAIN)
3592                        goto again;
3593        }
3594        mutex_unlock(&lli->lli_layout_mutex);
3595
3596        return rc;
3597}
3598
3599/**
3600 *  This function send a restore request to the MDT
3601 */
3602int ll_layout_restore(struct inode *inode)
3603{
3604        struct hsm_user_request *hur;
3605        int                      len, rc;
3606
3607        len = sizeof(struct hsm_user_request) +
3608              sizeof(struct hsm_user_item);
3609        hur = kzalloc(len, GFP_NOFS);
3610        if (!hur)
3611                return -ENOMEM;
3612
3613        hur->hur_request.hr_action = HUA_RESTORE;
3614        hur->hur_request.hr_archive_id = 0;
3615        hur->hur_request.hr_flags = 0;
3616        memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3617               sizeof(hur->hur_user_item[0].hui_fid));
3618        hur->hur_user_item[0].hui_extent.length = -1;
3619        hur->hur_request.hr_itemcount = 1;
3620        rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3621                           len, hur, NULL);
3622        OBD_FREE(hur, len);
3623        return rc;
3624}
3625