linux/drivers/staging/lustre/lustre/llite/file.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 *
  36 * lustre/llite/file.c
  37 *
  38 * Author: Peter Braam <braam@clusterfs.com>
  39 * Author: Phil Schwan <phil@clusterfs.com>
  40 * Author: Andreas Dilger <adilger@clusterfs.com>
  41 */
  42
  43#define DEBUG_SUBSYSTEM S_LLITE
  44#include "../include/lustre_dlm.h"
  45#include "../include/lustre_lite.h"
  46#include <linux/pagemap.h>
  47#include <linux/file.h>
  48#include "llite_internal.h"
  49#include "../include/lustre/ll_fiemap.h"
  50
  51#include "../include/cl_object.h"
  52
  53static int
  54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  55
  56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  57                          bool *lease_broken);
  58
  59static enum llioc_iter
  60ll_iocontrol_call(struct inode *inode, struct file *file,
  61                  unsigned int cmd, unsigned long arg, int *rcp);
  62
  63static struct ll_file_data *ll_file_data_get(void)
  64{
  65        struct ll_file_data *fd;
  66
  67        OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  68        if (fd == NULL)
  69                return NULL;
  70        fd->fd_write_failed = false;
  71        return fd;
  72}
  73
  74static void ll_file_data_put(struct ll_file_data *fd)
  75{
  76        if (fd != NULL)
  77                OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  78}
  79
  80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  81                          struct lustre_handle *fh)
  82{
  83        op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  84        op_data->op_attr.ia_mode = inode->i_mode;
  85        op_data->op_attr.ia_atime = inode->i_atime;
  86        op_data->op_attr.ia_mtime = inode->i_mtime;
  87        op_data->op_attr.ia_ctime = inode->i_ctime;
  88        op_data->op_attr.ia_size = i_size_read(inode);
  89        op_data->op_attr_blocks = inode->i_blocks;
  90        ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  91                                        ll_inode_to_ext_flags(inode->i_flags);
  92        op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  93        if (fh)
  94                op_data->op_handle = *fh;
  95        op_data->op_capa1 = ll_mdscapa_get(inode);
  96
  97        if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  98                op_data->op_bias |= MDS_DATA_MODIFIED;
  99}
 100
 101/**
 102 * Closes the IO epoch and packs all the attributes into @op_data for
 103 * the CLOSE rpc.
 104 */
 105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
 106                             struct obd_client_handle *och)
 107{
 108        op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 109                                        ATTR_MTIME | ATTR_MTIME_SET |
 110                                        ATTR_CTIME | ATTR_CTIME_SET;
 111
 112        if (!(och->och_flags & FMODE_WRITE))
 113                goto out;
 114
 115        if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 116                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 117        else
 118                ll_ioepoch_close(inode, op_data, &och, 0);
 119
 120out:
 121        ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 122        ll_prep_md_op_data(op_data, inode, NULL, NULL,
 123                           0, 0, LUSTRE_OPC_ANY, NULL);
 124}
 125
 126static int ll_close_inode_openhandle(struct obd_export *md_exp,
 127                                     struct inode *inode,
 128                                     struct obd_client_handle *och,
 129                                     const __u64 *data_version)
 130{
 131        struct obd_export *exp = ll_i2mdexp(inode);
 132        struct md_op_data *op_data;
 133        struct ptlrpc_request *req = NULL;
 134        struct obd_device *obd = class_exp2obd(exp);
 135        int epoch_close = 1;
 136        int rc;
 137
 138        if (obd == NULL) {
 139                /*
 140                 * XXX: in case of LMV, is this correct to access
 141                 * ->exp_handle?
 142                 */
 143                CERROR("Invalid MDC connection handle %#llx\n",
 144                       ll_i2mdexp(inode)->exp_handle.h_cookie);
 145                rc = 0;
 146                goto out;
 147        }
 148
 149        op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
 150        if (!op_data) {
 151                /* XXX We leak openhandle and request here. */
 152                rc = -ENOMEM;
 153                goto out;
 154        }
 155
 156        ll_prepare_close(inode, op_data, och);
 157        if (data_version != NULL) {
 158                /* Pass in data_version implies release. */
 159                op_data->op_bias |= MDS_HSM_RELEASE;
 160                op_data->op_data_version = *data_version;
 161                op_data->op_lease_handle = och->och_lease_handle;
 162                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 163        }
 164        epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
 165        rc = md_close(md_exp, op_data, och->och_mod, &req);
 166        if (rc == -EAGAIN) {
 167                /* This close must have the epoch closed. */
 168                LASSERT(epoch_close);
 169                /* MDS has instructed us to obtain Size-on-MDS attribute from
 170                 * OSTs and send setattr to back to MDS. */
 171                rc = ll_som_update(inode, op_data);
 172                if (rc) {
 173                        CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
 174                               inode->i_ino, rc);
 175                        rc = 0;
 176                }
 177        } else if (rc) {
 178                CERROR("inode %lu mdc close failed: rc = %d\n",
 179                       inode->i_ino, rc);
 180        }
 181
 182        /* DATA_MODIFIED flag was successfully sent on close, cancel data
 183         * modification flag. */
 184        if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 185                struct ll_inode_info *lli = ll_i2info(inode);
 186
 187                spin_lock(&lli->lli_lock);
 188                lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 189                spin_unlock(&lli->lli_lock);
 190        }
 191
 192        if (rc == 0) {
 193                rc = ll_objects_destroy(req, inode);
 194                if (rc)
 195                        CERROR("inode %lu ll_objects destroy: rc = %d\n",
 196                               inode->i_ino, rc);
 197        }
 198        if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
 199                struct mdt_body *body;
 200
 201                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 202                if (!(body->valid & OBD_MD_FLRELEASED))
 203                        rc = -EBUSY;
 204        }
 205
 206        ll_finish_md_op_data(op_data);
 207
 208out:
 209        if (exp_connect_som(exp) && !epoch_close &&
 210            S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 211                ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 212        } else {
 213                md_clear_open_replay_data(md_exp, och);
 214                /* Free @och if it is not waiting for DONE_WRITING. */
 215                och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 216                kfree(och);
 217        }
 218        if (req) /* This is close request */
 219                ptlrpc_req_finished(req);
 220        return rc;
 221}
 222
 223int ll_md_real_close(struct inode *inode, fmode_t fmode)
 224{
 225        struct ll_inode_info *lli = ll_i2info(inode);
 226        struct obd_client_handle **och_p;
 227        struct obd_client_handle *och;
 228        __u64 *och_usecount;
 229        int rc = 0;
 230
 231        if (fmode & FMODE_WRITE) {
 232                och_p = &lli->lli_mds_write_och;
 233                och_usecount = &lli->lli_open_fd_write_count;
 234        } else if (fmode & FMODE_EXEC) {
 235                och_p = &lli->lli_mds_exec_och;
 236                och_usecount = &lli->lli_open_fd_exec_count;
 237        } else {
 238                LASSERT(fmode & FMODE_READ);
 239                och_p = &lli->lli_mds_read_och;
 240                och_usecount = &lli->lli_open_fd_read_count;
 241        }
 242
 243        mutex_lock(&lli->lli_och_mutex);
 244        if (*och_usecount > 0) {
 245                /* There are still users of this handle, so skip
 246                 * freeing it. */
 247                mutex_unlock(&lli->lli_och_mutex);
 248                return 0;
 249        }
 250
 251        och = *och_p;
 252        *och_p = NULL;
 253        mutex_unlock(&lli->lli_och_mutex);
 254
 255        if (och != NULL) {
 256                /* There might be a race and this handle may already
 257                   be closed. */
 258                rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 259                                               inode, och, NULL);
 260        }
 261
 262        return rc;
 263}
 264
 265static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 266                       struct file *file)
 267{
 268        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 269        struct ll_inode_info *lli = ll_i2info(inode);
 270        int lockmode;
 271        __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 272        struct lustre_handle lockh;
 273        ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN}};
 274        int rc = 0;
 275
 276        /* clear group lock, if present */
 277        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 278                ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 279
 280        if (fd->fd_lease_och != NULL) {
 281                bool lease_broken;
 282
 283                /* Usually the lease is not released when the
 284                 * application crashed, we need to release here. */
 285                rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 286                CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 287                        PFID(&lli->lli_fid), rc, lease_broken);
 288
 289                fd->fd_lease_och = NULL;
 290        }
 291
 292        if (fd->fd_och != NULL) {
 293                rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
 294                fd->fd_och = NULL;
 295                goto out;
 296        }
 297
 298        /* Let's see if we have good enough OPEN lock on the file and if
 299           we can skip talking to MDS */
 300
 301        mutex_lock(&lli->lli_och_mutex);
 302        if (fd->fd_omode & FMODE_WRITE) {
 303                lockmode = LCK_CW;
 304                LASSERT(lli->lli_open_fd_write_count);
 305                lli->lli_open_fd_write_count--;
 306        } else if (fd->fd_omode & FMODE_EXEC) {
 307                lockmode = LCK_PR;
 308                LASSERT(lli->lli_open_fd_exec_count);
 309                lli->lli_open_fd_exec_count--;
 310        } else {
 311                lockmode = LCK_CR;
 312                LASSERT(lli->lli_open_fd_read_count);
 313                lli->lli_open_fd_read_count--;
 314        }
 315        mutex_unlock(&lli->lli_och_mutex);
 316
 317        if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 318                           LDLM_IBITS, &policy, lockmode, &lockh))
 319                rc = ll_md_real_close(inode, fd->fd_omode);
 320
 321out:
 322        LUSTRE_FPRIVATE(file) = NULL;
 323        ll_file_data_put(fd);
 324        ll_capa_close(inode);
 325
 326        return rc;
 327}
 328
 329/* While this returns an error code, fput() the caller does not, so we need
 330 * to make every effort to clean up all of our state here.  Also, applications
 331 * rarely check close errors and even if an error is returned they will not
 332 * re-try the close call.
 333 */
 334int ll_file_release(struct inode *inode, struct file *file)
 335{
 336        struct ll_file_data *fd;
 337        struct ll_sb_info *sbi = ll_i2sbi(inode);
 338        struct ll_inode_info *lli = ll_i2info(inode);
 339        int rc;
 340
 341        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 342               inode->i_generation, inode);
 343
 344#ifdef CONFIG_FS_POSIX_ACL
 345        if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
 346                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 347
 348                LASSERT(fd != NULL);
 349                if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 350                        fd->fd_flags &= ~LL_FILE_RMTACL;
 351                        rct_del(&sbi->ll_rct, current_pid());
 352                        et_search_free(&sbi->ll_et, current_pid());
 353                }
 354        }
 355#endif
 356
 357        if (!is_root_inode(inode))
 358                ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 359        fd = LUSTRE_FPRIVATE(file);
 360        LASSERT(fd != NULL);
 361
 362        /* The last ref on @file, maybe not the owner pid of statahead.
 363         * Different processes can open the same dir, "ll_opendir_key" means:
 364         * it is me that should stop the statahead thread. */
 365        if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 366            lli->lli_opendir_pid != 0)
 367                ll_stop_statahead(inode, lli->lli_opendir_key);
 368
 369        if (is_root_inode(inode)) {
 370                LUSTRE_FPRIVATE(file) = NULL;
 371                ll_file_data_put(fd);
 372                return 0;
 373        }
 374
 375        if (!S_ISDIR(inode->i_mode)) {
 376                lov_read_and_clear_async_rc(lli->lli_clob);
 377                lli->lli_async_rc = 0;
 378        }
 379
 380        rc = ll_md_close(sbi->ll_md_exp, inode, file);
 381
 382        if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 383                libcfs_debug_dumplog();
 384
 385        return rc;
 386}
 387
 388static int ll_intent_file_open(struct dentry *dentry, void *lmm,
 389                               int lmmsize, struct lookup_intent *itp)
 390{
 391        struct inode *inode = d_inode(dentry);
 392        struct ll_sb_info *sbi = ll_i2sbi(inode);
 393        struct dentry *parent = dentry->d_parent;
 394        const char *name = dentry->d_name.name;
 395        const int len = dentry->d_name.len;
 396        struct md_op_data *op_data;
 397        struct ptlrpc_request *req;
 398        __u32 opc = LUSTRE_OPC_ANY;
 399        int rc;
 400
 401        /* Usually we come here only for NFSD, and we want open lock.
 402           But we can also get here with pre 2.6.15 patchless kernels, and in
 403           that case that lock is also ok */
 404        /* We can also get here if there was cached open handle in revalidate_it
 405         * but it disappeared while we were getting from there to ll_file_open.
 406         * But this means this file was closed and immediately opened which
 407         * makes a good candidate for using OPEN lock */
 408        /* If lmmsize & lmm are not 0, we are just setting stripe info
 409         * parameters. No need for the open lock */
 410        if (lmm == NULL && lmmsize == 0) {
 411                itp->it_flags |= MDS_OPEN_LOCK;
 412                if (itp->it_flags & FMODE_WRITE)
 413                        opc = LUSTRE_OPC_CREATE;
 414        }
 415
 416        op_data  = ll_prep_md_op_data(NULL, d_inode(parent),
 417                                      inode, name, len,
 418                                      O_RDWR, opc, NULL);
 419        if (IS_ERR(op_data))
 420                return PTR_ERR(op_data);
 421
 422        itp->it_flags |= MDS_OPEN_BY_FID;
 423        rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 424                            0 /*unused */, &req, ll_md_blocking_ast, 0);
 425        ll_finish_md_op_data(op_data);
 426        if (rc == -ESTALE) {
 427                /* reason for keep own exit path - don`t flood log
 428                * with messages with -ESTALE errors.
 429                */
 430                if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 431                     it_open_error(DISP_OPEN_OPEN, itp))
 432                        goto out;
 433                ll_release_openhandle(inode, itp);
 434                goto out;
 435        }
 436
 437        if (it_disposition(itp, DISP_LOOKUP_NEG)) {
 438                rc = -ENOENT;
 439                goto out;
 440        }
 441
 442        if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 443                rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 444                CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 445                goto out;
 446        }
 447
 448        rc = ll_prep_inode(&inode, req, NULL, itp);
 449        if (!rc && itp->d.lustre.it_lock_mode)
 450                ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
 451
 452out:
 453        ptlrpc_req_finished(req);
 454        ll_intent_drop_lock(itp);
 455
 456        return rc;
 457}
 458
 459/**
 460 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 461 * not believe attributes if a few ioepoch holders exist. Attributes for
 462 * previous ioepoch if new one is opened are also skipped by MDS.
 463 */
 464void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 465{
 466        if (ioepoch && lli->lli_ioepoch != ioepoch) {
 467                lli->lli_ioepoch = ioepoch;
 468                CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
 469                       ioepoch, PFID(&lli->lli_fid));
 470        }
 471}
 472
 473static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 474                       struct obd_client_handle *och)
 475{
 476        struct ptlrpc_request *req = it->d.lustre.it_data;
 477        struct mdt_body *body;
 478
 479        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 480        och->och_fh = body->handle;
 481        och->och_fid = body->fid1;
 482        och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
 483        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 484        och->och_flags = it->it_flags;
 485
 486        return md_set_open_replay_data(md_exp, och, it);
 487}
 488
 489static int ll_local_open(struct file *file, struct lookup_intent *it,
 490                         struct ll_file_data *fd, struct obd_client_handle *och)
 491{
 492        struct inode *inode = file_inode(file);
 493        struct ll_inode_info *lli = ll_i2info(inode);
 494
 495        LASSERT(!LUSTRE_FPRIVATE(file));
 496
 497        LASSERT(fd != NULL);
 498
 499        if (och) {
 500                struct ptlrpc_request *req = it->d.lustre.it_data;
 501                struct mdt_body *body;
 502                int rc;
 503
 504                rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 505                if (rc != 0)
 506                        return rc;
 507
 508                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 509                ll_ioepoch_open(lli, body->ioepoch);
 510        }
 511
 512        LUSTRE_FPRIVATE(file) = fd;
 513        ll_readahead_init(inode, &fd->fd_ras);
 514        fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 515        return 0;
 516}
 517
 518/* Open a file, and (for the very first open) create objects on the OSTs at
 519 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 520 * creation or open until ll_lov_setstripe() ioctl is called.
 521 *
 522 * If we already have the stripe MD locally then we don't request it in
 523 * md_open(), by passing a lmm_size = 0.
 524 *
 525 * It is up to the application to ensure no other processes open this file
 526 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 527 * used.  We might be able to avoid races of that sort by getting lli_open_sem
 528 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 529 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 530 */
 531int ll_file_open(struct inode *inode, struct file *file)
 532{
 533        struct ll_inode_info *lli = ll_i2info(inode);
 534        struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 535                                          .it_flags = file->f_flags };
 536        struct obd_client_handle **och_p = NULL;
 537        __u64 *och_usecount = NULL;
 538        struct ll_file_data *fd;
 539        int rc = 0, opendir_set = 0;
 540
 541        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 542               inode->i_generation, inode, file->f_flags);
 543
 544        it = file->private_data; /* XXX: compat macro */
 545        file->private_data = NULL; /* prevent ll_local_open assertion */
 546
 547        fd = ll_file_data_get();
 548        if (fd == NULL) {
 549                rc = -ENOMEM;
 550                goto out_openerr;
 551        }
 552
 553        fd->fd_file = file;
 554        if (S_ISDIR(inode->i_mode)) {
 555                spin_lock(&lli->lli_sa_lock);
 556                if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 557                    lli->lli_opendir_pid == 0) {
 558                        lli->lli_opendir_key = fd;
 559                        lli->lli_opendir_pid = current_pid();
 560                        opendir_set = 1;
 561                }
 562                spin_unlock(&lli->lli_sa_lock);
 563        }
 564
 565        if (is_root_inode(inode)) {
 566                LUSTRE_FPRIVATE(file) = fd;
 567                return 0;
 568        }
 569
 570        if (!it || !it->d.lustre.it_disposition) {
 571                /* Convert f_flags into access mode. We cannot use file->f_mode,
 572                 * because everything but O_ACCMODE mask was stripped from
 573                 * there */
 574                if ((oit.it_flags + 1) & O_ACCMODE)
 575                        oit.it_flags++;
 576                if (file->f_flags & O_TRUNC)
 577                        oit.it_flags |= FMODE_WRITE;
 578
 579                /* kernel only call f_op->open in dentry_open.  filp_open calls
 580                 * dentry_open after call to open_namei that checks permissions.
 581                 * Only nfsd_open call dentry_open directly without checking
 582                 * permissions and because of that this code below is safe. */
 583                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 584                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 585
 586                /* We do not want O_EXCL here, presumably we opened the file
 587                 * already? XXX - NFS implications? */
 588                oit.it_flags &= ~O_EXCL;
 589
 590                /* bug20584, if "it_flags" contains O_CREAT, the file will be
 591                 * created if necessary, then "IT_CREAT" should be set to keep
 592                 * consistent with it */
 593                if (oit.it_flags & O_CREAT)
 594                        oit.it_op |= IT_CREAT;
 595
 596                it = &oit;
 597        }
 598
 599restart:
 600        /* Let's see if we have file open on MDS already. */
 601        if (it->it_flags & FMODE_WRITE) {
 602                och_p = &lli->lli_mds_write_och;
 603                och_usecount = &lli->lli_open_fd_write_count;
 604        } else if (it->it_flags & FMODE_EXEC) {
 605                och_p = &lli->lli_mds_exec_och;
 606                och_usecount = &lli->lli_open_fd_exec_count;
 607         } else {
 608                och_p = &lli->lli_mds_read_och;
 609                och_usecount = &lli->lli_open_fd_read_count;
 610        }
 611
 612        mutex_lock(&lli->lli_och_mutex);
 613        if (*och_p) { /* Open handle is present */
 614                if (it_disposition(it, DISP_OPEN_OPEN)) {
 615                        /* Well, there's extra open request that we do not need,
 616                           let's close it somehow. This will decref request. */
 617                        rc = it_open_error(DISP_OPEN_OPEN, it);
 618                        if (rc) {
 619                                mutex_unlock(&lli->lli_och_mutex);
 620                                goto out_openerr;
 621                        }
 622
 623                        ll_release_openhandle(inode, it);
 624                }
 625                (*och_usecount)++;
 626
 627                rc = ll_local_open(file, it, fd, NULL);
 628                if (rc) {
 629                        (*och_usecount)--;
 630                        mutex_unlock(&lli->lli_och_mutex);
 631                        goto out_openerr;
 632                }
 633        } else {
 634                LASSERT(*och_usecount == 0);
 635                if (!it->d.lustre.it_disposition) {
 636                        /* We cannot just request lock handle now, new ELC code
 637                           means that one of other OPEN locks for this file
 638                           could be cancelled, and since blocking ast handler
 639                           would attempt to grab och_mutex as well, that would
 640                           result in a deadlock */
 641                        mutex_unlock(&lli->lli_och_mutex);
 642                        it->it_create_mode |= M_CHECK_STALE;
 643                        rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
 644                        it->it_create_mode &= ~M_CHECK_STALE;
 645                        if (rc)
 646                                goto out_openerr;
 647
 648                        goto restart;
 649                }
 650                *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
 651                if (!*och_p) {
 652                        rc = -ENOMEM;
 653                        goto out_och_free;
 654                }
 655
 656                (*och_usecount)++;
 657
 658                /* md_intent_lock() didn't get a request ref if there was an
 659                 * open error, so don't do cleanup on the request here
 660                 * (bug 3430) */
 661                /* XXX (green): Should not we bail out on any error here, not
 662                 * just open error? */
 663                rc = it_open_error(DISP_OPEN_OPEN, it);
 664                if (rc)
 665                        goto out_och_free;
 666
 667                LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 668
 669                rc = ll_local_open(file, it, fd, *och_p);
 670                if (rc)
 671                        goto out_och_free;
 672        }
 673        mutex_unlock(&lli->lli_och_mutex);
 674        fd = NULL;
 675
 676        /* Must do this outside lli_och_mutex lock to prevent deadlock where
 677           different kind of OPEN lock for this same inode gets cancelled
 678           by ldlm_cancel_lru */
 679        if (!S_ISREG(inode->i_mode))
 680                goto out_och_free;
 681
 682        ll_capa_open(inode);
 683
 684        if (!lli->lli_has_smd &&
 685            (cl_is_lov_delay_create(file->f_flags) ||
 686             (file->f_mode & FMODE_WRITE) == 0)) {
 687                CDEBUG(D_INODE, "object creation was delayed\n");
 688                goto out_och_free;
 689        }
 690        cl_lov_delay_create_clear(&file->f_flags);
 691        goto out_och_free;
 692
 693out_och_free:
 694        if (rc) {
 695                if (och_p && *och_p) {
 696                        kfree(*och_p);
 697                        *och_p = NULL; /* OBD_FREE writes some magic there */
 698                        (*och_usecount)--;
 699                }
 700                mutex_unlock(&lli->lli_och_mutex);
 701
 702out_openerr:
 703                if (opendir_set != 0)
 704                        ll_stop_statahead(inode, lli->lli_opendir_key);
 705                ll_file_data_put(fd);
 706        } else {
 707                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 708        }
 709
 710        if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 711                ptlrpc_req_finished(it->d.lustre.it_data);
 712                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 713        }
 714
 715        return rc;
 716}
 717
 718static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 719                        struct ldlm_lock_desc *desc, void *data, int flag)
 720{
 721        int rc;
 722        struct lustre_handle lockh;
 723
 724        switch (flag) {
 725        case LDLM_CB_BLOCKING:
 726                ldlm_lock2handle(lock, &lockh);
 727                rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 728                if (rc < 0) {
 729                        CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 730                        return rc;
 731                }
 732                break;
 733        case LDLM_CB_CANCELING:
 734                /* do nothing */
 735                break;
 736        }
 737        return 0;
 738}
 739
 740/**
 741 * Acquire a lease and open the file.
 742 */
 743static struct obd_client_handle *
 744ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 745              __u64 open_flags)
 746{
 747        struct lookup_intent it = { .it_op = IT_OPEN };
 748        struct ll_sb_info *sbi = ll_i2sbi(inode);
 749        struct md_op_data *op_data;
 750        struct ptlrpc_request *req;
 751        struct lustre_handle old_handle = { 0 };
 752        struct obd_client_handle *och = NULL;
 753        int rc;
 754        int rc2;
 755
 756        if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 757                return ERR_PTR(-EINVAL);
 758
 759        if (file != NULL) {
 760                struct ll_inode_info *lli = ll_i2info(inode);
 761                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 762                struct obd_client_handle **och_p;
 763                __u64 *och_usecount;
 764
 765                if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 766                        return ERR_PTR(-EPERM);
 767
 768                /* Get the openhandle of the file */
 769                rc = -EBUSY;
 770                mutex_lock(&lli->lli_och_mutex);
 771                if (fd->fd_lease_och != NULL) {
 772                        mutex_unlock(&lli->lli_och_mutex);
 773                        return ERR_PTR(rc);
 774                }
 775
 776                if (fd->fd_och == NULL) {
 777                        if (file->f_mode & FMODE_WRITE) {
 778                                LASSERT(lli->lli_mds_write_och != NULL);
 779                                och_p = &lli->lli_mds_write_och;
 780                                och_usecount = &lli->lli_open_fd_write_count;
 781                        } else {
 782                                LASSERT(lli->lli_mds_read_och != NULL);
 783                                och_p = &lli->lli_mds_read_och;
 784                                och_usecount = &lli->lli_open_fd_read_count;
 785                        }
 786                        if (*och_usecount == 1) {
 787                                fd->fd_och = *och_p;
 788                                *och_p = NULL;
 789                                *och_usecount = 0;
 790                                rc = 0;
 791                        }
 792                }
 793                mutex_unlock(&lli->lli_och_mutex);
 794                if (rc < 0) /* more than 1 opener */
 795                        return ERR_PTR(rc);
 796
 797                LASSERT(fd->fd_och != NULL);
 798                old_handle = fd->fd_och->och_fh;
 799        }
 800
 801        och = kzalloc(sizeof(*och), GFP_NOFS);
 802        if (!och)
 803                return ERR_PTR(-ENOMEM);
 804
 805        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 806                                        LUSTRE_OPC_ANY, NULL);
 807        if (IS_ERR(op_data)) {
 808                rc = PTR_ERR(op_data);
 809                goto out;
 810        }
 811
 812        /* To tell the MDT this openhandle is from the same owner */
 813        op_data->op_handle = old_handle;
 814
 815        it.it_flags = fmode | open_flags;
 816        it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 817        rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
 818                                ll_md_blocking_lease_ast,
 819        /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 820         * it can be cancelled which may mislead applications that the lease is
 821         * broken;
 822         * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 823         * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 824         * doesn't deal with openhandle, so normal openhandle will be leaked. */
 825                                LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 826        ll_finish_md_op_data(op_data);
 827        ptlrpc_req_finished(req);
 828        if (rc < 0)
 829                goto out_release_it;
 830
 831        if (it_disposition(&it, DISP_LOOKUP_NEG)) {
 832                rc = -ENOENT;
 833                goto out_release_it;
 834        }
 835
 836        rc = it_open_error(DISP_OPEN_OPEN, &it);
 837        if (rc)
 838                goto out_release_it;
 839
 840        LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 841        ll_och_fill(sbi->ll_md_exp, &it, och);
 842
 843        if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
 844                rc = -EOPNOTSUPP;
 845                goto out_close;
 846        }
 847
 848        /* already get lease, handle lease lock */
 849        ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 850        if (it.d.lustre.it_lock_mode == 0 ||
 851            it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
 852                /* open lock must return for lease */
 853                CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 854                        PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
 855                        it.d.lustre.it_lock_bits);
 856                rc = -EPROTO;
 857                goto out_close;
 858        }
 859
 860        ll_intent_release(&it);
 861        return och;
 862
 863out_close:
 864        rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
 865        if (rc2)
 866                CERROR("Close openhandle returned %d\n", rc2);
 867
 868        /* cancel open lock */
 869        if (it.d.lustre.it_lock_mode != 0) {
 870                ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 871                                                it.d.lustre.it_lock_mode);
 872                it.d.lustre.it_lock_mode = 0;
 873        }
 874out_release_it:
 875        ll_intent_release(&it);
 876out:
 877        kfree(och);
 878        return ERR_PTR(rc);
 879}
 880
 881/**
 882 * Release lease and close the file.
 883 * It will check if the lease has ever broken.
 884 */
 885static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 886                          bool *lease_broken)
 887{
 888        struct ldlm_lock *lock;
 889        bool cancelled = true;
 890        int rc;
 891
 892        lock = ldlm_handle2lock(&och->och_lease_handle);
 893        if (lock != NULL) {
 894                lock_res_and_lock(lock);
 895                cancelled = ldlm_is_cancel(lock);
 896                unlock_res_and_lock(lock);
 897                ldlm_lock_put(lock);
 898        }
 899
 900        CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 901                PFID(&ll_i2info(inode)->lli_fid), cancelled);
 902
 903        if (!cancelled)
 904                ldlm_cli_cancel(&och->och_lease_handle, 0);
 905        if (lease_broken != NULL)
 906                *lease_broken = cancelled;
 907
 908        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
 909                                       NULL);
 910        return rc;
 911}
 912
 913/* Fills the obdo with the attributes for the lsm */
 914static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 915                          struct obd_capa *capa, struct obdo *obdo,
 916                          __u64 ioepoch, int sync)
 917{
 918        struct ptlrpc_request_set *set;
 919        struct obd_info     oinfo = { { { 0 } } };
 920        int                     rc;
 921
 922        LASSERT(lsm != NULL);
 923
 924        oinfo.oi_md = lsm;
 925        oinfo.oi_oa = obdo;
 926        oinfo.oi_oa->o_oi = lsm->lsm_oi;
 927        oinfo.oi_oa->o_mode = S_IFREG;
 928        oinfo.oi_oa->o_ioepoch = ioepoch;
 929        oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 930                               OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 931                               OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 932                               OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 933                               OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 934                               OBD_MD_FLDATAVERSION;
 935        oinfo.oi_capa = capa;
 936        if (sync) {
 937                oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 938                oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 939        }
 940
 941        set = ptlrpc_prep_set();
 942        if (set == NULL) {
 943                CERROR("can't allocate ptlrpc set\n");
 944                rc = -ENOMEM;
 945        } else {
 946                rc = obd_getattr_async(exp, &oinfo, set);
 947                if (rc == 0)
 948                        rc = ptlrpc_set_wait(set);
 949                ptlrpc_set_destroy(set);
 950        }
 951        if (rc == 0)
 952                oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 953                                         OBD_MD_FLATIME | OBD_MD_FLMTIME |
 954                                         OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 955                                         OBD_MD_FLDATAVERSION);
 956        return rc;
 957}
 958
 959/**
 960  * Performs the getattr on the inode and updates its fields.
 961  * If @sync != 0, perform the getattr under the server-side lock.
 962  */
 963int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 964                     __u64 ioepoch, int sync)
 965{
 966        struct obd_capa      *capa = ll_mdscapa_get(inode);
 967        struct lov_stripe_md *lsm;
 968        int rc;
 969
 970        lsm = ccc_inode_lsm_get(inode);
 971        rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 972                            capa, obdo, ioepoch, sync);
 973        capa_put(capa);
 974        if (rc == 0) {
 975                struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
 976
 977                obdo_refresh_inode(inode, obdo, obdo->o_valid);
 978                CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
 979                       POSTID(oi), i_size_read(inode),
 980                       (unsigned long long)inode->i_blocks,
 981                       1UL << inode->i_blkbits);
 982        }
 983        ccc_inode_lsm_put(inode, lsm);
 984        return rc;
 985}
 986
 987int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 988{
 989        struct ll_inode_info *lli = ll_i2info(inode);
 990        struct cl_object *obj = lli->lli_clob;
 991        struct cl_attr *attr = ccc_env_thread_attr(env);
 992        struct ost_lvb lvb;
 993        int rc = 0;
 994
 995        ll_inode_size_lock(inode);
 996        /* merge timestamps the most recently obtained from mds with
 997           timestamps obtained from osts */
 998        LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 999        LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1000        LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1001
1002        lvb.lvb_size = i_size_read(inode);
1003        lvb.lvb_blocks = inode->i_blocks;
1004        lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1005        lvb.lvb_atime = LTIME_S(inode->i_atime);
1006        lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1007
1008        cl_object_attr_lock(obj);
1009        rc = cl_object_attr_get(env, obj, attr);
1010        cl_object_attr_unlock(obj);
1011
1012        if (rc == 0) {
1013                if (lvb.lvb_atime < attr->cat_atime)
1014                        lvb.lvb_atime = attr->cat_atime;
1015                if (lvb.lvb_ctime < attr->cat_ctime)
1016                        lvb.lvb_ctime = attr->cat_ctime;
1017                if (lvb.lvb_mtime < attr->cat_mtime)
1018                        lvb.lvb_mtime = attr->cat_mtime;
1019
1020                CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1021                                PFID(&lli->lli_fid), attr->cat_size);
1022                cl_isize_write_nolock(inode, attr->cat_size);
1023
1024                inode->i_blocks = attr->cat_blocks;
1025
1026                LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1027                LTIME_S(inode->i_atime) = lvb.lvb_atime;
1028                LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1029        }
1030        ll_inode_size_unlock(inode);
1031
1032        return rc;
1033}
1034
1035int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1036                     lstat_t *st)
1037{
1038        struct obdo obdo = { 0 };
1039        int rc;
1040
1041        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1042        if (rc == 0) {
1043                st->st_size   = obdo.o_size;
1044                st->st_blocks = obdo.o_blocks;
1045                st->st_mtime  = obdo.o_mtime;
1046                st->st_atime  = obdo.o_atime;
1047                st->st_ctime  = obdo.o_ctime;
1048        }
1049        return rc;
1050}
1051
1052static bool file_is_noatime(const struct file *file)
1053{
1054        const struct vfsmount *mnt = file->f_path.mnt;
1055        const struct inode *inode = file_inode(file);
1056
1057        /* Adapted from file_accessed() and touch_atime().*/
1058        if (file->f_flags & O_NOATIME)
1059                return true;
1060
1061        if (inode->i_flags & S_NOATIME)
1062                return true;
1063
1064        if (IS_NOATIME(inode))
1065                return true;
1066
1067        if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1068                return true;
1069
1070        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1071                return true;
1072
1073        if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1074                return true;
1075
1076        return false;
1077}
1078
1079void ll_io_init(struct cl_io *io, const struct file *file, int write)
1080{
1081        struct inode *inode = file_inode(file);
1082
1083        io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1084        if (write) {
1085                io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1086                io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1087                                      file->f_flags & O_DIRECT ||
1088                                      IS_SYNC(inode);
1089        }
1090        io->ci_obj     = ll_i2info(inode)->lli_clob;
1091        io->ci_lockreq = CILR_MAYBE;
1092        if (ll_file_nolock(file)) {
1093                io->ci_lockreq = CILR_NEVER;
1094                io->ci_no_srvlock = 1;
1095        } else if (file->f_flags & O_APPEND) {
1096                io->ci_lockreq = CILR_MANDATORY;
1097        }
1098
1099        io->ci_noatime = file_is_noatime(file);
1100}
1101
1102static ssize_t
1103ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1104                   struct file *file, enum cl_io_type iot,
1105                   loff_t *ppos, size_t count)
1106{
1107        struct ll_inode_info *lli = ll_i2info(file_inode(file));
1108        struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1109        struct cl_io     *io;
1110        ssize_t        result;
1111
1112restart:
1113        io = ccc_env_thread_io(env);
1114        ll_io_init(io, file, iot == CIT_WRITE);
1115
1116        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1117                struct vvp_io *vio = vvp_env_io(env);
1118                struct ccc_io *cio = ccc_env_io(env);
1119                int write_mutex_locked = 0;
1120
1121                cio->cui_fd  = LUSTRE_FPRIVATE(file);
1122                vio->cui_io_subtype = args->via_io_subtype;
1123
1124                switch (vio->cui_io_subtype) {
1125                case IO_NORMAL:
1126                        cio->cui_iter = args->u.normal.via_iter;
1127                        cio->cui_iocb = args->u.normal.via_iocb;
1128                        if ((iot == CIT_WRITE) &&
1129                            !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1130                                if (mutex_lock_interruptible(&lli->
1131                                                               lli_write_mutex)) {
1132                                        result = -ERESTARTSYS;
1133                                        goto out;
1134                                }
1135                                write_mutex_locked = 1;
1136                        } else if (iot == CIT_READ) {
1137                                down_read(&lli->lli_trunc_sem);
1138                        }
1139                        break;
1140                case IO_SPLICE:
1141                        vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1142                        vio->u.splice.cui_flags = args->u.splice.via_flags;
1143                        break;
1144                default:
1145                        CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1146                        LBUG();
1147                }
1148                result = cl_io_loop(env, io);
1149                if (write_mutex_locked)
1150                        mutex_unlock(&lli->lli_write_mutex);
1151                else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1152                        up_read(&lli->lli_trunc_sem);
1153        } else {
1154                /* cl_io_rw_init() handled IO */
1155                result = io->ci_result;
1156        }
1157
1158        if (io->ci_nob > 0) {
1159                result = io->ci_nob;
1160                *ppos = io->u.ci_wr.wr.crw_pos;
1161        }
1162        goto out;
1163out:
1164        cl_io_fini(env, io);
1165        /* If any bit been read/written (result != 0), we just return
1166         * short read/write instead of restart io. */
1167        if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1168                CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1169                       iot == CIT_READ ? "read" : "write",
1170                       file, *ppos, count);
1171                LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1172                goto restart;
1173        }
1174
1175        if (iot == CIT_READ) {
1176                if (result >= 0)
1177                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1178                                           LPROC_LL_READ_BYTES, result);
1179        } else if (iot == CIT_WRITE) {
1180                if (result >= 0) {
1181                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1182                                           LPROC_LL_WRITE_BYTES, result);
1183                        fd->fd_write_failed = false;
1184                } else if (result != -ERESTARTSYS) {
1185                        fd->fd_write_failed = true;
1186                }
1187        }
1188
1189        return result;
1190}
1191
1192static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1193{
1194        struct lu_env      *env;
1195        struct vvp_io_args *args;
1196        ssize_t      result;
1197        int              refcheck;
1198
1199        env = cl_env_get(&refcheck);
1200        if (IS_ERR(env))
1201                return PTR_ERR(env);
1202
1203        args = vvp_env_args(env, IO_NORMAL);
1204        args->u.normal.via_iter = to;
1205        args->u.normal.via_iocb = iocb;
1206
1207        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1208                                    &iocb->ki_pos, iov_iter_count(to));
1209        cl_env_put(env, &refcheck);
1210        return result;
1211}
1212
1213/*
1214 * Write to a file (through the page cache).
1215 */
1216static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1217{
1218        struct lu_env      *env;
1219        struct vvp_io_args *args;
1220        ssize_t      result;
1221        int              refcheck;
1222
1223        env = cl_env_get(&refcheck);
1224        if (IS_ERR(env))
1225                return PTR_ERR(env);
1226
1227        args = vvp_env_args(env, IO_NORMAL);
1228        args->u.normal.via_iter = from;
1229        args->u.normal.via_iocb = iocb;
1230
1231        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1232                                  &iocb->ki_pos, iov_iter_count(from));
1233        cl_env_put(env, &refcheck);
1234        return result;
1235}
1236
1237/*
1238 * Send file content (through pagecache) somewhere with helper
1239 */
1240static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1241                                   struct pipe_inode_info *pipe, size_t count,
1242                                   unsigned int flags)
1243{
1244        struct lu_env      *env;
1245        struct vvp_io_args *args;
1246        ssize_t      result;
1247        int              refcheck;
1248
1249        env = cl_env_get(&refcheck);
1250        if (IS_ERR(env))
1251                return PTR_ERR(env);
1252
1253        args = vvp_env_args(env, IO_SPLICE);
1254        args->u.splice.via_pipe = pipe;
1255        args->u.splice.via_flags = flags;
1256
1257        result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1258        cl_env_put(env, &refcheck);
1259        return result;
1260}
1261
1262static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1263{
1264        struct obd_export *exp = ll_i2dtexp(inode);
1265        struct obd_trans_info oti = { 0 };
1266        struct obdo *oa = NULL;
1267        int lsm_size;
1268        int rc = 0;
1269        struct lov_stripe_md *lsm = NULL, *lsm2;
1270
1271        OBDO_ALLOC(oa);
1272        if (oa == NULL)
1273                return -ENOMEM;
1274
1275        lsm = ccc_inode_lsm_get(inode);
1276        if (!lsm_has_objects(lsm)) {
1277                rc = -ENOENT;
1278                goto out;
1279        }
1280
1281        lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1282                   (lsm->lsm_stripe_count));
1283
1284        lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
1285        if (lsm2 == NULL) {
1286                rc = -ENOMEM;
1287                goto out;
1288        }
1289
1290        oa->o_oi = *oi;
1291        oa->o_nlink = ost_idx;
1292        oa->o_flags |= OBD_FL_RECREATE_OBJS;
1293        oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1294        obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1295                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1296        obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1297        memcpy(lsm2, lsm, lsm_size);
1298        ll_inode_size_lock(inode);
1299        rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1300        ll_inode_size_unlock(inode);
1301
1302        kvfree(lsm2);
1303        goto out;
1304out:
1305        ccc_inode_lsm_put(inode, lsm);
1306        OBDO_FREE(oa);
1307        return rc;
1308}
1309
1310static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1311{
1312        struct ll_recreate_obj ucreat;
1313        struct ost_id           oi;
1314
1315        if (!capable(CFS_CAP_SYS_ADMIN))
1316                return -EPERM;
1317
1318        if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1319                           sizeof(ucreat)))
1320                return -EFAULT;
1321
1322        ostid_set_seq_mdt0(&oi);
1323        ostid_set_id(&oi, ucreat.lrc_id);
1324        return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1325}
1326
1327static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1328{
1329        struct lu_fid   fid;
1330        struct ost_id   oi;
1331        u32             ost_idx;
1332
1333        if (!capable(CFS_CAP_SYS_ADMIN))
1334                return -EPERM;
1335
1336        if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1337                return -EFAULT;
1338
1339        fid_to_ostid(&fid, &oi);
1340        ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1341        return ll_lov_recreate(inode, &oi, ost_idx);
1342}
1343
1344int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1345                             int flags, struct lov_user_md *lum, int lum_size)
1346{
1347        struct lov_stripe_md *lsm = NULL;
1348        struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1349        int rc = 0;
1350
1351        lsm = ccc_inode_lsm_get(inode);
1352        if (lsm != NULL) {
1353                ccc_inode_lsm_put(inode, lsm);
1354                CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1355                       inode->i_ino);
1356                rc = -EEXIST;
1357                goto out;
1358        }
1359
1360        ll_inode_size_lock(inode);
1361        rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1362        if (rc)
1363                goto out_unlock;
1364        rc = oit.d.lustre.it_status;
1365        if (rc < 0)
1366                goto out_req_free;
1367
1368        ll_release_openhandle(inode, &oit);
1369
1370out_unlock:
1371        ll_inode_size_unlock(inode);
1372        ll_intent_release(&oit);
1373        ccc_inode_lsm_put(inode, lsm);
1374out:
1375        return rc;
1376out_req_free:
1377        ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1378        goto out;
1379}
1380
1381int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1382                             struct lov_mds_md **lmmp, int *lmm_size,
1383                             struct ptlrpc_request **request)
1384{
1385        struct ll_sb_info *sbi = ll_i2sbi(inode);
1386        struct mdt_body  *body;
1387        struct lov_mds_md *lmm = NULL;
1388        struct ptlrpc_request *req = NULL;
1389        struct md_op_data *op_data;
1390        int rc, lmmsize;
1391
1392        rc = ll_get_default_mdsize(sbi, &lmmsize);
1393        if (rc)
1394                return rc;
1395
1396        op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1397                                     strlen(filename), lmmsize,
1398                                     LUSTRE_OPC_ANY, NULL);
1399        if (IS_ERR(op_data))
1400                return PTR_ERR(op_data);
1401
1402        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1403        rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1404        ll_finish_md_op_data(op_data);
1405        if (rc < 0) {
1406                CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1407                       filename, rc);
1408                goto out;
1409        }
1410
1411        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1412        LASSERT(body != NULL); /* checked by mdc_getattr_name */
1413
1414        lmmsize = body->eadatasize;
1415
1416        if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1417                        lmmsize == 0) {
1418                rc = -ENODATA;
1419                goto out;
1420        }
1421
1422        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1423        LASSERT(lmm != NULL);
1424
1425        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1426            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1427                rc = -EPROTO;
1428                goto out;
1429        }
1430
1431        /*
1432         * This is coming from the MDS, so is probably in
1433         * little endian.  We convert it to host endian before
1434         * passing it to userspace.
1435         */
1436        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1437                int stripe_count;
1438
1439                stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440                if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1441                        stripe_count = 0;
1442
1443                /* if function called for directory - we should
1444                 * avoid swab not existent lsm objects */
1445                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447                        if (S_ISREG(body->mode))
1448                                lustre_swab_lov_user_md_objects(
1449                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1450                                 stripe_count);
1451                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453                        if (S_ISREG(body->mode))
1454                                lustre_swab_lov_user_md_objects(
1455                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1456                                 stripe_count);
1457                }
1458        }
1459
1460out:
1461        *lmmp = lmm;
1462        *lmm_size = lmmsize;
1463        *request = req;
1464        return rc;
1465}
1466
1467static int ll_lov_setea(struct inode *inode, struct file *file,
1468                            unsigned long arg)
1469{
1470        int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471        struct lov_user_md      *lump;
1472        int                      lum_size = sizeof(struct lov_user_md) +
1473                                            sizeof(struct lov_user_ost_data);
1474        int                      rc;
1475
1476        if (!capable(CFS_CAP_SYS_ADMIN))
1477                return -EPERM;
1478
1479        lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
1480        if (lump == NULL)
1481                return -ENOMEM;
1482
1483        if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1484                kvfree(lump);
1485                return -EFAULT;
1486        }
1487
1488        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1489                                     lum_size);
1490        cl_lov_delay_create_clear(&file->f_flags);
1491
1492        kvfree(lump);
1493        return rc;
1494}
1495
1496static int ll_lov_setstripe(struct inode *inode, struct file *file,
1497                            unsigned long arg)
1498{
1499        struct lov_user_md_v3    lumv3;
1500        struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1501        struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1502        struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1503        int                      lum_size, rc;
1504        int                      flags = FMODE_WRITE;
1505
1506        /* first try with v1 which is smaller than v3 */
1507        lum_size = sizeof(struct lov_user_md_v1);
1508        if (copy_from_user(lumv1, lumv1p, lum_size))
1509                return -EFAULT;
1510
1511        if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512                lum_size = sizeof(struct lov_user_md_v3);
1513                if (copy_from_user(&lumv3, lumv3p, lum_size))
1514                        return -EFAULT;
1515        }
1516
1517        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1518                                      lum_size);
1519        cl_lov_delay_create_clear(&file->f_flags);
1520        if (rc == 0) {
1521                struct lov_stripe_md *lsm;
1522                __u32 gen;
1523
1524                put_user(0, &lumv1p->lmm_stripe_count);
1525
1526                ll_layout_refresh(inode, &gen);
1527                lsm = ccc_inode_lsm_get(inode);
1528                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1529                                   0, lsm, (void *)arg);
1530                ccc_inode_lsm_put(inode, lsm);
1531        }
1532        return rc;
1533}
1534
1535static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1536{
1537        struct lov_stripe_md *lsm;
1538        int rc = -ENODATA;
1539
1540        lsm = ccc_inode_lsm_get(inode);
1541        if (lsm != NULL)
1542                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1543                                   lsm, (void *)arg);
1544        ccc_inode_lsm_put(inode, lsm);
1545        return rc;
1546}
1547
1548static int
1549ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1550{
1551        struct ll_inode_info   *lli = ll_i2info(inode);
1552        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1553        struct ccc_grouplock    grouplock;
1554        int                  rc;
1555
1556        if (arg == 0) {
1557                CWARN("group id for group lock must not be 0\n");
1558                return -EINVAL;
1559        }
1560
1561        if (ll_file_nolock(file))
1562                return -EOPNOTSUPP;
1563
1564        spin_lock(&lli->lli_lock);
1565        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1566                CWARN("group lock already existed with gid %lu\n",
1567                      fd->fd_grouplock.cg_gid);
1568                spin_unlock(&lli->lli_lock);
1569                return -EINVAL;
1570        }
1571        LASSERT(fd->fd_grouplock.cg_lock == NULL);
1572        spin_unlock(&lli->lli_lock);
1573
1574        rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1575                              arg, (file->f_flags & O_NONBLOCK), &grouplock);
1576        if (rc)
1577                return rc;
1578
1579        spin_lock(&lli->lli_lock);
1580        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1581                spin_unlock(&lli->lli_lock);
1582                CERROR("another thread just won the race\n");
1583                cl_put_grouplock(&grouplock);
1584                return -EINVAL;
1585        }
1586
1587        fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1588        fd->fd_grouplock = grouplock;
1589        spin_unlock(&lli->lli_lock);
1590
1591        CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1592        return 0;
1593}
1594
1595static int ll_put_grouplock(struct inode *inode, struct file *file,
1596                            unsigned long arg)
1597{
1598        struct ll_inode_info   *lli = ll_i2info(inode);
1599        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1600        struct ccc_grouplock    grouplock;
1601
1602        spin_lock(&lli->lli_lock);
1603        if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1604                spin_unlock(&lli->lli_lock);
1605                CWARN("no group lock held\n");
1606                return -EINVAL;
1607        }
1608        LASSERT(fd->fd_grouplock.cg_lock != NULL);
1609
1610        if (fd->fd_grouplock.cg_gid != arg) {
1611                CWARN("group lock %lu doesn't match current id %lu\n",
1612                       arg, fd->fd_grouplock.cg_gid);
1613                spin_unlock(&lli->lli_lock);
1614                return -EINVAL;
1615        }
1616
1617        grouplock = fd->fd_grouplock;
1618        memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1619        fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1620        spin_unlock(&lli->lli_lock);
1621
1622        cl_put_grouplock(&grouplock);
1623        CDEBUG(D_INFO, "group lock %lu released\n", arg);
1624        return 0;
1625}
1626
1627/**
1628 * Close inode open handle
1629 *
1630 * \param inode  [in]     inode in question
1631 * \param it     [in,out] intent which contains open info and result
1632 *
1633 * \retval 0     success
1634 * \retval <0    failure
1635 */
1636int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1637{
1638        struct obd_client_handle *och;
1639        int rc;
1640
1641        LASSERT(inode);
1642
1643        /* Root ? Do nothing. */
1644        if (is_root_inode(inode))
1645                return 0;
1646
1647        /* No open handle to close? Move away */
1648        if (!it_disposition(it, DISP_OPEN_OPEN))
1649                return 0;
1650
1651        LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1652
1653        och = kzalloc(sizeof(*och), GFP_NOFS);
1654        if (!och) {
1655                rc = -ENOMEM;
1656                goto out;
1657        }
1658
1659        ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1660
1661        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1662                                       inode, och, NULL);
1663out:
1664        /* this one is in place of ll_file_open */
1665        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1666                ptlrpc_req_finished(it->d.lustre.it_data);
1667                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1668        }
1669        return rc;
1670}
1671
1672/**
1673 * Get size for inode for which FIEMAP mapping is requested.
1674 * Make the FIEMAP get_info call and returns the result.
1675 */
1676static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1677                        size_t num_bytes)
1678{
1679        struct obd_export *exp = ll_i2dtexp(inode);
1680        struct lov_stripe_md *lsm = NULL;
1681        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1682        __u32 vallen = num_bytes;
1683        int rc;
1684
1685        /* Checks for fiemap flags */
1686        if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1687                fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1688                return -EBADR;
1689        }
1690
1691        /* Check for FIEMAP_FLAG_SYNC */
1692        if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1693                rc = filemap_fdatawrite(inode->i_mapping);
1694                if (rc)
1695                        return rc;
1696        }
1697
1698        lsm = ccc_inode_lsm_get(inode);
1699        if (lsm == NULL)
1700                return -ENOENT;
1701
1702        /* If the stripe_count > 1 and the application does not understand
1703         * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1704         */
1705        if (lsm->lsm_stripe_count > 1 &&
1706            !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1707                rc = -EOPNOTSUPP;
1708                goto out;
1709        }
1710
1711        fm_key.oa.o_oi = lsm->lsm_oi;
1712        fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1713
1714        if (i_size_read(inode) == 0) {
1715                rc = ll_glimpse_size(inode);
1716                if (rc)
1717                        goto out;
1718        }
1719
1720        obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1721        obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1722        /* If filesize is 0, then there would be no objects for mapping */
1723        if (fm_key.oa.o_size == 0) {
1724                fiemap->fm_mapped_extents = 0;
1725                rc = 0;
1726                goto out;
1727        }
1728
1729        memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1730
1731        rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1732                          fiemap, lsm);
1733        if (rc)
1734                CERROR("obd_get_info failed: rc = %d\n", rc);
1735
1736out:
1737        ccc_inode_lsm_put(inode, lsm);
1738        return rc;
1739}
1740
1741int ll_fid2path(struct inode *inode, void __user *arg)
1742{
1743        struct obd_export *exp = ll_i2mdexp(inode);
1744        const struct getinfo_fid2path __user *gfin = arg;
1745        struct getinfo_fid2path *gfout;
1746        u32 pathlen;
1747        size_t outsize;
1748        int rc;
1749
1750        if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1751            !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1752                return -EPERM;
1753
1754        /* Only need to get the buflen */
1755        if (get_user(pathlen, &gfin->gf_pathlen))
1756                return -EFAULT;
1757
1758        if (pathlen > PATH_MAX)
1759                return -EINVAL;
1760
1761        outsize = sizeof(*gfout) + pathlen;
1762
1763        gfout = kzalloc(outsize, GFP_NOFS);
1764        if (!gfout)
1765                return -ENOMEM;
1766
1767        if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1768                rc = -EFAULT;
1769                goto gf_free;
1770        }
1771
1772        /* Call mdc_iocontrol */
1773        rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1774        if (rc != 0)
1775                goto gf_free;
1776
1777        if (copy_to_user(arg, gfout, outsize))
1778                rc = -EFAULT;
1779
1780gf_free:
1781        kfree(gfout);
1782        return rc;
1783}
1784
1785static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1786{
1787        struct ll_user_fiemap *fiemap_s;
1788        size_t num_bytes, ret_bytes;
1789        unsigned int extent_count;
1790        int rc = 0;
1791
1792        /* Get the extent count so we can calculate the size of
1793         * required fiemap buffer */
1794        if (get_user(extent_count,
1795            &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1796                return -EFAULT;
1797
1798        if (extent_count >=
1799            (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1800                return -EINVAL;
1801        num_bytes = sizeof(*fiemap_s) + (extent_count *
1802                                         sizeof(struct ll_fiemap_extent));
1803
1804        fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
1805        if (fiemap_s == NULL)
1806                return -ENOMEM;
1807
1808        /* get the fiemap value */
1809        if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1810                           sizeof(*fiemap_s))) {
1811                rc = -EFAULT;
1812                goto error;
1813        }
1814
1815        /* If fm_extent_count is non-zero, read the first extent since
1816         * it is used to calculate end_offset and device from previous
1817         * fiemap call. */
1818        if (extent_count) {
1819                if (copy_from_user(&fiemap_s->fm_extents[0],
1820                    (char __user *)arg + sizeof(*fiemap_s),
1821                    sizeof(struct ll_fiemap_extent))) {
1822                        rc = -EFAULT;
1823                        goto error;
1824                }
1825        }
1826
1827        rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1828        if (rc)
1829                goto error;
1830
1831        ret_bytes = sizeof(struct ll_user_fiemap);
1832
1833        if (extent_count != 0)
1834                ret_bytes += (fiemap_s->fm_mapped_extents *
1835                                 sizeof(struct ll_fiemap_extent));
1836
1837        if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1838                rc = -EFAULT;
1839
1840error:
1841        kvfree(fiemap_s);
1842        return rc;
1843}
1844
1845/*
1846 * Read the data_version for inode.
1847 *
1848 * This value is computed using stripe object version on OST.
1849 * Version is computed using server side locking.
1850 *
1851 * @param extent_lock  Take extent lock. Not needed if a process is already
1852 *                     holding the OST object group locks.
1853 */
1854int ll_data_version(struct inode *inode, __u64 *data_version,
1855                    int extent_lock)
1856{
1857        struct lov_stripe_md    *lsm = NULL;
1858        struct ll_sb_info       *sbi = ll_i2sbi(inode);
1859        struct obdo             *obdo = NULL;
1860        int                      rc;
1861
1862        /* If no stripe, we consider version is 0. */
1863        lsm = ccc_inode_lsm_get(inode);
1864        if (!lsm_has_objects(lsm)) {
1865                *data_version = 0;
1866                CDEBUG(D_INODE, "No object for inode\n");
1867                rc = 0;
1868                goto out;
1869        }
1870
1871        obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1872        if (!obdo) {
1873                rc = -ENOMEM;
1874                goto out;
1875        }
1876
1877        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1878        if (rc == 0) {
1879                if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1880                        rc = -EOPNOTSUPP;
1881                else
1882                        *data_version = obdo->o_data_version;
1883        }
1884
1885        kfree(obdo);
1886out:
1887        ccc_inode_lsm_put(inode, lsm);
1888        return rc;
1889}
1890
1891/*
1892 * Trigger a HSM release request for the provided inode.
1893 */
1894int ll_hsm_release(struct inode *inode)
1895{
1896        struct cl_env_nest nest;
1897        struct lu_env *env;
1898        struct obd_client_handle *och = NULL;
1899        __u64 data_version = 0;
1900        int rc;
1901
1902
1903        CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1904               ll_get_fsname(inode->i_sb, NULL, 0),
1905               PFID(&ll_i2info(inode)->lli_fid));
1906
1907        och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1908        if (IS_ERR(och)) {
1909                rc = PTR_ERR(och);
1910                goto out;
1911        }
1912
1913        /* Grab latest data_version and [am]time values */
1914        rc = ll_data_version(inode, &data_version, 1);
1915        if (rc != 0)
1916                goto out;
1917
1918        env = cl_env_nested_get(&nest);
1919        if (IS_ERR(env)) {
1920                rc = PTR_ERR(env);
1921                goto out;
1922        }
1923
1924        ll_merge_lvb(env, inode);
1925        cl_env_nested_put(&nest, env);
1926
1927        /* Release the file.
1928         * NB: lease lock handle is released in mdc_hsm_release_pack() because
1929         * we still need it to pack l_remote_handle to MDT. */
1930        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1931                                       &data_version);
1932        och = NULL;
1933
1934
1935out:
1936        if (och != NULL && !IS_ERR(och)) /* close the file */
1937                ll_lease_close(och, inode, NULL);
1938
1939        return rc;
1940}
1941
1942struct ll_swap_stack {
1943        struct iattr             ia1, ia2;
1944        __u64                    dv1, dv2;
1945        struct inode            *inode1, *inode2;
1946        bool                     check_dv1, check_dv2;
1947};
1948
1949static int ll_swap_layouts(struct file *file1, struct file *file2,
1950                           struct lustre_swap_layouts *lsl)
1951{
1952        struct mdc_swap_layouts  msl;
1953        struct md_op_data       *op_data;
1954        __u32                    gid;
1955        __u64                    dv;
1956        struct ll_swap_stack    *llss = NULL;
1957        int                      rc;
1958
1959        llss = kzalloc(sizeof(*llss), GFP_NOFS);
1960        if (!llss)
1961                return -ENOMEM;
1962
1963        llss->inode1 = file_inode(file1);
1964        llss->inode2 = file_inode(file2);
1965
1966        if (!S_ISREG(llss->inode2->i_mode)) {
1967                rc = -EINVAL;
1968                goto free;
1969        }
1970
1971        if (inode_permission(llss->inode1, MAY_WRITE) ||
1972            inode_permission(llss->inode2, MAY_WRITE)) {
1973                rc = -EPERM;
1974                goto free;
1975        }
1976
1977        if (llss->inode2->i_sb != llss->inode1->i_sb) {
1978                rc = -EXDEV;
1979                goto free;
1980        }
1981
1982        /* we use 2 bool because it is easier to swap than 2 bits */
1983        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1984                llss->check_dv1 = true;
1985
1986        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1987                llss->check_dv2 = true;
1988
1989        /* we cannot use lsl->sl_dvX directly because we may swap them */
1990        llss->dv1 = lsl->sl_dv1;
1991        llss->dv2 = lsl->sl_dv2;
1992
1993        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1994        if (rc == 0) /* same file, done! */ {
1995                rc = 0;
1996                goto free;
1997        }
1998
1999        if (rc < 0) { /* sequentialize it */
2000                swap(llss->inode1, llss->inode2);
2001                swap(file1, file2);
2002                swap(llss->dv1, llss->dv2);
2003                swap(llss->check_dv1, llss->check_dv2);
2004        }
2005
2006        gid = lsl->sl_gid;
2007        if (gid != 0) { /* application asks to flush dirty cache */
2008                rc = ll_get_grouplock(llss->inode1, file1, gid);
2009                if (rc < 0)
2010                        goto free;
2011
2012                rc = ll_get_grouplock(llss->inode2, file2, gid);
2013                if (rc < 0) {
2014                        ll_put_grouplock(llss->inode1, file1, gid);
2015                        goto free;
2016                }
2017        }
2018
2019        /* to be able to restore mtime and atime after swap
2020         * we need to first save them */
2021        if (lsl->sl_flags &
2022            (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2023                llss->ia1.ia_mtime = llss->inode1->i_mtime;
2024                llss->ia1.ia_atime = llss->inode1->i_atime;
2025                llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2026                llss->ia2.ia_mtime = llss->inode2->i_mtime;
2027                llss->ia2.ia_atime = llss->inode2->i_atime;
2028                llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2029        }
2030
2031        /* ultimate check, before swapping the layouts we check if
2032         * dataversion has changed (if requested) */
2033        if (llss->check_dv1) {
2034                rc = ll_data_version(llss->inode1, &dv, 0);
2035                if (rc)
2036                        goto putgl;
2037                if (dv != llss->dv1) {
2038                        rc = -EAGAIN;
2039                        goto putgl;
2040                }
2041        }
2042
2043        if (llss->check_dv2) {
2044                rc = ll_data_version(llss->inode2, &dv, 0);
2045                if (rc)
2046                        goto putgl;
2047                if (dv != llss->dv2) {
2048                        rc = -EAGAIN;
2049                        goto putgl;
2050                }
2051        }
2052
2053        /* struct md_op_data is used to send the swap args to the mdt
2054         * only flags is missing, so we use struct mdc_swap_layouts
2055         * through the md_op_data->op_data */
2056        /* flags from user space have to be converted before they are send to
2057         * server, no flag is sent today, they are only used on the client */
2058        msl.msl_flags = 0;
2059        rc = -ENOMEM;
2060        op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2061                                     0, LUSTRE_OPC_ANY, &msl);
2062        if (IS_ERR(op_data)) {
2063                rc = PTR_ERR(op_data);
2064                goto free;
2065        }
2066
2067        rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2068                           sizeof(*op_data), op_data, NULL);
2069        ll_finish_md_op_data(op_data);
2070
2071putgl:
2072        if (gid != 0) {
2073                ll_put_grouplock(llss->inode2, file2, gid);
2074                ll_put_grouplock(llss->inode1, file1, gid);
2075        }
2076
2077        /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2078        if (rc != 0)
2079                goto free;
2080
2081        /* clear useless flags */
2082        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2083                llss->ia1.ia_valid &= ~ATTR_MTIME;
2084                llss->ia2.ia_valid &= ~ATTR_MTIME;
2085        }
2086
2087        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2088                llss->ia1.ia_valid &= ~ATTR_ATIME;
2089                llss->ia2.ia_valid &= ~ATTR_ATIME;
2090        }
2091
2092        /* update time if requested */
2093        rc = 0;
2094        if (llss->ia2.ia_valid != 0) {
2095                mutex_lock(&llss->inode1->i_mutex);
2096                rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2097                mutex_unlock(&llss->inode1->i_mutex);
2098        }
2099
2100        if (llss->ia1.ia_valid != 0) {
2101                int rc1;
2102
2103                mutex_lock(&llss->inode2->i_mutex);
2104                rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2105                mutex_unlock(&llss->inode2->i_mutex);
2106                if (rc == 0)
2107                        rc = rc1;
2108        }
2109
2110free:
2111        kfree(llss);
2112
2113        return rc;
2114}
2115
2116static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2117{
2118        struct md_op_data       *op_data;
2119        int                      rc;
2120
2121        /* Non-root users are forbidden to set or clear flags which are
2122         * NOT defined in HSM_USER_MASK. */
2123        if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2124            !capable(CFS_CAP_SYS_ADMIN))
2125                return -EPERM;
2126
2127        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2128                                     LUSTRE_OPC_ANY, hss);
2129        if (IS_ERR(op_data))
2130                return PTR_ERR(op_data);
2131
2132        rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2133                           sizeof(*op_data), op_data, NULL);
2134
2135        ll_finish_md_op_data(op_data);
2136
2137        return rc;
2138}
2139
2140static int ll_hsm_import(struct inode *inode, struct file *file,
2141                         struct hsm_user_import *hui)
2142{
2143        struct hsm_state_set    *hss = NULL;
2144        struct iattr            *attr = NULL;
2145        int                      rc;
2146
2147
2148        if (!S_ISREG(inode->i_mode))
2149                return -EINVAL;
2150
2151        /* set HSM flags */
2152        hss = kzalloc(sizeof(*hss), GFP_NOFS);
2153        if (!hss)
2154                return -ENOMEM;
2155
2156        hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2157        hss->hss_archive_id = hui->hui_archive_id;
2158        hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2159        rc = ll_hsm_state_set(inode, hss);
2160        if (rc != 0)
2161                goto free_hss;
2162
2163        attr = kzalloc(sizeof(*attr), GFP_NOFS);
2164        if (!attr) {
2165                rc = -ENOMEM;
2166                goto free_hss;
2167        }
2168
2169        attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2170        attr->ia_mode |= S_IFREG;
2171        attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2172        attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2173        attr->ia_size = hui->hui_size;
2174        attr->ia_mtime.tv_sec = hui->hui_mtime;
2175        attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2176        attr->ia_atime.tv_sec = hui->hui_atime;
2177        attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2178
2179        attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2180                         ATTR_UID | ATTR_GID |
2181                         ATTR_MTIME | ATTR_MTIME_SET |
2182                         ATTR_ATIME | ATTR_ATIME_SET;
2183
2184        mutex_lock(&inode->i_mutex);
2185
2186        rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2187        if (rc == -ENODATA)
2188                rc = 0;
2189
2190        mutex_unlock(&inode->i_mutex);
2191
2192        kfree(attr);
2193free_hss:
2194        kfree(hss);
2195        return rc;
2196}
2197
2198static long
2199ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2200{
2201        struct inode            *inode = file_inode(file);
2202        struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2203        int                      flags, rc;
2204
2205        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2206               inode->i_generation, inode, cmd);
2207        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2208
2209        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2210        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2211                return -ENOTTY;
2212
2213        switch (cmd) {
2214        case LL_IOC_GETFLAGS:
2215                /* Get the current value of the file flags */
2216                return put_user(fd->fd_flags, (int *)arg);
2217        case LL_IOC_SETFLAGS:
2218        case LL_IOC_CLRFLAGS:
2219                /* Set or clear specific file flags */
2220                /* XXX This probably needs checks to ensure the flags are
2221                 *     not abused, and to handle any flag side effects.
2222                 */
2223                if (get_user(flags, (int *) arg))
2224                        return -EFAULT;
2225
2226                if (cmd == LL_IOC_SETFLAGS) {
2227                        if ((flags & LL_FILE_IGNORE_LOCK) &&
2228                            !(file->f_flags & O_DIRECT)) {
2229                                CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2230                                       current->comm);
2231                                return -EINVAL;
2232                        }
2233
2234                        fd->fd_flags |= flags;
2235                } else {
2236                        fd->fd_flags &= ~flags;
2237                }
2238                return 0;
2239        case LL_IOC_LOV_SETSTRIPE:
2240                return ll_lov_setstripe(inode, file, arg);
2241        case LL_IOC_LOV_SETEA:
2242                return ll_lov_setea(inode, file, arg);
2243        case LL_IOC_LOV_SWAP_LAYOUTS: {
2244                struct file *file2;
2245                struct lustre_swap_layouts lsl;
2246
2247                if (copy_from_user(&lsl, (char *)arg,
2248                                       sizeof(struct lustre_swap_layouts)))
2249                        return -EFAULT;
2250
2251                if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2252                        return -EPERM;
2253
2254                file2 = fget(lsl.sl_fd);
2255                if (file2 == NULL)
2256                        return -EBADF;
2257
2258                rc = -EPERM;
2259                if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2260                        rc = ll_swap_layouts(file, file2, &lsl);
2261                fput(file2);
2262                return rc;
2263        }
2264        case LL_IOC_LOV_GETSTRIPE:
2265                return ll_lov_getstripe(inode, arg);
2266        case LL_IOC_RECREATE_OBJ:
2267                return ll_lov_recreate_obj(inode, arg);
2268        case LL_IOC_RECREATE_FID:
2269                return ll_lov_recreate_fid(inode, arg);
2270        case FSFILT_IOC_FIEMAP:
2271                return ll_ioctl_fiemap(inode, arg);
2272        case FSFILT_IOC_GETFLAGS:
2273        case FSFILT_IOC_SETFLAGS:
2274                return ll_iocontrol(inode, file, cmd, arg);
2275        case FSFILT_IOC_GETVERSION_OLD:
2276        case FSFILT_IOC_GETVERSION:
2277                return put_user(inode->i_generation, (int *)arg);
2278        case LL_IOC_GROUP_LOCK:
2279                return ll_get_grouplock(inode, file, arg);
2280        case LL_IOC_GROUP_UNLOCK:
2281                return ll_put_grouplock(inode, file, arg);
2282        case IOC_OBD_STATFS:
2283                return ll_obd_statfs(inode, (void *)arg);
2284
2285        /* We need to special case any other ioctls we want to handle,
2286         * to send them to the MDS/OST as appropriate and to properly
2287         * network encode the arg field.
2288        case FSFILT_IOC_SETVERSION_OLD:
2289        case FSFILT_IOC_SETVERSION:
2290        */
2291        case LL_IOC_FLUSHCTX:
2292                return ll_flush_ctx(inode);
2293        case LL_IOC_PATH2FID: {
2294                if (copy_to_user((void *)arg, ll_inode2fid(inode),
2295                                 sizeof(struct lu_fid)))
2296                        return -EFAULT;
2297
2298                return 0;
2299        }
2300        case OBD_IOC_FID2PATH:
2301                return ll_fid2path(inode, (void *)arg);
2302        case LL_IOC_DATA_VERSION: {
2303                struct ioc_data_version idv;
2304                int                     rc;
2305
2306                if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2307                        return -EFAULT;
2308
2309                rc = ll_data_version(inode, &idv.idv_version,
2310                                !(idv.idv_flags & LL_DV_NOFLUSH));
2311
2312                if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2313                        return -EFAULT;
2314
2315                return rc;
2316        }
2317
2318        case LL_IOC_GET_MDTIDX: {
2319                int mdtidx;
2320
2321                mdtidx = ll_get_mdt_idx(inode);
2322                if (mdtidx < 0)
2323                        return mdtidx;
2324
2325                if (put_user((int)mdtidx, (int *)arg))
2326                        return -EFAULT;
2327
2328                return 0;
2329        }
2330        case OBD_IOC_GETDTNAME:
2331        case OBD_IOC_GETMDNAME:
2332                return ll_get_obd_name(inode, cmd, arg);
2333        case LL_IOC_HSM_STATE_GET: {
2334                struct md_op_data       *op_data;
2335                struct hsm_user_state   *hus;
2336                int                      rc;
2337
2338                hus = kzalloc(sizeof(*hus), GFP_NOFS);
2339                if (!hus)
2340                        return -ENOMEM;
2341
2342                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2343                                             LUSTRE_OPC_ANY, hus);
2344                if (IS_ERR(op_data)) {
2345                        kfree(hus);
2346                        return PTR_ERR(op_data);
2347                }
2348
2349                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2350                                   op_data, NULL);
2351
2352                if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2353                        rc = -EFAULT;
2354
2355                ll_finish_md_op_data(op_data);
2356                kfree(hus);
2357                return rc;
2358        }
2359        case LL_IOC_HSM_STATE_SET: {
2360                struct hsm_state_set    *hss;
2361                int                      rc;
2362
2363                hss = memdup_user((char *)arg, sizeof(*hss));
2364                if (IS_ERR(hss))
2365                        return PTR_ERR(hss);
2366
2367                rc = ll_hsm_state_set(inode, hss);
2368
2369                kfree(hss);
2370                return rc;
2371        }
2372        case LL_IOC_HSM_ACTION: {
2373                struct md_op_data               *op_data;
2374                struct hsm_current_action       *hca;
2375                int                              rc;
2376
2377                hca = kzalloc(sizeof(*hca), GFP_NOFS);
2378                if (!hca)
2379                        return -ENOMEM;
2380
2381                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2382                                             LUSTRE_OPC_ANY, hca);
2383                if (IS_ERR(op_data)) {
2384                        kfree(hca);
2385                        return PTR_ERR(op_data);
2386                }
2387
2388                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2389                                   op_data, NULL);
2390
2391                if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2392                        rc = -EFAULT;
2393
2394                ll_finish_md_op_data(op_data);
2395                kfree(hca);
2396                return rc;
2397        }
2398        case LL_IOC_SET_LEASE: {
2399                struct ll_inode_info *lli = ll_i2info(inode);
2400                struct obd_client_handle *och = NULL;
2401                bool lease_broken;
2402                fmode_t mode = 0;
2403
2404                switch (arg) {
2405                case F_WRLCK:
2406                        if (!(file->f_mode & FMODE_WRITE))
2407                                return -EPERM;
2408                        mode = FMODE_WRITE;
2409                        break;
2410                case F_RDLCK:
2411                        if (!(file->f_mode & FMODE_READ))
2412                                return -EPERM;
2413                        mode = FMODE_READ;
2414                        break;
2415                case F_UNLCK:
2416                        mutex_lock(&lli->lli_och_mutex);
2417                        if (fd->fd_lease_och != NULL) {
2418                                och = fd->fd_lease_och;
2419                                fd->fd_lease_och = NULL;
2420                        }
2421                        mutex_unlock(&lli->lli_och_mutex);
2422
2423                        if (och != NULL) {
2424                                mode = och->och_flags &
2425                                       (FMODE_READ|FMODE_WRITE);
2426                                rc = ll_lease_close(och, inode, &lease_broken);
2427                                if (rc == 0 && lease_broken)
2428                                        mode = 0;
2429                        } else {
2430                                rc = -ENOLCK;
2431                        }
2432
2433                        /* return the type of lease or error */
2434                        return rc < 0 ? rc : (int)mode;
2435                default:
2436                        return -EINVAL;
2437                }
2438
2439                CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2440
2441                /* apply for lease */
2442                och = ll_lease_open(inode, file, mode, 0);
2443                if (IS_ERR(och))
2444                        return PTR_ERR(och);
2445
2446                rc = 0;
2447                mutex_lock(&lli->lli_och_mutex);
2448                if (fd->fd_lease_och == NULL) {
2449                        fd->fd_lease_och = och;
2450                        och = NULL;
2451                }
2452                mutex_unlock(&lli->lli_och_mutex);
2453                if (och != NULL) {
2454                        /* impossible now that only excl is supported for now */
2455                        ll_lease_close(och, inode, &lease_broken);
2456                        rc = -EBUSY;
2457                }
2458                return rc;
2459        }
2460        case LL_IOC_GET_LEASE: {
2461                struct ll_inode_info *lli = ll_i2info(inode);
2462                struct ldlm_lock *lock = NULL;
2463
2464                rc = 0;
2465                mutex_lock(&lli->lli_och_mutex);
2466                if (fd->fd_lease_och != NULL) {
2467                        struct obd_client_handle *och = fd->fd_lease_och;
2468
2469                        lock = ldlm_handle2lock(&och->och_lease_handle);
2470                        if (lock != NULL) {
2471                                lock_res_and_lock(lock);
2472                                if (!ldlm_is_cancel(lock))
2473                                        rc = och->och_flags &
2474                                                (FMODE_READ | FMODE_WRITE);
2475                                unlock_res_and_lock(lock);
2476                                ldlm_lock_put(lock);
2477                        }
2478                }
2479                mutex_unlock(&lli->lli_och_mutex);
2480                return rc;
2481        }
2482        case LL_IOC_HSM_IMPORT: {
2483                struct hsm_user_import *hui;
2484
2485                hui = memdup_user((void *)arg, sizeof(*hui));
2486                if (IS_ERR(hui))
2487                        return PTR_ERR(hui);
2488
2489                rc = ll_hsm_import(inode, file, hui);
2490
2491                kfree(hui);
2492                return rc;
2493        }
2494        default: {
2495                int err;
2496
2497                if (LLIOC_STOP ==
2498                     ll_iocontrol_call(inode, file, cmd, arg, &err))
2499                        return err;
2500
2501                return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2502                                     (void *)arg);
2503        }
2504        }
2505}
2506
2507
2508static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2509{
2510        struct inode *inode = file_inode(file);
2511        loff_t retval, eof = 0;
2512
2513        retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2514                           (origin == SEEK_CUR) ? file->f_pos : 0);
2515        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2516               inode->i_ino, inode->i_generation, inode, retval, retval,
2517               origin);
2518        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2519
2520        if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2521                retval = ll_glimpse_size(inode);
2522                if (retval != 0)
2523                        return retval;
2524                eof = i_size_read(inode);
2525        }
2526
2527        retval = generic_file_llseek_size(file, offset, origin,
2528                                          ll_file_maxbytes(inode), eof);
2529        return retval;
2530}
2531
2532static int ll_flush(struct file *file, fl_owner_t id)
2533{
2534        struct inode *inode = file_inode(file);
2535        struct ll_inode_info *lli = ll_i2info(inode);
2536        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2537        int rc, err;
2538
2539        LASSERT(!S_ISDIR(inode->i_mode));
2540
2541        /* catch async errors that were recorded back when async writeback
2542         * failed for pages in this mapping. */
2543        rc = lli->lli_async_rc;
2544        lli->lli_async_rc = 0;
2545        err = lov_read_and_clear_async_rc(lli->lli_clob);
2546        if (rc == 0)
2547                rc = err;
2548
2549        /* The application has been told write failure already.
2550         * Do not report failure again. */
2551        if (fd->fd_write_failed)
2552                return 0;
2553        return rc ? -EIO : 0;
2554}
2555
2556/**
2557 * Called to make sure a portion of file has been written out.
2558 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2559 *
2560 * Return how many pages have been written.
2561 */
2562int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2563                       enum cl_fsync_mode mode, int ignore_layout)
2564{
2565        struct cl_env_nest nest;
2566        struct lu_env *env;
2567        struct cl_io *io;
2568        struct obd_capa *capa = NULL;
2569        struct cl_fsync_io *fio;
2570        int result;
2571
2572        if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2573            mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2574                return -EINVAL;
2575
2576        env = cl_env_nested_get(&nest);
2577        if (IS_ERR(env))
2578                return PTR_ERR(env);
2579
2580        capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2581
2582        io = ccc_env_thread_io(env);
2583        io->ci_obj = cl_i2info(inode)->lli_clob;
2584        io->ci_ignore_layout = ignore_layout;
2585
2586        /* initialize parameters for sync */
2587        fio = &io->u.ci_fsync;
2588        fio->fi_capa = capa;
2589        fio->fi_start = start;
2590        fio->fi_end = end;
2591        fio->fi_fid = ll_inode2fid(inode);
2592        fio->fi_mode = mode;
2593        fio->fi_nr_written = 0;
2594
2595        if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2596                result = cl_io_loop(env, io);
2597        else
2598                result = io->ci_result;
2599        if (result == 0)
2600                result = fio->fi_nr_written;
2601        cl_io_fini(env, io);
2602        cl_env_nested_put(&nest, env);
2603
2604        capa_put(capa);
2605
2606        return result;
2607}
2608
2609int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2610{
2611        struct inode *inode = file_inode(file);
2612        struct ll_inode_info *lli = ll_i2info(inode);
2613        struct ptlrpc_request *req;
2614        struct obd_capa *oc;
2615        int rc, err;
2616
2617        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2618               inode->i_generation, inode);
2619        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2620
2621        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2622        mutex_lock(&inode->i_mutex);
2623
2624        /* catch async errors that were recorded back when async writeback
2625         * failed for pages in this mapping. */
2626        if (!S_ISDIR(inode->i_mode)) {
2627                err = lli->lli_async_rc;
2628                lli->lli_async_rc = 0;
2629                if (rc == 0)
2630                        rc = err;
2631                err = lov_read_and_clear_async_rc(lli->lli_clob);
2632                if (rc == 0)
2633                        rc = err;
2634        }
2635
2636        oc = ll_mdscapa_get(inode);
2637        err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2638                      &req);
2639        capa_put(oc);
2640        if (!rc)
2641                rc = err;
2642        if (!err)
2643                ptlrpc_req_finished(req);
2644
2645        if (S_ISREG(inode->i_mode)) {
2646                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2647
2648                err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2649                if (rc == 0 && err < 0)
2650                        rc = err;
2651                if (rc < 0)
2652                        fd->fd_write_failed = true;
2653                else
2654                        fd->fd_write_failed = false;
2655        }
2656
2657        mutex_unlock(&inode->i_mutex);
2658        return rc;
2659}
2660
2661static int
2662ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2663{
2664        struct inode *inode = file_inode(file);
2665        struct ll_sb_info *sbi = ll_i2sbi(inode);
2666        struct ldlm_enqueue_info einfo = {
2667                .ei_type        = LDLM_FLOCK,
2668                .ei_cb_cp       = ldlm_flock_completion_ast,
2669                .ei_cbdata      = file_lock,
2670        };
2671        struct md_op_data *op_data;
2672        struct lustre_handle lockh = {0};
2673        ldlm_policy_data_t flock = {{0}};
2674        __u64 flags = 0;
2675        int rc;
2676        int rc2 = 0;
2677
2678        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2679               inode->i_ino, file_lock);
2680
2681        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2682
2683        if (file_lock->fl_flags & FL_FLOCK)
2684                LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2685        else if (!(file_lock->fl_flags & FL_POSIX))
2686                return -EINVAL;
2687
2688        flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2689        flock.l_flock.pid = file_lock->fl_pid;
2690        flock.l_flock.start = file_lock->fl_start;
2691        flock.l_flock.end = file_lock->fl_end;
2692
2693        /* Somewhat ugly workaround for svc lockd.
2694         * lockd installs custom fl_lmops->lm_compare_owner that checks
2695         * for the fl_owner to be the same (which it always is on local node
2696         * I guess between lockd processes) and then compares pid.
2697         * As such we assign pid to the owner field to make it all work,
2698         * conflict with normal locks is unlikely since pid space and
2699         * pointer space for current->files are not intersecting */
2700        if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2701                flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2702
2703        switch (file_lock->fl_type) {
2704        case F_RDLCK:
2705                einfo.ei_mode = LCK_PR;
2706                break;
2707        case F_UNLCK:
2708                /* An unlock request may or may not have any relation to
2709                 * existing locks so we may not be able to pass a lock handle
2710                 * via a normal ldlm_lock_cancel() request. The request may even
2711                 * unlock a byte range in the middle of an existing lock. In
2712                 * order to process an unlock request we need all of the same
2713                 * information that is given with a normal read or write record
2714                 * lock request. To avoid creating another ldlm unlock (cancel)
2715                 * message we'll treat a LCK_NL flock request as an unlock. */
2716                einfo.ei_mode = LCK_NL;
2717                break;
2718        case F_WRLCK:
2719                einfo.ei_mode = LCK_PW;
2720                break;
2721        default:
2722                CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2723                        file_lock->fl_type);
2724                return -ENOTSUPP;
2725        }
2726
2727        switch (cmd) {
2728        case F_SETLKW:
2729#ifdef F_SETLKW64
2730        case F_SETLKW64:
2731#endif
2732                flags = 0;
2733                break;
2734        case F_SETLK:
2735#ifdef F_SETLK64
2736        case F_SETLK64:
2737#endif
2738                flags = LDLM_FL_BLOCK_NOWAIT;
2739                break;
2740        case F_GETLK:
2741#ifdef F_GETLK64
2742        case F_GETLK64:
2743#endif
2744                flags = LDLM_FL_TEST_LOCK;
2745                /* Save the old mode so that if the mode in the lock changes we
2746                 * can decrement the appropriate reader or writer refcount. */
2747                file_lock->fl_type = einfo.ei_mode;
2748                break;
2749        default:
2750                CERROR("unknown fcntl lock command: %d\n", cmd);
2751                return -EINVAL;
2752        }
2753
2754        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2755                                     LUSTRE_OPC_ANY, NULL);
2756        if (IS_ERR(op_data))
2757                return PTR_ERR(op_data);
2758
2759        CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2760               inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2761               flock.l_flock.start, flock.l_flock.end);
2762
2763        rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2764                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2765
2766        if ((file_lock->fl_flags & FL_FLOCK) &&
2767            (rc == 0 || file_lock->fl_type == F_UNLCK))
2768                rc2  = flock_lock_file_wait(file, file_lock);
2769        if ((file_lock->fl_flags & FL_POSIX) &&
2770            (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2771            !(flags & LDLM_FL_TEST_LOCK))
2772                rc2  = posix_lock_file_wait(file, file_lock);
2773
2774        if (rc2 && file_lock->fl_type != F_UNLCK) {
2775                einfo.ei_mode = LCK_NL;
2776                md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2777                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2778                rc = rc2;
2779        }
2780
2781        ll_finish_md_op_data(op_data);
2782
2783        return rc;
2784}
2785
2786static int
2787ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2788{
2789        return -ENOSYS;
2790}
2791
2792/**
2793 * test if some locks matching bits and l_req_mode are acquired
2794 * - bits can be in different locks
2795 * - if found clear the common lock bits in *bits
2796 * - the bits not found, are kept in *bits
2797 * \param inode [IN]
2798 * \param bits [IN] searched lock bits [IN]
2799 * \param l_req_mode [IN] searched lock mode
2800 * \retval boolean, true iff all bits are found
2801 */
2802int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2803{
2804        struct lustre_handle lockh;
2805        ldlm_policy_data_t policy;
2806        ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2807                                (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2808        struct lu_fid *fid;
2809        __u64 flags;
2810        int i;
2811
2812        if (!inode)
2813                return 0;
2814
2815        fid = &ll_i2info(inode)->lli_fid;
2816        CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2817               ldlm_lockname[mode]);
2818
2819        flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2820        for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2821                policy.l_inodebits.bits = *bits & (1 << i);
2822                if (policy.l_inodebits.bits == 0)
2823                        continue;
2824
2825                if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2826                                  &policy, mode, &lockh)) {
2827                        struct ldlm_lock *lock;
2828
2829                        lock = ldlm_handle2lock(&lockh);
2830                        if (lock) {
2831                                *bits &=
2832                                      ~(lock->l_policy_data.l_inodebits.bits);
2833                                LDLM_LOCK_PUT(lock);
2834                        } else {
2835                                *bits &= ~policy.l_inodebits.bits;
2836                        }
2837                }
2838        }
2839        return *bits == 0;
2840}
2841
2842ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2843                            struct lustre_handle *lockh, __u64 flags,
2844                            ldlm_mode_t mode)
2845{
2846        ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2847        struct lu_fid *fid;
2848        ldlm_mode_t rc;
2849
2850        fid = &ll_i2info(inode)->lli_fid;
2851        CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2852
2853        rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2854                           fid, LDLM_IBITS, &policy, mode, lockh);
2855
2856        return rc;
2857}
2858
2859static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2860{
2861        /* Already unlinked. Just update nlink and return success */
2862        if (rc == -ENOENT) {
2863                clear_nlink(inode);
2864                /* This path cannot be hit for regular files unless in
2865                 * case of obscure races, so no need to validate size.
2866                 */
2867                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2868                        return 0;
2869        } else if (rc != 0) {
2870                CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2871                             "%s: revalidate FID "DFID" error: rc = %d\n",
2872                             ll_get_fsname(inode->i_sb, NULL, 0),
2873                             PFID(ll_inode2fid(inode)), rc);
2874        }
2875
2876        return rc;
2877}
2878
2879static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2880{
2881        struct inode *inode = d_inode(dentry);
2882        struct ptlrpc_request *req = NULL;
2883        struct obd_export *exp;
2884        int rc = 0;
2885
2886        LASSERT(inode != NULL);
2887
2888        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2889               inode->i_ino, inode->i_generation, inode, dentry);
2890
2891        exp = ll_i2mdexp(inode);
2892
2893        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2894         *      But under CMD case, it caused some lock issues, should be fixed
2895         *      with new CMD ibits lock. See bug 12718 */
2896        if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2897                struct lookup_intent oit = { .it_op = IT_GETATTR };
2898                struct md_op_data *op_data;
2899
2900                if (ibits == MDS_INODELOCK_LOOKUP)
2901                        oit.it_op = IT_LOOKUP;
2902
2903                /* Call getattr by fid, so do not provide name at all. */
2904                op_data = ll_prep_md_op_data(NULL, inode,
2905                                             inode, NULL, 0, 0,
2906                                             LUSTRE_OPC_ANY, NULL);
2907                if (IS_ERR(op_data))
2908                        return PTR_ERR(op_data);
2909
2910                oit.it_create_mode |= M_CHECK_STALE;
2911                rc = md_intent_lock(exp, op_data, NULL, 0,
2912                                    /* we are not interested in name
2913                                       based lookup */
2914                                    &oit, 0, &req,
2915                                    ll_md_blocking_ast, 0);
2916                ll_finish_md_op_data(op_data);
2917                oit.it_create_mode &= ~M_CHECK_STALE;
2918                if (rc < 0) {
2919                        rc = ll_inode_revalidate_fini(inode, rc);
2920                        goto out;
2921                }
2922
2923                rc = ll_revalidate_it_finish(req, &oit, inode);
2924                if (rc != 0) {
2925                        ll_intent_release(&oit);
2926                        goto out;
2927                }
2928
2929                /* Unlinked? Unhash dentry, so it is not picked up later by
2930                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2931                   here to preserve get_cwd functionality on 2.6.
2932                   Bug 10503 */
2933                if (!d_inode(dentry)->i_nlink)
2934                        d_lustre_invalidate(dentry, 0);
2935
2936                ll_lookup_finish_locks(&oit, inode);
2937        } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2938                struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
2939                u64 valid = OBD_MD_FLGETATTR;
2940                struct md_op_data *op_data;
2941                int ealen = 0;
2942
2943                if (S_ISREG(inode->i_mode)) {
2944                        rc = ll_get_default_mdsize(sbi, &ealen);
2945                        if (rc)
2946                                return rc;
2947                        valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2948                }
2949
2950                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2951                                             0, ealen, LUSTRE_OPC_ANY,
2952                                             NULL);
2953                if (IS_ERR(op_data))
2954                        return PTR_ERR(op_data);
2955
2956                op_data->op_valid = valid;
2957                /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2958                 * capa for this inode. Because we only keep capas of dirs
2959                 * fresh. */
2960                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2961                ll_finish_md_op_data(op_data);
2962                if (rc) {
2963                        rc = ll_inode_revalidate_fini(inode, rc);
2964                        return rc;
2965                }
2966
2967                rc = ll_prep_inode(&inode, req, NULL, NULL);
2968        }
2969out:
2970        ptlrpc_req_finished(req);
2971        return rc;
2972}
2973
2974static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2975{
2976        struct inode *inode = d_inode(dentry);
2977        int rc;
2978
2979        rc = __ll_inode_revalidate(dentry, ibits);
2980        if (rc != 0)
2981                return rc;
2982
2983        /* if object isn't regular file, don't validate size */
2984        if (!S_ISREG(inode->i_mode)) {
2985                LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2986                LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2987                LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2988        } else {
2989                /* In case of restore, the MDT has the right size and has
2990                 * already send it back without granting the layout lock,
2991                 * inode is up-to-date so glimpse is useless.
2992                 * Also to glimpse we need the layout, in case of a running
2993                 * restore the MDT holds the layout lock so the glimpse will
2994                 * block up to the end of restore (getattr will block)
2995                 */
2996                if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2997                        rc = ll_glimpse_size(inode);
2998        }
2999        return rc;
3000}
3001
3002int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3003{
3004        struct inode *inode = d_inode(de);
3005        struct ll_sb_info *sbi = ll_i2sbi(inode);
3006        struct ll_inode_info *lli = ll_i2info(inode);
3007        int res;
3008
3009        res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3010                                      MDS_INODELOCK_LOOKUP);
3011        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3012
3013        if (res)
3014                return res;
3015
3016        stat->dev = inode->i_sb->s_dev;
3017        if (ll_need_32bit_api(sbi))
3018                stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3019        else
3020                stat->ino = inode->i_ino;
3021        stat->mode = inode->i_mode;
3022        stat->nlink = inode->i_nlink;
3023        stat->uid = inode->i_uid;
3024        stat->gid = inode->i_gid;
3025        stat->rdev = inode->i_rdev;
3026        stat->atime = inode->i_atime;
3027        stat->mtime = inode->i_mtime;
3028        stat->ctime = inode->i_ctime;
3029        stat->blksize = 1 << inode->i_blkbits;
3030
3031        stat->size = i_size_read(inode);
3032        stat->blocks = inode->i_blocks;
3033
3034        return 0;
3035}
3036
3037static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3038                     __u64 start, __u64 len)
3039{
3040        int rc;
3041        size_t num_bytes;
3042        struct ll_user_fiemap *fiemap;
3043        unsigned int extent_count = fieinfo->fi_extents_max;
3044
3045        num_bytes = sizeof(*fiemap) + (extent_count *
3046                                       sizeof(struct ll_fiemap_extent));
3047        fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
3048
3049        if (fiemap == NULL)
3050                return -ENOMEM;
3051
3052        fiemap->fm_flags = fieinfo->fi_flags;
3053        fiemap->fm_extent_count = fieinfo->fi_extents_max;
3054        fiemap->fm_start = start;
3055        fiemap->fm_length = len;
3056        if (extent_count > 0)
3057                memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3058                       sizeof(struct ll_fiemap_extent));
3059
3060        rc = ll_do_fiemap(inode, fiemap, num_bytes);
3061
3062        fieinfo->fi_flags = fiemap->fm_flags;
3063        fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3064        if (extent_count > 0)
3065                memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3066                       fiemap->fm_mapped_extents *
3067                       sizeof(struct ll_fiemap_extent));
3068
3069        kvfree(fiemap);
3070        return rc;
3071}
3072
3073struct posix_acl *ll_get_acl(struct inode *inode, int type)
3074{
3075        struct ll_inode_info *lli = ll_i2info(inode);
3076        struct posix_acl *acl = NULL;
3077
3078        spin_lock(&lli->lli_lock);
3079        /* VFS' acl_permission_check->check_acl will release the refcount */
3080        acl = posix_acl_dup(lli->lli_posix_acl);
3081        spin_unlock(&lli->lli_lock);
3082
3083        return acl;
3084}
3085
3086
3087int ll_inode_permission(struct inode *inode, int mask)
3088{
3089        int rc = 0;
3090
3091#ifdef MAY_NOT_BLOCK
3092        if (mask & MAY_NOT_BLOCK)
3093                return -ECHILD;
3094#endif
3095
3096       /* as root inode are NOT getting validated in lookup operation,
3097        * need to do it before permission check. */
3098
3099        if (is_root_inode(inode)) {
3100                rc = __ll_inode_revalidate(inode->i_sb->s_root,
3101                                           MDS_INODELOCK_LOOKUP);
3102                if (rc)
3103                        return rc;
3104        }
3105
3106        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3107               inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3108
3109        if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3110                return lustre_check_remote_perm(inode, mask);
3111
3112        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3113        rc = generic_permission(inode, mask);
3114
3115        return rc;
3116}
3117
3118/* -o localflock - only provides locally consistent flock locks */
3119struct file_operations ll_file_operations = {
3120        .read_iter = ll_file_read_iter,
3121        .write_iter = ll_file_write_iter,
3122        .unlocked_ioctl = ll_file_ioctl,
3123        .open      = ll_file_open,
3124        .release        = ll_file_release,
3125        .mmap      = ll_file_mmap,
3126        .llseek  = ll_file_seek,
3127        .splice_read    = ll_file_splice_read,
3128        .fsync    = ll_fsync,
3129        .flush    = ll_flush
3130};
3131
3132struct file_operations ll_file_operations_flock = {
3133        .read_iter    = ll_file_read_iter,
3134        .write_iter   = ll_file_write_iter,
3135        .unlocked_ioctl = ll_file_ioctl,
3136        .open      = ll_file_open,
3137        .release        = ll_file_release,
3138        .mmap      = ll_file_mmap,
3139        .llseek  = ll_file_seek,
3140        .splice_read    = ll_file_splice_read,
3141        .fsync    = ll_fsync,
3142        .flush    = ll_flush,
3143        .flock    = ll_file_flock,
3144        .lock      = ll_file_flock
3145};
3146
3147/* These are for -o noflock - to return ENOSYS on flock calls */
3148struct file_operations ll_file_operations_noflock = {
3149        .read_iter    = ll_file_read_iter,
3150        .write_iter   = ll_file_write_iter,
3151        .unlocked_ioctl = ll_file_ioctl,
3152        .open      = ll_file_open,
3153        .release        = ll_file_release,
3154        .mmap      = ll_file_mmap,
3155        .llseek  = ll_file_seek,
3156        .splice_read    = ll_file_splice_read,
3157        .fsync    = ll_fsync,
3158        .flush    = ll_flush,
3159        .flock    = ll_file_noflock,
3160        .lock      = ll_file_noflock
3161};
3162
3163struct inode_operations ll_file_inode_operations = {
3164        .setattr        = ll_setattr,
3165        .getattr        = ll_getattr,
3166        .permission     = ll_inode_permission,
3167        .setxattr       = ll_setxattr,
3168        .getxattr       = ll_getxattr,
3169        .listxattr      = ll_listxattr,
3170        .removexattr    = ll_removexattr,
3171        .fiemap         = ll_fiemap,
3172        .get_acl        = ll_get_acl,
3173};
3174
3175/* dynamic ioctl number support routines */
3176static struct llioc_ctl_data {
3177        struct rw_semaphore     ioc_sem;
3178        struct list_head              ioc_head;
3179} llioc = {
3180        __RWSEM_INITIALIZER(llioc.ioc_sem),
3181        LIST_HEAD_INIT(llioc.ioc_head)
3182};
3183
3184
3185struct llioc_data {
3186        struct list_head              iocd_list;
3187        unsigned int        iocd_size;
3188        llioc_callback_t        iocd_cb;
3189        unsigned int        iocd_count;
3190        unsigned int        iocd_cmd[0];
3191};
3192
3193void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3194{
3195        unsigned int size;
3196        struct llioc_data *in_data = NULL;
3197
3198        if (cb == NULL || cmd == NULL ||
3199            count > LLIOC_MAX_CMD || count < 0)
3200                return NULL;
3201
3202        size = sizeof(*in_data) + count * sizeof(unsigned int);
3203        in_data = kzalloc(size, GFP_NOFS);
3204        if (!in_data)
3205                return NULL;
3206
3207        memset(in_data, 0, sizeof(*in_data));
3208        in_data->iocd_size = size;
3209        in_data->iocd_cb = cb;
3210        in_data->iocd_count = count;
3211        memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3212
3213        down_write(&llioc.ioc_sem);
3214        list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3215        up_write(&llioc.ioc_sem);
3216
3217        return in_data;
3218}
3219EXPORT_SYMBOL(ll_iocontrol_register);
3220
3221void ll_iocontrol_unregister(void *magic)
3222{
3223        struct llioc_data *tmp;
3224
3225        if (magic == NULL)
3226                return;
3227
3228        down_write(&llioc.ioc_sem);
3229        list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3230                if (tmp == magic) {
3231                        list_del(&tmp->iocd_list);
3232                        up_write(&llioc.ioc_sem);
3233
3234                        kfree(tmp);
3235                        return;
3236                }
3237        }
3238        up_write(&llioc.ioc_sem);
3239
3240        CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3241}
3242EXPORT_SYMBOL(ll_iocontrol_unregister);
3243
3244static enum llioc_iter
3245ll_iocontrol_call(struct inode *inode, struct file *file,
3246                  unsigned int cmd, unsigned long arg, int *rcp)
3247{
3248        enum llioc_iter ret = LLIOC_CONT;
3249        struct llioc_data *data;
3250        int rc = -EINVAL, i;
3251
3252        down_read(&llioc.ioc_sem);
3253        list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3254                for (i = 0; i < data->iocd_count; i++) {
3255                        if (cmd != data->iocd_cmd[i])
3256                                continue;
3257
3258                        ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3259                        break;
3260                }
3261
3262                if (ret == LLIOC_STOP)
3263                        break;
3264        }
3265        up_read(&llioc.ioc_sem);
3266
3267        if (rcp)
3268                *rcp = rc;
3269        return ret;
3270}
3271
3272int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3273{
3274        struct ll_inode_info *lli = ll_i2info(inode);
3275        struct cl_env_nest nest;
3276        struct lu_env *env;
3277        int result;
3278
3279        if (lli->lli_clob == NULL)
3280                return 0;
3281
3282        env = cl_env_nested_get(&nest);
3283        if (IS_ERR(env))
3284                return PTR_ERR(env);
3285
3286        result = cl_conf_set(env, lli->lli_clob, conf);
3287        cl_env_nested_put(&nest, env);
3288
3289        if (conf->coc_opc == OBJECT_CONF_SET) {
3290                struct ldlm_lock *lock = conf->coc_lock;
3291
3292                LASSERT(lock != NULL);
3293                LASSERT(ldlm_has_layout(lock));
3294                if (result == 0) {
3295                        /* it can only be allowed to match after layout is
3296                         * applied to inode otherwise false layout would be
3297                         * seen. Applying layout should happen before dropping
3298                         * the intent lock. */
3299                        ldlm_lock_allow_match(lock);
3300                }
3301        }
3302        return result;
3303}
3304
3305/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3306static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3307
3308{
3309        struct ll_sb_info *sbi = ll_i2sbi(inode);
3310        struct obd_capa *oc;
3311        struct ptlrpc_request *req;
3312        struct mdt_body *body;
3313        void *lvbdata;
3314        void *lmm;
3315        int lmmsize;
3316        int rc;
3317
3318        CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3319               PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3320               lock->l_lvb_data, lock->l_lvb_len);
3321
3322        if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3323                return 0;
3324
3325        /* if layout lock was granted right away, the layout is returned
3326         * within DLM_LVB of dlm reply; otherwise if the lock was ever
3327         * blocked and then granted via completion ast, we have to fetch
3328         * layout here. Please note that we can't use the LVB buffer in
3329         * completion AST because it doesn't have a large enough buffer */
3330        oc = ll_mdscapa_get(inode);
3331        rc = ll_get_default_mdsize(sbi, &lmmsize);
3332        if (rc == 0)
3333                rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3334                                OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3335                                lmmsize, 0, &req);
3336        capa_put(oc);
3337        if (rc < 0)
3338                return rc;
3339
3340        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3341        if (body == NULL) {
3342                rc = -EPROTO;
3343                goto out;
3344        }
3345
3346        lmmsize = body->eadatasize;
3347        if (lmmsize == 0) /* empty layout */ {
3348                rc = 0;
3349                goto out;
3350        }
3351
3352        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3353        if (lmm == NULL) {
3354                rc = -EFAULT;
3355                goto out;
3356        }
3357
3358        lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
3359        if (lvbdata == NULL) {
3360                rc = -ENOMEM;
3361                goto out;
3362        }
3363
3364        memcpy(lvbdata, lmm, lmmsize);
3365        lock_res_and_lock(lock);
3366        if (lock->l_lvb_data != NULL)
3367                kvfree(lock->l_lvb_data);
3368
3369        lock->l_lvb_data = lvbdata;
3370        lock->l_lvb_len = lmmsize;
3371        unlock_res_and_lock(lock);
3372
3373out:
3374        ptlrpc_req_finished(req);
3375        return rc;
3376}
3377
3378/**
3379 * Apply the layout to the inode. Layout lock is held and will be released
3380 * in this function.
3381 */
3382static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3383                                struct inode *inode, __u32 *gen, bool reconf)
3384{
3385        struct ll_inode_info *lli = ll_i2info(inode);
3386        struct ll_sb_info    *sbi = ll_i2sbi(inode);
3387        struct ldlm_lock *lock;
3388        struct lustre_md md = { NULL };
3389        struct cl_object_conf conf;
3390        int rc = 0;
3391        bool lvb_ready;
3392        bool wait_layout = false;
3393
3394        LASSERT(lustre_handle_is_used(lockh));
3395
3396        lock = ldlm_handle2lock(lockh);
3397        LASSERT(lock != NULL);
3398        LASSERT(ldlm_has_layout(lock));
3399
3400        LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3401                   inode, PFID(&lli->lli_fid), reconf);
3402
3403        /* in case this is a caching lock and reinstate with new inode */
3404        md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3405
3406        lock_res_and_lock(lock);
3407        lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3408        unlock_res_and_lock(lock);
3409        /* checking lvb_ready is racy but this is okay. The worst case is
3410         * that multi processes may configure the file on the same time. */
3411        if (lvb_ready || !reconf) {
3412                rc = -ENODATA;
3413                if (lvb_ready) {
3414                        /* layout_gen must be valid if layout lock is not
3415                         * cancelled and stripe has already set */
3416                        *gen = ll_layout_version_get(lli);
3417                        rc = 0;
3418                }
3419                goto out;
3420        }
3421
3422        rc = ll_layout_fetch(inode, lock);
3423        if (rc < 0)
3424                goto out;
3425
3426        /* for layout lock, lmm is returned in lock's lvb.
3427         * lvb_data is immutable if the lock is held so it's safe to access it
3428         * without res lock. See the description in ldlm_lock_decref_internal()
3429         * for the condition to free lvb_data of layout lock */
3430        if (lock->l_lvb_data != NULL) {
3431                rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3432                                  lock->l_lvb_data, lock->l_lvb_len);
3433                if (rc >= 0) {
3434                        *gen = LL_LAYOUT_GEN_EMPTY;
3435                        if (md.lsm != NULL)
3436                                *gen = md.lsm->lsm_layout_gen;
3437                        rc = 0;
3438                } else {
3439                        CERROR("%s: file "DFID" unpackmd error: %d\n",
3440                                ll_get_fsname(inode->i_sb, NULL, 0),
3441                                PFID(&lli->lli_fid), rc);
3442                }
3443        }
3444        if (rc < 0)
3445                goto out;
3446
3447        /* set layout to file. Unlikely this will fail as old layout was
3448         * surely eliminated */
3449        memset(&conf, 0, sizeof(conf));
3450        conf.coc_opc = OBJECT_CONF_SET;
3451        conf.coc_inode = inode;
3452        conf.coc_lock = lock;
3453        conf.u.coc_md = &md;
3454        rc = ll_layout_conf(inode, &conf);
3455
3456        if (md.lsm != NULL)
3457                obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3458
3459        /* refresh layout failed, need to wait */
3460        wait_layout = rc == -EBUSY;
3461
3462out:
3463        LDLM_LOCK_PUT(lock);
3464        ldlm_lock_decref(lockh, mode);
3465
3466        /* wait for IO to complete if it's still being used. */
3467        if (wait_layout) {
3468                CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3469                        ll_get_fsname(inode->i_sb, NULL, 0),
3470                        inode, PFID(&lli->lli_fid));
3471
3472                memset(&conf, 0, sizeof(conf));
3473                conf.coc_opc = OBJECT_CONF_WAIT;
3474                conf.coc_inode = inode;
3475                rc = ll_layout_conf(inode, &conf);
3476                if (rc == 0)
3477                        rc = -EAGAIN;
3478
3479                CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3480                        PFID(&lli->lli_fid), rc);
3481        }
3482        return rc;
3483}
3484
3485/**
3486 * This function checks if there exists a LAYOUT lock on the client side,
3487 * or enqueues it if it doesn't have one in cache.
3488 *
3489 * This function will not hold layout lock so it may be revoked any time after
3490 * this function returns. Any operations depend on layout should be redone
3491 * in that case.
3492 *
3493 * This function should be called before lov_io_init() to get an uptodate
3494 * layout version, the caller should save the version number and after IO
3495 * is finished, this function should be called again to verify that layout
3496 * is not changed during IO time.
3497 */
3498int ll_layout_refresh(struct inode *inode, __u32 *gen)
3499{
3500        struct ll_inode_info  *lli = ll_i2info(inode);
3501        struct ll_sb_info     *sbi = ll_i2sbi(inode);
3502        struct md_op_data     *op_data;
3503        struct lookup_intent   it;
3504        struct lustre_handle   lockh;
3505        ldlm_mode_t            mode;
3506        struct ldlm_enqueue_info einfo = {
3507                .ei_type = LDLM_IBITS,
3508                .ei_mode = LCK_CR,
3509                .ei_cb_bl = ll_md_blocking_ast,
3510                .ei_cb_cp = ldlm_completion_ast,
3511        };
3512        int rc;
3513
3514        *gen = ll_layout_version_get(lli);
3515        if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3516                return 0;
3517
3518        /* sanity checks */
3519        LASSERT(fid_is_sane(ll_inode2fid(inode)));
3520        LASSERT(S_ISREG(inode->i_mode));
3521
3522        /* take layout lock mutex to enqueue layout lock exclusively. */
3523        mutex_lock(&lli->lli_layout_mutex);
3524
3525again:
3526        /* mostly layout lock is caching on the local side, so try to match
3527         * it before grabbing layout lock mutex. */
3528        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3529                               LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3530        if (mode != 0) { /* hit cached lock */
3531                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3532                if (rc == -EAGAIN)
3533                        goto again;
3534
3535                mutex_unlock(&lli->lli_layout_mutex);
3536                return rc;
3537        }
3538
3539        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3540                        0, 0, LUSTRE_OPC_ANY, NULL);
3541        if (IS_ERR(op_data)) {
3542                mutex_unlock(&lli->lli_layout_mutex);
3543                return PTR_ERR(op_data);
3544        }
3545
3546        /* have to enqueue one */
3547        memset(&it, 0, sizeof(it));
3548        it.it_op = IT_LAYOUT;
3549        lockh.cookie = 0ULL;
3550
3551        LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3552                        ll_get_fsname(inode->i_sb, NULL, 0), inode,
3553                        PFID(&lli->lli_fid));
3554
3555        rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3556                        NULL, 0, NULL, 0);
3557        if (it.d.lustre.it_data != NULL)
3558                ptlrpc_req_finished(it.d.lustre.it_data);
3559        it.d.lustre.it_data = NULL;
3560
3561        ll_finish_md_op_data(op_data);
3562
3563        mode = it.d.lustre.it_lock_mode;
3564        it.d.lustre.it_lock_mode = 0;
3565        ll_intent_drop_lock(&it);
3566
3567        if (rc == 0) {
3568                /* set lock data in case this is a new lock */
3569                ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3570                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3571                if (rc == -EAGAIN)
3572                        goto again;
3573        }
3574        mutex_unlock(&lli->lli_layout_mutex);
3575
3576        return rc;
3577}
3578
3579/**
3580 *  This function send a restore request to the MDT
3581 */
3582int ll_layout_restore(struct inode *inode)
3583{
3584        struct hsm_user_request *hur;
3585        int                      len, rc;
3586
3587        len = sizeof(struct hsm_user_request) +
3588              sizeof(struct hsm_user_item);
3589        hur = kzalloc(len, GFP_NOFS);
3590        if (!hur)
3591                return -ENOMEM;
3592
3593        hur->hur_request.hr_action = HUA_RESTORE;
3594        hur->hur_request.hr_archive_id = 0;
3595        hur->hur_request.hr_flags = 0;
3596        memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3597               sizeof(hur->hur_user_item[0].hui_fid));
3598        hur->hur_user_item[0].hui_extent.length = -1;
3599        hur->hur_request.hr_itemcount = 1;
3600        rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3601                           len, hur, NULL);
3602        kfree(hur);
3603        return rc;
3604}
3605