linux/drivers/staging/lustre/lustre/llite/file.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 *
  36 * lustre/llite/file.c
  37 *
  38 * Author: Peter Braam <braam@clusterfs.com>
  39 * Author: Phil Schwan <phil@clusterfs.com>
  40 * Author: Andreas Dilger <adilger@clusterfs.com>
  41 */
  42
  43#define DEBUG_SUBSYSTEM S_LLITE
  44#include "../include/lustre_dlm.h"
  45#include "../include/lustre_lite.h"
  46#include <linux/pagemap.h>
  47#include <linux/file.h>
  48#include "llite_internal.h"
  49#include "../include/lustre/ll_fiemap.h"
  50
  51#include "../include/cl_object.h"
  52
  53static int
  54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  55
  56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  57                          bool *lease_broken);
  58
  59static enum llioc_iter
  60ll_iocontrol_call(struct inode *inode, struct file *file,
  61                  unsigned int cmd, unsigned long arg, int *rcp);
  62
  63static struct ll_file_data *ll_file_data_get(void)
  64{
  65        struct ll_file_data *fd;
  66
  67        OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  68        if (fd == NULL)
  69                return NULL;
  70        fd->fd_write_failed = false;
  71        return fd;
  72}
  73
  74static void ll_file_data_put(struct ll_file_data *fd)
  75{
  76        if (fd != NULL)
  77                OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  78}
  79
  80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  81                          struct lustre_handle *fh)
  82{
  83        op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  84        op_data->op_attr.ia_mode = inode->i_mode;
  85        op_data->op_attr.ia_atime = inode->i_atime;
  86        op_data->op_attr.ia_mtime = inode->i_mtime;
  87        op_data->op_attr.ia_ctime = inode->i_ctime;
  88        op_data->op_attr.ia_size = i_size_read(inode);
  89        op_data->op_attr_blocks = inode->i_blocks;
  90        ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  91                                        ll_inode_to_ext_flags(inode->i_flags);
  92        op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  93        if (fh)
  94                op_data->op_handle = *fh;
  95        op_data->op_capa1 = ll_mdscapa_get(inode);
  96
  97        if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  98                op_data->op_bias |= MDS_DATA_MODIFIED;
  99}
 100
 101/**
 102 * Closes the IO epoch and packs all the attributes into @op_data for
 103 * the CLOSE rpc.
 104 */
 105static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
 106                             struct obd_client_handle *och)
 107{
 108        op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 109                                        ATTR_MTIME | ATTR_MTIME_SET |
 110                                        ATTR_CTIME | ATTR_CTIME_SET;
 111
 112        if (!(och->och_flags & FMODE_WRITE))
 113                goto out;
 114
 115        if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 116                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 117        else
 118                ll_ioepoch_close(inode, op_data, &och, 0);
 119
 120out:
 121        ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 122        ll_prep_md_op_data(op_data, inode, NULL, NULL,
 123                           0, 0, LUSTRE_OPC_ANY, NULL);
 124}
 125
 126static int ll_close_inode_openhandle(struct obd_export *md_exp,
 127                                     struct inode *inode,
 128                                     struct obd_client_handle *och,
 129                                     const __u64 *data_version)
 130{
 131        struct obd_export *exp = ll_i2mdexp(inode);
 132        struct md_op_data *op_data;
 133        struct ptlrpc_request *req = NULL;
 134        struct obd_device *obd = class_exp2obd(exp);
 135        int epoch_close = 1;
 136        int rc;
 137
 138        if (obd == NULL) {
 139                /*
 140                 * XXX: in case of LMV, is this correct to access
 141                 * ->exp_handle?
 142                 */
 143                CERROR("Invalid MDC connection handle %#llx\n",
 144                       ll_i2mdexp(inode)->exp_handle.h_cookie);
 145                rc = 0;
 146                goto out;
 147        }
 148
 149        op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
 150        if (!op_data) {
 151                /* XXX We leak openhandle and request here. */
 152                rc = -ENOMEM;
 153                goto out;
 154        }
 155
 156        ll_prepare_close(inode, op_data, och);
 157        if (data_version != NULL) {
 158                /* Pass in data_version implies release. */
 159                op_data->op_bias |= MDS_HSM_RELEASE;
 160                op_data->op_data_version = *data_version;
 161                op_data->op_lease_handle = och->och_lease_handle;
 162                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 163        }
 164        epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 165        rc = md_close(md_exp, op_data, och->och_mod, &req);
 166        if (rc == -EAGAIN) {
 167                /* This close must have the epoch closed. */
 168                LASSERT(epoch_close);
 169                /* MDS has instructed us to obtain Size-on-MDS attribute from
 170                 * OSTs and send setattr to back to MDS. */
 171                rc = ll_som_update(inode, op_data);
 172                if (rc) {
 173                        CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
 174                               inode->i_ino, rc);
 175                        rc = 0;
 176                }
 177        } else if (rc) {
 178                CERROR("inode %lu mdc close failed: rc = %d\n",
 179                       inode->i_ino, rc);
 180        }
 181
 182        /* DATA_MODIFIED flag was successfully sent on close, cancel data
 183         * modification flag. */
 184        if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 185                struct ll_inode_info *lli = ll_i2info(inode);
 186
 187                spin_lock(&lli->lli_lock);
 188                lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 189                spin_unlock(&lli->lli_lock);
 190        }
 191
 192        if (rc == 0) {
 193                rc = ll_objects_destroy(req, inode);
 194                if (rc)
 195                        CERROR("inode %lu ll_objects destroy: rc = %d\n",
 196                               inode->i_ino, rc);
 197        }
 198        if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
 199                struct mdt_body *body;
 200                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 201                if (!(body->valid & OBD_MD_FLRELEASED))
 202                        rc = -EBUSY;
 203        }
 204
 205        ll_finish_md_op_data(op_data);
 206
 207out:
 208        if (exp_connect_som(exp) && !epoch_close &&
 209            S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 210                ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 211        } else {
 212                md_clear_open_replay_data(md_exp, och);
 213                /* Free @och if it is not waiting for DONE_WRITING. */
 214                och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 215                OBD_FREE_PTR(och);
 216        }
 217        if (req) /* This is close request */
 218                ptlrpc_req_finished(req);
 219        return rc;
 220}
 221
 222int ll_md_real_close(struct inode *inode, fmode_t fmode)
 223{
 224        struct ll_inode_info *lli = ll_i2info(inode);
 225        struct obd_client_handle **och_p;
 226        struct obd_client_handle *och;
 227        __u64 *och_usecount;
 228        int rc = 0;
 229
 230        if (fmode & FMODE_WRITE) {
 231                och_p = &lli->lli_mds_write_och;
 232                och_usecount = &lli->lli_open_fd_write_count;
 233        } else if (fmode & FMODE_EXEC) {
 234                och_p = &lli->lli_mds_exec_och;
 235                och_usecount = &lli->lli_open_fd_exec_count;
 236        } else {
 237                LASSERT(fmode & FMODE_READ);
 238                och_p = &lli->lli_mds_read_och;
 239                och_usecount = &lli->lli_open_fd_read_count;
 240        }
 241
 242        mutex_lock(&lli->lli_och_mutex);
 243        if (*och_usecount > 0) {
 244                /* There are still users of this handle, so skip
 245                 * freeing it. */
 246                mutex_unlock(&lli->lli_och_mutex);
 247                return 0;
 248        }
 249
 250        och = *och_p;
 251        *och_p = NULL;
 252        mutex_unlock(&lli->lli_och_mutex);
 253
 254        if (och != NULL) {
 255                /* There might be a race and this handle may already
 256                   be closed. */
 257                rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 258                                               inode, och, NULL);
 259        }
 260
 261        return rc;
 262}
 263
 264static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 265                       struct file *file)
 266{
 267        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 268        struct ll_inode_info *lli = ll_i2info(inode);
 269        int lockmode;
 270        __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 271        struct lustre_handle lockh;
 272        ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 273        int rc = 0;
 274
 275        /* clear group lock, if present */
 276        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 277                ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 278
 279        if (fd->fd_lease_och != NULL) {
 280                bool lease_broken;
 281
 282                /* Usually the lease is not released when the
 283                 * application crashed, we need to release here. */
 284                rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 285                CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 286                        PFID(&lli->lli_fid), rc, lease_broken);
 287
 288                fd->fd_lease_och = NULL;
 289        }
 290
 291        if (fd->fd_och != NULL) {
 292                rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
 293                fd->fd_och = NULL;
 294                goto out;
 295        }
 296
 297        /* Let's see if we have good enough OPEN lock on the file and if
 298           we can skip talking to MDS */
 299
 300        mutex_lock(&lli->lli_och_mutex);
 301        if (fd->fd_omode & FMODE_WRITE) {
 302                lockmode = LCK_CW;
 303                LASSERT(lli->lli_open_fd_write_count);
 304                lli->lli_open_fd_write_count--;
 305        } else if (fd->fd_omode & FMODE_EXEC) {
 306                lockmode = LCK_PR;
 307                LASSERT(lli->lli_open_fd_exec_count);
 308                lli->lli_open_fd_exec_count--;
 309        } else {
 310                lockmode = LCK_CR;
 311                LASSERT(lli->lli_open_fd_read_count);
 312                lli->lli_open_fd_read_count--;
 313        }
 314        mutex_unlock(&lli->lli_och_mutex);
 315
 316        if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 317                           LDLM_IBITS, &policy, lockmode, &lockh))
 318                rc = ll_md_real_close(inode, fd->fd_omode);
 319
 320out:
 321        LUSTRE_FPRIVATE(file) = NULL;
 322        ll_file_data_put(fd);
 323        ll_capa_close(inode);
 324
 325        return rc;
 326}
 327
 328/* While this returns an error code, fput() the caller does not, so we need
 329 * to make every effort to clean up all of our state here.  Also, applications
 330 * rarely check close errors and even if an error is returned they will not
 331 * re-try the close call.
 332 */
 333int ll_file_release(struct inode *inode, struct file *file)
 334{
 335        struct ll_file_data *fd;
 336        struct ll_sb_info *sbi = ll_i2sbi(inode);
 337        struct ll_inode_info *lli = ll_i2info(inode);
 338        int rc;
 339
 340        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 341               inode->i_generation, inode);
 342
 343#ifdef CONFIG_FS_POSIX_ACL
 344        if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
 345                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 346
 347                LASSERT(fd != NULL);
 348                if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 349                        fd->fd_flags &= ~LL_FILE_RMTACL;
 350                        rct_del(&sbi->ll_rct, current_pid());
 351                        et_search_free(&sbi->ll_et, current_pid());
 352                }
 353        }
 354#endif
 355
 356        if (!is_root_inode(inode))
 357                ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 358        fd = LUSTRE_FPRIVATE(file);
 359        LASSERT(fd != NULL);
 360
 361        /* The last ref on @file, maybe not the owner pid of statahead.
 362         * Different processes can open the same dir, "ll_opendir_key" means:
 363         * it is me that should stop the statahead thread. */
 364        if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 365            lli->lli_opendir_pid != 0)
 366                ll_stop_statahead(inode, lli->lli_opendir_key);
 367
 368        if (is_root_inode(inode)) {
 369                LUSTRE_FPRIVATE(file) = NULL;
 370                ll_file_data_put(fd);
 371                return 0;
 372        }
 373
 374        if (!S_ISDIR(inode->i_mode)) {
 375                lov_read_and_clear_async_rc(lli->lli_clob);
 376                lli->lli_async_rc = 0;
 377        }
 378
 379        rc = ll_md_close(sbi->ll_md_exp, inode, file);
 380
 381        if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 382                libcfs_debug_dumplog();
 383
 384        return rc;
 385}
 386
 387static int ll_intent_file_open(struct dentry *dentry, void *lmm,
 388                               int lmmsize, struct lookup_intent *itp)
 389{
 390        struct inode *inode = dentry->d_inode;
 391        struct ll_sb_info *sbi = ll_i2sbi(inode);
 392        struct dentry *parent = dentry->d_parent;
 393        const char *name = dentry->d_name.name;
 394        const int len = dentry->d_name.len;
 395        struct md_op_data *op_data;
 396        struct ptlrpc_request *req;
 397        __u32 opc = LUSTRE_OPC_ANY;
 398        int rc;
 399
 400        /* Usually we come here only for NFSD, and we want open lock.
 401           But we can also get here with pre 2.6.15 patchless kernels, and in
 402           that case that lock is also ok */
 403        /* We can also get here if there was cached open handle in revalidate_it
 404         * but it disappeared while we were getting from there to ll_file_open.
 405         * But this means this file was closed and immediately opened which
 406         * makes a good candidate for using OPEN lock */
 407        /* If lmmsize & lmm are not 0, we are just setting stripe info
 408         * parameters. No need for the open lock */
 409        if (lmm == NULL && lmmsize == 0) {
 410                itp->it_flags |= MDS_OPEN_LOCK;
 411                if (itp->it_flags & FMODE_WRITE)
 412                        opc = LUSTRE_OPC_CREATE;
 413        }
 414
 415        op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 416                                      inode, name, len,
 417                                      O_RDWR, opc, NULL);
 418        if (IS_ERR(op_data))
 419                return PTR_ERR(op_data);
 420
 421        itp->it_flags |= MDS_OPEN_BY_FID;
 422        rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 423                            0 /*unused */, &req, ll_md_blocking_ast, 0);
 424        ll_finish_md_op_data(op_data);
 425        if (rc == -ESTALE) {
 426                /* reason for keep own exit path - don`t flood log
 427                * with messages with -ESTALE errors.
 428                */
 429                if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 430                     it_open_error(DISP_OPEN_OPEN, itp))
 431                        goto out;
 432                ll_release_openhandle(inode, itp);
 433                goto out;
 434        }
 435
 436        if (it_disposition(itp, DISP_LOOKUP_NEG)) {
 437                rc = -ENOENT;
 438                goto out;
 439        }
 440
 441        if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 442                rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 443                CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 444                goto out;
 445        }
 446
 447        rc = ll_prep_inode(&inode, req, NULL, itp);
 448        if (!rc && itp->d.lustre.it_lock_mode)
 449                ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
 450
 451out:
 452        ptlrpc_req_finished(req);
 453        ll_intent_drop_lock(itp);
 454
 455        return rc;
 456}
 457
 458/**
 459 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 460 * not believe attributes if a few ioepoch holders exist. Attributes for
 461 * previous ioepoch if new one is opened are also skipped by MDS.
 462 */
 463void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 464{
 465        if (ioepoch && lli->lli_ioepoch != ioepoch) {
 466                lli->lli_ioepoch = ioepoch;
 467                CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
 468                       ioepoch, PFID(&lli->lli_fid));
 469        }
 470}
 471
 472static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 473                       struct obd_client_handle *och)
 474{
 475        struct ptlrpc_request *req = it->d.lustre.it_data;
 476        struct mdt_body *body;
 477
 478        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 479        och->och_fh = body->handle;
 480        och->och_fid = body->fid1;
 481        och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
 482        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 483        och->och_flags = it->it_flags;
 484
 485        return md_set_open_replay_data(md_exp, och, it);
 486}
 487
 488static int ll_local_open(struct file *file, struct lookup_intent *it,
 489                         struct ll_file_data *fd, struct obd_client_handle *och)
 490{
 491        struct inode *inode = file_inode(file);
 492        struct ll_inode_info *lli = ll_i2info(inode);
 493
 494        LASSERT(!LUSTRE_FPRIVATE(file));
 495
 496        LASSERT(fd != NULL);
 497
 498        if (och) {
 499                struct ptlrpc_request *req = it->d.lustre.it_data;
 500                struct mdt_body *body;
 501                int rc;
 502
 503                rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 504                if (rc != 0)
 505                        return rc;
 506
 507                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 508                ll_ioepoch_open(lli, body->ioepoch);
 509        }
 510
 511        LUSTRE_FPRIVATE(file) = fd;
 512        ll_readahead_init(inode, &fd->fd_ras);
 513        fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 514        return 0;
 515}
 516
 517/* Open a file, and (for the very first open) create objects on the OSTs at
 518 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 519 * creation or open until ll_lov_setstripe() ioctl is called.
 520 *
 521 * If we already have the stripe MD locally then we don't request it in
 522 * md_open(), by passing a lmm_size = 0.
 523 *
 524 * It is up to the application to ensure no other processes open this file
 525 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 526 * used.  We might be able to avoid races of that sort by getting lli_open_sem
 527 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 528 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 529 */
 530int ll_file_open(struct inode *inode, struct file *file)
 531{
 532        struct ll_inode_info *lli = ll_i2info(inode);
 533        struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 534                                          .it_flags = file->f_flags };
 535        struct obd_client_handle **och_p = NULL;
 536        __u64 *och_usecount = NULL;
 537        struct ll_file_data *fd;
 538        int rc = 0, opendir_set = 0;
 539
 540        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 541               inode->i_generation, inode, file->f_flags);
 542
 543        it = file->private_data; /* XXX: compat macro */
 544        file->private_data = NULL; /* prevent ll_local_open assertion */
 545
 546        fd = ll_file_data_get();
 547        if (fd == NULL) {
 548                rc = -ENOMEM;
 549                goto out_openerr;
 550        }
 551
 552        fd->fd_file = file;
 553        if (S_ISDIR(inode->i_mode)) {
 554                spin_lock(&lli->lli_sa_lock);
 555                if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 556                    lli->lli_opendir_pid == 0) {
 557                        lli->lli_opendir_key = fd;
 558                        lli->lli_opendir_pid = current_pid();
 559                        opendir_set = 1;
 560                }
 561                spin_unlock(&lli->lli_sa_lock);
 562        }
 563
 564        if (is_root_inode(inode)) {
 565                LUSTRE_FPRIVATE(file) = fd;
 566                return 0;
 567        }
 568
 569        if (!it || !it->d.lustre.it_disposition) {
 570                /* Convert f_flags into access mode. We cannot use file->f_mode,
 571                 * because everything but O_ACCMODE mask was stripped from
 572                 * there */
 573                if ((oit.it_flags + 1) & O_ACCMODE)
 574                        oit.it_flags++;
 575                if (file->f_flags & O_TRUNC)
 576                        oit.it_flags |= FMODE_WRITE;
 577
 578                /* kernel only call f_op->open in dentry_open.  filp_open calls
 579                 * dentry_open after call to open_namei that checks permissions.
 580                 * Only nfsd_open call dentry_open directly without checking
 581                 * permissions and because of that this code below is safe. */
 582                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 583                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 584
 585                /* We do not want O_EXCL here, presumably we opened the file
 586                 * already? XXX - NFS implications? */
 587                oit.it_flags &= ~O_EXCL;
 588
 589                /* bug20584, if "it_flags" contains O_CREAT, the file will be
 590                 * created if necessary, then "IT_CREAT" should be set to keep
 591                 * consistent with it */
 592                if (oit.it_flags & O_CREAT)
 593                        oit.it_op |= IT_CREAT;
 594
 595                it = &oit;
 596        }
 597
 598restart:
 599        /* Let's see if we have file open on MDS already. */
 600        if (it->it_flags & FMODE_WRITE) {
 601                och_p = &lli->lli_mds_write_och;
 602                och_usecount = &lli->lli_open_fd_write_count;
 603        } else if (it->it_flags & FMODE_EXEC) {
 604                och_p = &lli->lli_mds_exec_och;
 605                och_usecount = &lli->lli_open_fd_exec_count;
 606         } else {
 607                och_p = &lli->lli_mds_read_och;
 608                och_usecount = &lli->lli_open_fd_read_count;
 609        }
 610
 611        mutex_lock(&lli->lli_och_mutex);
 612        if (*och_p) { /* Open handle is present */
 613                if (it_disposition(it, DISP_OPEN_OPEN)) {
 614                        /* Well, there's extra open request that we do not need,
 615                           let's close it somehow. This will decref request. */
 616                        rc = it_open_error(DISP_OPEN_OPEN, it);
 617                        if (rc) {
 618                                mutex_unlock(&lli->lli_och_mutex);
 619                                goto out_openerr;
 620                        }
 621
 622                        ll_release_openhandle(inode, it);
 623                }
 624                (*och_usecount)++;
 625
 626                rc = ll_local_open(file, it, fd, NULL);
 627                if (rc) {
 628                        (*och_usecount)--;
 629                        mutex_unlock(&lli->lli_och_mutex);
 630                        goto out_openerr;
 631                }
 632        } else {
 633                LASSERT(*och_usecount == 0);
 634                if (!it->d.lustre.it_disposition) {
 635                        /* We cannot just request lock handle now, new ELC code
 636                           means that one of other OPEN locks for this file
 637                           could be cancelled, and since blocking ast handler
 638                           would attempt to grab och_mutex as well, that would
 639                           result in a deadlock */
 640                        mutex_unlock(&lli->lli_och_mutex);
 641                        it->it_create_mode |= M_CHECK_STALE;
 642                        rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
 643                        it->it_create_mode &= ~M_CHECK_STALE;
 644                        if (rc)
 645                                goto out_openerr;
 646
 647                        goto restart;
 648                }
 649                *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
 650                if (!*och_p) {
 651                        rc = -ENOMEM;
 652                        goto out_och_free;
 653                }
 654
 655                (*och_usecount)++;
 656
 657                /* md_intent_lock() didn't get a request ref if there was an
 658                 * open error, so don't do cleanup on the request here
 659                 * (bug 3430) */
 660                /* XXX (green): Should not we bail out on any error here, not
 661                 * just open error? */
 662                rc = it_open_error(DISP_OPEN_OPEN, it);
 663                if (rc)
 664                        goto out_och_free;
 665
 666                LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 667
 668                rc = ll_local_open(file, it, fd, *och_p);
 669                if (rc)
 670                        goto out_och_free;
 671        }
 672        mutex_unlock(&lli->lli_och_mutex);
 673        fd = NULL;
 674
 675        /* Must do this outside lli_och_mutex lock to prevent deadlock where
 676           different kind of OPEN lock for this same inode gets cancelled
 677           by ldlm_cancel_lru */
 678        if (!S_ISREG(inode->i_mode))
 679                goto out_och_free;
 680
 681        ll_capa_open(inode);
 682
 683        if (!lli->lli_has_smd &&
 684            (cl_is_lov_delay_create(file->f_flags) ||
 685             (file->f_mode & FMODE_WRITE) == 0)) {
 686                CDEBUG(D_INODE, "object creation was delayed\n");
 687                goto out_och_free;
 688        }
 689        cl_lov_delay_create_clear(&file->f_flags);
 690        goto out_och_free;
 691
 692out_och_free:
 693        if (rc) {
 694                if (och_p && *och_p) {
 695                        OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 696                        *och_p = NULL; /* OBD_FREE writes some magic there */
 697                        (*och_usecount)--;
 698                }
 699                mutex_unlock(&lli->lli_och_mutex);
 700
 701out_openerr:
 702                if (opendir_set != 0)
 703                        ll_stop_statahead(inode, lli->lli_opendir_key);
 704                if (fd != NULL)
 705                        ll_file_data_put(fd);
 706        } else {
 707                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 708        }
 709
 710        if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 711                ptlrpc_req_finished(it->d.lustre.it_data);
 712                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 713        }
 714
 715        return rc;
 716}
 717
 718static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 719                        struct ldlm_lock_desc *desc, void *data, int flag)
 720{
 721        int rc;
 722        struct lustre_handle lockh;
 723
 724        switch (flag) {
 725        case LDLM_CB_BLOCKING:
 726                ldlm_lock2handle(lock, &lockh);
 727                rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 728                if (rc < 0) {
 729                        CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 730                        return rc;
 731                }
 732                break;
 733        case LDLM_CB_CANCELING:
 734                /* do nothing */
 735                break;
 736        }
 737        return 0;
 738}
 739
 740/**
 741 * Acquire a lease and open the file.
 742 */
 743static struct obd_client_handle *
 744ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 745              __u64 open_flags)
 746{
 747        struct lookup_intent it = { .it_op = IT_OPEN };
 748        struct ll_sb_info *sbi = ll_i2sbi(inode);
 749        struct md_op_data *op_data;
 750        struct ptlrpc_request *req;
 751        struct lustre_handle old_handle = { 0 };
 752        struct obd_client_handle *och = NULL;
 753        int rc;
 754        int rc2;
 755
 756        if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 757                return ERR_PTR(-EINVAL);
 758
 759        if (file != NULL) {
 760                struct ll_inode_info *lli = ll_i2info(inode);
 761                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 762                struct obd_client_handle **och_p;
 763                __u64 *och_usecount;
 764
 765                if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 766                        return ERR_PTR(-EPERM);
 767
 768                /* Get the openhandle of the file */
 769                rc = -EBUSY;
 770                mutex_lock(&lli->lli_och_mutex);
 771                if (fd->fd_lease_och != NULL) {
 772                        mutex_unlock(&lli->lli_och_mutex);
 773                        return ERR_PTR(rc);
 774                }
 775
 776                if (fd->fd_och == NULL) {
 777                        if (file->f_mode & FMODE_WRITE) {
 778                                LASSERT(lli->lli_mds_write_och != NULL);
 779                                och_p = &lli->lli_mds_write_och;
 780                                och_usecount = &lli->lli_open_fd_write_count;
 781                        } else {
 782                                LASSERT(lli->lli_mds_read_och != NULL);
 783                                och_p = &lli->lli_mds_read_och;
 784                                och_usecount = &lli->lli_open_fd_read_count;
 785                        }
 786                        if (*och_usecount == 1) {
 787                                fd->fd_och = *och_p;
 788                                *och_p = NULL;
 789                                *och_usecount = 0;
 790                                rc = 0;
 791                        }
 792                }
 793                mutex_unlock(&lli->lli_och_mutex);
 794                if (rc < 0) /* more than 1 opener */
 795                        return ERR_PTR(rc);
 796
 797                LASSERT(fd->fd_och != NULL);
 798                old_handle = fd->fd_och->och_fh;
 799        }
 800
 801        och = kzalloc(sizeof(*och), GFP_NOFS);
 802        if (!och)
 803                return ERR_PTR(-ENOMEM);
 804
 805        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 806                                        LUSTRE_OPC_ANY, NULL);
 807        if (IS_ERR(op_data)) {
 808                rc = PTR_ERR(op_data);
 809                goto out;
 810        }
 811
 812        /* To tell the MDT this openhandle is from the same owner */
 813        op_data->op_handle = old_handle;
 814
 815        it.it_flags = fmode | open_flags;
 816        it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 817        rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
 818                                ll_md_blocking_lease_ast,
 819        /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 820         * it can be cancelled which may mislead applications that the lease is
 821         * broken;
 822         * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 823         * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 824         * doesn't deal with openhandle, so normal openhandle will be leaked. */
 825                                LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 826        ll_finish_md_op_data(op_data);
 827        ptlrpc_req_finished(req);
 828        if (rc < 0)
 829                goto out_release_it;
 830
 831        if (it_disposition(&it, DISP_LOOKUP_NEG)) {
 832                rc = -ENOENT;
 833                goto out_release_it;
 834        }
 835
 836        rc = it_open_error(DISP_OPEN_OPEN, &it);
 837        if (rc)
 838                goto out_release_it;
 839
 840        LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 841        ll_och_fill(sbi->ll_md_exp, &it, och);
 842
 843        if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
 844                rc = -EOPNOTSUPP;
 845                goto out_close;
 846        }
 847
 848        /* already get lease, handle lease lock */
 849        ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 850        if (it.d.lustre.it_lock_mode == 0 ||
 851            it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
 852                /* open lock must return for lease */
 853                CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 854                        PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
 855                        it.d.lustre.it_lock_bits);
 856                rc = -EPROTO;
 857                goto out_close;
 858        }
 859
 860        ll_intent_release(&it);
 861        return och;
 862
 863out_close:
 864        rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
 865        if (rc2)
 866                CERROR("Close openhandle returned %d\n", rc2);
 867
 868        /* cancel open lock */
 869        if (it.d.lustre.it_lock_mode != 0) {
 870                ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 871                                                it.d.lustre.it_lock_mode);
 872                it.d.lustre.it_lock_mode = 0;
 873        }
 874out_release_it:
 875        ll_intent_release(&it);
 876out:
 877        OBD_FREE_PTR(och);
 878        return ERR_PTR(rc);
 879}
 880
 881/**
 882 * Release lease and close the file.
 883 * It will check if the lease has ever broken.
 884 */
 885static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 886                          bool *lease_broken)
 887{
 888        struct ldlm_lock *lock;
 889        bool cancelled = true;
 890        int rc;
 891
 892        lock = ldlm_handle2lock(&och->och_lease_handle);
 893        if (lock != NULL) {
 894                lock_res_and_lock(lock);
 895                cancelled = ldlm_is_cancel(lock);
 896                unlock_res_and_lock(lock);
 897                ldlm_lock_put(lock);
 898        }
 899
 900        CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 901                PFID(&ll_i2info(inode)->lli_fid), cancelled);
 902
 903        if (!cancelled)
 904                ldlm_cli_cancel(&och->och_lease_handle, 0);
 905        if (lease_broken != NULL)
 906                *lease_broken = cancelled;
 907
 908        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
 909                                       NULL);
 910        return rc;
 911}
 912
 913/* Fills the obdo with the attributes for the lsm */
 914static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 915                          struct obd_capa *capa, struct obdo *obdo,
 916                          __u64 ioepoch, int sync)
 917{
 918        struct ptlrpc_request_set *set;
 919        struct obd_info     oinfo = { { { 0 } } };
 920        int                     rc;
 921
 922        LASSERT(lsm != NULL);
 923
 924        oinfo.oi_md = lsm;
 925        oinfo.oi_oa = obdo;
 926        oinfo.oi_oa->o_oi = lsm->lsm_oi;
 927        oinfo.oi_oa->o_mode = S_IFREG;
 928        oinfo.oi_oa->o_ioepoch = ioepoch;
 929        oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 930                               OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 931                               OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 932                               OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 933                               OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 934                               OBD_MD_FLDATAVERSION;
 935        oinfo.oi_capa = capa;
 936        if (sync) {
 937                oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 938                oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 939        }
 940
 941        set = ptlrpc_prep_set();
 942        if (set == NULL) {
 943                CERROR("can't allocate ptlrpc set\n");
 944                rc = -ENOMEM;
 945        } else {
 946                rc = obd_getattr_async(exp, &oinfo, set);
 947                if (rc == 0)
 948                        rc = ptlrpc_set_wait(set);
 949                ptlrpc_set_destroy(set);
 950        }
 951        if (rc == 0)
 952                oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 953                                         OBD_MD_FLATIME | OBD_MD_FLMTIME |
 954                                         OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 955                                         OBD_MD_FLDATAVERSION);
 956        return rc;
 957}
 958
 959/**
 960  * Performs the getattr on the inode and updates its fields.
 961  * If @sync != 0, perform the getattr under the server-side lock.
 962  */
 963int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 964                     __u64 ioepoch, int sync)
 965{
 966        struct obd_capa      *capa = ll_mdscapa_get(inode);
 967        struct lov_stripe_md *lsm;
 968        int rc;
 969
 970        lsm = ccc_inode_lsm_get(inode);
 971        rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 972                            capa, obdo, ioepoch, sync);
 973        capa_put(capa);
 974        if (rc == 0) {
 975                struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
 976
 977                obdo_refresh_inode(inode, obdo, obdo->o_valid);
 978                CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
 979                       POSTID(oi), i_size_read(inode),
 980                       (unsigned long long)inode->i_blocks,
 981                       1UL << inode->i_blkbits);
 982        }
 983        ccc_inode_lsm_put(inode, lsm);
 984        return rc;
 985}
 986
 987int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 988{
 989        struct ll_inode_info *lli = ll_i2info(inode);
 990        struct cl_object *obj = lli->lli_clob;
 991        struct cl_attr *attr = ccc_env_thread_attr(env);
 992        struct ost_lvb lvb;
 993        int rc = 0;
 994
 995        ll_inode_size_lock(inode);
 996        /* merge timestamps the most recently obtained from mds with
 997           timestamps obtained from osts */
 998        LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 999        LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
1000        LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
1001
1002        lvb.lvb_size = i_size_read(inode);
1003        lvb.lvb_blocks = inode->i_blocks;
1004        lvb.lvb_mtime = LTIME_S(inode->i_mtime);
1005        lvb.lvb_atime = LTIME_S(inode->i_atime);
1006        lvb.lvb_ctime = LTIME_S(inode->i_ctime);
1007
1008        cl_object_attr_lock(obj);
1009        rc = cl_object_attr_get(env, obj, attr);
1010        cl_object_attr_unlock(obj);
1011
1012        if (rc == 0) {
1013                if (lvb.lvb_atime < attr->cat_atime)
1014                        lvb.lvb_atime = attr->cat_atime;
1015                if (lvb.lvb_ctime < attr->cat_ctime)
1016                        lvb.lvb_ctime = attr->cat_ctime;
1017                if (lvb.lvb_mtime < attr->cat_mtime)
1018                        lvb.lvb_mtime = attr->cat_mtime;
1019
1020                CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1021                                PFID(&lli->lli_fid), attr->cat_size);
1022                cl_isize_write_nolock(inode, attr->cat_size);
1023
1024                inode->i_blocks = attr->cat_blocks;
1025
1026                LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1027                LTIME_S(inode->i_atime) = lvb.lvb_atime;
1028                LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1029        }
1030        ll_inode_size_unlock(inode);
1031
1032        return rc;
1033}
1034
1035int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1036                     lstat_t *st)
1037{
1038        struct obdo obdo = { 0 };
1039        int rc;
1040
1041        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1042        if (rc == 0) {
1043                st->st_size   = obdo.o_size;
1044                st->st_blocks = obdo.o_blocks;
1045                st->st_mtime  = obdo.o_mtime;
1046                st->st_atime  = obdo.o_atime;
1047                st->st_ctime  = obdo.o_ctime;
1048        }
1049        return rc;
1050}
1051
1052static bool file_is_noatime(const struct file *file)
1053{
1054        const struct vfsmount *mnt = file->f_path.mnt;
1055        const struct inode *inode = file_inode(file);
1056
1057        /* Adapted from file_accessed() and touch_atime().*/
1058        if (file->f_flags & O_NOATIME)
1059                return true;
1060
1061        if (inode->i_flags & S_NOATIME)
1062                return true;
1063
1064        if (IS_NOATIME(inode))
1065                return true;
1066
1067        if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1068                return true;
1069
1070        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1071                return true;
1072
1073        if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1074                return true;
1075
1076        return false;
1077}
1078
1079void ll_io_init(struct cl_io *io, const struct file *file, int write)
1080{
1081        struct inode *inode = file_inode(file);
1082
1083        io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1084        if (write) {
1085                io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1086                io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1087                                      file->f_flags & O_DIRECT ||
1088                                      IS_SYNC(inode);
1089        }
1090        io->ci_obj     = ll_i2info(inode)->lli_clob;
1091        io->ci_lockreq = CILR_MAYBE;
1092        if (ll_file_nolock(file)) {
1093                io->ci_lockreq = CILR_NEVER;
1094                io->ci_no_srvlock = 1;
1095        } else if (file->f_flags & O_APPEND) {
1096                io->ci_lockreq = CILR_MANDATORY;
1097        }
1098
1099        io->ci_noatime = file_is_noatime(file);
1100}
1101
1102static ssize_t
1103ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1104                   struct file *file, enum cl_io_type iot,
1105                   loff_t *ppos, size_t count)
1106{
1107        struct ll_inode_info *lli = ll_i2info(file_inode(file));
1108        struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1109        struct cl_io     *io;
1110        ssize_t        result;
1111
1112restart:
1113        io = ccc_env_thread_io(env);
1114        ll_io_init(io, file, iot == CIT_WRITE);
1115
1116        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1117                struct vvp_io *vio = vvp_env_io(env);
1118                struct ccc_io *cio = ccc_env_io(env);
1119                int write_mutex_locked = 0;
1120
1121                cio->cui_fd  = LUSTRE_FPRIVATE(file);
1122                vio->cui_io_subtype = args->via_io_subtype;
1123
1124                switch (vio->cui_io_subtype) {
1125                case IO_NORMAL:
1126                        cio->cui_iter = args->u.normal.via_iter;
1127                        cio->cui_iocb = args->u.normal.via_iocb;
1128                        if ((iot == CIT_WRITE) &&
1129                            !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1130                                if (mutex_lock_interruptible(&lli->
1131                                                               lli_write_mutex)) {
1132                                        result = -ERESTARTSYS;
1133                                        goto out;
1134                                }
1135                                write_mutex_locked = 1;
1136                        } else if (iot == CIT_READ) {
1137                                down_read(&lli->lli_trunc_sem);
1138                        }
1139                        break;
1140                case IO_SPLICE:
1141                        vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1142                        vio->u.splice.cui_flags = args->u.splice.via_flags;
1143                        break;
1144                default:
1145                        CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1146                        LBUG();
1147                }
1148                result = cl_io_loop(env, io);
1149                if (write_mutex_locked)
1150                        mutex_unlock(&lli->lli_write_mutex);
1151                else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1152                        up_read(&lli->lli_trunc_sem);
1153        } else {
1154                /* cl_io_rw_init() handled IO */
1155                result = io->ci_result;
1156        }
1157
1158        if (io->ci_nob > 0) {
1159                result = io->ci_nob;
1160                *ppos = io->u.ci_wr.wr.crw_pos;
1161        }
1162        goto out;
1163out:
1164        cl_io_fini(env, io);
1165        /* If any bit been read/written (result != 0), we just return
1166         * short read/write instead of restart io. */
1167        if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1168                CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1169                       iot == CIT_READ ? "read" : "write",
1170                       file, *ppos, count);
1171                LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1172                goto restart;
1173        }
1174
1175        if (iot == CIT_READ) {
1176                if (result >= 0)
1177                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1178                                           LPROC_LL_READ_BYTES, result);
1179        } else if (iot == CIT_WRITE) {
1180                if (result >= 0) {
1181                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1182                                           LPROC_LL_WRITE_BYTES, result);
1183                        fd->fd_write_failed = false;
1184                } else if (result != -ERESTARTSYS) {
1185                        fd->fd_write_failed = true;
1186                }
1187        }
1188
1189        return result;
1190}
1191
1192static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1193{
1194        struct lu_env      *env;
1195        struct vvp_io_args *args;
1196        ssize_t      result;
1197        int              refcheck;
1198
1199        env = cl_env_get(&refcheck);
1200        if (IS_ERR(env))
1201                return PTR_ERR(env);
1202
1203        args = vvp_env_args(env, IO_NORMAL);
1204        args->u.normal.via_iter = to;
1205        args->u.normal.via_iocb = iocb;
1206
1207        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1208                                    &iocb->ki_pos, iov_iter_count(to));
1209        cl_env_put(env, &refcheck);
1210        return result;
1211}
1212
1213/*
1214 * Write to a file (through the page cache).
1215 */
1216static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1217{
1218        struct lu_env      *env;
1219        struct vvp_io_args *args;
1220        ssize_t      result;
1221        int              refcheck;
1222
1223        env = cl_env_get(&refcheck);
1224        if (IS_ERR(env))
1225                return PTR_ERR(env);
1226
1227        args = vvp_env_args(env, IO_NORMAL);
1228        args->u.normal.via_iter = from;
1229        args->u.normal.via_iocb = iocb;
1230
1231        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1232                                  &iocb->ki_pos, iov_iter_count(from));
1233        cl_env_put(env, &refcheck);
1234        return result;
1235}
1236
1237/*
1238 * Send file content (through pagecache) somewhere with helper
1239 */
1240static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1241                                   struct pipe_inode_info *pipe, size_t count,
1242                                   unsigned int flags)
1243{
1244        struct lu_env      *env;
1245        struct vvp_io_args *args;
1246        ssize_t      result;
1247        int              refcheck;
1248
1249        env = cl_env_get(&refcheck);
1250        if (IS_ERR(env))
1251                return PTR_ERR(env);
1252
1253        args = vvp_env_args(env, IO_SPLICE);
1254        args->u.splice.via_pipe = pipe;
1255        args->u.splice.via_flags = flags;
1256
1257        result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1258        cl_env_put(env, &refcheck);
1259        return result;
1260}
1261
1262static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1263{
1264        struct obd_export *exp = ll_i2dtexp(inode);
1265        struct obd_trans_info oti = { 0 };
1266        struct obdo *oa = NULL;
1267        int lsm_size;
1268        int rc = 0;
1269        struct lov_stripe_md *lsm = NULL, *lsm2;
1270
1271        OBDO_ALLOC(oa);
1272        if (oa == NULL)
1273                return -ENOMEM;
1274
1275        lsm = ccc_inode_lsm_get(inode);
1276        if (!lsm_has_objects(lsm)) {
1277                rc = -ENOENT;
1278                goto out;
1279        }
1280
1281        lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1282                   (lsm->lsm_stripe_count));
1283
1284        OBD_ALLOC_LARGE(lsm2, lsm_size);
1285        if (lsm2 == NULL) {
1286                rc = -ENOMEM;
1287                goto out;
1288        }
1289
1290        oa->o_oi = *oi;
1291        oa->o_nlink = ost_idx;
1292        oa->o_flags |= OBD_FL_RECREATE_OBJS;
1293        oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1294        obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1295                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1296        obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1297        memcpy(lsm2, lsm, lsm_size);
1298        ll_inode_size_lock(inode);
1299        rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1300        ll_inode_size_unlock(inode);
1301
1302        OBD_FREE_LARGE(lsm2, lsm_size);
1303        goto out;
1304out:
1305        ccc_inode_lsm_put(inode, lsm);
1306        OBDO_FREE(oa);
1307        return rc;
1308}
1309
1310static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1311{
1312        struct ll_recreate_obj ucreat;
1313        struct ost_id           oi;
1314
1315        if (!capable(CFS_CAP_SYS_ADMIN))
1316                return -EPERM;
1317
1318        if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1319                           sizeof(ucreat)))
1320                return -EFAULT;
1321
1322        ostid_set_seq_mdt0(&oi);
1323        ostid_set_id(&oi, ucreat.lrc_id);
1324        return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1325}
1326
1327static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1328{
1329        struct lu_fid   fid;
1330        struct ost_id   oi;
1331        u32             ost_idx;
1332
1333        if (!capable(CFS_CAP_SYS_ADMIN))
1334                return -EPERM;
1335
1336        if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1337                return -EFAULT;
1338
1339        fid_to_ostid(&fid, &oi);
1340        ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1341        return ll_lov_recreate(inode, &oi, ost_idx);
1342}
1343
1344int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1345                             int flags, struct lov_user_md *lum, int lum_size)
1346{
1347        struct lov_stripe_md *lsm = NULL;
1348        struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1349        int rc = 0;
1350
1351        lsm = ccc_inode_lsm_get(inode);
1352        if (lsm != NULL) {
1353                ccc_inode_lsm_put(inode, lsm);
1354                CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1355                       inode->i_ino);
1356                rc = -EEXIST;
1357                goto out;
1358        }
1359
1360        ll_inode_size_lock(inode);
1361        rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1362        if (rc)
1363                goto out_unlock;
1364        rc = oit.d.lustre.it_status;
1365        if (rc < 0)
1366                goto out_req_free;
1367
1368        ll_release_openhandle(inode, &oit);
1369
1370out_unlock:
1371        ll_inode_size_unlock(inode);
1372        ll_intent_release(&oit);
1373        ccc_inode_lsm_put(inode, lsm);
1374out:
1375        return rc;
1376out_req_free:
1377        ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1378        goto out;
1379}
1380
1381int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1382                             struct lov_mds_md **lmmp, int *lmm_size,
1383                             struct ptlrpc_request **request)
1384{
1385        struct ll_sb_info *sbi = ll_i2sbi(inode);
1386        struct mdt_body  *body;
1387        struct lov_mds_md *lmm = NULL;
1388        struct ptlrpc_request *req = NULL;
1389        struct md_op_data *op_data;
1390        int rc, lmmsize;
1391
1392        rc = ll_get_default_mdsize(sbi, &lmmsize);
1393        if (rc)
1394                return rc;
1395
1396        op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1397                                     strlen(filename), lmmsize,
1398                                     LUSTRE_OPC_ANY, NULL);
1399        if (IS_ERR(op_data))
1400                return PTR_ERR(op_data);
1401
1402        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1403        rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1404        ll_finish_md_op_data(op_data);
1405        if (rc < 0) {
1406                CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1407                       filename, rc);
1408                goto out;
1409        }
1410
1411        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1412        LASSERT(body != NULL); /* checked by mdc_getattr_name */
1413
1414        lmmsize = body->eadatasize;
1415
1416        if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1417                        lmmsize == 0) {
1418                rc = -ENODATA;
1419                goto out;
1420        }
1421
1422        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1423        LASSERT(lmm != NULL);
1424
1425        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1426            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1427                rc = -EPROTO;
1428                goto out;
1429        }
1430
1431        /*
1432         * This is coming from the MDS, so is probably in
1433         * little endian.  We convert it to host endian before
1434         * passing it to userspace.
1435         */
1436        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1437                int stripe_count;
1438
1439                stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1440                if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1441                        stripe_count = 0;
1442
1443                /* if function called for directory - we should
1444                 * avoid swab not existent lsm objects */
1445                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1446                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1447                        if (S_ISREG(body->mode))
1448                                lustre_swab_lov_user_md_objects(
1449                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1450                                 stripe_count);
1451                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1452                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1453                        if (S_ISREG(body->mode))
1454                                lustre_swab_lov_user_md_objects(
1455                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1456                                 stripe_count);
1457                }
1458        }
1459
1460out:
1461        *lmmp = lmm;
1462        *lmm_size = lmmsize;
1463        *request = req;
1464        return rc;
1465}
1466
1467static int ll_lov_setea(struct inode *inode, struct file *file,
1468                            unsigned long arg)
1469{
1470        int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1471        struct lov_user_md      *lump;
1472        int                      lum_size = sizeof(struct lov_user_md) +
1473                                            sizeof(struct lov_user_ost_data);
1474        int                      rc;
1475
1476        if (!capable(CFS_CAP_SYS_ADMIN))
1477                return -EPERM;
1478
1479        OBD_ALLOC_LARGE(lump, lum_size);
1480        if (lump == NULL)
1481                return -ENOMEM;
1482
1483        if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1484                OBD_FREE_LARGE(lump, lum_size);
1485                return -EFAULT;
1486        }
1487
1488        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1489                                     lum_size);
1490        cl_lov_delay_create_clear(&file->f_flags);
1491
1492        OBD_FREE_LARGE(lump, lum_size);
1493        return rc;
1494}
1495
1496static int ll_lov_setstripe(struct inode *inode, struct file *file,
1497                            unsigned long arg)
1498{
1499        struct lov_user_md_v3    lumv3;
1500        struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1501        struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1502        struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1503        int                      lum_size, rc;
1504        int                      flags = FMODE_WRITE;
1505
1506        /* first try with v1 which is smaller than v3 */
1507        lum_size = sizeof(struct lov_user_md_v1);
1508        if (copy_from_user(lumv1, lumv1p, lum_size))
1509                return -EFAULT;
1510
1511        if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1512                lum_size = sizeof(struct lov_user_md_v3);
1513                if (copy_from_user(&lumv3, lumv3p, lum_size))
1514                        return -EFAULT;
1515        }
1516
1517        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1518                                      lum_size);
1519        cl_lov_delay_create_clear(&file->f_flags);
1520        if (rc == 0) {
1521                struct lov_stripe_md *lsm;
1522                __u32 gen;
1523
1524                put_user(0, &lumv1p->lmm_stripe_count);
1525
1526                ll_layout_refresh(inode, &gen);
1527                lsm = ccc_inode_lsm_get(inode);
1528                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1529                                   0, lsm, (void *)arg);
1530                ccc_inode_lsm_put(inode, lsm);
1531        }
1532        return rc;
1533}
1534
1535static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1536{
1537        struct lov_stripe_md *lsm;
1538        int rc = -ENODATA;
1539
1540        lsm = ccc_inode_lsm_get(inode);
1541        if (lsm != NULL)
1542                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1543                                   lsm, (void *)arg);
1544        ccc_inode_lsm_put(inode, lsm);
1545        return rc;
1546}
1547
1548static int
1549ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1550{
1551        struct ll_inode_info   *lli = ll_i2info(inode);
1552        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1553        struct ccc_grouplock    grouplock;
1554        int                  rc;
1555
1556        if (ll_file_nolock(file))
1557                return -EOPNOTSUPP;
1558
1559        spin_lock(&lli->lli_lock);
1560        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1561                CWARN("group lock already existed with gid %lu\n",
1562                      fd->fd_grouplock.cg_gid);
1563                spin_unlock(&lli->lli_lock);
1564                return -EINVAL;
1565        }
1566        LASSERT(fd->fd_grouplock.cg_lock == NULL);
1567        spin_unlock(&lli->lli_lock);
1568
1569        rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1570                              arg, (file->f_flags & O_NONBLOCK), &grouplock);
1571        if (rc)
1572                return rc;
1573
1574        spin_lock(&lli->lli_lock);
1575        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1576                spin_unlock(&lli->lli_lock);
1577                CERROR("another thread just won the race\n");
1578                cl_put_grouplock(&grouplock);
1579                return -EINVAL;
1580        }
1581
1582        fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1583        fd->fd_grouplock = grouplock;
1584        spin_unlock(&lli->lli_lock);
1585
1586        CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1587        return 0;
1588}
1589
1590int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1591{
1592        struct ll_inode_info   *lli = ll_i2info(inode);
1593        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1594        struct ccc_grouplock    grouplock;
1595
1596        spin_lock(&lli->lli_lock);
1597        if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1598                spin_unlock(&lli->lli_lock);
1599                CWARN("no group lock held\n");
1600                return -EINVAL;
1601        }
1602        LASSERT(fd->fd_grouplock.cg_lock != NULL);
1603
1604        if (fd->fd_grouplock.cg_gid != arg) {
1605                CWARN("group lock %lu doesn't match current id %lu\n",
1606                       arg, fd->fd_grouplock.cg_gid);
1607                spin_unlock(&lli->lli_lock);
1608                return -EINVAL;
1609        }
1610
1611        grouplock = fd->fd_grouplock;
1612        memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1613        fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1614        spin_unlock(&lli->lli_lock);
1615
1616        cl_put_grouplock(&grouplock);
1617        CDEBUG(D_INFO, "group lock %lu released\n", arg);
1618        return 0;
1619}
1620
1621/**
1622 * Close inode open handle
1623 *
1624 * \param inode  [in]     inode in question
1625 * \param it     [in,out] intent which contains open info and result
1626 *
1627 * \retval 0     success
1628 * \retval <0    failure
1629 */
1630int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1631{
1632        struct obd_client_handle *och;
1633        int rc;
1634
1635        LASSERT(inode);
1636
1637        /* Root ? Do nothing. */
1638        if (is_root_inode(inode))
1639                return 0;
1640
1641        /* No open handle to close? Move away */
1642        if (!it_disposition(it, DISP_OPEN_OPEN))
1643                return 0;
1644
1645        LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1646
1647        och = kzalloc(sizeof(*och), GFP_NOFS);
1648        if (!och) {
1649                rc = -ENOMEM;
1650                goto out;
1651        }
1652
1653        ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1654
1655        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1656                                       inode, och, NULL);
1657out:
1658        /* this one is in place of ll_file_open */
1659        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1660                ptlrpc_req_finished(it->d.lustre.it_data);
1661                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1662        }
1663        return rc;
1664}
1665
1666/**
1667 * Get size for inode for which FIEMAP mapping is requested.
1668 * Make the FIEMAP get_info call and returns the result.
1669 */
1670static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1671                        size_t num_bytes)
1672{
1673        struct obd_export *exp = ll_i2dtexp(inode);
1674        struct lov_stripe_md *lsm = NULL;
1675        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1676        __u32 vallen = num_bytes;
1677        int rc;
1678
1679        /* Checks for fiemap flags */
1680        if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1681                fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1682                return -EBADR;
1683        }
1684
1685        /* Check for FIEMAP_FLAG_SYNC */
1686        if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1687                rc = filemap_fdatawrite(inode->i_mapping);
1688                if (rc)
1689                        return rc;
1690        }
1691
1692        lsm = ccc_inode_lsm_get(inode);
1693        if (lsm == NULL)
1694                return -ENOENT;
1695
1696        /* If the stripe_count > 1 and the application does not understand
1697         * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1698         */
1699        if (lsm->lsm_stripe_count > 1 &&
1700            !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1701                rc = -EOPNOTSUPP;
1702                goto out;
1703        }
1704
1705        fm_key.oa.o_oi = lsm->lsm_oi;
1706        fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1707
1708        obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1709        obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1710        /* If filesize is 0, then there would be no objects for mapping */
1711        if (fm_key.oa.o_size == 0) {
1712                fiemap->fm_mapped_extents = 0;
1713                rc = 0;
1714                goto out;
1715        }
1716
1717        memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1718
1719        rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1720                          fiemap, lsm);
1721        if (rc)
1722                CERROR("obd_get_info failed: rc = %d\n", rc);
1723
1724out:
1725        ccc_inode_lsm_put(inode, lsm);
1726        return rc;
1727}
1728
1729int ll_fid2path(struct inode *inode, void __user *arg)
1730{
1731        struct obd_export *exp = ll_i2mdexp(inode);
1732        const struct getinfo_fid2path __user *gfin = arg;
1733        struct getinfo_fid2path *gfout;
1734        u32 pathlen;
1735        size_t outsize;
1736        int rc;
1737
1738        if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1739            !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1740                return -EPERM;
1741
1742        /* Only need to get the buflen */
1743        if (get_user(pathlen, &gfin->gf_pathlen))
1744                return -EFAULT;
1745
1746        if (pathlen > PATH_MAX)
1747                return -EINVAL;
1748
1749        outsize = sizeof(*gfout) + pathlen;
1750
1751        gfout = kzalloc(outsize, GFP_NOFS);
1752        if (!gfout)
1753                return -ENOMEM;
1754
1755        if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1756                rc = -EFAULT;
1757                goto gf_free;
1758        }
1759
1760        /* Call mdc_iocontrol */
1761        rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1762        if (rc != 0)
1763                goto gf_free;
1764
1765        if (copy_to_user(arg, gfout, outsize))
1766                rc = -EFAULT;
1767
1768gf_free:
1769        OBD_FREE(gfout, outsize);
1770        return rc;
1771}
1772
1773static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1774{
1775        struct ll_user_fiemap *fiemap_s;
1776        size_t num_bytes, ret_bytes;
1777        unsigned int extent_count;
1778        int rc = 0;
1779
1780        /* Get the extent count so we can calculate the size of
1781         * required fiemap buffer */
1782        if (get_user(extent_count,
1783            &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1784                return -EFAULT;
1785
1786        if (extent_count >=
1787            (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1788                return -EINVAL;
1789        num_bytes = sizeof(*fiemap_s) + (extent_count *
1790                                         sizeof(struct ll_fiemap_extent));
1791
1792        OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1793        if (fiemap_s == NULL)
1794                return -ENOMEM;
1795
1796        /* get the fiemap value */
1797        if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1798                           sizeof(*fiemap_s))) {
1799                rc = -EFAULT;
1800                goto error;
1801        }
1802
1803        /* If fm_extent_count is non-zero, read the first extent since
1804         * it is used to calculate end_offset and device from previous
1805         * fiemap call. */
1806        if (extent_count) {
1807                if (copy_from_user(&fiemap_s->fm_extents[0],
1808                    (char __user *)arg + sizeof(*fiemap_s),
1809                    sizeof(struct ll_fiemap_extent))) {
1810                        rc = -EFAULT;
1811                        goto error;
1812                }
1813        }
1814
1815        rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1816        if (rc)
1817                goto error;
1818
1819        ret_bytes = sizeof(struct ll_user_fiemap);
1820
1821        if (extent_count != 0)
1822                ret_bytes += (fiemap_s->fm_mapped_extents *
1823                                 sizeof(struct ll_fiemap_extent));
1824
1825        if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1826                rc = -EFAULT;
1827
1828error:
1829        OBD_FREE_LARGE(fiemap_s, num_bytes);
1830        return rc;
1831}
1832
1833/*
1834 * Read the data_version for inode.
1835 *
1836 * This value is computed using stripe object version on OST.
1837 * Version is computed using server side locking.
1838 *
1839 * @param extent_lock  Take extent lock. Not needed if a process is already
1840 *                     holding the OST object group locks.
1841 */
1842int ll_data_version(struct inode *inode, __u64 *data_version,
1843                    int extent_lock)
1844{
1845        struct lov_stripe_md    *lsm = NULL;
1846        struct ll_sb_info       *sbi = ll_i2sbi(inode);
1847        struct obdo             *obdo = NULL;
1848        int                      rc;
1849
1850        /* If no stripe, we consider version is 0. */
1851        lsm = ccc_inode_lsm_get(inode);
1852        if (!lsm_has_objects(lsm)) {
1853                *data_version = 0;
1854                CDEBUG(D_INODE, "No object for inode\n");
1855                rc = 0;
1856                goto out;
1857        }
1858
1859        obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1860        if (!obdo) {
1861                rc = -ENOMEM;
1862                goto out;
1863        }
1864
1865        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1866        if (rc == 0) {
1867                if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1868                        rc = -EOPNOTSUPP;
1869                else
1870                        *data_version = obdo->o_data_version;
1871        }
1872
1873        OBD_FREE_PTR(obdo);
1874out:
1875        ccc_inode_lsm_put(inode, lsm);
1876        return rc;
1877}
1878
1879/*
1880 * Trigger a HSM release request for the provided inode.
1881 */
1882int ll_hsm_release(struct inode *inode)
1883{
1884        struct cl_env_nest nest;
1885        struct lu_env *env;
1886        struct obd_client_handle *och = NULL;
1887        __u64 data_version = 0;
1888        int rc;
1889
1890
1891        CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1892               ll_get_fsname(inode->i_sb, NULL, 0),
1893               PFID(&ll_i2info(inode)->lli_fid));
1894
1895        och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1896        if (IS_ERR(och)) {
1897                rc = PTR_ERR(och);
1898                goto out;
1899        }
1900
1901        /* Grab latest data_version and [am]time values */
1902        rc = ll_data_version(inode, &data_version, 1);
1903        if (rc != 0)
1904                goto out;
1905
1906        env = cl_env_nested_get(&nest);
1907        if (IS_ERR(env)) {
1908                rc = PTR_ERR(env);
1909                goto out;
1910        }
1911
1912        ll_merge_lvb(env, inode);
1913        cl_env_nested_put(&nest, env);
1914
1915        /* Release the file.
1916         * NB: lease lock handle is released in mdc_hsm_release_pack() because
1917         * we still need it to pack l_remote_handle to MDT. */
1918        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1919                                       &data_version);
1920        och = NULL;
1921
1922
1923out:
1924        if (och != NULL && !IS_ERR(och)) /* close the file */
1925                ll_lease_close(och, inode, NULL);
1926
1927        return rc;
1928}
1929
1930struct ll_swap_stack {
1931        struct iattr             ia1, ia2;
1932        __u64                    dv1, dv2;
1933        struct inode            *inode1, *inode2;
1934        bool                     check_dv1, check_dv2;
1935};
1936
1937static int ll_swap_layouts(struct file *file1, struct file *file2,
1938                           struct lustre_swap_layouts *lsl)
1939{
1940        struct mdc_swap_layouts  msl;
1941        struct md_op_data       *op_data;
1942        __u32                    gid;
1943        __u64                    dv;
1944        struct ll_swap_stack    *llss = NULL;
1945        int                      rc;
1946
1947        llss = kzalloc(sizeof(*llss), GFP_NOFS);
1948        if (!llss)
1949                return -ENOMEM;
1950
1951        llss->inode1 = file_inode(file1);
1952        llss->inode2 = file_inode(file2);
1953
1954        if (!S_ISREG(llss->inode2->i_mode)) {
1955                rc = -EINVAL;
1956                goto free;
1957        }
1958
1959        if (inode_permission(llss->inode1, MAY_WRITE) ||
1960            inode_permission(llss->inode2, MAY_WRITE)) {
1961                rc = -EPERM;
1962                goto free;
1963        }
1964
1965        if (llss->inode2->i_sb != llss->inode1->i_sb) {
1966                rc = -EXDEV;
1967                goto free;
1968        }
1969
1970        /* we use 2 bool because it is easier to swap than 2 bits */
1971        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1972                llss->check_dv1 = true;
1973
1974        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1975                llss->check_dv2 = true;
1976
1977        /* we cannot use lsl->sl_dvX directly because we may swap them */
1978        llss->dv1 = lsl->sl_dv1;
1979        llss->dv2 = lsl->sl_dv2;
1980
1981        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1982        if (rc == 0) /* same file, done! */ {
1983                rc = 0;
1984                goto free;
1985        }
1986
1987        if (rc < 0) { /* sequentialize it */
1988                swap(llss->inode1, llss->inode2);
1989                swap(file1, file2);
1990                swap(llss->dv1, llss->dv2);
1991                swap(llss->check_dv1, llss->check_dv2);
1992        }
1993
1994        gid = lsl->sl_gid;
1995        if (gid != 0) { /* application asks to flush dirty cache */
1996                rc = ll_get_grouplock(llss->inode1, file1, gid);
1997                if (rc < 0)
1998                        goto free;
1999
2000                rc = ll_get_grouplock(llss->inode2, file2, gid);
2001                if (rc < 0) {
2002                        ll_put_grouplock(llss->inode1, file1, gid);
2003                        goto free;
2004                }
2005        }
2006
2007        /* to be able to restore mtime and atime after swap
2008         * we need to first save them */
2009        if (lsl->sl_flags &
2010            (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2011                llss->ia1.ia_mtime = llss->inode1->i_mtime;
2012                llss->ia1.ia_atime = llss->inode1->i_atime;
2013                llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2014                llss->ia2.ia_mtime = llss->inode2->i_mtime;
2015                llss->ia2.ia_atime = llss->inode2->i_atime;
2016                llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2017        }
2018
2019        /* ultimate check, before swapping the layouts we check if
2020         * dataversion has changed (if requested) */
2021        if (llss->check_dv1) {
2022                rc = ll_data_version(llss->inode1, &dv, 0);
2023                if (rc)
2024                        goto putgl;
2025                if (dv != llss->dv1) {
2026                        rc = -EAGAIN;
2027                        goto putgl;
2028                }
2029        }
2030
2031        if (llss->check_dv2) {
2032                rc = ll_data_version(llss->inode2, &dv, 0);
2033                if (rc)
2034                        goto putgl;
2035                if (dv != llss->dv2) {
2036                        rc = -EAGAIN;
2037                        goto putgl;
2038                }
2039        }
2040
2041        /* struct md_op_data is used to send the swap args to the mdt
2042         * only flags is missing, so we use struct mdc_swap_layouts
2043         * through the md_op_data->op_data */
2044        /* flags from user space have to be converted before they are send to
2045         * server, no flag is sent today, they are only used on the client */
2046        msl.msl_flags = 0;
2047        rc = -ENOMEM;
2048        op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2049                                     0, LUSTRE_OPC_ANY, &msl);
2050        if (IS_ERR(op_data)) {
2051                rc = PTR_ERR(op_data);
2052                goto free;
2053        }
2054
2055        rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2056                           sizeof(*op_data), op_data, NULL);
2057        ll_finish_md_op_data(op_data);
2058
2059putgl:
2060        if (gid != 0) {
2061                ll_put_grouplock(llss->inode2, file2, gid);
2062                ll_put_grouplock(llss->inode1, file1, gid);
2063        }
2064
2065        /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2066        if (rc != 0)
2067                goto free;
2068
2069        /* clear useless flags */
2070        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2071                llss->ia1.ia_valid &= ~ATTR_MTIME;
2072                llss->ia2.ia_valid &= ~ATTR_MTIME;
2073        }
2074
2075        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2076                llss->ia1.ia_valid &= ~ATTR_ATIME;
2077                llss->ia2.ia_valid &= ~ATTR_ATIME;
2078        }
2079
2080        /* update time if requested */
2081        rc = 0;
2082        if (llss->ia2.ia_valid != 0) {
2083                mutex_lock(&llss->inode1->i_mutex);
2084                rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2085                mutex_unlock(&llss->inode1->i_mutex);
2086        }
2087
2088        if (llss->ia1.ia_valid != 0) {
2089                int rc1;
2090
2091                mutex_lock(&llss->inode2->i_mutex);
2092                rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2093                mutex_unlock(&llss->inode2->i_mutex);
2094                if (rc == 0)
2095                        rc = rc1;
2096        }
2097
2098free:
2099        if (llss != NULL)
2100                OBD_FREE_PTR(llss);
2101
2102        return rc;
2103}
2104
2105static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2106{
2107        struct md_op_data       *op_data;
2108        int                      rc;
2109
2110        /* Non-root users are forbidden to set or clear flags which are
2111         * NOT defined in HSM_USER_MASK. */
2112        if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2113            !capable(CFS_CAP_SYS_ADMIN))
2114                return -EPERM;
2115
2116        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2117                                     LUSTRE_OPC_ANY, hss);
2118        if (IS_ERR(op_data))
2119                return PTR_ERR(op_data);
2120
2121        rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2122                           sizeof(*op_data), op_data, NULL);
2123
2124        ll_finish_md_op_data(op_data);
2125
2126        return rc;
2127}
2128
2129static int ll_hsm_import(struct inode *inode, struct file *file,
2130                         struct hsm_user_import *hui)
2131{
2132        struct hsm_state_set    *hss = NULL;
2133        struct iattr            *attr = NULL;
2134        int                      rc;
2135
2136
2137        if (!S_ISREG(inode->i_mode))
2138                return -EINVAL;
2139
2140        /* set HSM flags */
2141        hss = kzalloc(sizeof(*hss), GFP_NOFS);
2142        if (!hss) {
2143                rc = -ENOMEM;
2144                goto out;
2145        }
2146
2147        hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2148        hss->hss_archive_id = hui->hui_archive_id;
2149        hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2150        rc = ll_hsm_state_set(inode, hss);
2151        if (rc != 0)
2152                goto out;
2153
2154        attr = kzalloc(sizeof(*attr), GFP_NOFS);
2155        if (!attr) {
2156                rc = -ENOMEM;
2157                goto out;
2158        }
2159
2160        attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2161        attr->ia_mode |= S_IFREG;
2162        attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2163        attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2164        attr->ia_size = hui->hui_size;
2165        attr->ia_mtime.tv_sec = hui->hui_mtime;
2166        attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2167        attr->ia_atime.tv_sec = hui->hui_atime;
2168        attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2169
2170        attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2171                         ATTR_UID | ATTR_GID |
2172                         ATTR_MTIME | ATTR_MTIME_SET |
2173                         ATTR_ATIME | ATTR_ATIME_SET;
2174
2175        mutex_lock(&inode->i_mutex);
2176
2177        rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2178        if (rc == -ENODATA)
2179                rc = 0;
2180
2181        mutex_unlock(&inode->i_mutex);
2182
2183out:
2184        if (hss != NULL)
2185                OBD_FREE_PTR(hss);
2186
2187        if (attr != NULL)
2188                OBD_FREE_PTR(attr);
2189
2190        return rc;
2191}
2192
2193static long
2194ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2195{
2196        struct inode            *inode = file_inode(file);
2197        struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2198        int                      flags, rc;
2199
2200        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2201               inode->i_generation, inode, cmd);
2202        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2203
2204        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2205        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2206                return -ENOTTY;
2207
2208        switch (cmd) {
2209        case LL_IOC_GETFLAGS:
2210                /* Get the current value of the file flags */
2211                return put_user(fd->fd_flags, (int *)arg);
2212        case LL_IOC_SETFLAGS:
2213        case LL_IOC_CLRFLAGS:
2214                /* Set or clear specific file flags */
2215                /* XXX This probably needs checks to ensure the flags are
2216                 *     not abused, and to handle any flag side effects.
2217                 */
2218                if (get_user(flags, (int *) arg))
2219                        return -EFAULT;
2220
2221                if (cmd == LL_IOC_SETFLAGS) {
2222                        if ((flags & LL_FILE_IGNORE_LOCK) &&
2223                            !(file->f_flags & O_DIRECT)) {
2224                                CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2225                                       current->comm);
2226                                return -EINVAL;
2227                        }
2228
2229                        fd->fd_flags |= flags;
2230                } else {
2231                        fd->fd_flags &= ~flags;
2232                }
2233                return 0;
2234        case LL_IOC_LOV_SETSTRIPE:
2235                return ll_lov_setstripe(inode, file, arg);
2236        case LL_IOC_LOV_SETEA:
2237                return ll_lov_setea(inode, file, arg);
2238        case LL_IOC_LOV_SWAP_LAYOUTS: {
2239                struct file *file2;
2240                struct lustre_swap_layouts lsl;
2241
2242                if (copy_from_user(&lsl, (char *)arg,
2243                                       sizeof(struct lustre_swap_layouts)))
2244                        return -EFAULT;
2245
2246                if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2247                        return -EPERM;
2248
2249                file2 = fget(lsl.sl_fd);
2250                if (file2 == NULL)
2251                        return -EBADF;
2252
2253                rc = -EPERM;
2254                if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2255                        rc = ll_swap_layouts(file, file2, &lsl);
2256                fput(file2);
2257                return rc;
2258        }
2259        case LL_IOC_LOV_GETSTRIPE:
2260                return ll_lov_getstripe(inode, arg);
2261        case LL_IOC_RECREATE_OBJ:
2262                return ll_lov_recreate_obj(inode, arg);
2263        case LL_IOC_RECREATE_FID:
2264                return ll_lov_recreate_fid(inode, arg);
2265        case FSFILT_IOC_FIEMAP:
2266                return ll_ioctl_fiemap(inode, arg);
2267        case FSFILT_IOC_GETFLAGS:
2268        case FSFILT_IOC_SETFLAGS:
2269                return ll_iocontrol(inode, file, cmd, arg);
2270        case FSFILT_IOC_GETVERSION_OLD:
2271        case FSFILT_IOC_GETVERSION:
2272                return put_user(inode->i_generation, (int *)arg);
2273        case LL_IOC_GROUP_LOCK:
2274                return ll_get_grouplock(inode, file, arg);
2275        case LL_IOC_GROUP_UNLOCK:
2276                return ll_put_grouplock(inode, file, arg);
2277        case IOC_OBD_STATFS:
2278                return ll_obd_statfs(inode, (void *)arg);
2279
2280        /* We need to special case any other ioctls we want to handle,
2281         * to send them to the MDS/OST as appropriate and to properly
2282         * network encode the arg field.
2283        case FSFILT_IOC_SETVERSION_OLD:
2284        case FSFILT_IOC_SETVERSION:
2285        */
2286        case LL_IOC_FLUSHCTX:
2287                return ll_flush_ctx(inode);
2288        case LL_IOC_PATH2FID: {
2289                if (copy_to_user((void *)arg, ll_inode2fid(inode),
2290                                 sizeof(struct lu_fid)))
2291                        return -EFAULT;
2292
2293                return 0;
2294        }
2295        case OBD_IOC_FID2PATH:
2296                return ll_fid2path(inode, (void *)arg);
2297        case LL_IOC_DATA_VERSION: {
2298                struct ioc_data_version idv;
2299                int                     rc;
2300
2301                if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2302                        return -EFAULT;
2303
2304                rc = ll_data_version(inode, &idv.idv_version,
2305                                !(idv.idv_flags & LL_DV_NOFLUSH));
2306
2307                if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2308                        return -EFAULT;
2309
2310                return rc;
2311        }
2312
2313        case LL_IOC_GET_MDTIDX: {
2314                int mdtidx;
2315
2316                mdtidx = ll_get_mdt_idx(inode);
2317                if (mdtidx < 0)
2318                        return mdtidx;
2319
2320                if (put_user((int)mdtidx, (int *)arg))
2321                        return -EFAULT;
2322
2323                return 0;
2324        }
2325        case OBD_IOC_GETDTNAME:
2326        case OBD_IOC_GETMDNAME:
2327                return ll_get_obd_name(inode, cmd, arg);
2328        case LL_IOC_HSM_STATE_GET: {
2329                struct md_op_data       *op_data;
2330                struct hsm_user_state   *hus;
2331                int                      rc;
2332
2333                hus = kzalloc(sizeof(*hus), GFP_NOFS);
2334                if (!hus)
2335                        return -ENOMEM;
2336
2337                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2338                                             LUSTRE_OPC_ANY, hus);
2339                if (IS_ERR(op_data)) {
2340                        OBD_FREE_PTR(hus);
2341                        return PTR_ERR(op_data);
2342                }
2343
2344                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2345                                   op_data, NULL);
2346
2347                if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2348                        rc = -EFAULT;
2349
2350                ll_finish_md_op_data(op_data);
2351                OBD_FREE_PTR(hus);
2352                return rc;
2353        }
2354        case LL_IOC_HSM_STATE_SET: {
2355                struct hsm_state_set    *hss;
2356                int                      rc;
2357
2358                hss = kzalloc(sizeof(*hss), GFP_NOFS);
2359                if (!hss)
2360                        return -ENOMEM;
2361
2362                if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2363                        OBD_FREE_PTR(hss);
2364                        return -EFAULT;
2365                }
2366
2367                rc = ll_hsm_state_set(inode, hss);
2368
2369                OBD_FREE_PTR(hss);
2370                return rc;
2371        }
2372        case LL_IOC_HSM_ACTION: {
2373                struct md_op_data               *op_data;
2374                struct hsm_current_action       *hca;
2375                int                              rc;
2376
2377                hca = kzalloc(sizeof(*hca), GFP_NOFS);
2378                if (!hca)
2379                        return -ENOMEM;
2380
2381                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2382                                             LUSTRE_OPC_ANY, hca);
2383                if (IS_ERR(op_data)) {
2384                        OBD_FREE_PTR(hca);
2385                        return PTR_ERR(op_data);
2386                }
2387
2388                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2389                                   op_data, NULL);
2390
2391                if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2392                        rc = -EFAULT;
2393
2394                ll_finish_md_op_data(op_data);
2395                OBD_FREE_PTR(hca);
2396                return rc;
2397        }
2398        case LL_IOC_SET_LEASE: {
2399                struct ll_inode_info *lli = ll_i2info(inode);
2400                struct obd_client_handle *och = NULL;
2401                bool lease_broken;
2402                fmode_t mode = 0;
2403
2404                switch (arg) {
2405                case F_WRLCK:
2406                        if (!(file->f_mode & FMODE_WRITE))
2407                                return -EPERM;
2408                        mode = FMODE_WRITE;
2409                        break;
2410                case F_RDLCK:
2411                        if (!(file->f_mode & FMODE_READ))
2412                                return -EPERM;
2413                        mode = FMODE_READ;
2414                        break;
2415                case F_UNLCK:
2416                        mutex_lock(&lli->lli_och_mutex);
2417                        if (fd->fd_lease_och != NULL) {
2418                                och = fd->fd_lease_och;
2419                                fd->fd_lease_och = NULL;
2420                        }
2421                        mutex_unlock(&lli->lli_och_mutex);
2422
2423                        if (och != NULL) {
2424                                mode = och->och_flags &
2425                                       (FMODE_READ|FMODE_WRITE);
2426                                rc = ll_lease_close(och, inode, &lease_broken);
2427                                if (rc == 0 && lease_broken)
2428                                        mode = 0;
2429                        } else {
2430                                rc = -ENOLCK;
2431                        }
2432
2433                        /* return the type of lease or error */
2434                        return rc < 0 ? rc : (int)mode;
2435                default:
2436                        return -EINVAL;
2437                }
2438
2439                CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2440
2441                /* apply for lease */
2442                och = ll_lease_open(inode, file, mode, 0);
2443                if (IS_ERR(och))
2444                        return PTR_ERR(och);
2445
2446                rc = 0;
2447                mutex_lock(&lli->lli_och_mutex);
2448                if (fd->fd_lease_och == NULL) {
2449                        fd->fd_lease_och = och;
2450                        och = NULL;
2451                }
2452                mutex_unlock(&lli->lli_och_mutex);
2453                if (och != NULL) {
2454                        /* impossible now that only excl is supported for now */
2455                        ll_lease_close(och, inode, &lease_broken);
2456                        rc = -EBUSY;
2457                }
2458                return rc;
2459        }
2460        case LL_IOC_GET_LEASE: {
2461                struct ll_inode_info *lli = ll_i2info(inode);
2462                struct ldlm_lock *lock = NULL;
2463
2464                rc = 0;
2465                mutex_lock(&lli->lli_och_mutex);
2466                if (fd->fd_lease_och != NULL) {
2467                        struct obd_client_handle *och = fd->fd_lease_och;
2468
2469                        lock = ldlm_handle2lock(&och->och_lease_handle);
2470                        if (lock != NULL) {
2471                                lock_res_and_lock(lock);
2472                                if (!ldlm_is_cancel(lock))
2473                                        rc = och->och_flags &
2474                                                (FMODE_READ | FMODE_WRITE);
2475                                unlock_res_and_lock(lock);
2476                                ldlm_lock_put(lock);
2477                        }
2478                }
2479                mutex_unlock(&lli->lli_och_mutex);
2480                return rc;
2481        }
2482        case LL_IOC_HSM_IMPORT: {
2483                struct hsm_user_import *hui;
2484
2485                hui = kzalloc(sizeof(*hui), GFP_NOFS);
2486                if (!hui)
2487                        return -ENOMEM;
2488
2489                if (copy_from_user(hui, (void *)arg, sizeof(*hui))) {
2490                        OBD_FREE_PTR(hui);
2491                        return -EFAULT;
2492                }
2493
2494                rc = ll_hsm_import(inode, file, hui);
2495
2496                OBD_FREE_PTR(hui);
2497                return rc;
2498        }
2499        default: {
2500                int err;
2501
2502                if (LLIOC_STOP ==
2503                     ll_iocontrol_call(inode, file, cmd, arg, &err))
2504                        return err;
2505
2506                return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2507                                     (void *)arg);
2508        }
2509        }
2510}
2511
2512
2513static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2514{
2515        struct inode *inode = file_inode(file);
2516        loff_t retval, eof = 0;
2517
2518        retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2519                           (origin == SEEK_CUR) ? file->f_pos : 0);
2520        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2521               inode->i_ino, inode->i_generation, inode, retval, retval,
2522               origin);
2523        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2524
2525        if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2526                retval = ll_glimpse_size(inode);
2527                if (retval != 0)
2528                        return retval;
2529                eof = i_size_read(inode);
2530        }
2531
2532        retval = generic_file_llseek_size(file, offset, origin,
2533                                          ll_file_maxbytes(inode), eof);
2534        return retval;
2535}
2536
2537static int ll_flush(struct file *file, fl_owner_t id)
2538{
2539        struct inode *inode = file_inode(file);
2540        struct ll_inode_info *lli = ll_i2info(inode);
2541        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2542        int rc, err;
2543
2544        LASSERT(!S_ISDIR(inode->i_mode));
2545
2546        /* catch async errors that were recorded back when async writeback
2547         * failed for pages in this mapping. */
2548        rc = lli->lli_async_rc;
2549        lli->lli_async_rc = 0;
2550        err = lov_read_and_clear_async_rc(lli->lli_clob);
2551        if (rc == 0)
2552                rc = err;
2553
2554        /* The application has been told write failure already.
2555         * Do not report failure again. */
2556        if (fd->fd_write_failed)
2557                return 0;
2558        return rc ? -EIO : 0;
2559}
2560
2561/**
2562 * Called to make sure a portion of file has been written out.
2563 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2564 *
2565 * Return how many pages have been written.
2566 */
2567int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2568                       enum cl_fsync_mode mode, int ignore_layout)
2569{
2570        struct cl_env_nest nest;
2571        struct lu_env *env;
2572        struct cl_io *io;
2573        struct obd_capa *capa = NULL;
2574        struct cl_fsync_io *fio;
2575        int result;
2576
2577        if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2578            mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2579                return -EINVAL;
2580
2581        env = cl_env_nested_get(&nest);
2582        if (IS_ERR(env))
2583                return PTR_ERR(env);
2584
2585        capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2586
2587        io = ccc_env_thread_io(env);
2588        io->ci_obj = cl_i2info(inode)->lli_clob;
2589        io->ci_ignore_layout = ignore_layout;
2590
2591        /* initialize parameters for sync */
2592        fio = &io->u.ci_fsync;
2593        fio->fi_capa = capa;
2594        fio->fi_start = start;
2595        fio->fi_end = end;
2596        fio->fi_fid = ll_inode2fid(inode);
2597        fio->fi_mode = mode;
2598        fio->fi_nr_written = 0;
2599
2600        if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2601                result = cl_io_loop(env, io);
2602        else
2603                result = io->ci_result;
2604        if (result == 0)
2605                result = fio->fi_nr_written;
2606        cl_io_fini(env, io);
2607        cl_env_nested_put(&nest, env);
2608
2609        capa_put(capa);
2610
2611        return result;
2612}
2613
2614int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2615{
2616        struct inode *inode = file_inode(file);
2617        struct ll_inode_info *lli = ll_i2info(inode);
2618        struct ptlrpc_request *req;
2619        struct obd_capa *oc;
2620        int rc, err;
2621
2622        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2623               inode->i_generation, inode);
2624        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2625
2626        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2627        mutex_lock(&inode->i_mutex);
2628
2629        /* catch async errors that were recorded back when async writeback
2630         * failed for pages in this mapping. */
2631        if (!S_ISDIR(inode->i_mode)) {
2632                err = lli->lli_async_rc;
2633                lli->lli_async_rc = 0;
2634                if (rc == 0)
2635                        rc = err;
2636                err = lov_read_and_clear_async_rc(lli->lli_clob);
2637                if (rc == 0)
2638                        rc = err;
2639        }
2640
2641        oc = ll_mdscapa_get(inode);
2642        err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2643                      &req);
2644        capa_put(oc);
2645        if (!rc)
2646                rc = err;
2647        if (!err)
2648                ptlrpc_req_finished(req);
2649
2650        if (S_ISREG(inode->i_mode)) {
2651                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2652
2653                err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2654                if (rc == 0 && err < 0)
2655                        rc = err;
2656                if (rc < 0)
2657                        fd->fd_write_failed = true;
2658                else
2659                        fd->fd_write_failed = false;
2660        }
2661
2662        mutex_unlock(&inode->i_mutex);
2663        return rc;
2664}
2665
2666static int
2667ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2668{
2669        struct inode *inode = file_inode(file);
2670        struct ll_sb_info *sbi = ll_i2sbi(inode);
2671        struct ldlm_enqueue_info einfo = {
2672                .ei_type        = LDLM_FLOCK,
2673                .ei_cb_cp       = ldlm_flock_completion_ast,
2674                .ei_cbdata      = file_lock,
2675        };
2676        struct md_op_data *op_data;
2677        struct lustre_handle lockh = {0};
2678        ldlm_policy_data_t flock = {{0}};
2679        __u64 flags = 0;
2680        int rc;
2681        int rc2 = 0;
2682
2683        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2684               inode->i_ino, file_lock);
2685
2686        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2687
2688        if (file_lock->fl_flags & FL_FLOCK)
2689                LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2690        else if (!(file_lock->fl_flags & FL_POSIX))
2691                return -EINVAL;
2692
2693        flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2694        flock.l_flock.pid = file_lock->fl_pid;
2695        flock.l_flock.start = file_lock->fl_start;
2696        flock.l_flock.end = file_lock->fl_end;
2697
2698        /* Somewhat ugly workaround for svc lockd.
2699         * lockd installs custom fl_lmops->lm_compare_owner that checks
2700         * for the fl_owner to be the same (which it always is on local node
2701         * I guess between lockd processes) and then compares pid.
2702         * As such we assign pid to the owner field to make it all work,
2703         * conflict with normal locks is unlikely since pid space and
2704         * pointer space for current->files are not intersecting */
2705        if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2706                flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2707
2708        switch (file_lock->fl_type) {
2709        case F_RDLCK:
2710                einfo.ei_mode = LCK_PR;
2711                break;
2712        case F_UNLCK:
2713                /* An unlock request may or may not have any relation to
2714                 * existing locks so we may not be able to pass a lock handle
2715                 * via a normal ldlm_lock_cancel() request. The request may even
2716                 * unlock a byte range in the middle of an existing lock. In
2717                 * order to process an unlock request we need all of the same
2718                 * information that is given with a normal read or write record
2719                 * lock request. To avoid creating another ldlm unlock (cancel)
2720                 * message we'll treat a LCK_NL flock request as an unlock. */
2721                einfo.ei_mode = LCK_NL;
2722                break;
2723        case F_WRLCK:
2724                einfo.ei_mode = LCK_PW;
2725                break;
2726        default:
2727                CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2728                        file_lock->fl_type);
2729                return -ENOTSUPP;
2730        }
2731
2732        switch (cmd) {
2733        case F_SETLKW:
2734#ifdef F_SETLKW64
2735        case F_SETLKW64:
2736#endif
2737                flags = 0;
2738                break;
2739        case F_SETLK:
2740#ifdef F_SETLK64
2741        case F_SETLK64:
2742#endif
2743                flags = LDLM_FL_BLOCK_NOWAIT;
2744                break;
2745        case F_GETLK:
2746#ifdef F_GETLK64
2747        case F_GETLK64:
2748#endif
2749                flags = LDLM_FL_TEST_LOCK;
2750                /* Save the old mode so that if the mode in the lock changes we
2751                 * can decrement the appropriate reader or writer refcount. */
2752                file_lock->fl_type = einfo.ei_mode;
2753                break;
2754        default:
2755                CERROR("unknown fcntl lock command: %d\n", cmd);
2756                return -EINVAL;
2757        }
2758
2759        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2760                                     LUSTRE_OPC_ANY, NULL);
2761        if (IS_ERR(op_data))
2762                return PTR_ERR(op_data);
2763
2764        CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2765               inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2766               flock.l_flock.start, flock.l_flock.end);
2767
2768        rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2769                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2770
2771        if ((file_lock->fl_flags & FL_FLOCK) &&
2772            (rc == 0 || file_lock->fl_type == F_UNLCK))
2773                rc2  = flock_lock_file_wait(file, file_lock);
2774        if ((file_lock->fl_flags & FL_POSIX) &&
2775            (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2776            !(flags & LDLM_FL_TEST_LOCK))
2777                rc2  = posix_lock_file_wait(file, file_lock);
2778
2779        if (rc2 && file_lock->fl_type != F_UNLCK) {
2780                einfo.ei_mode = LCK_NL;
2781                md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2782                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2783                rc = rc2;
2784        }
2785
2786        ll_finish_md_op_data(op_data);
2787
2788        return rc;
2789}
2790
2791static int
2792ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2793{
2794        return -ENOSYS;
2795}
2796
2797/**
2798 * test if some locks matching bits and l_req_mode are acquired
2799 * - bits can be in different locks
2800 * - if found clear the common lock bits in *bits
2801 * - the bits not found, are kept in *bits
2802 * \param inode [IN]
2803 * \param bits [IN] searched lock bits [IN]
2804 * \param l_req_mode [IN] searched lock mode
2805 * \retval boolean, true iff all bits are found
2806 */
2807int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2808{
2809        struct lustre_handle lockh;
2810        ldlm_policy_data_t policy;
2811        ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2812                                (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2813        struct lu_fid *fid;
2814        __u64 flags;
2815        int i;
2816
2817        if (!inode)
2818               return 0;
2819
2820        fid = &ll_i2info(inode)->lli_fid;
2821        CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2822               ldlm_lockname[mode]);
2823
2824        flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2825        for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2826                policy.l_inodebits.bits = *bits & (1 << i);
2827                if (policy.l_inodebits.bits == 0)
2828                        continue;
2829
2830                if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2831                                  &policy, mode, &lockh)) {
2832                        struct ldlm_lock *lock;
2833
2834                        lock = ldlm_handle2lock(&lockh);
2835                        if (lock) {
2836                                *bits &=
2837                                      ~(lock->l_policy_data.l_inodebits.bits);
2838                                LDLM_LOCK_PUT(lock);
2839                        } else {
2840                                *bits &= ~policy.l_inodebits.bits;
2841                        }
2842                }
2843        }
2844        return *bits == 0;
2845}
2846
2847ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2848                            struct lustre_handle *lockh, __u64 flags,
2849                            ldlm_mode_t mode)
2850{
2851        ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2852        struct lu_fid *fid;
2853        ldlm_mode_t rc;
2854
2855        fid = &ll_i2info(inode)->lli_fid;
2856        CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2857
2858        rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2859                           fid, LDLM_IBITS, &policy, mode, lockh);
2860
2861        return rc;
2862}
2863
2864static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2865{
2866        /* Already unlinked. Just update nlink and return success */
2867        if (rc == -ENOENT) {
2868                clear_nlink(inode);
2869                /* This path cannot be hit for regular files unless in
2870                 * case of obscure races, so no need to validate size.
2871                 */
2872                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2873                        return 0;
2874        } else if (rc != 0) {
2875                CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2876                             "%s: revalidate FID "DFID" error: rc = %d\n",
2877                             ll_get_fsname(inode->i_sb, NULL, 0),
2878                             PFID(ll_inode2fid(inode)), rc);
2879        }
2880
2881        return rc;
2882}
2883
2884static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2885{
2886        struct inode *inode = dentry->d_inode;
2887        struct ptlrpc_request *req = NULL;
2888        struct obd_export *exp;
2889        int rc = 0;
2890
2891        LASSERT(inode != NULL);
2892
2893        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2894               inode->i_ino, inode->i_generation, inode, dentry);
2895
2896        exp = ll_i2mdexp(inode);
2897
2898        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2899         *      But under CMD case, it caused some lock issues, should be fixed
2900         *      with new CMD ibits lock. See bug 12718 */
2901        if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2902                struct lookup_intent oit = { .it_op = IT_GETATTR };
2903                struct md_op_data *op_data;
2904
2905                if (ibits == MDS_INODELOCK_LOOKUP)
2906                        oit.it_op = IT_LOOKUP;
2907
2908                /* Call getattr by fid, so do not provide name at all. */
2909                op_data = ll_prep_md_op_data(NULL, dentry->d_inode,
2910                                             dentry->d_inode, NULL, 0, 0,
2911                                             LUSTRE_OPC_ANY, NULL);
2912                if (IS_ERR(op_data))
2913                        return PTR_ERR(op_data);
2914
2915                oit.it_create_mode |= M_CHECK_STALE;
2916                rc = md_intent_lock(exp, op_data, NULL, 0,
2917                                    /* we are not interested in name
2918                                       based lookup */
2919                                    &oit, 0, &req,
2920                                    ll_md_blocking_ast, 0);
2921                ll_finish_md_op_data(op_data);
2922                oit.it_create_mode &= ~M_CHECK_STALE;
2923                if (rc < 0) {
2924                        rc = ll_inode_revalidate_fini(inode, rc);
2925                        goto out;
2926                }
2927
2928                rc = ll_revalidate_it_finish(req, &oit, dentry);
2929                if (rc != 0) {
2930                        ll_intent_release(&oit);
2931                        goto out;
2932                }
2933
2934                /* Unlinked? Unhash dentry, so it is not picked up later by
2935                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2936                   here to preserve get_cwd functionality on 2.6.
2937                   Bug 10503 */
2938                if (!dentry->d_inode->i_nlink)
2939                        d_lustre_invalidate(dentry, 0);
2940
2941                ll_lookup_finish_locks(&oit, dentry);
2942        } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2943                struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2944                u64 valid = OBD_MD_FLGETATTR;
2945                struct md_op_data *op_data;
2946                int ealen = 0;
2947
2948                if (S_ISREG(inode->i_mode)) {
2949                        rc = ll_get_default_mdsize(sbi, &ealen);
2950                        if (rc)
2951                                return rc;
2952                        valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2953                }
2954
2955                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2956                                             0, ealen, LUSTRE_OPC_ANY,
2957                                             NULL);
2958                if (IS_ERR(op_data))
2959                        return PTR_ERR(op_data);
2960
2961                op_data->op_valid = valid;
2962                /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2963                 * capa for this inode. Because we only keep capas of dirs
2964                 * fresh. */
2965                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2966                ll_finish_md_op_data(op_data);
2967                if (rc) {
2968                        rc = ll_inode_revalidate_fini(inode, rc);
2969                        return rc;
2970                }
2971
2972                rc = ll_prep_inode(&inode, req, NULL, NULL);
2973        }
2974out:
2975        ptlrpc_req_finished(req);
2976        return rc;
2977}
2978
2979static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2980{
2981        struct inode *inode = dentry->d_inode;
2982        int rc;
2983
2984        rc = __ll_inode_revalidate(dentry, ibits);
2985        if (rc != 0)
2986                return rc;
2987
2988        /* if object isn't regular file, don't validate size */
2989        if (!S_ISREG(inode->i_mode)) {
2990                LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2991                LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2992                LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2993        } else {
2994                /* In case of restore, the MDT has the right size and has
2995                 * already send it back without granting the layout lock,
2996                 * inode is up-to-date so glimpse is useless.
2997                 * Also to glimpse we need the layout, in case of a running
2998                 * restore the MDT holds the layout lock so the glimpse will
2999                 * block up to the end of restore (getattr will block)
3000                 */
3001                if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3002                        rc = ll_glimpse_size(inode);
3003        }
3004        return rc;
3005}
3006
3007int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3008{
3009        struct inode *inode = de->d_inode;
3010        struct ll_sb_info *sbi = ll_i2sbi(inode);
3011        struct ll_inode_info *lli = ll_i2info(inode);
3012        int res = 0;
3013
3014        res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
3015                                      MDS_INODELOCK_LOOKUP);
3016        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3017
3018        if (res)
3019                return res;
3020
3021        stat->dev = inode->i_sb->s_dev;
3022        if (ll_need_32bit_api(sbi))
3023                stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3024        else
3025                stat->ino = inode->i_ino;
3026        stat->mode = inode->i_mode;
3027        stat->nlink = inode->i_nlink;
3028        stat->uid = inode->i_uid;
3029        stat->gid = inode->i_gid;
3030        stat->rdev = inode->i_rdev;
3031        stat->atime = inode->i_atime;
3032        stat->mtime = inode->i_mtime;
3033        stat->ctime = inode->i_ctime;
3034        stat->blksize = 1 << inode->i_blkbits;
3035
3036        stat->size = i_size_read(inode);
3037        stat->blocks = inode->i_blocks;
3038
3039        return 0;
3040}
3041
3042static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3043                     __u64 start, __u64 len)
3044{
3045        int rc;
3046        size_t num_bytes;
3047        struct ll_user_fiemap *fiemap;
3048        unsigned int extent_count = fieinfo->fi_extents_max;
3049
3050        num_bytes = sizeof(*fiemap) + (extent_count *
3051                                       sizeof(struct ll_fiemap_extent));
3052        OBD_ALLOC_LARGE(fiemap, num_bytes);
3053
3054        if (fiemap == NULL)
3055                return -ENOMEM;
3056
3057        fiemap->fm_flags = fieinfo->fi_flags;
3058        fiemap->fm_extent_count = fieinfo->fi_extents_max;
3059        fiemap->fm_start = start;
3060        fiemap->fm_length = len;
3061        if (extent_count > 0)
3062                memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3063                       sizeof(struct ll_fiemap_extent));
3064
3065        rc = ll_do_fiemap(inode, fiemap, num_bytes);
3066
3067        fieinfo->fi_flags = fiemap->fm_flags;
3068        fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3069        if (extent_count > 0)
3070                memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3071                       fiemap->fm_mapped_extents *
3072                       sizeof(struct ll_fiemap_extent));
3073
3074        OBD_FREE_LARGE(fiemap, num_bytes);
3075        return rc;
3076}
3077
3078struct posix_acl *ll_get_acl(struct inode *inode, int type)
3079{
3080        struct ll_inode_info *lli = ll_i2info(inode);
3081        struct posix_acl *acl = NULL;
3082
3083        spin_lock(&lli->lli_lock);
3084        /* VFS' acl_permission_check->check_acl will release the refcount */
3085        acl = posix_acl_dup(lli->lli_posix_acl);
3086        spin_unlock(&lli->lli_lock);
3087
3088        return acl;
3089}
3090
3091
3092int ll_inode_permission(struct inode *inode, int mask)
3093{
3094        int rc = 0;
3095
3096#ifdef MAY_NOT_BLOCK
3097        if (mask & MAY_NOT_BLOCK)
3098                return -ECHILD;
3099#endif
3100
3101       /* as root inode are NOT getting validated in lookup operation,
3102        * need to do it before permission check. */
3103
3104        if (is_root_inode(inode)) {
3105                rc = __ll_inode_revalidate(inode->i_sb->s_root,
3106                                           MDS_INODELOCK_LOOKUP);
3107                if (rc)
3108                        return rc;
3109        }
3110
3111        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3112               inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3113
3114        if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3115                return lustre_check_remote_perm(inode, mask);
3116
3117        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3118        rc = generic_permission(inode, mask);
3119
3120        return rc;
3121}
3122
3123/* -o localflock - only provides locally consistent flock locks */
3124struct file_operations ll_file_operations = {
3125        .read      = new_sync_read,
3126        .read_iter = ll_file_read_iter,
3127        .write    = new_sync_write,
3128        .write_iter = ll_file_write_iter,
3129        .unlocked_ioctl = ll_file_ioctl,
3130        .open      = ll_file_open,
3131        .release        = ll_file_release,
3132        .mmap      = ll_file_mmap,
3133        .llseek  = ll_file_seek,
3134        .splice_read    = ll_file_splice_read,
3135        .fsync    = ll_fsync,
3136        .flush    = ll_flush
3137};
3138
3139struct file_operations ll_file_operations_flock = {
3140        .read      = new_sync_read,
3141        .read_iter    = ll_file_read_iter,
3142        .write    = new_sync_write,
3143        .write_iter   = ll_file_write_iter,
3144        .unlocked_ioctl = ll_file_ioctl,
3145        .open      = ll_file_open,
3146        .release        = ll_file_release,
3147        .mmap      = ll_file_mmap,
3148        .llseek  = ll_file_seek,
3149        .splice_read    = ll_file_splice_read,
3150        .fsync    = ll_fsync,
3151        .flush    = ll_flush,
3152        .flock    = ll_file_flock,
3153        .lock      = ll_file_flock
3154};
3155
3156/* These are for -o noflock - to return ENOSYS on flock calls */
3157struct file_operations ll_file_operations_noflock = {
3158        .read      = new_sync_read,
3159        .read_iter    = ll_file_read_iter,
3160        .write    = new_sync_write,
3161        .write_iter   = ll_file_write_iter,
3162        .unlocked_ioctl = ll_file_ioctl,
3163        .open      = ll_file_open,
3164        .release        = ll_file_release,
3165        .mmap      = ll_file_mmap,
3166        .llseek  = ll_file_seek,
3167        .splice_read    = ll_file_splice_read,
3168        .fsync    = ll_fsync,
3169        .flush    = ll_flush,
3170        .flock    = ll_file_noflock,
3171        .lock      = ll_file_noflock
3172};
3173
3174struct inode_operations ll_file_inode_operations = {
3175        .setattr        = ll_setattr,
3176        .getattr        = ll_getattr,
3177        .permission     = ll_inode_permission,
3178        .setxattr       = ll_setxattr,
3179        .getxattr       = ll_getxattr,
3180        .listxattr      = ll_listxattr,
3181        .removexattr    = ll_removexattr,
3182        .fiemap         = ll_fiemap,
3183        .get_acl        = ll_get_acl,
3184};
3185
3186/* dynamic ioctl number support routines */
3187static struct llioc_ctl_data {
3188        struct rw_semaphore     ioc_sem;
3189        struct list_head              ioc_head;
3190} llioc = {
3191        __RWSEM_INITIALIZER(llioc.ioc_sem),
3192        LIST_HEAD_INIT(llioc.ioc_head)
3193};
3194
3195
3196struct llioc_data {
3197        struct list_head              iocd_list;
3198        unsigned int        iocd_size;
3199        llioc_callback_t        iocd_cb;
3200        unsigned int        iocd_count;
3201        unsigned int        iocd_cmd[0];
3202};
3203
3204void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3205{
3206        unsigned int size;
3207        struct llioc_data *in_data = NULL;
3208
3209        if (cb == NULL || cmd == NULL ||
3210            count > LLIOC_MAX_CMD || count < 0)
3211                return NULL;
3212
3213        size = sizeof(*in_data) + count * sizeof(unsigned int);
3214        in_data = kzalloc(size, GFP_NOFS);
3215        if (!in_data)
3216                return NULL;
3217
3218        memset(in_data, 0, sizeof(*in_data));
3219        in_data->iocd_size = size;
3220        in_data->iocd_cb = cb;
3221        in_data->iocd_count = count;
3222        memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3223
3224        down_write(&llioc.ioc_sem);
3225        list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3226        up_write(&llioc.ioc_sem);
3227
3228        return in_data;
3229}
3230
3231void ll_iocontrol_unregister(void *magic)
3232{
3233        struct llioc_data *tmp;
3234
3235        if (magic == NULL)
3236                return;
3237
3238        down_write(&llioc.ioc_sem);
3239        list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3240                if (tmp == magic) {
3241                        unsigned int size = tmp->iocd_size;
3242
3243                        list_del(&tmp->iocd_list);
3244                        up_write(&llioc.ioc_sem);
3245
3246                        OBD_FREE(tmp, size);
3247                        return;
3248                }
3249        }
3250        up_write(&llioc.ioc_sem);
3251
3252        CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3253}
3254
3255EXPORT_SYMBOL(ll_iocontrol_register);
3256EXPORT_SYMBOL(ll_iocontrol_unregister);
3257
3258static enum llioc_iter
3259ll_iocontrol_call(struct inode *inode, struct file *file,
3260                  unsigned int cmd, unsigned long arg, int *rcp)
3261{
3262        enum llioc_iter ret = LLIOC_CONT;
3263        struct llioc_data *data;
3264        int rc = -EINVAL, i;
3265
3266        down_read(&llioc.ioc_sem);
3267        list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3268                for (i = 0; i < data->iocd_count; i++) {
3269                        if (cmd != data->iocd_cmd[i])
3270                                continue;
3271
3272                        ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3273                        break;
3274                }
3275
3276                if (ret == LLIOC_STOP)
3277                        break;
3278        }
3279        up_read(&llioc.ioc_sem);
3280
3281        if (rcp)
3282                *rcp = rc;
3283        return ret;
3284}
3285
3286int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3287{
3288        struct ll_inode_info *lli = ll_i2info(inode);
3289        struct cl_env_nest nest;
3290        struct lu_env *env;
3291        int result;
3292
3293        if (lli->lli_clob == NULL)
3294                return 0;
3295
3296        env = cl_env_nested_get(&nest);
3297        if (IS_ERR(env))
3298                return PTR_ERR(env);
3299
3300        result = cl_conf_set(env, lli->lli_clob, conf);
3301        cl_env_nested_put(&nest, env);
3302
3303        if (conf->coc_opc == OBJECT_CONF_SET) {
3304                struct ldlm_lock *lock = conf->coc_lock;
3305
3306                LASSERT(lock != NULL);
3307                LASSERT(ldlm_has_layout(lock));
3308                if (result == 0) {
3309                        /* it can only be allowed to match after layout is
3310                         * applied to inode otherwise false layout would be
3311                         * seen. Applying layout should happen before dropping
3312                         * the intent lock. */
3313                        ldlm_lock_allow_match(lock);
3314                }
3315        }
3316        return result;
3317}
3318
3319/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3320static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3321
3322{
3323        struct ll_sb_info *sbi = ll_i2sbi(inode);
3324        struct obd_capa *oc;
3325        struct ptlrpc_request *req;
3326        struct mdt_body *body;
3327        void *lvbdata;
3328        void *lmm;
3329        int lmmsize;
3330        int rc;
3331
3332        CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3333               PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3334               lock->l_lvb_data, lock->l_lvb_len);
3335
3336        if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3337                return 0;
3338
3339        /* if layout lock was granted right away, the layout is returned
3340         * within DLM_LVB of dlm reply; otherwise if the lock was ever
3341         * blocked and then granted via completion ast, we have to fetch
3342         * layout here. Please note that we can't use the LVB buffer in
3343         * completion AST because it doesn't have a large enough buffer */
3344        oc = ll_mdscapa_get(inode);
3345        rc = ll_get_default_mdsize(sbi, &lmmsize);
3346        if (rc == 0)
3347                rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3348                                OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3349                                lmmsize, 0, &req);
3350        capa_put(oc);
3351        if (rc < 0)
3352                return rc;
3353
3354        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3355        if (body == NULL) {
3356                rc = -EPROTO;
3357                goto out;
3358        }
3359
3360        lmmsize = body->eadatasize;
3361        if (lmmsize == 0) /* empty layout */ {
3362                rc = 0;
3363                goto out;
3364        }
3365
3366        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3367        if (lmm == NULL) {
3368                rc = -EFAULT;
3369                goto out;
3370        }
3371
3372        OBD_ALLOC_LARGE(lvbdata, lmmsize);
3373        if (lvbdata == NULL) {
3374                rc = -ENOMEM;
3375                goto out;
3376        }
3377
3378        memcpy(lvbdata, lmm, lmmsize);
3379        lock_res_and_lock(lock);
3380        if (lock->l_lvb_data != NULL)
3381                OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3382
3383        lock->l_lvb_data = lvbdata;
3384        lock->l_lvb_len = lmmsize;
3385        unlock_res_and_lock(lock);
3386
3387out:
3388        ptlrpc_req_finished(req);
3389        return rc;
3390}
3391
3392/**
3393 * Apply the layout to the inode. Layout lock is held and will be released
3394 * in this function.
3395 */
3396static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3397                                struct inode *inode, __u32 *gen, bool reconf)
3398{
3399        struct ll_inode_info *lli = ll_i2info(inode);
3400        struct ll_sb_info    *sbi = ll_i2sbi(inode);
3401        struct ldlm_lock *lock;
3402        struct lustre_md md = { NULL };
3403        struct cl_object_conf conf;
3404        int rc = 0;
3405        bool lvb_ready;
3406        bool wait_layout = false;
3407
3408        LASSERT(lustre_handle_is_used(lockh));
3409
3410        lock = ldlm_handle2lock(lockh);
3411        LASSERT(lock != NULL);
3412        LASSERT(ldlm_has_layout(lock));
3413
3414        LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3415                   inode, PFID(&lli->lli_fid), reconf);
3416
3417        /* in case this is a caching lock and reinstate with new inode */
3418        md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3419
3420        lock_res_and_lock(lock);
3421        lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3422        unlock_res_and_lock(lock);
3423        /* checking lvb_ready is racy but this is okay. The worst case is
3424         * that multi processes may configure the file on the same time. */
3425        if (lvb_ready || !reconf) {
3426                rc = -ENODATA;
3427                if (lvb_ready) {
3428                        /* layout_gen must be valid if layout lock is not
3429                         * cancelled and stripe has already set */
3430                        *gen = ll_layout_version_get(lli);
3431                        rc = 0;
3432                }
3433                goto out;
3434        }
3435
3436        rc = ll_layout_fetch(inode, lock);
3437        if (rc < 0)
3438                goto out;
3439
3440        /* for layout lock, lmm is returned in lock's lvb.
3441         * lvb_data is immutable if the lock is held so it's safe to access it
3442         * without res lock. See the description in ldlm_lock_decref_internal()
3443         * for the condition to free lvb_data of layout lock */
3444        if (lock->l_lvb_data != NULL) {
3445                rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3446                                  lock->l_lvb_data, lock->l_lvb_len);
3447                if (rc >= 0) {
3448                        *gen = LL_LAYOUT_GEN_EMPTY;
3449                        if (md.lsm != NULL)
3450                                *gen = md.lsm->lsm_layout_gen;
3451                        rc = 0;
3452                } else {
3453                        CERROR("%s: file "DFID" unpackmd error: %d\n",
3454                                ll_get_fsname(inode->i_sb, NULL, 0),
3455                                PFID(&lli->lli_fid), rc);
3456                }
3457        }
3458        if (rc < 0)
3459                goto out;
3460
3461        /* set layout to file. Unlikely this will fail as old layout was
3462         * surely eliminated */
3463        memset(&conf, 0, sizeof(conf));
3464        conf.coc_opc = OBJECT_CONF_SET;
3465        conf.coc_inode = inode;
3466        conf.coc_lock = lock;
3467        conf.u.coc_md = &md;
3468        rc = ll_layout_conf(inode, &conf);
3469
3470        if (md.lsm != NULL)
3471                obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3472
3473        /* refresh layout failed, need to wait */
3474        wait_layout = rc == -EBUSY;
3475
3476out:
3477        LDLM_LOCK_PUT(lock);
3478        ldlm_lock_decref(lockh, mode);
3479
3480        /* wait for IO to complete if it's still being used. */
3481        if (wait_layout) {
3482                CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3483                        ll_get_fsname(inode->i_sb, NULL, 0),
3484                        inode, PFID(&lli->lli_fid));
3485
3486                memset(&conf, 0, sizeof(conf));
3487                conf.coc_opc = OBJECT_CONF_WAIT;
3488                conf.coc_inode = inode;
3489                rc = ll_layout_conf(inode, &conf);
3490                if (rc == 0)
3491                        rc = -EAGAIN;
3492
3493                CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3494                        PFID(&lli->lli_fid), rc);
3495        }
3496        return rc;
3497}
3498
3499/**
3500 * This function checks if there exists a LAYOUT lock on the client side,
3501 * or enqueues it if it doesn't have one in cache.
3502 *
3503 * This function will not hold layout lock so it may be revoked any time after
3504 * this function returns. Any operations depend on layout should be redone
3505 * in that case.
3506 *
3507 * This function should be called before lov_io_init() to get an uptodate
3508 * layout version, the caller should save the version number and after IO
3509 * is finished, this function should be called again to verify that layout
3510 * is not changed during IO time.
3511 */
3512int ll_layout_refresh(struct inode *inode, __u32 *gen)
3513{
3514        struct ll_inode_info  *lli = ll_i2info(inode);
3515        struct ll_sb_info     *sbi = ll_i2sbi(inode);
3516        struct md_op_data     *op_data;
3517        struct lookup_intent   it;
3518        struct lustre_handle   lockh;
3519        ldlm_mode_t            mode;
3520        struct ldlm_enqueue_info einfo = {
3521                .ei_type = LDLM_IBITS,
3522                .ei_mode = LCK_CR,
3523                .ei_cb_bl = ll_md_blocking_ast,
3524                .ei_cb_cp = ldlm_completion_ast,
3525        };
3526        int rc;
3527
3528        *gen = ll_layout_version_get(lli);
3529        if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3530                return 0;
3531
3532        /* sanity checks */
3533        LASSERT(fid_is_sane(ll_inode2fid(inode)));
3534        LASSERT(S_ISREG(inode->i_mode));
3535
3536        /* take layout lock mutex to enqueue layout lock exclusively. */
3537        mutex_lock(&lli->lli_layout_mutex);
3538
3539again:
3540        /* mostly layout lock is caching on the local side, so try to match
3541         * it before grabbing layout lock mutex. */
3542        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3543                               LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3544        if (mode != 0) { /* hit cached lock */
3545                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3546                if (rc == -EAGAIN)
3547                        goto again;
3548
3549                mutex_unlock(&lli->lli_layout_mutex);
3550                return rc;
3551        }
3552
3553        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3554                        0, 0, LUSTRE_OPC_ANY, NULL);
3555        if (IS_ERR(op_data)) {
3556                mutex_unlock(&lli->lli_layout_mutex);
3557                return PTR_ERR(op_data);
3558        }
3559
3560        /* have to enqueue one */
3561        memset(&it, 0, sizeof(it));
3562        it.it_op = IT_LAYOUT;
3563        lockh.cookie = 0ULL;
3564
3565        LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3566                        ll_get_fsname(inode->i_sb, NULL, 0), inode,
3567                        PFID(&lli->lli_fid));
3568
3569        rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3570                        NULL, 0, NULL, 0);
3571        if (it.d.lustre.it_data != NULL)
3572                ptlrpc_req_finished(it.d.lustre.it_data);
3573        it.d.lustre.it_data = NULL;
3574
3575        ll_finish_md_op_data(op_data);
3576
3577        mode = it.d.lustre.it_lock_mode;
3578        it.d.lustre.it_lock_mode = 0;
3579        ll_intent_drop_lock(&it);
3580
3581        if (rc == 0) {
3582                /* set lock data in case this is a new lock */
3583                ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3584                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3585                if (rc == -EAGAIN)
3586                        goto again;
3587        }
3588        mutex_unlock(&lli->lli_layout_mutex);
3589
3590        return rc;
3591}
3592
3593/**
3594 *  This function send a restore request to the MDT
3595 */
3596int ll_layout_restore(struct inode *inode)
3597{
3598        struct hsm_user_request *hur;
3599        int                      len, rc;
3600
3601        len = sizeof(struct hsm_user_request) +
3602              sizeof(struct hsm_user_item);
3603        hur = kzalloc(len, GFP_NOFS);
3604        if (!hur)
3605                return -ENOMEM;
3606
3607        hur->hur_request.hr_action = HUA_RESTORE;
3608        hur->hur_request.hr_archive_id = 0;
3609        hur->hur_request.hr_flags = 0;
3610        memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3611               sizeof(hur->hur_user_item[0].hui_fid));
3612        hur->hur_user_item[0].hui_extent.length = -1;
3613        hur->hur_request.hr_itemcount = 1;
3614        rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3615                           len, hur, NULL);
3616        OBD_FREE(hur, len);
3617        return rc;
3618}
3619