linux/drivers/staging/lustre/lustre/llite/file.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 *
  36 * lustre/llite/file.c
  37 *
  38 * Author: Peter Braam <braam@clusterfs.com>
  39 * Author: Phil Schwan <phil@clusterfs.com>
  40 * Author: Andreas Dilger <adilger@clusterfs.com>
  41 */
  42
  43#define DEBUG_SUBSYSTEM S_LLITE
  44#include "../include/lustre_dlm.h"
  45#include "../include/lustre_lite.h"
  46#include <linux/pagemap.h>
  47#include <linux/file.h>
  48#include "llite_internal.h"
  49#include "../include/lustre/ll_fiemap.h"
  50
  51#include "../include/cl_object.h"
  52
  53static int
  54ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  55
  56static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  57                          bool *lease_broken);
  58
  59static enum llioc_iter
  60ll_iocontrol_call(struct inode *inode, struct file *file,
  61                  unsigned int cmd, unsigned long arg, int *rcp);
  62
  63static struct ll_file_data *ll_file_data_get(void)
  64{
  65        struct ll_file_data *fd;
  66
  67        fd = kmem_cache_alloc(ll_file_data_slab, GFP_NOFS | __GFP_ZERO);
  68        if (fd == NULL)
  69                return NULL;
  70        fd->fd_write_failed = false;
  71        return fd;
  72}
  73
  74static void ll_file_data_put(struct ll_file_data *fd)
  75{
  76        if (fd != NULL)
  77                kmem_cache_free(ll_file_data_slab, fd);
  78}
  79
  80void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  81                          struct lustre_handle *fh)
  82{
  83        op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  84        op_data->op_attr.ia_mode = inode->i_mode;
  85        op_data->op_attr.ia_atime = inode->i_atime;
  86        op_data->op_attr.ia_mtime = inode->i_mtime;
  87        op_data->op_attr.ia_ctime = inode->i_ctime;
  88        op_data->op_attr.ia_size = i_size_read(inode);
  89        op_data->op_attr_blocks = inode->i_blocks;
  90        ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  91                                        ll_inode_to_ext_flags(inode->i_flags);
  92        op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  93        if (fh)
  94                op_data->op_handle = *fh;
  95
  96        if (ll_i2info(inode)->lli_flags & LLIF_DATA_MODIFIED)
  97                op_data->op_bias |= MDS_DATA_MODIFIED;
  98}
  99
 100/**
 101 * Closes the IO epoch and packs all the attributes into @op_data for
 102 * the CLOSE rpc.
 103 */
 104static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
 105                             struct obd_client_handle *och)
 106{
 107        op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 108                                        ATTR_MTIME | ATTR_MTIME_SET |
 109                                        ATTR_CTIME | ATTR_CTIME_SET;
 110
 111        if (!(och->och_flags & FMODE_WRITE))
 112                goto out;
 113
 114        if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 115                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 116        else
 117                ll_ioepoch_close(inode, op_data, &och, 0);
 118
 119out:
 120        ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 121        ll_prep_md_op_data(op_data, inode, NULL, NULL,
 122                           0, 0, LUSTRE_OPC_ANY, NULL);
 123}
 124
 125static int ll_close_inode_openhandle(struct obd_export *md_exp,
 126                                     struct inode *inode,
 127                                     struct obd_client_handle *och,
 128                                     const __u64 *data_version)
 129{
 130        struct obd_export *exp = ll_i2mdexp(inode);
 131        struct md_op_data *op_data;
 132        struct ptlrpc_request *req = NULL;
 133        struct obd_device *obd = class_exp2obd(exp);
 134        int epoch_close = 1;
 135        int rc;
 136
 137        if (obd == NULL) {
 138                /*
 139                 * XXX: in case of LMV, is this correct to access
 140                 * ->exp_handle?
 141                 */
 142                CERROR("Invalid MDC connection handle %#llx\n",
 143                       ll_i2mdexp(inode)->exp_handle.h_cookie);
 144                rc = 0;
 145                goto out;
 146        }
 147
 148        op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
 149        if (!op_data) {
 150                /* XXX We leak openhandle and request here. */
 151                rc = -ENOMEM;
 152                goto out;
 153        }
 154
 155        ll_prepare_close(inode, op_data, och);
 156        if (data_version != NULL) {
 157                /* Pass in data_version implies release. */
 158                op_data->op_bias |= MDS_HSM_RELEASE;
 159                op_data->op_data_version = *data_version;
 160                op_data->op_lease_handle = och->och_lease_handle;
 161                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 162        }
 163        epoch_close = op_data->op_flags & MF_EPOCH_CLOSE;
 164        rc = md_close(md_exp, op_data, och->och_mod, &req);
 165        if (rc == -EAGAIN) {
 166                /* This close must have the epoch closed. */
 167                LASSERT(epoch_close);
 168                /* MDS has instructed us to obtain Size-on-MDS attribute from
 169                 * OSTs and send setattr to back to MDS. */
 170                rc = ll_som_update(inode, op_data);
 171                if (rc) {
 172                        CERROR("inode %lu mdc Size-on-MDS update failed: rc = %d\n",
 173                               inode->i_ino, rc);
 174                        rc = 0;
 175                }
 176        } else if (rc) {
 177                CERROR("inode %lu mdc close failed: rc = %d\n",
 178                       inode->i_ino, rc);
 179        }
 180
 181        /* DATA_MODIFIED flag was successfully sent on close, cancel data
 182         * modification flag. */
 183        if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 184                struct ll_inode_info *lli = ll_i2info(inode);
 185
 186                spin_lock(&lli->lli_lock);
 187                lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 188                spin_unlock(&lli->lli_lock);
 189        }
 190
 191        if (rc == 0) {
 192                rc = ll_objects_destroy(req, inode);
 193                if (rc)
 194                        CERROR("inode %lu ll_objects destroy: rc = %d\n",
 195                               inode->i_ino, rc);
 196        }
 197        if (rc == 0 && op_data->op_bias & MDS_HSM_RELEASE) {
 198                struct mdt_body *body;
 199
 200                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 201                if (!(body->valid & OBD_MD_FLRELEASED))
 202                        rc = -EBUSY;
 203        }
 204
 205        ll_finish_md_op_data(op_data);
 206
 207out:
 208        if (exp_connect_som(exp) && !epoch_close &&
 209            S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 210                ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 211        } else {
 212                md_clear_open_replay_data(md_exp, och);
 213                /* Free @och if it is not waiting for DONE_WRITING. */
 214                och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 215                kfree(och);
 216        }
 217        if (req) /* This is close request */
 218                ptlrpc_req_finished(req);
 219        return rc;
 220}
 221
 222int ll_md_real_close(struct inode *inode, fmode_t fmode)
 223{
 224        struct ll_inode_info *lli = ll_i2info(inode);
 225        struct obd_client_handle **och_p;
 226        struct obd_client_handle *och;
 227        __u64 *och_usecount;
 228        int rc = 0;
 229
 230        if (fmode & FMODE_WRITE) {
 231                och_p = &lli->lli_mds_write_och;
 232                och_usecount = &lli->lli_open_fd_write_count;
 233        } else if (fmode & FMODE_EXEC) {
 234                och_p = &lli->lli_mds_exec_och;
 235                och_usecount = &lli->lli_open_fd_exec_count;
 236        } else {
 237                LASSERT(fmode & FMODE_READ);
 238                och_p = &lli->lli_mds_read_och;
 239                och_usecount = &lli->lli_open_fd_read_count;
 240        }
 241
 242        mutex_lock(&lli->lli_och_mutex);
 243        if (*och_usecount > 0) {
 244                /* There are still users of this handle, so skip
 245                 * freeing it. */
 246                mutex_unlock(&lli->lli_och_mutex);
 247                return 0;
 248        }
 249
 250        och = *och_p;
 251        *och_p = NULL;
 252        mutex_unlock(&lli->lli_och_mutex);
 253
 254        if (och != NULL) {
 255                /* There might be a race and this handle may already
 256                   be closed. */
 257                rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 258                                               inode, och, NULL);
 259        }
 260
 261        return rc;
 262}
 263
 264static int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 265                       struct file *file)
 266{
 267        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 268        struct ll_inode_info *lli = ll_i2info(inode);
 269        int lockmode;
 270        __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 271        struct lustre_handle lockh;
 272        ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_OPEN} };
 273        int rc = 0;
 274
 275        /* clear group lock, if present */
 276        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 277                ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 278
 279        if (fd->fd_lease_och != NULL) {
 280                bool lease_broken;
 281
 282                /* Usually the lease is not released when the
 283                 * application crashed, we need to release here. */
 284                rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 285                CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 286                        PFID(&lli->lli_fid), rc, lease_broken);
 287
 288                fd->fd_lease_och = NULL;
 289        }
 290
 291        if (fd->fd_och != NULL) {
 292                rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och, NULL);
 293                fd->fd_och = NULL;
 294                goto out;
 295        }
 296
 297        /* Let's see if we have good enough OPEN lock on the file and if
 298           we can skip talking to MDS */
 299
 300        mutex_lock(&lli->lli_och_mutex);
 301        if (fd->fd_omode & FMODE_WRITE) {
 302                lockmode = LCK_CW;
 303                LASSERT(lli->lli_open_fd_write_count);
 304                lli->lli_open_fd_write_count--;
 305        } else if (fd->fd_omode & FMODE_EXEC) {
 306                lockmode = LCK_PR;
 307                LASSERT(lli->lli_open_fd_exec_count);
 308                lli->lli_open_fd_exec_count--;
 309        } else {
 310                lockmode = LCK_CR;
 311                LASSERT(lli->lli_open_fd_read_count);
 312                lli->lli_open_fd_read_count--;
 313        }
 314        mutex_unlock(&lli->lli_och_mutex);
 315
 316        if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 317                           LDLM_IBITS, &policy, lockmode, &lockh))
 318                rc = ll_md_real_close(inode, fd->fd_omode);
 319
 320out:
 321        LUSTRE_FPRIVATE(file) = NULL;
 322        ll_file_data_put(fd);
 323
 324        return rc;
 325}
 326
 327/* While this returns an error code, fput() the caller does not, so we need
 328 * to make every effort to clean up all of our state here.  Also, applications
 329 * rarely check close errors and even if an error is returned they will not
 330 * re-try the close call.
 331 */
 332int ll_file_release(struct inode *inode, struct file *file)
 333{
 334        struct ll_file_data *fd;
 335        struct ll_sb_info *sbi = ll_i2sbi(inode);
 336        struct ll_inode_info *lli = ll_i2info(inode);
 337        int rc;
 338
 339        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 340               inode->i_generation, inode);
 341
 342#ifdef CONFIG_FS_POSIX_ACL
 343        if (sbi->ll_flags & LL_SBI_RMT_CLIENT && is_root_inode(inode)) {
 344                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 345
 346                LASSERT(fd != NULL);
 347                if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 348                        fd->fd_flags &= ~LL_FILE_RMTACL;
 349                        rct_del(&sbi->ll_rct, current_pid());
 350                        et_search_free(&sbi->ll_et, current_pid());
 351                }
 352        }
 353#endif
 354
 355        if (!is_root_inode(inode))
 356                ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 357        fd = LUSTRE_FPRIVATE(file);
 358        LASSERT(fd != NULL);
 359
 360        /* The last ref on @file, maybe not the owner pid of statahead.
 361         * Different processes can open the same dir, "ll_opendir_key" means:
 362         * it is me that should stop the statahead thread. */
 363        if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 364            lli->lli_opendir_pid != 0)
 365                ll_stop_statahead(inode, lli->lli_opendir_key);
 366
 367        if (is_root_inode(inode)) {
 368                LUSTRE_FPRIVATE(file) = NULL;
 369                ll_file_data_put(fd);
 370                return 0;
 371        }
 372
 373        if (!S_ISDIR(inode->i_mode)) {
 374                lov_read_and_clear_async_rc(lli->lli_clob);
 375                lli->lli_async_rc = 0;
 376        }
 377
 378        rc = ll_md_close(sbi->ll_md_exp, inode, file);
 379
 380        if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 381                libcfs_debug_dumplog();
 382
 383        return rc;
 384}
 385
 386static int ll_intent_file_open(struct dentry *dentry, void *lmm,
 387                               int lmmsize, struct lookup_intent *itp)
 388{
 389        struct inode *inode = d_inode(dentry);
 390        struct ll_sb_info *sbi = ll_i2sbi(inode);
 391        struct dentry *parent = dentry->d_parent;
 392        const char *name = dentry->d_name.name;
 393        const int len = dentry->d_name.len;
 394        struct md_op_data *op_data;
 395        struct ptlrpc_request *req;
 396        __u32 opc = LUSTRE_OPC_ANY;
 397        int rc;
 398
 399        /* Usually we come here only for NFSD, and we want open lock.
 400           But we can also get here with pre 2.6.15 patchless kernels, and in
 401           that case that lock is also ok */
 402        /* We can also get here if there was cached open handle in revalidate_it
 403         * but it disappeared while we were getting from there to ll_file_open.
 404         * But this means this file was closed and immediately opened which
 405         * makes a good candidate for using OPEN lock */
 406        /* If lmmsize & lmm are not 0, we are just setting stripe info
 407         * parameters. No need for the open lock */
 408        if (lmm == NULL && lmmsize == 0) {
 409                itp->it_flags |= MDS_OPEN_LOCK;
 410                if (itp->it_flags & FMODE_WRITE)
 411                        opc = LUSTRE_OPC_CREATE;
 412        }
 413
 414        op_data  = ll_prep_md_op_data(NULL, d_inode(parent),
 415                                      inode, name, len,
 416                                      O_RDWR, opc, NULL);
 417        if (IS_ERR(op_data))
 418                return PTR_ERR(op_data);
 419
 420        itp->it_flags |= MDS_OPEN_BY_FID;
 421        rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 422                            0 /*unused */, &req, ll_md_blocking_ast, 0);
 423        ll_finish_md_op_data(op_data);
 424        if (rc == -ESTALE) {
 425                /* reason for keep own exit path - don`t flood log
 426                * with messages with -ESTALE errors.
 427                */
 428                if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 429                     it_open_error(DISP_OPEN_OPEN, itp))
 430                        goto out;
 431                ll_release_openhandle(inode, itp);
 432                goto out;
 433        }
 434
 435        if (it_disposition(itp, DISP_LOOKUP_NEG)) {
 436                rc = -ENOENT;
 437                goto out;
 438        }
 439
 440        if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 441                rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 442                CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 443                goto out;
 444        }
 445
 446        rc = ll_prep_inode(&inode, req, NULL, itp);
 447        if (!rc && itp->d.lustre.it_lock_mode)
 448                ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL);
 449
 450out:
 451        ptlrpc_req_finished(req);
 452        ll_intent_drop_lock(itp);
 453
 454        return rc;
 455}
 456
 457/**
 458 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 459 * not believe attributes if a few ioepoch holders exist. Attributes for
 460 * previous ioepoch if new one is opened are also skipped by MDS.
 461 */
 462void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 463{
 464        if (ioepoch && lli->lli_ioepoch != ioepoch) {
 465                lli->lli_ioepoch = ioepoch;
 466                CDEBUG(D_INODE, "Epoch %llu opened on "DFID"\n",
 467                       ioepoch, PFID(&lli->lli_fid));
 468        }
 469}
 470
 471static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 472                       struct obd_client_handle *och)
 473{
 474        struct ptlrpc_request *req = it->d.lustre.it_data;
 475        struct mdt_body *body;
 476
 477        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 478        och->och_fh = body->handle;
 479        och->och_fid = body->fid1;
 480        och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
 481        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 482        och->och_flags = it->it_flags;
 483
 484        return md_set_open_replay_data(md_exp, och, it);
 485}
 486
 487static int ll_local_open(struct file *file, struct lookup_intent *it,
 488                         struct ll_file_data *fd, struct obd_client_handle *och)
 489{
 490        struct inode *inode = file_inode(file);
 491        struct ll_inode_info *lli = ll_i2info(inode);
 492
 493        LASSERT(!LUSTRE_FPRIVATE(file));
 494
 495        LASSERT(fd != NULL);
 496
 497        if (och) {
 498                struct ptlrpc_request *req = it->d.lustre.it_data;
 499                struct mdt_body *body;
 500                int rc;
 501
 502                rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 503                if (rc != 0)
 504                        return rc;
 505
 506                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 507                ll_ioepoch_open(lli, body->ioepoch);
 508        }
 509
 510        LUSTRE_FPRIVATE(file) = fd;
 511        ll_readahead_init(inode, &fd->fd_ras);
 512        fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 513        return 0;
 514}
 515
 516/* Open a file, and (for the very first open) create objects on the OSTs at
 517 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 518 * creation or open until ll_lov_setstripe() ioctl is called.
 519 *
 520 * If we already have the stripe MD locally then we don't request it in
 521 * md_open(), by passing a lmm_size = 0.
 522 *
 523 * It is up to the application to ensure no other processes open this file
 524 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 525 * used.  We might be able to avoid races of that sort by getting lli_open_sem
 526 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 527 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 528 */
 529int ll_file_open(struct inode *inode, struct file *file)
 530{
 531        struct ll_inode_info *lli = ll_i2info(inode);
 532        struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 533                                          .it_flags = file->f_flags };
 534        struct obd_client_handle **och_p = NULL;
 535        __u64 *och_usecount = NULL;
 536        struct ll_file_data *fd;
 537        int rc = 0, opendir_set = 0;
 538
 539        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 540               inode->i_generation, inode, file->f_flags);
 541
 542        it = file->private_data; /* XXX: compat macro */
 543        file->private_data = NULL; /* prevent ll_local_open assertion */
 544
 545        fd = ll_file_data_get();
 546        if (fd == NULL) {
 547                rc = -ENOMEM;
 548                goto out_openerr;
 549        }
 550
 551        fd->fd_file = file;
 552        if (S_ISDIR(inode->i_mode)) {
 553                spin_lock(&lli->lli_sa_lock);
 554                if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 555                    lli->lli_opendir_pid == 0) {
 556                        lli->lli_opendir_key = fd;
 557                        lli->lli_opendir_pid = current_pid();
 558                        opendir_set = 1;
 559                }
 560                spin_unlock(&lli->lli_sa_lock);
 561        }
 562
 563        if (is_root_inode(inode)) {
 564                LUSTRE_FPRIVATE(file) = fd;
 565                return 0;
 566        }
 567
 568        if (!it || !it->d.lustre.it_disposition) {
 569                /* Convert f_flags into access mode. We cannot use file->f_mode,
 570                 * because everything but O_ACCMODE mask was stripped from
 571                 * there */
 572                if ((oit.it_flags + 1) & O_ACCMODE)
 573                        oit.it_flags++;
 574                if (file->f_flags & O_TRUNC)
 575                        oit.it_flags |= FMODE_WRITE;
 576
 577                /* kernel only call f_op->open in dentry_open.  filp_open calls
 578                 * dentry_open after call to open_namei that checks permissions.
 579                 * Only nfsd_open call dentry_open directly without checking
 580                 * permissions and because of that this code below is safe. */
 581                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 582                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 583
 584                /* We do not want O_EXCL here, presumably we opened the file
 585                 * already? XXX - NFS implications? */
 586                oit.it_flags &= ~O_EXCL;
 587
 588                /* bug20584, if "it_flags" contains O_CREAT, the file will be
 589                 * created if necessary, then "IT_CREAT" should be set to keep
 590                 * consistent with it */
 591                if (oit.it_flags & O_CREAT)
 592                        oit.it_op |= IT_CREAT;
 593
 594                it = &oit;
 595        }
 596
 597restart:
 598        /* Let's see if we have file open on MDS already. */
 599        if (it->it_flags & FMODE_WRITE) {
 600                och_p = &lli->lli_mds_write_och;
 601                och_usecount = &lli->lli_open_fd_write_count;
 602        } else if (it->it_flags & FMODE_EXEC) {
 603                och_p = &lli->lli_mds_exec_och;
 604                och_usecount = &lli->lli_open_fd_exec_count;
 605         } else {
 606                och_p = &lli->lli_mds_read_och;
 607                och_usecount = &lli->lli_open_fd_read_count;
 608        }
 609
 610        mutex_lock(&lli->lli_och_mutex);
 611        if (*och_p) { /* Open handle is present */
 612                if (it_disposition(it, DISP_OPEN_OPEN)) {
 613                        /* Well, there's extra open request that we do not need,
 614                           let's close it somehow. This will decref request. */
 615                        rc = it_open_error(DISP_OPEN_OPEN, it);
 616                        if (rc) {
 617                                mutex_unlock(&lli->lli_och_mutex);
 618                                goto out_openerr;
 619                        }
 620
 621                        ll_release_openhandle(inode, it);
 622                }
 623                (*och_usecount)++;
 624
 625                rc = ll_local_open(file, it, fd, NULL);
 626                if (rc) {
 627                        (*och_usecount)--;
 628                        mutex_unlock(&lli->lli_och_mutex);
 629                        goto out_openerr;
 630                }
 631        } else {
 632                LASSERT(*och_usecount == 0);
 633                if (!it->d.lustre.it_disposition) {
 634                        /* We cannot just request lock handle now, new ELC code
 635                           means that one of other OPEN locks for this file
 636                           could be cancelled, and since blocking ast handler
 637                           would attempt to grab och_mutex as well, that would
 638                           result in a deadlock */
 639                        mutex_unlock(&lli->lli_och_mutex);
 640                        it->it_create_mode |= M_CHECK_STALE;
 641                        rc = ll_intent_file_open(file->f_path.dentry, NULL, 0, it);
 642                        it->it_create_mode &= ~M_CHECK_STALE;
 643                        if (rc)
 644                                goto out_openerr;
 645
 646                        goto restart;
 647                }
 648                *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS);
 649                if (!*och_p) {
 650                        rc = -ENOMEM;
 651                        goto out_och_free;
 652                }
 653
 654                (*och_usecount)++;
 655
 656                /* md_intent_lock() didn't get a request ref if there was an
 657                 * open error, so don't do cleanup on the request here
 658                 * (bug 3430) */
 659                /* XXX (green): Should not we bail out on any error here, not
 660                 * just open error? */
 661                rc = it_open_error(DISP_OPEN_OPEN, it);
 662                if (rc)
 663                        goto out_och_free;
 664
 665                LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 666
 667                rc = ll_local_open(file, it, fd, *och_p);
 668                if (rc)
 669                        goto out_och_free;
 670        }
 671        mutex_unlock(&lli->lli_och_mutex);
 672        fd = NULL;
 673
 674        /* Must do this outside lli_och_mutex lock to prevent deadlock where
 675           different kind of OPEN lock for this same inode gets cancelled
 676           by ldlm_cancel_lru */
 677        if (!S_ISREG(inode->i_mode))
 678                goto out_och_free;
 679
 680        if (!lli->lli_has_smd &&
 681            (cl_is_lov_delay_create(file->f_flags) ||
 682             (file->f_mode & FMODE_WRITE) == 0)) {
 683                CDEBUG(D_INODE, "object creation was delayed\n");
 684                goto out_och_free;
 685        }
 686        cl_lov_delay_create_clear(&file->f_flags);
 687        goto out_och_free;
 688
 689out_och_free:
 690        if (rc) {
 691                if (och_p && *och_p) {
 692                        kfree(*och_p);
 693                        *och_p = NULL;
 694                        (*och_usecount)--;
 695                }
 696                mutex_unlock(&lli->lli_och_mutex);
 697
 698out_openerr:
 699                if (opendir_set != 0)
 700                        ll_stop_statahead(inode, lli->lli_opendir_key);
 701                ll_file_data_put(fd);
 702        } else {
 703                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 704        }
 705
 706        if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 707                ptlrpc_req_finished(it->d.lustre.it_data);
 708                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 709        }
 710
 711        return rc;
 712}
 713
 714static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 715                        struct ldlm_lock_desc *desc, void *data, int flag)
 716{
 717        int rc;
 718        struct lustre_handle lockh;
 719
 720        switch (flag) {
 721        case LDLM_CB_BLOCKING:
 722                ldlm_lock2handle(lock, &lockh);
 723                rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 724                if (rc < 0) {
 725                        CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 726                        return rc;
 727                }
 728                break;
 729        case LDLM_CB_CANCELING:
 730                /* do nothing */
 731                break;
 732        }
 733        return 0;
 734}
 735
 736/**
 737 * Acquire a lease and open the file.
 738 */
 739static struct obd_client_handle *
 740ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 741              __u64 open_flags)
 742{
 743        struct lookup_intent it = { .it_op = IT_OPEN };
 744        struct ll_sb_info *sbi = ll_i2sbi(inode);
 745        struct md_op_data *op_data;
 746        struct ptlrpc_request *req;
 747        struct lustre_handle old_handle = { 0 };
 748        struct obd_client_handle *och = NULL;
 749        int rc;
 750        int rc2;
 751
 752        if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 753                return ERR_PTR(-EINVAL);
 754
 755        if (file != NULL) {
 756                struct ll_inode_info *lli = ll_i2info(inode);
 757                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 758                struct obd_client_handle **och_p;
 759                __u64 *och_usecount;
 760
 761                if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 762                        return ERR_PTR(-EPERM);
 763
 764                /* Get the openhandle of the file */
 765                rc = -EBUSY;
 766                mutex_lock(&lli->lli_och_mutex);
 767                if (fd->fd_lease_och != NULL) {
 768                        mutex_unlock(&lli->lli_och_mutex);
 769                        return ERR_PTR(rc);
 770                }
 771
 772                if (fd->fd_och == NULL) {
 773                        if (file->f_mode & FMODE_WRITE) {
 774                                LASSERT(lli->lli_mds_write_och != NULL);
 775                                och_p = &lli->lli_mds_write_och;
 776                                och_usecount = &lli->lli_open_fd_write_count;
 777                        } else {
 778                                LASSERT(lli->lli_mds_read_och != NULL);
 779                                och_p = &lli->lli_mds_read_och;
 780                                och_usecount = &lli->lli_open_fd_read_count;
 781                        }
 782                        if (*och_usecount == 1) {
 783                                fd->fd_och = *och_p;
 784                                *och_p = NULL;
 785                                *och_usecount = 0;
 786                                rc = 0;
 787                        }
 788                }
 789                mutex_unlock(&lli->lli_och_mutex);
 790                if (rc < 0) /* more than 1 opener */
 791                        return ERR_PTR(rc);
 792
 793                LASSERT(fd->fd_och != NULL);
 794                old_handle = fd->fd_och->och_fh;
 795        }
 796
 797        och = kzalloc(sizeof(*och), GFP_NOFS);
 798        if (!och)
 799                return ERR_PTR(-ENOMEM);
 800
 801        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 802                                        LUSTRE_OPC_ANY, NULL);
 803        if (IS_ERR(op_data)) {
 804                rc = PTR_ERR(op_data);
 805                goto out;
 806        }
 807
 808        /* To tell the MDT this openhandle is from the same owner */
 809        op_data->op_handle = old_handle;
 810
 811        it.it_flags = fmode | open_flags;
 812        it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 813        rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
 814                                ll_md_blocking_lease_ast,
 815        /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 816         * it can be cancelled which may mislead applications that the lease is
 817         * broken;
 818         * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 819         * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 820         * doesn't deal with openhandle, so normal openhandle will be leaked. */
 821                                LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 822        ll_finish_md_op_data(op_data);
 823        ptlrpc_req_finished(req);
 824        if (rc < 0)
 825                goto out_release_it;
 826
 827        if (it_disposition(&it, DISP_LOOKUP_NEG)) {
 828                rc = -ENOENT;
 829                goto out_release_it;
 830        }
 831
 832        rc = it_open_error(DISP_OPEN_OPEN, &it);
 833        if (rc)
 834                goto out_release_it;
 835
 836        LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 837        ll_och_fill(sbi->ll_md_exp, &it, och);
 838
 839        if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ {
 840                rc = -EOPNOTSUPP;
 841                goto out_close;
 842        }
 843
 844        /* already get lease, handle lease lock */
 845        ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 846        if (it.d.lustre.it_lock_mode == 0 ||
 847            it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
 848                /* open lock must return for lease */
 849                CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 850                        PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
 851                        it.d.lustre.it_lock_bits);
 852                rc = -EPROTO;
 853                goto out_close;
 854        }
 855
 856        ll_intent_release(&it);
 857        return och;
 858
 859out_close:
 860        rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och, NULL);
 861        if (rc2)
 862                CERROR("Close openhandle returned %d\n", rc2);
 863
 864        /* cancel open lock */
 865        if (it.d.lustre.it_lock_mode != 0) {
 866                ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 867                                                it.d.lustre.it_lock_mode);
 868                it.d.lustre.it_lock_mode = 0;
 869        }
 870out_release_it:
 871        ll_intent_release(&it);
 872out:
 873        kfree(och);
 874        return ERR_PTR(rc);
 875}
 876
 877/**
 878 * Release lease and close the file.
 879 * It will check if the lease has ever broken.
 880 */
 881static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 882                          bool *lease_broken)
 883{
 884        struct ldlm_lock *lock;
 885        bool cancelled = true;
 886        int rc;
 887
 888        lock = ldlm_handle2lock(&och->och_lease_handle);
 889        if (lock != NULL) {
 890                lock_res_and_lock(lock);
 891                cancelled = ldlm_is_cancel(lock);
 892                unlock_res_and_lock(lock);
 893                ldlm_lock_put(lock);
 894        }
 895
 896        CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 897                PFID(&ll_i2info(inode)->lli_fid), cancelled);
 898
 899        if (!cancelled)
 900                ldlm_cli_cancel(&och->och_lease_handle, 0);
 901        if (lease_broken != NULL)
 902                *lease_broken = cancelled;
 903
 904        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
 905                                       NULL);
 906        return rc;
 907}
 908
 909/* Fills the obdo with the attributes for the lsm */
 910static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 911                          struct obdo *obdo, __u64 ioepoch, int sync)
 912{
 913        struct ptlrpc_request_set *set;
 914        struct obd_info     oinfo = { };
 915        int                     rc;
 916
 917        LASSERT(lsm != NULL);
 918
 919        oinfo.oi_md = lsm;
 920        oinfo.oi_oa = obdo;
 921        oinfo.oi_oa->o_oi = lsm->lsm_oi;
 922        oinfo.oi_oa->o_mode = S_IFREG;
 923        oinfo.oi_oa->o_ioepoch = ioepoch;
 924        oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 925                               OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 926                               OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 927                               OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 928                               OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 929                               OBD_MD_FLDATAVERSION;
 930        if (sync) {
 931                oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 932                oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 933        }
 934
 935        set = ptlrpc_prep_set();
 936        if (set == NULL) {
 937                CERROR("can't allocate ptlrpc set\n");
 938                rc = -ENOMEM;
 939        } else {
 940                rc = obd_getattr_async(exp, &oinfo, set);
 941                if (rc == 0)
 942                        rc = ptlrpc_set_wait(set);
 943                ptlrpc_set_destroy(set);
 944        }
 945        if (rc == 0)
 946                oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 947                                         OBD_MD_FLATIME | OBD_MD_FLMTIME |
 948                                         OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 949                                         OBD_MD_FLDATAVERSION);
 950        return rc;
 951}
 952
 953/**
 954  * Performs the getattr on the inode and updates its fields.
 955  * If @sync != 0, perform the getattr under the server-side lock.
 956  */
 957int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 958                     __u64 ioepoch, int sync)
 959{
 960        struct lov_stripe_md *lsm;
 961        int rc;
 962
 963        lsm = ccc_inode_lsm_get(inode);
 964        rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 965                            obdo, ioepoch, sync);
 966        if (rc == 0) {
 967                struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
 968
 969                obdo_refresh_inode(inode, obdo, obdo->o_valid);
 970                CDEBUG(D_INODE, "objid " DOSTID " size %llu, blocks %llu, blksize %lu\n",
 971                       POSTID(oi), i_size_read(inode),
 972                       (unsigned long long)inode->i_blocks,
 973                       1UL << inode->i_blkbits);
 974        }
 975        ccc_inode_lsm_put(inode, lsm);
 976        return rc;
 977}
 978
 979int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 980{
 981        struct ll_inode_info *lli = ll_i2info(inode);
 982        struct cl_object *obj = lli->lli_clob;
 983        struct cl_attr *attr = ccc_env_thread_attr(env);
 984        struct ost_lvb lvb;
 985        int rc = 0;
 986
 987        ll_inode_size_lock(inode);
 988        /* merge timestamps the most recently obtained from mds with
 989           timestamps obtained from osts */
 990        LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 991        LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
 992        LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
 993
 994        lvb.lvb_size = i_size_read(inode);
 995        lvb.lvb_blocks = inode->i_blocks;
 996        lvb.lvb_mtime = LTIME_S(inode->i_mtime);
 997        lvb.lvb_atime = LTIME_S(inode->i_atime);
 998        lvb.lvb_ctime = LTIME_S(inode->i_ctime);
 999
1000        cl_object_attr_lock(obj);
1001        rc = cl_object_attr_get(env, obj, attr);
1002        cl_object_attr_unlock(obj);
1003
1004        if (rc == 0) {
1005                if (lvb.lvb_atime < attr->cat_atime)
1006                        lvb.lvb_atime = attr->cat_atime;
1007                if (lvb.lvb_ctime < attr->cat_ctime)
1008                        lvb.lvb_ctime = attr->cat_ctime;
1009                if (lvb.lvb_mtime < attr->cat_mtime)
1010                        lvb.lvb_mtime = attr->cat_mtime;
1011
1012                CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1013                                PFID(&lli->lli_fid), attr->cat_size);
1014                cl_isize_write_nolock(inode, attr->cat_size);
1015
1016                inode->i_blocks = attr->cat_blocks;
1017
1018                LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1019                LTIME_S(inode->i_atime) = lvb.lvb_atime;
1020                LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1021        }
1022        ll_inode_size_unlock(inode);
1023
1024        return rc;
1025}
1026
1027int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1028                     lstat_t *st)
1029{
1030        struct obdo obdo = { 0 };
1031        int rc;
1032
1033        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, &obdo, 0, 0);
1034        if (rc == 0) {
1035                st->st_size   = obdo.o_size;
1036                st->st_blocks = obdo.o_blocks;
1037                st->st_mtime  = obdo.o_mtime;
1038                st->st_atime  = obdo.o_atime;
1039                st->st_ctime  = obdo.o_ctime;
1040        }
1041        return rc;
1042}
1043
1044static bool file_is_noatime(const struct file *file)
1045{
1046        const struct vfsmount *mnt = file->f_path.mnt;
1047        const struct inode *inode = file_inode(file);
1048
1049        /* Adapted from file_accessed() and touch_atime().*/
1050        if (file->f_flags & O_NOATIME)
1051                return true;
1052
1053        if (inode->i_flags & S_NOATIME)
1054                return true;
1055
1056        if (IS_NOATIME(inode))
1057                return true;
1058
1059        if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1060                return true;
1061
1062        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1063                return true;
1064
1065        if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1066                return true;
1067
1068        return false;
1069}
1070
1071void ll_io_init(struct cl_io *io, const struct file *file, int write)
1072{
1073        struct inode *inode = file_inode(file);
1074
1075        io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1076        if (write) {
1077                io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1078                io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1079                                      file->f_flags & O_DIRECT ||
1080                                      IS_SYNC(inode);
1081        }
1082        io->ci_obj     = ll_i2info(inode)->lli_clob;
1083        io->ci_lockreq = CILR_MAYBE;
1084        if (ll_file_nolock(file)) {
1085                io->ci_lockreq = CILR_NEVER;
1086                io->ci_no_srvlock = 1;
1087        } else if (file->f_flags & O_APPEND) {
1088                io->ci_lockreq = CILR_MANDATORY;
1089        }
1090
1091        io->ci_noatime = file_is_noatime(file);
1092}
1093
1094static ssize_t
1095ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1096                   struct file *file, enum cl_io_type iot,
1097                   loff_t *ppos, size_t count)
1098{
1099        struct ll_inode_info *lli = ll_i2info(file_inode(file));
1100        struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1101        struct cl_io     *io;
1102        ssize_t        result;
1103
1104restart:
1105        io = ccc_env_thread_io(env);
1106        ll_io_init(io, file, iot == CIT_WRITE);
1107
1108        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1109                struct vvp_io *vio = vvp_env_io(env);
1110                struct ccc_io *cio = ccc_env_io(env);
1111                int write_mutex_locked = 0;
1112
1113                cio->cui_fd  = LUSTRE_FPRIVATE(file);
1114                vio->cui_io_subtype = args->via_io_subtype;
1115
1116                switch (vio->cui_io_subtype) {
1117                case IO_NORMAL:
1118                        cio->cui_iter = args->u.normal.via_iter;
1119                        cio->cui_iocb = args->u.normal.via_iocb;
1120                        if ((iot == CIT_WRITE) &&
1121                            !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1122                                if (mutex_lock_interruptible(&lli->
1123                                                               lli_write_mutex)) {
1124                                        result = -ERESTARTSYS;
1125                                        goto out;
1126                                }
1127                                write_mutex_locked = 1;
1128                        } else if (iot == CIT_READ) {
1129                                down_read(&lli->lli_trunc_sem);
1130                        }
1131                        break;
1132                case IO_SPLICE:
1133                        vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1134                        vio->u.splice.cui_flags = args->u.splice.via_flags;
1135                        break;
1136                default:
1137                        CERROR("Unknown IO type - %u\n", vio->cui_io_subtype);
1138                        LBUG();
1139                }
1140                result = cl_io_loop(env, io);
1141                if (write_mutex_locked)
1142                        mutex_unlock(&lli->lli_write_mutex);
1143                else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1144                        up_read(&lli->lli_trunc_sem);
1145        } else {
1146                /* cl_io_rw_init() handled IO */
1147                result = io->ci_result;
1148        }
1149
1150        if (io->ci_nob > 0) {
1151                result = io->ci_nob;
1152                *ppos = io->u.ci_wr.wr.crw_pos;
1153        }
1154        goto out;
1155out:
1156        cl_io_fini(env, io);
1157        /* If any bit been read/written (result != 0), we just return
1158         * short read/write instead of restart io. */
1159        if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1160                CDEBUG(D_VFSTRACE, "Restart %s on %pD from %lld, count:%zd\n",
1161                       iot == CIT_READ ? "read" : "write",
1162                       file, *ppos, count);
1163                LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1164                goto restart;
1165        }
1166
1167        if (iot == CIT_READ) {
1168                if (result >= 0)
1169                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1170                                           LPROC_LL_READ_BYTES, result);
1171        } else if (iot == CIT_WRITE) {
1172                if (result >= 0) {
1173                        ll_stats_ops_tally(ll_i2sbi(file_inode(file)),
1174                                           LPROC_LL_WRITE_BYTES, result);
1175                        fd->fd_write_failed = false;
1176                } else if (result != -ERESTARTSYS) {
1177                        fd->fd_write_failed = true;
1178                }
1179        }
1180
1181        return result;
1182}
1183
1184static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1185{
1186        struct lu_env      *env;
1187        struct vvp_io_args *args;
1188        ssize_t      result;
1189        int              refcheck;
1190
1191        env = cl_env_get(&refcheck);
1192        if (IS_ERR(env))
1193                return PTR_ERR(env);
1194
1195        args = vvp_env_args(env, IO_NORMAL);
1196        args->u.normal.via_iter = to;
1197        args->u.normal.via_iocb = iocb;
1198
1199        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1200                                    &iocb->ki_pos, iov_iter_count(to));
1201        cl_env_put(env, &refcheck);
1202        return result;
1203}
1204
1205/*
1206 * Write to a file (through the page cache).
1207 */
1208static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1209{
1210        struct lu_env      *env;
1211        struct vvp_io_args *args;
1212        ssize_t      result;
1213        int              refcheck;
1214
1215        env = cl_env_get(&refcheck);
1216        if (IS_ERR(env))
1217                return PTR_ERR(env);
1218
1219        args = vvp_env_args(env, IO_NORMAL);
1220        args->u.normal.via_iter = from;
1221        args->u.normal.via_iocb = iocb;
1222
1223        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1224                                  &iocb->ki_pos, iov_iter_count(from));
1225        cl_env_put(env, &refcheck);
1226        return result;
1227}
1228
1229/*
1230 * Send file content (through pagecache) somewhere with helper
1231 */
1232static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1233                                   struct pipe_inode_info *pipe, size_t count,
1234                                   unsigned int flags)
1235{
1236        struct lu_env      *env;
1237        struct vvp_io_args *args;
1238        ssize_t      result;
1239        int              refcheck;
1240
1241        env = cl_env_get(&refcheck);
1242        if (IS_ERR(env))
1243                return PTR_ERR(env);
1244
1245        args = vvp_env_args(env, IO_SPLICE);
1246        args->u.splice.via_pipe = pipe;
1247        args->u.splice.via_flags = flags;
1248
1249        result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1250        cl_env_put(env, &refcheck);
1251        return result;
1252}
1253
1254static int ll_lov_recreate(struct inode *inode, struct ost_id *oi, u32 ost_idx)
1255{
1256        struct obd_export *exp = ll_i2dtexp(inode);
1257        struct obd_trans_info oti = { 0 };
1258        struct obdo *oa = NULL;
1259        int lsm_size;
1260        int rc = 0;
1261        struct lov_stripe_md *lsm = NULL, *lsm2;
1262
1263        oa = kmem_cache_alloc(obdo_cachep, GFP_NOFS | __GFP_ZERO);
1264        if (oa == NULL)
1265                return -ENOMEM;
1266
1267        lsm = ccc_inode_lsm_get(inode);
1268        if (!lsm_has_objects(lsm)) {
1269                rc = -ENOENT;
1270                goto out;
1271        }
1272
1273        lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1274                   (lsm->lsm_stripe_count));
1275
1276        lsm2 = libcfs_kvzalloc(lsm_size, GFP_NOFS);
1277        if (lsm2 == NULL) {
1278                rc = -ENOMEM;
1279                goto out;
1280        }
1281
1282        oa->o_oi = *oi;
1283        oa->o_nlink = ost_idx;
1284        oa->o_flags |= OBD_FL_RECREATE_OBJS;
1285        oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1286        obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1287                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1288        obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1289        memcpy(lsm2, lsm, lsm_size);
1290        ll_inode_size_lock(inode);
1291        rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1292        ll_inode_size_unlock(inode);
1293
1294        kvfree(lsm2);
1295        goto out;
1296out:
1297        ccc_inode_lsm_put(inode, lsm);
1298        kmem_cache_free(obdo_cachep, oa);
1299        return rc;
1300}
1301
1302static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1303{
1304        struct ll_recreate_obj ucreat;
1305        struct ost_id           oi;
1306
1307        if (!capable(CFS_CAP_SYS_ADMIN))
1308                return -EPERM;
1309
1310        if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1311                           sizeof(ucreat)))
1312                return -EFAULT;
1313
1314        ostid_set_seq_mdt0(&oi);
1315        ostid_set_id(&oi, ucreat.lrc_id);
1316        return ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx);
1317}
1318
1319static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1320{
1321        struct lu_fid   fid;
1322        struct ost_id   oi;
1323        u32             ost_idx;
1324
1325        if (!capable(CFS_CAP_SYS_ADMIN))
1326                return -EPERM;
1327
1328        if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1329                return -EFAULT;
1330
1331        fid_to_ostid(&fid, &oi);
1332        ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1333        return ll_lov_recreate(inode, &oi, ost_idx);
1334}
1335
1336int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1337                             int flags, struct lov_user_md *lum, int lum_size)
1338{
1339        struct lov_stripe_md *lsm = NULL;
1340        struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1341        int rc = 0;
1342
1343        lsm = ccc_inode_lsm_get(inode);
1344        if (lsm != NULL) {
1345                ccc_inode_lsm_put(inode, lsm);
1346                CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1347                       inode->i_ino);
1348                rc = -EEXIST;
1349                goto out;
1350        }
1351
1352        ll_inode_size_lock(inode);
1353        rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1354        if (rc)
1355                goto out_unlock;
1356        rc = oit.d.lustre.it_status;
1357        if (rc < 0)
1358                goto out_req_free;
1359
1360        ll_release_openhandle(inode, &oit);
1361
1362out_unlock:
1363        ll_inode_size_unlock(inode);
1364        ll_intent_release(&oit);
1365        ccc_inode_lsm_put(inode, lsm);
1366out:
1367        return rc;
1368out_req_free:
1369        ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1370        goto out;
1371}
1372
1373int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1374                             struct lov_mds_md **lmmp, int *lmm_size,
1375                             struct ptlrpc_request **request)
1376{
1377        struct ll_sb_info *sbi = ll_i2sbi(inode);
1378        struct mdt_body  *body;
1379        struct lov_mds_md *lmm = NULL;
1380        struct ptlrpc_request *req = NULL;
1381        struct md_op_data *op_data;
1382        int rc, lmmsize;
1383
1384        rc = ll_get_default_mdsize(sbi, &lmmsize);
1385        if (rc)
1386                return rc;
1387
1388        op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1389                                     strlen(filename), lmmsize,
1390                                     LUSTRE_OPC_ANY, NULL);
1391        if (IS_ERR(op_data))
1392                return PTR_ERR(op_data);
1393
1394        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1395        rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1396        ll_finish_md_op_data(op_data);
1397        if (rc < 0) {
1398                CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n",
1399                       filename, rc);
1400                goto out;
1401        }
1402
1403        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1404        LASSERT(body != NULL); /* checked by mdc_getattr_name */
1405
1406        lmmsize = body->eadatasize;
1407
1408        if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1409                        lmmsize == 0) {
1410                rc = -ENODATA;
1411                goto out;
1412        }
1413
1414        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1415        LASSERT(lmm != NULL);
1416
1417        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1418            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1419                rc = -EPROTO;
1420                goto out;
1421        }
1422
1423        /*
1424         * This is coming from the MDS, so is probably in
1425         * little endian.  We convert it to host endian before
1426         * passing it to userspace.
1427         */
1428        if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
1429                int stripe_count;
1430
1431                stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1432                if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1433                        stripe_count = 0;
1434
1435                /* if function called for directory - we should
1436                 * avoid swab not existent lsm objects */
1437                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1438                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1439                        if (S_ISREG(body->mode))
1440                                lustre_swab_lov_user_md_objects(
1441                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1442                                 stripe_count);
1443                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1444                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1445                        if (S_ISREG(body->mode))
1446                                lustre_swab_lov_user_md_objects(
1447                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1448                                 stripe_count);
1449                }
1450        }
1451
1452out:
1453        *lmmp = lmm;
1454        *lmm_size = lmmsize;
1455        *request = req;
1456        return rc;
1457}
1458
1459static int ll_lov_setea(struct inode *inode, struct file *file,
1460                            unsigned long arg)
1461{
1462        int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1463        struct lov_user_md      *lump;
1464        int                      lum_size = sizeof(struct lov_user_md) +
1465                                            sizeof(struct lov_user_ost_data);
1466        int                      rc;
1467
1468        if (!capable(CFS_CAP_SYS_ADMIN))
1469                return -EPERM;
1470
1471        lump = libcfs_kvzalloc(lum_size, GFP_NOFS);
1472        if (lump == NULL)
1473                return -ENOMEM;
1474
1475        if (copy_from_user(lump, (struct lov_user_md *)arg, lum_size)) {
1476                kvfree(lump);
1477                return -EFAULT;
1478        }
1479
1480        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump,
1481                                     lum_size);
1482        cl_lov_delay_create_clear(&file->f_flags);
1483
1484        kvfree(lump);
1485        return rc;
1486}
1487
1488static int ll_lov_setstripe(struct inode *inode, struct file *file,
1489                            unsigned long arg)
1490{
1491        struct lov_user_md_v3    lumv3;
1492        struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1493        struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1494        struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1495        int                      lum_size, rc;
1496        int                      flags = FMODE_WRITE;
1497
1498        /* first try with v1 which is smaller than v3 */
1499        lum_size = sizeof(struct lov_user_md_v1);
1500        if (copy_from_user(lumv1, lumv1p, lum_size))
1501                return -EFAULT;
1502
1503        if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1504                lum_size = sizeof(struct lov_user_md_v3);
1505                if (copy_from_user(&lumv3, lumv3p, lum_size))
1506                        return -EFAULT;
1507        }
1508
1509        rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lumv1,
1510                                      lum_size);
1511        cl_lov_delay_create_clear(&file->f_flags);
1512        if (rc == 0) {
1513                struct lov_stripe_md *lsm;
1514                __u32 gen;
1515
1516                put_user(0, &lumv1p->lmm_stripe_count);
1517
1518                ll_layout_refresh(inode, &gen);
1519                lsm = ccc_inode_lsm_get(inode);
1520                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1521                                   0, lsm, (void *)arg);
1522                ccc_inode_lsm_put(inode, lsm);
1523        }
1524        return rc;
1525}
1526
1527static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1528{
1529        struct lov_stripe_md *lsm;
1530        int rc = -ENODATA;
1531
1532        lsm = ccc_inode_lsm_get(inode);
1533        if (lsm != NULL)
1534                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1535                                   lsm, (void *)arg);
1536        ccc_inode_lsm_put(inode, lsm);
1537        return rc;
1538}
1539
1540static int
1541ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1542{
1543        struct ll_inode_info   *lli = ll_i2info(inode);
1544        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1545        struct ccc_grouplock    grouplock;
1546        int                  rc;
1547
1548        if (arg == 0) {
1549                CWARN("group id for group lock must not be 0\n");
1550                return -EINVAL;
1551        }
1552
1553        if (ll_file_nolock(file))
1554                return -EOPNOTSUPP;
1555
1556        spin_lock(&lli->lli_lock);
1557        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1558                CWARN("group lock already existed with gid %lu\n",
1559                      fd->fd_grouplock.cg_gid);
1560                spin_unlock(&lli->lli_lock);
1561                return -EINVAL;
1562        }
1563        LASSERT(fd->fd_grouplock.cg_lock == NULL);
1564        spin_unlock(&lli->lli_lock);
1565
1566        rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1567                              arg, (file->f_flags & O_NONBLOCK), &grouplock);
1568        if (rc)
1569                return rc;
1570
1571        spin_lock(&lli->lli_lock);
1572        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1573                spin_unlock(&lli->lli_lock);
1574                CERROR("another thread just won the race\n");
1575                cl_put_grouplock(&grouplock);
1576                return -EINVAL;
1577        }
1578
1579        fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1580        fd->fd_grouplock = grouplock;
1581        spin_unlock(&lli->lli_lock);
1582
1583        CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1584        return 0;
1585}
1586
1587static int ll_put_grouplock(struct inode *inode, struct file *file,
1588                            unsigned long arg)
1589{
1590        struct ll_inode_info   *lli = ll_i2info(inode);
1591        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1592        struct ccc_grouplock    grouplock;
1593
1594        spin_lock(&lli->lli_lock);
1595        if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1596                spin_unlock(&lli->lli_lock);
1597                CWARN("no group lock held\n");
1598                return -EINVAL;
1599        }
1600        LASSERT(fd->fd_grouplock.cg_lock != NULL);
1601
1602        if (fd->fd_grouplock.cg_gid != arg) {
1603                CWARN("group lock %lu doesn't match current id %lu\n",
1604                       arg, fd->fd_grouplock.cg_gid);
1605                spin_unlock(&lli->lli_lock);
1606                return -EINVAL;
1607        }
1608
1609        grouplock = fd->fd_grouplock;
1610        memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1611        fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1612        spin_unlock(&lli->lli_lock);
1613
1614        cl_put_grouplock(&grouplock);
1615        CDEBUG(D_INFO, "group lock %lu released\n", arg);
1616        return 0;
1617}
1618
1619/**
1620 * Close inode open handle
1621 *
1622 * \param inode  [in]     inode in question
1623 * \param it     [in,out] intent which contains open info and result
1624 *
1625 * \retval 0     success
1626 * \retval <0    failure
1627 */
1628int ll_release_openhandle(struct inode *inode, struct lookup_intent *it)
1629{
1630        struct obd_client_handle *och;
1631        int rc;
1632
1633        LASSERT(inode);
1634
1635        /* Root ? Do nothing. */
1636        if (is_root_inode(inode))
1637                return 0;
1638
1639        /* No open handle to close? Move away */
1640        if (!it_disposition(it, DISP_OPEN_OPEN))
1641                return 0;
1642
1643        LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1644
1645        och = kzalloc(sizeof(*och), GFP_NOFS);
1646        if (!och) {
1647                rc = -ENOMEM;
1648                goto out;
1649        }
1650
1651        ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1652
1653        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1654                                       inode, och, NULL);
1655out:
1656        /* this one is in place of ll_file_open */
1657        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1658                ptlrpc_req_finished(it->d.lustre.it_data);
1659                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1660        }
1661        return rc;
1662}
1663
1664/**
1665 * Get size for inode for which FIEMAP mapping is requested.
1666 * Make the FIEMAP get_info call and returns the result.
1667 */
1668static int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1669                        size_t num_bytes)
1670{
1671        struct obd_export *exp = ll_i2dtexp(inode);
1672        struct lov_stripe_md *lsm = NULL;
1673        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1674        __u32 vallen = num_bytes;
1675        int rc;
1676
1677        /* Checks for fiemap flags */
1678        if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1679                fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1680                return -EBADR;
1681        }
1682
1683        /* Check for FIEMAP_FLAG_SYNC */
1684        if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1685                rc = filemap_fdatawrite(inode->i_mapping);
1686                if (rc)
1687                        return rc;
1688        }
1689
1690        lsm = ccc_inode_lsm_get(inode);
1691        if (lsm == NULL)
1692                return -ENOENT;
1693
1694        /* If the stripe_count > 1 and the application does not understand
1695         * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1696         */
1697        if (lsm->lsm_stripe_count > 1 &&
1698            !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) {
1699                rc = -EOPNOTSUPP;
1700                goto out;
1701        }
1702
1703        fm_key.oa.o_oi = lsm->lsm_oi;
1704        fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1705
1706        if (i_size_read(inode) == 0) {
1707                rc = ll_glimpse_size(inode);
1708                if (rc)
1709                        goto out;
1710        }
1711
1712        obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1713        obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1714        /* If filesize is 0, then there would be no objects for mapping */
1715        if (fm_key.oa.o_size == 0) {
1716                fiemap->fm_mapped_extents = 0;
1717                rc = 0;
1718                goto out;
1719        }
1720
1721        memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1722
1723        rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1724                          fiemap, lsm);
1725        if (rc)
1726                CERROR("obd_get_info failed: rc = %d\n", rc);
1727
1728out:
1729        ccc_inode_lsm_put(inode, lsm);
1730        return rc;
1731}
1732
1733int ll_fid2path(struct inode *inode, void __user *arg)
1734{
1735        struct obd_export *exp = ll_i2mdexp(inode);
1736        const struct getinfo_fid2path __user *gfin = arg;
1737        struct getinfo_fid2path *gfout;
1738        u32 pathlen;
1739        size_t outsize;
1740        int rc;
1741
1742        if (!capable(CFS_CAP_DAC_READ_SEARCH) &&
1743            !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1744                return -EPERM;
1745
1746        /* Only need to get the buflen */
1747        if (get_user(pathlen, &gfin->gf_pathlen))
1748                return -EFAULT;
1749
1750        if (pathlen > PATH_MAX)
1751                return -EINVAL;
1752
1753        outsize = sizeof(*gfout) + pathlen;
1754
1755        gfout = kzalloc(outsize, GFP_NOFS);
1756        if (!gfout)
1757                return -ENOMEM;
1758
1759        if (copy_from_user(gfout, arg, sizeof(*gfout))) {
1760                rc = -EFAULT;
1761                goto gf_free;
1762        }
1763
1764        /* Call mdc_iocontrol */
1765        rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1766        if (rc != 0)
1767                goto gf_free;
1768
1769        if (copy_to_user(arg, gfout, outsize))
1770                rc = -EFAULT;
1771
1772gf_free:
1773        kfree(gfout);
1774        return rc;
1775}
1776
1777static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1778{
1779        struct ll_user_fiemap *fiemap_s;
1780        size_t num_bytes, ret_bytes;
1781        unsigned int extent_count;
1782        int rc = 0;
1783
1784        /* Get the extent count so we can calculate the size of
1785         * required fiemap buffer */
1786        if (get_user(extent_count,
1787            &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1788                return -EFAULT;
1789
1790        if (extent_count >=
1791            (SIZE_MAX - sizeof(*fiemap_s)) / sizeof(struct ll_fiemap_extent))
1792                return -EINVAL;
1793        num_bytes = sizeof(*fiemap_s) + (extent_count *
1794                                         sizeof(struct ll_fiemap_extent));
1795
1796        fiemap_s = libcfs_kvzalloc(num_bytes, GFP_NOFS);
1797        if (fiemap_s == NULL)
1798                return -ENOMEM;
1799
1800        /* get the fiemap value */
1801        if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1802                           sizeof(*fiemap_s))) {
1803                rc = -EFAULT;
1804                goto error;
1805        }
1806
1807        /* If fm_extent_count is non-zero, read the first extent since
1808         * it is used to calculate end_offset and device from previous
1809         * fiemap call. */
1810        if (extent_count) {
1811                if (copy_from_user(&fiemap_s->fm_extents[0],
1812                    (char __user *)arg + sizeof(*fiemap_s),
1813                    sizeof(struct ll_fiemap_extent))) {
1814                        rc = -EFAULT;
1815                        goto error;
1816                }
1817        }
1818
1819        rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1820        if (rc)
1821                goto error;
1822
1823        ret_bytes = sizeof(struct ll_user_fiemap);
1824
1825        if (extent_count != 0)
1826                ret_bytes += (fiemap_s->fm_mapped_extents *
1827                                 sizeof(struct ll_fiemap_extent));
1828
1829        if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1830                rc = -EFAULT;
1831
1832error:
1833        kvfree(fiemap_s);
1834        return rc;
1835}
1836
1837/*
1838 * Read the data_version for inode.
1839 *
1840 * This value is computed using stripe object version on OST.
1841 * Version is computed using server side locking.
1842 *
1843 * @param extent_lock  Take extent lock. Not needed if a process is already
1844 *                     holding the OST object group locks.
1845 */
1846int ll_data_version(struct inode *inode, __u64 *data_version,
1847                    int extent_lock)
1848{
1849        struct lov_stripe_md    *lsm = NULL;
1850        struct ll_sb_info       *sbi = ll_i2sbi(inode);
1851        struct obdo             *obdo = NULL;
1852        int                      rc;
1853
1854        /* If no stripe, we consider version is 0. */
1855        lsm = ccc_inode_lsm_get(inode);
1856        if (!lsm_has_objects(lsm)) {
1857                *data_version = 0;
1858                CDEBUG(D_INODE, "No object for inode\n");
1859                rc = 0;
1860                goto out;
1861        }
1862
1863        obdo = kzalloc(sizeof(*obdo), GFP_NOFS);
1864        if (!obdo) {
1865                rc = -ENOMEM;
1866                goto out;
1867        }
1868
1869        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, obdo, 0, extent_lock);
1870        if (rc == 0) {
1871                if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1872                        rc = -EOPNOTSUPP;
1873                else
1874                        *data_version = obdo->o_data_version;
1875        }
1876
1877        kfree(obdo);
1878out:
1879        ccc_inode_lsm_put(inode, lsm);
1880        return rc;
1881}
1882
1883/*
1884 * Trigger a HSM release request for the provided inode.
1885 */
1886int ll_hsm_release(struct inode *inode)
1887{
1888        struct cl_env_nest nest;
1889        struct lu_env *env;
1890        struct obd_client_handle *och = NULL;
1891        __u64 data_version = 0;
1892        int rc;
1893
1894        CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
1895               ll_get_fsname(inode->i_sb, NULL, 0),
1896               PFID(&ll_i2info(inode)->lli_fid));
1897
1898        och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
1899        if (IS_ERR(och)) {
1900                rc = PTR_ERR(och);
1901                goto out;
1902        }
1903
1904        /* Grab latest data_version and [am]time values */
1905        rc = ll_data_version(inode, &data_version, 1);
1906        if (rc != 0)
1907                goto out;
1908
1909        env = cl_env_nested_get(&nest);
1910        if (IS_ERR(env)) {
1911                rc = PTR_ERR(env);
1912                goto out;
1913        }
1914
1915        ll_merge_lvb(env, inode);
1916        cl_env_nested_put(&nest, env);
1917
1918        /* Release the file.
1919         * NB: lease lock handle is released in mdc_hsm_release_pack() because
1920         * we still need it to pack l_remote_handle to MDT. */
1921        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och,
1922                                       &data_version);
1923        och = NULL;
1924
1925out:
1926        if (och != NULL && !IS_ERR(och)) /* close the file */
1927                ll_lease_close(och, inode, NULL);
1928
1929        return rc;
1930}
1931
1932struct ll_swap_stack {
1933        struct iattr             ia1, ia2;
1934        __u64                    dv1, dv2;
1935        struct inode            *inode1, *inode2;
1936        bool                     check_dv1, check_dv2;
1937};
1938
1939static int ll_swap_layouts(struct file *file1, struct file *file2,
1940                           struct lustre_swap_layouts *lsl)
1941{
1942        struct mdc_swap_layouts  msl;
1943        struct md_op_data       *op_data;
1944        __u32                    gid;
1945        __u64                    dv;
1946        struct ll_swap_stack    *llss = NULL;
1947        int                      rc;
1948
1949        llss = kzalloc(sizeof(*llss), GFP_NOFS);
1950        if (!llss)
1951                return -ENOMEM;
1952
1953        llss->inode1 = file_inode(file1);
1954        llss->inode2 = file_inode(file2);
1955
1956        if (!S_ISREG(llss->inode2->i_mode)) {
1957                rc = -EINVAL;
1958                goto free;
1959        }
1960
1961        if (inode_permission(llss->inode1, MAY_WRITE) ||
1962            inode_permission(llss->inode2, MAY_WRITE)) {
1963                rc = -EPERM;
1964                goto free;
1965        }
1966
1967        if (llss->inode2->i_sb != llss->inode1->i_sb) {
1968                rc = -EXDEV;
1969                goto free;
1970        }
1971
1972        /* we use 2 bool because it is easier to swap than 2 bits */
1973        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1974                llss->check_dv1 = true;
1975
1976        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1977                llss->check_dv2 = true;
1978
1979        /* we cannot use lsl->sl_dvX directly because we may swap them */
1980        llss->dv1 = lsl->sl_dv1;
1981        llss->dv2 = lsl->sl_dv2;
1982
1983        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1984        if (rc == 0) /* same file, done! */ {
1985                rc = 0;
1986                goto free;
1987        }
1988
1989        if (rc < 0) { /* sequentialize it */
1990                swap(llss->inode1, llss->inode2);
1991                swap(file1, file2);
1992                swap(llss->dv1, llss->dv2);
1993                swap(llss->check_dv1, llss->check_dv2);
1994        }
1995
1996        gid = lsl->sl_gid;
1997        if (gid != 0) { /* application asks to flush dirty cache */
1998                rc = ll_get_grouplock(llss->inode1, file1, gid);
1999                if (rc < 0)
2000                        goto free;
2001
2002                rc = ll_get_grouplock(llss->inode2, file2, gid);
2003                if (rc < 0) {
2004                        ll_put_grouplock(llss->inode1, file1, gid);
2005                        goto free;
2006                }
2007        }
2008
2009        /* to be able to restore mtime and atime after swap
2010         * we need to first save them */
2011        if (lsl->sl_flags &
2012            (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2013                llss->ia1.ia_mtime = llss->inode1->i_mtime;
2014                llss->ia1.ia_atime = llss->inode1->i_atime;
2015                llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2016                llss->ia2.ia_mtime = llss->inode2->i_mtime;
2017                llss->ia2.ia_atime = llss->inode2->i_atime;
2018                llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2019        }
2020
2021        /* ultimate check, before swapping the layouts we check if
2022         * dataversion has changed (if requested) */
2023        if (llss->check_dv1) {
2024                rc = ll_data_version(llss->inode1, &dv, 0);
2025                if (rc)
2026                        goto putgl;
2027                if (dv != llss->dv1) {
2028                        rc = -EAGAIN;
2029                        goto putgl;
2030                }
2031        }
2032
2033        if (llss->check_dv2) {
2034                rc = ll_data_version(llss->inode2, &dv, 0);
2035                if (rc)
2036                        goto putgl;
2037                if (dv != llss->dv2) {
2038                        rc = -EAGAIN;
2039                        goto putgl;
2040                }
2041        }
2042
2043        /* struct md_op_data is used to send the swap args to the mdt
2044         * only flags is missing, so we use struct mdc_swap_layouts
2045         * through the md_op_data->op_data */
2046        /* flags from user space have to be converted before they are send to
2047         * server, no flag is sent today, they are only used on the client */
2048        msl.msl_flags = 0;
2049        rc = -ENOMEM;
2050        op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2051                                     0, LUSTRE_OPC_ANY, &msl);
2052        if (IS_ERR(op_data)) {
2053                rc = PTR_ERR(op_data);
2054                goto free;
2055        }
2056
2057        rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2058                           sizeof(*op_data), op_data, NULL);
2059        ll_finish_md_op_data(op_data);
2060
2061putgl:
2062        if (gid != 0) {
2063                ll_put_grouplock(llss->inode2, file2, gid);
2064                ll_put_grouplock(llss->inode1, file1, gid);
2065        }
2066
2067        /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2068        if (rc != 0)
2069                goto free;
2070
2071        /* clear useless flags */
2072        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2073                llss->ia1.ia_valid &= ~ATTR_MTIME;
2074                llss->ia2.ia_valid &= ~ATTR_MTIME;
2075        }
2076
2077        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2078                llss->ia1.ia_valid &= ~ATTR_ATIME;
2079                llss->ia2.ia_valid &= ~ATTR_ATIME;
2080        }
2081
2082        /* update time if requested */
2083        rc = 0;
2084        if (llss->ia2.ia_valid != 0) {
2085                mutex_lock(&llss->inode1->i_mutex);
2086                rc = ll_setattr(file1->f_path.dentry, &llss->ia2);
2087                mutex_unlock(&llss->inode1->i_mutex);
2088        }
2089
2090        if (llss->ia1.ia_valid != 0) {
2091                int rc1;
2092
2093                mutex_lock(&llss->inode2->i_mutex);
2094                rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1);
2095                mutex_unlock(&llss->inode2->i_mutex);
2096                if (rc == 0)
2097                        rc = rc1;
2098        }
2099
2100free:
2101        kfree(llss);
2102
2103        return rc;
2104}
2105
2106static int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2107{
2108        struct md_op_data       *op_data;
2109        int                      rc;
2110
2111        /* Detect out-of range masks */
2112        if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2113                return -EINVAL;
2114
2115        /* Non-root users are forbidden to set or clear flags which are
2116         * NOT defined in HSM_USER_MASK. */
2117        if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2118            !capable(CFS_CAP_SYS_ADMIN))
2119                return -EPERM;
2120
2121        /* Detect out-of range archive id */
2122        if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2123            (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2124                return -EINVAL;
2125
2126        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2127                                     LUSTRE_OPC_ANY, hss);
2128        if (IS_ERR(op_data))
2129                return PTR_ERR(op_data);
2130
2131        rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2132                           sizeof(*op_data), op_data, NULL);
2133
2134        ll_finish_md_op_data(op_data);
2135
2136        return rc;
2137}
2138
2139static int ll_hsm_import(struct inode *inode, struct file *file,
2140                         struct hsm_user_import *hui)
2141{
2142        struct hsm_state_set    *hss = NULL;
2143        struct iattr            *attr = NULL;
2144        int                      rc;
2145
2146        if (!S_ISREG(inode->i_mode))
2147                return -EINVAL;
2148
2149        /* set HSM flags */
2150        hss = kzalloc(sizeof(*hss), GFP_NOFS);
2151        if (!hss)
2152                return -ENOMEM;
2153
2154        hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2155        hss->hss_archive_id = hui->hui_archive_id;
2156        hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2157        rc = ll_hsm_state_set(inode, hss);
2158        if (rc != 0)
2159                goto free_hss;
2160
2161        attr = kzalloc(sizeof(*attr), GFP_NOFS);
2162        if (!attr) {
2163                rc = -ENOMEM;
2164                goto free_hss;
2165        }
2166
2167        attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2168        attr->ia_mode |= S_IFREG;
2169        attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2170        attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2171        attr->ia_size = hui->hui_size;
2172        attr->ia_mtime.tv_sec = hui->hui_mtime;
2173        attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2174        attr->ia_atime.tv_sec = hui->hui_atime;
2175        attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2176
2177        attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2178                         ATTR_UID | ATTR_GID |
2179                         ATTR_MTIME | ATTR_MTIME_SET |
2180                         ATTR_ATIME | ATTR_ATIME_SET;
2181
2182        mutex_lock(&inode->i_mutex);
2183
2184        rc = ll_setattr_raw(file->f_path.dentry, attr, true);
2185        if (rc == -ENODATA)
2186                rc = 0;
2187
2188        mutex_unlock(&inode->i_mutex);
2189
2190        kfree(attr);
2191free_hss:
2192        kfree(hss);
2193        return rc;
2194}
2195
2196static long
2197ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2198{
2199        struct inode            *inode = file_inode(file);
2200        struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2201        int                      flags, rc;
2202
2203        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2204               inode->i_generation, inode, cmd);
2205        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2206
2207        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2208        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2209                return -ENOTTY;
2210
2211        switch (cmd) {
2212        case LL_IOC_GETFLAGS:
2213                /* Get the current value of the file flags */
2214                return put_user(fd->fd_flags, (int *)arg);
2215        case LL_IOC_SETFLAGS:
2216        case LL_IOC_CLRFLAGS:
2217                /* Set or clear specific file flags */
2218                /* XXX This probably needs checks to ensure the flags are
2219                 *     not abused, and to handle any flag side effects.
2220                 */
2221                if (get_user(flags, (int *) arg))
2222                        return -EFAULT;
2223
2224                if (cmd == LL_IOC_SETFLAGS) {
2225                        if ((flags & LL_FILE_IGNORE_LOCK) &&
2226                            !(file->f_flags & O_DIRECT)) {
2227                                CERROR("%s: unable to disable locking on non-O_DIRECT file\n",
2228                                       current->comm);
2229                                return -EINVAL;
2230                        }
2231
2232                        fd->fd_flags |= flags;
2233                } else {
2234                        fd->fd_flags &= ~flags;
2235                }
2236                return 0;
2237        case LL_IOC_LOV_SETSTRIPE:
2238                return ll_lov_setstripe(inode, file, arg);
2239        case LL_IOC_LOV_SETEA:
2240                return ll_lov_setea(inode, file, arg);
2241        case LL_IOC_LOV_SWAP_LAYOUTS: {
2242                struct file *file2;
2243                struct lustre_swap_layouts lsl;
2244
2245                if (copy_from_user(&lsl, (char *)arg,
2246                                       sizeof(struct lustre_swap_layouts)))
2247                        return -EFAULT;
2248
2249                if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2250                        return -EPERM;
2251
2252                file2 = fget(lsl.sl_fd);
2253                if (file2 == NULL)
2254                        return -EBADF;
2255
2256                rc = -EPERM;
2257                if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2258                        rc = ll_swap_layouts(file, file2, &lsl);
2259                fput(file2);
2260                return rc;
2261        }
2262        case LL_IOC_LOV_GETSTRIPE:
2263                return ll_lov_getstripe(inode, arg);
2264        case LL_IOC_RECREATE_OBJ:
2265                return ll_lov_recreate_obj(inode, arg);
2266        case LL_IOC_RECREATE_FID:
2267                return ll_lov_recreate_fid(inode, arg);
2268        case FSFILT_IOC_FIEMAP:
2269                return ll_ioctl_fiemap(inode, arg);
2270        case FSFILT_IOC_GETFLAGS:
2271        case FSFILT_IOC_SETFLAGS:
2272                return ll_iocontrol(inode, file, cmd, arg);
2273        case FSFILT_IOC_GETVERSION_OLD:
2274        case FSFILT_IOC_GETVERSION:
2275                return put_user(inode->i_generation, (int *)arg);
2276        case LL_IOC_GROUP_LOCK:
2277                return ll_get_grouplock(inode, file, arg);
2278        case LL_IOC_GROUP_UNLOCK:
2279                return ll_put_grouplock(inode, file, arg);
2280        case IOC_OBD_STATFS:
2281                return ll_obd_statfs(inode, (void *)arg);
2282
2283        /* We need to special case any other ioctls we want to handle,
2284         * to send them to the MDS/OST as appropriate and to properly
2285         * network encode the arg field.
2286        case FSFILT_IOC_SETVERSION_OLD:
2287        case FSFILT_IOC_SETVERSION:
2288        */
2289        case LL_IOC_FLUSHCTX:
2290                return ll_flush_ctx(inode);
2291        case LL_IOC_PATH2FID: {
2292                if (copy_to_user((void *)arg, ll_inode2fid(inode),
2293                                 sizeof(struct lu_fid)))
2294                        return -EFAULT;
2295
2296                return 0;
2297        }
2298        case OBD_IOC_FID2PATH:
2299                return ll_fid2path(inode, (void *)arg);
2300        case LL_IOC_DATA_VERSION: {
2301                struct ioc_data_version idv;
2302                int                     rc;
2303
2304                if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2305                        return -EFAULT;
2306
2307                rc = ll_data_version(inode, &idv.idv_version,
2308                                !(idv.idv_flags & LL_DV_NOFLUSH));
2309
2310                if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2311                        return -EFAULT;
2312
2313                return rc;
2314        }
2315
2316        case LL_IOC_GET_MDTIDX: {
2317                int mdtidx;
2318
2319                mdtidx = ll_get_mdt_idx(inode);
2320                if (mdtidx < 0)
2321                        return mdtidx;
2322
2323                if (put_user((int)mdtidx, (int *)arg))
2324                        return -EFAULT;
2325
2326                return 0;
2327        }
2328        case OBD_IOC_GETDTNAME:
2329        case OBD_IOC_GETMDNAME:
2330                return ll_get_obd_name(inode, cmd, arg);
2331        case LL_IOC_HSM_STATE_GET: {
2332                struct md_op_data       *op_data;
2333                struct hsm_user_state   *hus;
2334                int                      rc;
2335
2336                hus = kzalloc(sizeof(*hus), GFP_NOFS);
2337                if (!hus)
2338                        return -ENOMEM;
2339
2340                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2341                                             LUSTRE_OPC_ANY, hus);
2342                if (IS_ERR(op_data)) {
2343                        kfree(hus);
2344                        return PTR_ERR(op_data);
2345                }
2346
2347                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2348                                   op_data, NULL);
2349
2350                if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2351                        rc = -EFAULT;
2352
2353                ll_finish_md_op_data(op_data);
2354                kfree(hus);
2355                return rc;
2356        }
2357        case LL_IOC_HSM_STATE_SET: {
2358                struct hsm_state_set    *hss;
2359                int                      rc;
2360
2361                hss = memdup_user((char *)arg, sizeof(*hss));
2362                if (IS_ERR(hss))
2363                        return PTR_ERR(hss);
2364
2365                rc = ll_hsm_state_set(inode, hss);
2366
2367                kfree(hss);
2368                return rc;
2369        }
2370        case LL_IOC_HSM_ACTION: {
2371                struct md_op_data               *op_data;
2372                struct hsm_current_action       *hca;
2373                int                              rc;
2374
2375                hca = kzalloc(sizeof(*hca), GFP_NOFS);
2376                if (!hca)
2377                        return -ENOMEM;
2378
2379                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2380                                             LUSTRE_OPC_ANY, hca);
2381                if (IS_ERR(op_data)) {
2382                        kfree(hca);
2383                        return PTR_ERR(op_data);
2384                }
2385
2386                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2387                                   op_data, NULL);
2388
2389                if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2390                        rc = -EFAULT;
2391
2392                ll_finish_md_op_data(op_data);
2393                kfree(hca);
2394                return rc;
2395        }
2396        case LL_IOC_SET_LEASE: {
2397                struct ll_inode_info *lli = ll_i2info(inode);
2398                struct obd_client_handle *och = NULL;
2399                bool lease_broken;
2400                fmode_t mode = 0;
2401
2402                switch (arg) {
2403                case F_WRLCK:
2404                        if (!(file->f_mode & FMODE_WRITE))
2405                                return -EPERM;
2406                        mode = FMODE_WRITE;
2407                        break;
2408                case F_RDLCK:
2409                        if (!(file->f_mode & FMODE_READ))
2410                                return -EPERM;
2411                        mode = FMODE_READ;
2412                        break;
2413                case F_UNLCK:
2414                        mutex_lock(&lli->lli_och_mutex);
2415                        if (fd->fd_lease_och != NULL) {
2416                                och = fd->fd_lease_och;
2417                                fd->fd_lease_och = NULL;
2418                        }
2419                        mutex_unlock(&lli->lli_och_mutex);
2420
2421                        if (och != NULL) {
2422                                mode = och->och_flags &
2423                                       (FMODE_READ|FMODE_WRITE);
2424                                rc = ll_lease_close(och, inode, &lease_broken);
2425                                if (rc == 0 && lease_broken)
2426                                        mode = 0;
2427                        } else {
2428                                rc = -ENOLCK;
2429                        }
2430
2431                        /* return the type of lease or error */
2432                        return rc < 0 ? rc : (int)mode;
2433                default:
2434                        return -EINVAL;
2435                }
2436
2437                CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2438
2439                /* apply for lease */
2440                och = ll_lease_open(inode, file, mode, 0);
2441                if (IS_ERR(och))
2442                        return PTR_ERR(och);
2443
2444                rc = 0;
2445                mutex_lock(&lli->lli_och_mutex);
2446                if (fd->fd_lease_och == NULL) {
2447                        fd->fd_lease_och = och;
2448                        och = NULL;
2449                }
2450                mutex_unlock(&lli->lli_och_mutex);
2451                if (och != NULL) {
2452                        /* impossible now that only excl is supported for now */
2453                        ll_lease_close(och, inode, &lease_broken);
2454                        rc = -EBUSY;
2455                }
2456                return rc;
2457        }
2458        case LL_IOC_GET_LEASE: {
2459                struct ll_inode_info *lli = ll_i2info(inode);
2460                struct ldlm_lock *lock = NULL;
2461
2462                rc = 0;
2463                mutex_lock(&lli->lli_och_mutex);
2464                if (fd->fd_lease_och != NULL) {
2465                        struct obd_client_handle *och = fd->fd_lease_och;
2466
2467                        lock = ldlm_handle2lock(&och->och_lease_handle);
2468                        if (lock != NULL) {
2469                                lock_res_and_lock(lock);
2470                                if (!ldlm_is_cancel(lock))
2471                                        rc = och->och_flags &
2472                                                (FMODE_READ | FMODE_WRITE);
2473                                unlock_res_and_lock(lock);
2474                                ldlm_lock_put(lock);
2475                        }
2476                }
2477                mutex_unlock(&lli->lli_och_mutex);
2478                return rc;
2479        }
2480        case LL_IOC_HSM_IMPORT: {
2481                struct hsm_user_import *hui;
2482
2483                hui = memdup_user((void *)arg, sizeof(*hui));
2484                if (IS_ERR(hui))
2485                        return PTR_ERR(hui);
2486
2487                rc = ll_hsm_import(inode, file, hui);
2488
2489                kfree(hui);
2490                return rc;
2491        }
2492        default: {
2493                int err;
2494
2495                if (ll_iocontrol_call(inode, file, cmd, arg, &err) ==
2496                     LLIOC_STOP)
2497                        return err;
2498
2499                return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2500                                     (void *)arg);
2501        }
2502        }
2503}
2504
2505static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2506{
2507        struct inode *inode = file_inode(file);
2508        loff_t retval, eof = 0;
2509
2510        retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2511                           (origin == SEEK_CUR) ? file->f_pos : 0);
2512        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2513               inode->i_ino, inode->i_generation, inode, retval, retval,
2514               origin);
2515        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2516
2517        if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2518                retval = ll_glimpse_size(inode);
2519                if (retval != 0)
2520                        return retval;
2521                eof = i_size_read(inode);
2522        }
2523
2524        retval = generic_file_llseek_size(file, offset, origin,
2525                                          ll_file_maxbytes(inode), eof);
2526        return retval;
2527}
2528
2529static int ll_flush(struct file *file, fl_owner_t id)
2530{
2531        struct inode *inode = file_inode(file);
2532        struct ll_inode_info *lli = ll_i2info(inode);
2533        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2534        int rc, err;
2535
2536        LASSERT(!S_ISDIR(inode->i_mode));
2537
2538        /* catch async errors that were recorded back when async writeback
2539         * failed for pages in this mapping. */
2540        rc = lli->lli_async_rc;
2541        lli->lli_async_rc = 0;
2542        err = lov_read_and_clear_async_rc(lli->lli_clob);
2543        if (rc == 0)
2544                rc = err;
2545
2546        /* The application has been told write failure already.
2547         * Do not report failure again. */
2548        if (fd->fd_write_failed)
2549                return 0;
2550        return rc ? -EIO : 0;
2551}
2552
2553/**
2554 * Called to make sure a portion of file has been written out.
2555 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
2556 *
2557 * Return how many pages have been written.
2558 */
2559int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2560                       enum cl_fsync_mode mode, int ignore_layout)
2561{
2562        struct cl_env_nest nest;
2563        struct lu_env *env;
2564        struct cl_io *io;
2565        struct cl_fsync_io *fio;
2566        int result;
2567
2568        if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2569            mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2570                return -EINVAL;
2571
2572        env = cl_env_nested_get(&nest);
2573        if (IS_ERR(env))
2574                return PTR_ERR(env);
2575
2576        io = ccc_env_thread_io(env);
2577        io->ci_obj = cl_i2info(inode)->lli_clob;
2578        io->ci_ignore_layout = ignore_layout;
2579
2580        /* initialize parameters for sync */
2581        fio = &io->u.ci_fsync;
2582        fio->fi_start = start;
2583        fio->fi_end = end;
2584        fio->fi_fid = ll_inode2fid(inode);
2585        fio->fi_mode = mode;
2586        fio->fi_nr_written = 0;
2587
2588        if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2589                result = cl_io_loop(env, io);
2590        else
2591                result = io->ci_result;
2592        if (result == 0)
2593                result = fio->fi_nr_written;
2594        cl_io_fini(env, io);
2595        cl_env_nested_put(&nest, env);
2596
2597        return result;
2598}
2599
2600int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2601{
2602        struct inode *inode = file_inode(file);
2603        struct ll_inode_info *lli = ll_i2info(inode);
2604        struct ptlrpc_request *req;
2605        int rc, err;
2606
2607        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2608               inode->i_generation, inode);
2609        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2610
2611        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2612        mutex_lock(&inode->i_mutex);
2613
2614        /* catch async errors that were recorded back when async writeback
2615         * failed for pages in this mapping. */
2616        if (!S_ISDIR(inode->i_mode)) {
2617                err = lli->lli_async_rc;
2618                lli->lli_async_rc = 0;
2619                if (rc == 0)
2620                        rc = err;
2621                err = lov_read_and_clear_async_rc(lli->lli_clob);
2622                if (rc == 0)
2623                        rc = err;
2624        }
2625
2626        err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
2627        if (!rc)
2628                rc = err;
2629        if (!err)
2630                ptlrpc_req_finished(req);
2631
2632        if (S_ISREG(inode->i_mode)) {
2633                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2634
2635                err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
2636                if (rc == 0 && err < 0)
2637                        rc = err;
2638                if (rc < 0)
2639                        fd->fd_write_failed = true;
2640                else
2641                        fd->fd_write_failed = false;
2642        }
2643
2644        mutex_unlock(&inode->i_mutex);
2645        return rc;
2646}
2647
2648static int
2649ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2650{
2651        struct inode *inode = file_inode(file);
2652        struct ll_sb_info *sbi = ll_i2sbi(inode);
2653        struct ldlm_enqueue_info einfo = {
2654                .ei_type        = LDLM_FLOCK,
2655                .ei_cb_cp       = ldlm_flock_completion_ast,
2656                .ei_cbdata      = file_lock,
2657        };
2658        struct md_op_data *op_data;
2659        struct lustre_handle lockh = {0};
2660        ldlm_policy_data_t flock = { {0} };
2661        __u64 flags = 0;
2662        int rc;
2663        int rc2 = 0;
2664
2665        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2666               inode->i_ino, file_lock);
2667
2668        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2669
2670        if (file_lock->fl_flags & FL_FLOCK)
2671                LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2672        else if (!(file_lock->fl_flags & FL_POSIX))
2673                return -EINVAL;
2674
2675        flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2676        flock.l_flock.pid = file_lock->fl_pid;
2677        flock.l_flock.start = file_lock->fl_start;
2678        flock.l_flock.end = file_lock->fl_end;
2679
2680        /* Somewhat ugly workaround for svc lockd.
2681         * lockd installs custom fl_lmops->lm_compare_owner that checks
2682         * for the fl_owner to be the same (which it always is on local node
2683         * I guess between lockd processes) and then compares pid.
2684         * As such we assign pid to the owner field to make it all work,
2685         * conflict with normal locks is unlikely since pid space and
2686         * pointer space for current->files are not intersecting */
2687        if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2688                flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2689
2690        switch (file_lock->fl_type) {
2691        case F_RDLCK:
2692                einfo.ei_mode = LCK_PR;
2693                break;
2694        case F_UNLCK:
2695                /* An unlock request may or may not have any relation to
2696                 * existing locks so we may not be able to pass a lock handle
2697                 * via a normal ldlm_lock_cancel() request. The request may even
2698                 * unlock a byte range in the middle of an existing lock. In
2699                 * order to process an unlock request we need all of the same
2700                 * information that is given with a normal read or write record
2701                 * lock request. To avoid creating another ldlm unlock (cancel)
2702                 * message we'll treat a LCK_NL flock request as an unlock. */
2703                einfo.ei_mode = LCK_NL;
2704                break;
2705        case F_WRLCK:
2706                einfo.ei_mode = LCK_PW;
2707                break;
2708        default:
2709                CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2710                        file_lock->fl_type);
2711                return -ENOTSUPP;
2712        }
2713
2714        switch (cmd) {
2715        case F_SETLKW:
2716#ifdef F_SETLKW64
2717        case F_SETLKW64:
2718#endif
2719                flags = 0;
2720                break;
2721        case F_SETLK:
2722#ifdef F_SETLK64
2723        case F_SETLK64:
2724#endif
2725                flags = LDLM_FL_BLOCK_NOWAIT;
2726                break;
2727        case F_GETLK:
2728#ifdef F_GETLK64
2729        case F_GETLK64:
2730#endif
2731                flags = LDLM_FL_TEST_LOCK;
2732                /* Save the old mode so that if the mode in the lock changes we
2733                 * can decrement the appropriate reader or writer refcount. */
2734                file_lock->fl_type = einfo.ei_mode;
2735                break;
2736        default:
2737                CERROR("unknown fcntl lock command: %d\n", cmd);
2738                return -EINVAL;
2739        }
2740
2741        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2742                                     LUSTRE_OPC_ANY, NULL);
2743        if (IS_ERR(op_data))
2744                return PTR_ERR(op_data);
2745
2746        CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n",
2747               inode->i_ino, flock.l_flock.pid, flags, einfo.ei_mode,
2748               flock.l_flock.start, flock.l_flock.end);
2749
2750        rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2751                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2752
2753        if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
2754            !(flags & LDLM_FL_TEST_LOCK))
2755                rc2  = locks_lock_file_wait(file, file_lock);
2756
2757        if (rc2 && file_lock->fl_type != F_UNLCK) {
2758                einfo.ei_mode = LCK_NL;
2759                md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2760                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2761                rc = rc2;
2762        }
2763
2764        ll_finish_md_op_data(op_data);
2765
2766        return rc;
2767}
2768
2769static int
2770ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2771{
2772        return -ENOSYS;
2773}
2774
2775/**
2776 * test if some locks matching bits and l_req_mode are acquired
2777 * - bits can be in different locks
2778 * - if found clear the common lock bits in *bits
2779 * - the bits not found, are kept in *bits
2780 * \param inode [IN]
2781 * \param bits [IN] searched lock bits [IN]
2782 * \param l_req_mode [IN] searched lock mode
2783 * \retval boolean, true iff all bits are found
2784 */
2785int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2786{
2787        struct lustre_handle lockh;
2788        ldlm_policy_data_t policy;
2789        ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2790                                (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2791        struct lu_fid *fid;
2792        __u64 flags;
2793        int i;
2794
2795        if (!inode)
2796                return 0;
2797
2798        fid = &ll_i2info(inode)->lli_fid;
2799        CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2800               ldlm_lockname[mode]);
2801
2802        flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2803        for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2804                policy.l_inodebits.bits = *bits & (1 << i);
2805                if (policy.l_inodebits.bits == 0)
2806                        continue;
2807
2808                if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2809                                  &policy, mode, &lockh)) {
2810                        struct ldlm_lock *lock;
2811
2812                        lock = ldlm_handle2lock(&lockh);
2813                        if (lock) {
2814                                *bits &=
2815                                      ~(lock->l_policy_data.l_inodebits.bits);
2816                                LDLM_LOCK_PUT(lock);
2817                        } else {
2818                                *bits &= ~policy.l_inodebits.bits;
2819                        }
2820                }
2821        }
2822        return *bits == 0;
2823}
2824
2825ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2826                            struct lustre_handle *lockh, __u64 flags,
2827                            ldlm_mode_t mode)
2828{
2829        ldlm_policy_data_t policy = { .l_inodebits = {bits} };
2830        struct lu_fid *fid;
2831        ldlm_mode_t rc;
2832
2833        fid = &ll_i2info(inode)->lli_fid;
2834        CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2835
2836        rc = md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED,
2837                           fid, LDLM_IBITS, &policy, mode, lockh);
2838
2839        return rc;
2840}
2841
2842static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2843{
2844        /* Already unlinked. Just update nlink and return success */
2845        if (rc == -ENOENT) {
2846                clear_nlink(inode);
2847                /* This path cannot be hit for regular files unless in
2848                 * case of obscure races, so no need to validate size.
2849                 */
2850                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2851                        return 0;
2852        } else if (rc != 0) {
2853                CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
2854                             "%s: revalidate FID "DFID" error: rc = %d\n",
2855                             ll_get_fsname(inode->i_sb, NULL, 0),
2856                             PFID(ll_inode2fid(inode)), rc);
2857        }
2858
2859        return rc;
2860}
2861
2862static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2863{
2864        struct inode *inode = d_inode(dentry);
2865        struct ptlrpc_request *req = NULL;
2866        struct obd_export *exp;
2867        int rc = 0;
2868
2869        LASSERT(inode != NULL);
2870
2871        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%pd\n",
2872               inode->i_ino, inode->i_generation, inode, dentry);
2873
2874        exp = ll_i2mdexp(inode);
2875
2876        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2877         *      But under CMD case, it caused some lock issues, should be fixed
2878         *      with new CMD ibits lock. See bug 12718 */
2879        if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2880                struct lookup_intent oit = { .it_op = IT_GETATTR };
2881                struct md_op_data *op_data;
2882
2883                if (ibits == MDS_INODELOCK_LOOKUP)
2884                        oit.it_op = IT_LOOKUP;
2885
2886                /* Call getattr by fid, so do not provide name at all. */
2887                op_data = ll_prep_md_op_data(NULL, inode,
2888                                             inode, NULL, 0, 0,
2889                                             LUSTRE_OPC_ANY, NULL);
2890                if (IS_ERR(op_data))
2891                        return PTR_ERR(op_data);
2892
2893                oit.it_create_mode |= M_CHECK_STALE;
2894                rc = md_intent_lock(exp, op_data, NULL, 0,
2895                                    /* we are not interested in name
2896                                       based lookup */
2897                                    &oit, 0, &req,
2898                                    ll_md_blocking_ast, 0);
2899                ll_finish_md_op_data(op_data);
2900                oit.it_create_mode &= ~M_CHECK_STALE;
2901                if (rc < 0) {
2902                        rc = ll_inode_revalidate_fini(inode, rc);
2903                        goto out;
2904                }
2905
2906                rc = ll_revalidate_it_finish(req, &oit, inode);
2907                if (rc != 0) {
2908                        ll_intent_release(&oit);
2909                        goto out;
2910                }
2911
2912                /* Unlinked? Unhash dentry, so it is not picked up later by
2913                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2914                   here to preserve get_cwd functionality on 2.6.
2915                   Bug 10503 */
2916                if (!d_inode(dentry)->i_nlink)
2917                        d_lustre_invalidate(dentry, 0);
2918
2919                ll_lookup_finish_locks(&oit, inode);
2920        } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) {
2921                struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry));
2922                u64 valid = OBD_MD_FLGETATTR;
2923                struct md_op_data *op_data;
2924                int ealen = 0;
2925
2926                if (S_ISREG(inode->i_mode)) {
2927                        rc = ll_get_default_mdsize(sbi, &ealen);
2928                        if (rc)
2929                                return rc;
2930                        valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2931                }
2932
2933                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2934                                             0, ealen, LUSTRE_OPC_ANY,
2935                                             NULL);
2936                if (IS_ERR(op_data))
2937                        return PTR_ERR(op_data);
2938
2939                op_data->op_valid = valid;
2940                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2941                ll_finish_md_op_data(op_data);
2942                if (rc) {
2943                        rc = ll_inode_revalidate_fini(inode, rc);
2944                        return rc;
2945                }
2946
2947                rc = ll_prep_inode(&inode, req, NULL, NULL);
2948        }
2949out:
2950        ptlrpc_req_finished(req);
2951        return rc;
2952}
2953
2954static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits)
2955{
2956        struct inode *inode = d_inode(dentry);
2957        int rc;
2958
2959        rc = __ll_inode_revalidate(dentry, ibits);
2960        if (rc != 0)
2961                return rc;
2962
2963        /* if object isn't regular file, don't validate size */
2964        if (!S_ISREG(inode->i_mode)) {
2965                LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2966                LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2967                LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2968        } else {
2969                /* In case of restore, the MDT has the right size and has
2970                 * already send it back without granting the layout lock,
2971                 * inode is up-to-date so glimpse is useless.
2972                 * Also to glimpse we need the layout, in case of a running
2973                 * restore the MDT holds the layout lock so the glimpse will
2974                 * block up to the end of restore (getattr will block)
2975                 */
2976                if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
2977                        rc = ll_glimpse_size(inode);
2978        }
2979        return rc;
2980}
2981
2982int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2983{
2984        struct inode *inode = d_inode(de);
2985        struct ll_sb_info *sbi = ll_i2sbi(inode);
2986        struct ll_inode_info *lli = ll_i2info(inode);
2987        int res;
2988
2989        res = ll_inode_revalidate(de, MDS_INODELOCK_UPDATE |
2990                                      MDS_INODELOCK_LOOKUP);
2991        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2992
2993        if (res)
2994                return res;
2995
2996        stat->dev = inode->i_sb->s_dev;
2997        if (ll_need_32bit_api(sbi))
2998                stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2999        else
3000                stat->ino = inode->i_ino;
3001        stat->mode = inode->i_mode;
3002        stat->nlink = inode->i_nlink;
3003        stat->uid = inode->i_uid;
3004        stat->gid = inode->i_gid;
3005        stat->rdev = inode->i_rdev;
3006        stat->atime = inode->i_atime;
3007        stat->mtime = inode->i_mtime;
3008        stat->ctime = inode->i_ctime;
3009        stat->blksize = 1 << inode->i_blkbits;
3010
3011        stat->size = i_size_read(inode);
3012        stat->blocks = inode->i_blocks;
3013
3014        return 0;
3015}
3016
3017static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3018                     __u64 start, __u64 len)
3019{
3020        int rc;
3021        size_t num_bytes;
3022        struct ll_user_fiemap *fiemap;
3023        unsigned int extent_count = fieinfo->fi_extents_max;
3024
3025        num_bytes = sizeof(*fiemap) + (extent_count *
3026                                       sizeof(struct ll_fiemap_extent));
3027        fiemap = libcfs_kvzalloc(num_bytes, GFP_NOFS);
3028
3029        if (fiemap == NULL)
3030                return -ENOMEM;
3031
3032        fiemap->fm_flags = fieinfo->fi_flags;
3033        fiemap->fm_extent_count = fieinfo->fi_extents_max;
3034        fiemap->fm_start = start;
3035        fiemap->fm_length = len;
3036        if (extent_count > 0)
3037                memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3038                       sizeof(struct ll_fiemap_extent));
3039
3040        rc = ll_do_fiemap(inode, fiemap, num_bytes);
3041
3042        fieinfo->fi_flags = fiemap->fm_flags;
3043        fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3044        if (extent_count > 0)
3045                memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3046                       fiemap->fm_mapped_extents *
3047                       sizeof(struct ll_fiemap_extent));
3048
3049        kvfree(fiemap);
3050        return rc;
3051}
3052
3053struct posix_acl *ll_get_acl(struct inode *inode, int type)
3054{
3055        struct ll_inode_info *lli = ll_i2info(inode);
3056        struct posix_acl *acl = NULL;
3057
3058        spin_lock(&lli->lli_lock);
3059        /* VFS' acl_permission_check->check_acl will release the refcount */
3060        acl = posix_acl_dup(lli->lli_posix_acl);
3061        spin_unlock(&lli->lli_lock);
3062
3063        return acl;
3064}
3065
3066int ll_inode_permission(struct inode *inode, int mask)
3067{
3068        int rc = 0;
3069
3070#ifdef MAY_NOT_BLOCK
3071        if (mask & MAY_NOT_BLOCK)
3072                return -ECHILD;
3073#endif
3074
3075       /* as root inode are NOT getting validated in lookup operation,
3076        * need to do it before permission check. */
3077
3078        if (is_root_inode(inode)) {
3079                rc = __ll_inode_revalidate(inode->i_sb->s_root,
3080                                           MDS_INODELOCK_LOOKUP);
3081                if (rc)
3082                        return rc;
3083        }
3084
3085        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3086               inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3087
3088        if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3089                return lustre_check_remote_perm(inode, mask);
3090
3091        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3092        rc = generic_permission(inode, mask);
3093
3094        return rc;
3095}
3096
3097/* -o localflock - only provides locally consistent flock locks */
3098struct file_operations ll_file_operations = {
3099        .read_iter = ll_file_read_iter,
3100        .write_iter = ll_file_write_iter,
3101        .unlocked_ioctl = ll_file_ioctl,
3102        .open      = ll_file_open,
3103        .release        = ll_file_release,
3104        .mmap      = ll_file_mmap,
3105        .llseek  = ll_file_seek,
3106        .splice_read    = ll_file_splice_read,
3107        .fsync    = ll_fsync,
3108        .flush    = ll_flush
3109};
3110
3111struct file_operations ll_file_operations_flock = {
3112        .read_iter    = ll_file_read_iter,
3113        .write_iter   = ll_file_write_iter,
3114        .unlocked_ioctl = ll_file_ioctl,
3115        .open      = ll_file_open,
3116        .release        = ll_file_release,
3117        .mmap      = ll_file_mmap,
3118        .llseek  = ll_file_seek,
3119        .splice_read    = ll_file_splice_read,
3120        .fsync    = ll_fsync,
3121        .flush    = ll_flush,
3122        .flock    = ll_file_flock,
3123        .lock      = ll_file_flock
3124};
3125
3126/* These are for -o noflock - to return ENOSYS on flock calls */
3127struct file_operations ll_file_operations_noflock = {
3128        .read_iter    = ll_file_read_iter,
3129        .write_iter   = ll_file_write_iter,
3130        .unlocked_ioctl = ll_file_ioctl,
3131        .open      = ll_file_open,
3132        .release        = ll_file_release,
3133        .mmap      = ll_file_mmap,
3134        .llseek  = ll_file_seek,
3135        .splice_read    = ll_file_splice_read,
3136        .fsync    = ll_fsync,
3137        .flush    = ll_flush,
3138        .flock    = ll_file_noflock,
3139        .lock      = ll_file_noflock
3140};
3141
3142struct inode_operations ll_file_inode_operations = {
3143        .setattr        = ll_setattr,
3144        .getattr        = ll_getattr,
3145        .permission     = ll_inode_permission,
3146        .setxattr       = ll_setxattr,
3147        .getxattr       = ll_getxattr,
3148        .listxattr      = ll_listxattr,
3149        .removexattr    = ll_removexattr,
3150        .fiemap         = ll_fiemap,
3151        .get_acl        = ll_get_acl,
3152};
3153
3154/* dynamic ioctl number support routines */
3155static struct llioc_ctl_data {
3156        struct rw_semaphore     ioc_sem;
3157        struct list_head              ioc_head;
3158} llioc = {
3159        __RWSEM_INITIALIZER(llioc.ioc_sem),
3160        LIST_HEAD_INIT(llioc.ioc_head)
3161};
3162
3163struct llioc_data {
3164        struct list_head              iocd_list;
3165        unsigned int        iocd_size;
3166        llioc_callback_t        iocd_cb;
3167        unsigned int        iocd_count;
3168        unsigned int        iocd_cmd[0];
3169};
3170
3171void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3172{
3173        unsigned int size;
3174        struct llioc_data *in_data = NULL;
3175
3176        if (cb == NULL || cmd == NULL ||
3177            count > LLIOC_MAX_CMD || count < 0)
3178                return NULL;
3179
3180        size = sizeof(*in_data) + count * sizeof(unsigned int);
3181        in_data = kzalloc(size, GFP_NOFS);
3182        if (!in_data)
3183                return NULL;
3184
3185        memset(in_data, 0, sizeof(*in_data));
3186        in_data->iocd_size = size;
3187        in_data->iocd_cb = cb;
3188        in_data->iocd_count = count;
3189        memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3190
3191        down_write(&llioc.ioc_sem);
3192        list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3193        up_write(&llioc.ioc_sem);
3194
3195        return in_data;
3196}
3197EXPORT_SYMBOL(ll_iocontrol_register);
3198
3199void ll_iocontrol_unregister(void *magic)
3200{
3201        struct llioc_data *tmp;
3202
3203        if (magic == NULL)
3204                return;
3205
3206        down_write(&llioc.ioc_sem);
3207        list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3208                if (tmp == magic) {
3209                        list_del(&tmp->iocd_list);
3210                        up_write(&llioc.ioc_sem);
3211
3212                        kfree(tmp);
3213                        return;
3214                }
3215        }
3216        up_write(&llioc.ioc_sem);
3217
3218        CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3219}
3220EXPORT_SYMBOL(ll_iocontrol_unregister);
3221
3222static enum llioc_iter
3223ll_iocontrol_call(struct inode *inode, struct file *file,
3224                  unsigned int cmd, unsigned long arg, int *rcp)
3225{
3226        enum llioc_iter ret = LLIOC_CONT;
3227        struct llioc_data *data;
3228        int rc = -EINVAL, i;
3229
3230        down_read(&llioc.ioc_sem);
3231        list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3232                for (i = 0; i < data->iocd_count; i++) {
3233                        if (cmd != data->iocd_cmd[i])
3234                                continue;
3235
3236                        ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3237                        break;
3238                }
3239
3240                if (ret == LLIOC_STOP)
3241                        break;
3242        }
3243        up_read(&llioc.ioc_sem);
3244
3245        if (rcp)
3246                *rcp = rc;
3247        return ret;
3248}
3249
3250int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3251{
3252        struct ll_inode_info *lli = ll_i2info(inode);
3253        struct cl_env_nest nest;
3254        struct lu_env *env;
3255        int result;
3256
3257        if (lli->lli_clob == NULL)
3258                return 0;
3259
3260        env = cl_env_nested_get(&nest);
3261        if (IS_ERR(env))
3262                return PTR_ERR(env);
3263
3264        result = cl_conf_set(env, lli->lli_clob, conf);
3265        cl_env_nested_put(&nest, env);
3266
3267        if (conf->coc_opc == OBJECT_CONF_SET) {
3268                struct ldlm_lock *lock = conf->coc_lock;
3269
3270                LASSERT(lock != NULL);
3271                LASSERT(ldlm_has_layout(lock));
3272                if (result == 0) {
3273                        /* it can only be allowed to match after layout is
3274                         * applied to inode otherwise false layout would be
3275                         * seen. Applying layout should happen before dropping
3276                         * the intent lock. */
3277                        ldlm_lock_allow_match(lock);
3278                }
3279        }
3280        return result;
3281}
3282
3283/* Fetch layout from MDT with getxattr request, if it's not ready yet */
3284static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3285
3286{
3287        struct ll_sb_info *sbi = ll_i2sbi(inode);
3288        struct ptlrpc_request *req;
3289        struct mdt_body *body;
3290        void *lvbdata;
3291        void *lmm;
3292        int lmmsize;
3293        int rc;
3294
3295        CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3296               PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3297               lock->l_lvb_data, lock->l_lvb_len);
3298
3299        if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3300                return 0;
3301
3302        /* if layout lock was granted right away, the layout is returned
3303         * within DLM_LVB of dlm reply; otherwise if the lock was ever
3304         * blocked and then granted via completion ast, we have to fetch
3305         * layout here. Please note that we can't use the LVB buffer in
3306         * completion AST because it doesn't have a large enough buffer */
3307        rc = ll_get_default_mdsize(sbi, &lmmsize);
3308        if (rc == 0)
3309                rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
3310                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3311                                 lmmsize, 0, &req);
3312        if (rc < 0)
3313                return rc;
3314
3315        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3316        if (body == NULL) {
3317                rc = -EPROTO;
3318                goto out;
3319        }
3320
3321        lmmsize = body->eadatasize;
3322        if (lmmsize == 0) /* empty layout */ {
3323                rc = 0;
3324                goto out;
3325        }
3326
3327        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3328        if (lmm == NULL) {
3329                rc = -EFAULT;
3330                goto out;
3331        }
3332
3333        lvbdata = libcfs_kvzalloc(lmmsize, GFP_NOFS);
3334        if (lvbdata == NULL) {
3335                rc = -ENOMEM;
3336                goto out;
3337        }
3338
3339        memcpy(lvbdata, lmm, lmmsize);
3340        lock_res_and_lock(lock);
3341        if (lock->l_lvb_data != NULL)
3342                kvfree(lock->l_lvb_data);
3343
3344        lock->l_lvb_data = lvbdata;
3345        lock->l_lvb_len = lmmsize;
3346        unlock_res_and_lock(lock);
3347
3348out:
3349        ptlrpc_req_finished(req);
3350        return rc;
3351}
3352
3353/**
3354 * Apply the layout to the inode. Layout lock is held and will be released
3355 * in this function.
3356 */
3357static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3358                                struct inode *inode, __u32 *gen, bool reconf)
3359{
3360        struct ll_inode_info *lli = ll_i2info(inode);
3361        struct ll_sb_info    *sbi = ll_i2sbi(inode);
3362        struct ldlm_lock *lock;
3363        struct lustre_md md = { NULL };
3364        struct cl_object_conf conf;
3365        int rc = 0;
3366        bool lvb_ready;
3367        bool wait_layout = false;
3368
3369        LASSERT(lustre_handle_is_used(lockh));
3370
3371        lock = ldlm_handle2lock(lockh);
3372        LASSERT(lock != NULL);
3373        LASSERT(ldlm_has_layout(lock));
3374
3375        LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3376                   inode, PFID(&lli->lli_fid), reconf);
3377
3378        /* in case this is a caching lock and reinstate with new inode */
3379        md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3380
3381        lock_res_and_lock(lock);
3382        lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3383        unlock_res_and_lock(lock);
3384        /* checking lvb_ready is racy but this is okay. The worst case is
3385         * that multi processes may configure the file on the same time. */
3386        if (lvb_ready || !reconf) {
3387                rc = -ENODATA;
3388                if (lvb_ready) {
3389                        /* layout_gen must be valid if layout lock is not
3390                         * cancelled and stripe has already set */
3391                        *gen = ll_layout_version_get(lli);
3392                        rc = 0;
3393                }
3394                goto out;
3395        }
3396
3397        rc = ll_layout_fetch(inode, lock);
3398        if (rc < 0)
3399                goto out;
3400
3401        /* for layout lock, lmm is returned in lock's lvb.
3402         * lvb_data is immutable if the lock is held so it's safe to access it
3403         * without res lock. See the description in ldlm_lock_decref_internal()
3404         * for the condition to free lvb_data of layout lock */
3405        if (lock->l_lvb_data != NULL) {
3406                rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3407                                  lock->l_lvb_data, lock->l_lvb_len);
3408                if (rc >= 0) {
3409                        *gen = LL_LAYOUT_GEN_EMPTY;
3410                        if (md.lsm != NULL)
3411                                *gen = md.lsm->lsm_layout_gen;
3412                        rc = 0;
3413                } else {
3414                        CERROR("%s: file "DFID" unpackmd error: %d\n",
3415                                ll_get_fsname(inode->i_sb, NULL, 0),
3416                                PFID(&lli->lli_fid), rc);
3417                }
3418        }
3419        if (rc < 0)
3420                goto out;
3421
3422        /* set layout to file. Unlikely this will fail as old layout was
3423         * surely eliminated */
3424        memset(&conf, 0, sizeof(conf));
3425        conf.coc_opc = OBJECT_CONF_SET;
3426        conf.coc_inode = inode;
3427        conf.coc_lock = lock;
3428        conf.u.coc_md = &md;
3429        rc = ll_layout_conf(inode, &conf);
3430
3431        if (md.lsm != NULL)
3432                obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3433
3434        /* refresh layout failed, need to wait */
3435        wait_layout = rc == -EBUSY;
3436
3437out:
3438        LDLM_LOCK_PUT(lock);
3439        ldlm_lock_decref(lockh, mode);
3440
3441        /* wait for IO to complete if it's still being used. */
3442        if (wait_layout) {
3443                CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3444                        ll_get_fsname(inode->i_sb, NULL, 0),
3445                        inode, PFID(&lli->lli_fid));
3446
3447                memset(&conf, 0, sizeof(conf));
3448                conf.coc_opc = OBJECT_CONF_WAIT;
3449                conf.coc_inode = inode;
3450                rc = ll_layout_conf(inode, &conf);
3451                if (rc == 0)
3452                        rc = -EAGAIN;
3453
3454                CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3455                        PFID(&lli->lli_fid), rc);
3456        }
3457        return rc;
3458}
3459
3460/**
3461 * This function checks if there exists a LAYOUT lock on the client side,
3462 * or enqueues it if it doesn't have one in cache.
3463 *
3464 * This function will not hold layout lock so it may be revoked any time after
3465 * this function returns. Any operations depend on layout should be redone
3466 * in that case.
3467 *
3468 * This function should be called before lov_io_init() to get an uptodate
3469 * layout version, the caller should save the version number and after IO
3470 * is finished, this function should be called again to verify that layout
3471 * is not changed during IO time.
3472 */
3473int ll_layout_refresh(struct inode *inode, __u32 *gen)
3474{
3475        struct ll_inode_info  *lli = ll_i2info(inode);
3476        struct ll_sb_info     *sbi = ll_i2sbi(inode);
3477        struct md_op_data     *op_data;
3478        struct lookup_intent   it;
3479        struct lustre_handle   lockh;
3480        ldlm_mode_t            mode;
3481        struct ldlm_enqueue_info einfo = {
3482                .ei_type = LDLM_IBITS,
3483                .ei_mode = LCK_CR,
3484                .ei_cb_bl = ll_md_blocking_ast,
3485                .ei_cb_cp = ldlm_completion_ast,
3486        };
3487        int rc;
3488
3489        *gen = ll_layout_version_get(lli);
3490        if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != LL_LAYOUT_GEN_NONE)
3491                return 0;
3492
3493        /* sanity checks */
3494        LASSERT(fid_is_sane(ll_inode2fid(inode)));
3495        LASSERT(S_ISREG(inode->i_mode));
3496
3497        /* take layout lock mutex to enqueue layout lock exclusively. */
3498        mutex_lock(&lli->lli_layout_mutex);
3499
3500again:
3501        /* mostly layout lock is caching on the local side, so try to match
3502         * it before grabbing layout lock mutex. */
3503        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
3504                               LCK_CR | LCK_CW | LCK_PR | LCK_PW);
3505        if (mode != 0) { /* hit cached lock */
3506                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3507                if (rc == -EAGAIN)
3508                        goto again;
3509
3510                mutex_unlock(&lli->lli_layout_mutex);
3511                return rc;
3512        }
3513
3514        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3515                        0, 0, LUSTRE_OPC_ANY, NULL);
3516        if (IS_ERR(op_data)) {
3517                mutex_unlock(&lli->lli_layout_mutex);
3518                return PTR_ERR(op_data);
3519        }
3520
3521        /* have to enqueue one */
3522        memset(&it, 0, sizeof(it));
3523        it.it_op = IT_LAYOUT;
3524        lockh.cookie = 0ULL;
3525
3526        LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3527                        ll_get_fsname(inode->i_sb, NULL, 0), inode,
3528                        PFID(&lli->lli_fid));
3529
3530        rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3531                        NULL, 0, NULL, 0);
3532        if (it.d.lustre.it_data != NULL)
3533                ptlrpc_req_finished(it.d.lustre.it_data);
3534        it.d.lustre.it_data = NULL;
3535
3536        ll_finish_md_op_data(op_data);
3537
3538        mode = it.d.lustre.it_lock_mode;
3539        it.d.lustre.it_lock_mode = 0;
3540        ll_intent_drop_lock(&it);
3541
3542        if (rc == 0) {
3543                /* set lock data in case this is a new lock */
3544                ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3545                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3546                if (rc == -EAGAIN)
3547                        goto again;
3548        }
3549        mutex_unlock(&lli->lli_layout_mutex);
3550
3551        return rc;
3552}
3553
3554/**
3555 *  This function send a restore request to the MDT
3556 */
3557int ll_layout_restore(struct inode *inode)
3558{
3559        struct hsm_user_request *hur;
3560        int                      len, rc;
3561
3562        len = sizeof(struct hsm_user_request) +
3563              sizeof(struct hsm_user_item);
3564        hur = kzalloc(len, GFP_NOFS);
3565        if (!hur)
3566                return -ENOMEM;
3567
3568        hur->hur_request.hr_action = HUA_RESTORE;
3569        hur->hur_request.hr_archive_id = 0;
3570        hur->hur_request.hr_flags = 0;
3571        memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3572               sizeof(hur->hur_user_item[0].hui_fid));
3573        hur->hur_user_item[0].hui_extent.length = -1;
3574        hur->hur_request.hr_itemcount = 1;
3575        rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3576                           len, hur, NULL);
3577        kfree(hur);
3578        return rc;
3579}
3580