linux/drivers/staging/lustre/lustre/llite/file.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 *
  36 * lustre/llite/file.c
  37 *
  38 * Author: Peter Braam <braam@clusterfs.com>
  39 * Author: Phil Schwan <phil@clusterfs.com>
  40 * Author: Andreas Dilger <adilger@clusterfs.com>
  41 */
  42
  43#define DEBUG_SUBSYSTEM S_LLITE
  44#include <lustre_dlm.h>
  45#include <lustre_lite.h>
  46#include <linux/pagemap.h>
  47#include <linux/file.h>
  48#include "llite_internal.h"
  49#include <lustre/ll_fiemap.h>
  50
  51#include "cl_object.h"
  52
  53struct ll_file_data *ll_file_data_get(void)
  54{
  55        struct ll_file_data *fd;
  56
  57        OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
  58        fd->fd_write_failed = false;
  59        return fd;
  60}
  61
  62static void ll_file_data_put(struct ll_file_data *fd)
  63{
  64        if (fd != NULL)
  65                OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  66}
  67
  68void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  69                          struct lustre_handle *fh)
  70{
  71        op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  72        op_data->op_attr.ia_mode = inode->i_mode;
  73        op_data->op_attr.ia_atime = inode->i_atime;
  74        op_data->op_attr.ia_mtime = inode->i_mtime;
  75        op_data->op_attr.ia_ctime = inode->i_ctime;
  76        op_data->op_attr.ia_size = i_size_read(inode);
  77        op_data->op_attr_blocks = inode->i_blocks;
  78        ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  79                                        ll_inode_to_ext_flags(inode->i_flags);
  80        op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  81        if (fh)
  82                op_data->op_handle = *fh;
  83        op_data->op_capa1 = ll_mdscapa_get(inode);
  84
  85        if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  86                op_data->op_bias |= MDS_DATA_MODIFIED;
  87}
  88
  89/**
  90 * Closes the IO epoch and packs all the attributes into @op_data for
  91 * the CLOSE rpc.
  92 */
  93static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  94                             struct obd_client_handle *och)
  95{
  96        ENTRY;
  97
  98        op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
  99                                        ATTR_MTIME | ATTR_MTIME_SET |
 100                                        ATTR_CTIME | ATTR_CTIME_SET;
 101
 102        if (!(och->och_flags & FMODE_WRITE))
 103                goto out;
 104
 105        if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 106                op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 107        else
 108                ll_ioepoch_close(inode, op_data, &och, 0);
 109
 110out:
 111        ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 112        ll_prep_md_op_data(op_data, inode, NULL, NULL,
 113                           0, 0, LUSTRE_OPC_ANY, NULL);
 114        EXIT;
 115}
 116
 117static int ll_close_inode_openhandle(struct obd_export *md_exp,
 118                                     struct inode *inode,
 119                                     struct obd_client_handle *och)
 120{
 121        struct obd_export *exp = ll_i2mdexp(inode);
 122        struct md_op_data *op_data;
 123        struct ptlrpc_request *req = NULL;
 124        struct obd_device *obd = class_exp2obd(exp);
 125        int epoch_close = 1;
 126        int rc;
 127        ENTRY;
 128
 129        if (obd == NULL) {
 130                /*
 131                 * XXX: in case of LMV, is this correct to access
 132                 * ->exp_handle?
 133                 */
 134                CERROR("Invalid MDC connection handle "LPX64"\n",
 135                       ll_i2mdexp(inode)->exp_handle.h_cookie);
 136                GOTO(out, rc = 0);
 137        }
 138
 139        OBD_ALLOC_PTR(op_data);
 140        if (op_data == NULL)
 141                GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 142
 143        ll_prepare_close(inode, op_data, och);
 144        epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 145        rc = md_close(md_exp, op_data, och->och_mod, &req);
 146        if (rc == -EAGAIN) {
 147                /* This close must have the epoch closed. */
 148                LASSERT(epoch_close);
 149                /* MDS has instructed us to obtain Size-on-MDS attribute from
 150                 * OSTs and send setattr to back to MDS. */
 151                rc = ll_som_update(inode, op_data);
 152                if (rc) {
 153                        CERROR("inode %lu mdc Size-on-MDS update failed: "
 154                               "rc = %d\n", inode->i_ino, rc);
 155                        rc = 0;
 156                }
 157        } else if (rc) {
 158                CERROR("inode %lu mdc close failed: rc = %d\n",
 159                       inode->i_ino, rc);
 160        }
 161
 162        /* DATA_MODIFIED flag was successfully sent on close, cancel data
 163         * modification flag. */
 164        if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 165                struct ll_inode_info *lli = ll_i2info(inode);
 166
 167                spin_lock(&lli->lli_lock);
 168                lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 169                spin_unlock(&lli->lli_lock);
 170        }
 171
 172        ll_finish_md_op_data(op_data);
 173
 174        if (rc == 0) {
 175                rc = ll_objects_destroy(req, inode);
 176                if (rc)
 177                        CERROR("inode %lu ll_objects destroy: rc = %d\n",
 178                               inode->i_ino, rc);
 179        }
 180
 181        EXIT;
 182out:
 183
 184        if (exp_connect_som(exp) && !epoch_close &&
 185            S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 186                ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 187        } else {
 188                md_clear_open_replay_data(md_exp, och);
 189                /* Free @och if it is not waiting for DONE_WRITING. */
 190                och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 191                OBD_FREE_PTR(och);
 192        }
 193        if (req) /* This is close request */
 194                ptlrpc_req_finished(req);
 195        return rc;
 196}
 197
 198int ll_md_real_close(struct inode *inode, int flags)
 199{
 200        struct ll_inode_info *lli = ll_i2info(inode);
 201        struct obd_client_handle **och_p;
 202        struct obd_client_handle *och;
 203        __u64 *och_usecount;
 204        int rc = 0;
 205        ENTRY;
 206
 207        if (flags & FMODE_WRITE) {
 208                och_p = &lli->lli_mds_write_och;
 209                och_usecount = &lli->lli_open_fd_write_count;
 210        } else if (flags & FMODE_EXEC) {
 211                och_p = &lli->lli_mds_exec_och;
 212                och_usecount = &lli->lli_open_fd_exec_count;
 213        } else {
 214                LASSERT(flags & FMODE_READ);
 215                och_p = &lli->lli_mds_read_och;
 216                och_usecount = &lli->lli_open_fd_read_count;
 217        }
 218
 219        mutex_lock(&lli->lli_och_mutex);
 220        if (*och_usecount) { /* There are still users of this handle, so
 221                                skip freeing it. */
 222                mutex_unlock(&lli->lli_och_mutex);
 223                RETURN(0);
 224        }
 225        och=*och_p;
 226        *och_p = NULL;
 227        mutex_unlock(&lli->lli_och_mutex);
 228
 229        if (och) { /* There might be a race and somebody have freed this och
 230                      already */
 231                rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 232                                               inode, och);
 233        }
 234
 235        RETURN(rc);
 236}
 237
 238int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 239                struct file *file)
 240{
 241        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 242        struct ll_inode_info *lli = ll_i2info(inode);
 243        int rc = 0;
 244        ENTRY;
 245
 246        /* clear group lock, if present */
 247        if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 248                ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 249
 250        /* Let's see if we have good enough OPEN lock on the file and if
 251           we can skip talking to MDS */
 252        if (file->f_dentry->d_inode) { /* Can this ever be false? */
 253                int lockmode;
 254                int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 255                struct lustre_handle lockh;
 256                struct inode *inode = file->f_dentry->d_inode;
 257                ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 258
 259                mutex_lock(&lli->lli_och_mutex);
 260                if (fd->fd_omode & FMODE_WRITE) {
 261                        lockmode = LCK_CW;
 262                        LASSERT(lli->lli_open_fd_write_count);
 263                        lli->lli_open_fd_write_count--;
 264                } else if (fd->fd_omode & FMODE_EXEC) {
 265                        lockmode = LCK_PR;
 266                        LASSERT(lli->lli_open_fd_exec_count);
 267                        lli->lli_open_fd_exec_count--;
 268                } else {
 269                        lockmode = LCK_CR;
 270                        LASSERT(lli->lli_open_fd_read_count);
 271                        lli->lli_open_fd_read_count--;
 272                }
 273                mutex_unlock(&lli->lli_och_mutex);
 274
 275                if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 276                                   LDLM_IBITS, &policy, lockmode,
 277                                   &lockh)) {
 278                        rc = ll_md_real_close(file->f_dentry->d_inode,
 279                                              fd->fd_omode);
 280                }
 281        } else {
 282                CERROR("Releasing a file %p with negative dentry %p. Name %s",
 283                       file, file->f_dentry, file->f_dentry->d_name.name);
 284        }
 285
 286        LUSTRE_FPRIVATE(file) = NULL;
 287        ll_file_data_put(fd);
 288        ll_capa_close(inode);
 289
 290        RETURN(rc);
 291}
 292
 293/* While this returns an error code, fput() the caller does not, so we need
 294 * to make every effort to clean up all of our state here.  Also, applications
 295 * rarely check close errors and even if an error is returned they will not
 296 * re-try the close call.
 297 */
 298int ll_file_release(struct inode *inode, struct file *file)
 299{
 300        struct ll_file_data *fd;
 301        struct ll_sb_info *sbi = ll_i2sbi(inode);
 302        struct ll_inode_info *lli = ll_i2info(inode);
 303        int rc;
 304        ENTRY;
 305
 306        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 307               inode->i_generation, inode);
 308
 309#ifdef CONFIG_FS_POSIX_ACL
 310        if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 311            inode == inode->i_sb->s_root->d_inode) {
 312                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 313
 314                LASSERT(fd != NULL);
 315                if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 316                        fd->fd_flags &= ~LL_FILE_RMTACL;
 317                        rct_del(&sbi->ll_rct, current_pid());
 318                        et_search_free(&sbi->ll_et, current_pid());
 319                }
 320        }
 321#endif
 322
 323        if (inode->i_sb->s_root != file->f_dentry)
 324                ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 325        fd = LUSTRE_FPRIVATE(file);
 326        LASSERT(fd != NULL);
 327
 328        /* The last ref on @file, maybe not the the owner pid of statahead.
 329         * Different processes can open the same dir, "ll_opendir_key" means:
 330         * it is me that should stop the statahead thread. */
 331        if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 332            lli->lli_opendir_pid != 0)
 333                ll_stop_statahead(inode, lli->lli_opendir_key);
 334
 335        if (inode->i_sb->s_root == file->f_dentry) {
 336                LUSTRE_FPRIVATE(file) = NULL;
 337                ll_file_data_put(fd);
 338                RETURN(0);
 339        }
 340
 341        if (!S_ISDIR(inode->i_mode)) {
 342                lov_read_and_clear_async_rc(lli->lli_clob);
 343                lli->lli_async_rc = 0;
 344        }
 345
 346        rc = ll_md_close(sbi->ll_md_exp, inode, file);
 347
 348        if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 349                libcfs_debug_dumplog();
 350
 351        RETURN(rc);
 352}
 353
 354static int ll_intent_file_open(struct file *file, void *lmm,
 355                               int lmmsize, struct lookup_intent *itp)
 356{
 357        struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 358        struct dentry *parent = file->f_dentry->d_parent;
 359        const char *name = file->f_dentry->d_name.name;
 360        const int len = file->f_dentry->d_name.len;
 361        struct md_op_data *op_data;
 362        struct ptlrpc_request *req;
 363        __u32 opc = LUSTRE_OPC_ANY;
 364        int rc;
 365        ENTRY;
 366
 367        if (!parent)
 368                RETURN(-ENOENT);
 369
 370        /* Usually we come here only for NFSD, and we want open lock.
 371           But we can also get here with pre 2.6.15 patchless kernels, and in
 372           that case that lock is also ok */
 373        /* We can also get here if there was cached open handle in revalidate_it
 374         * but it disappeared while we were getting from there to ll_file_open.
 375         * But this means this file was closed and immediatelly opened which
 376         * makes a good candidate for using OPEN lock */
 377        /* If lmmsize & lmm are not 0, we are just setting stripe info
 378         * parameters. No need for the open lock */
 379        if (lmm == NULL && lmmsize == 0) {
 380                itp->it_flags |= MDS_OPEN_LOCK;
 381                if (itp->it_flags & FMODE_WRITE)
 382                        opc = LUSTRE_OPC_CREATE;
 383        }
 384
 385        op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 386                                      file->f_dentry->d_inode, name, len,
 387                                      O_RDWR, opc, NULL);
 388        if (IS_ERR(op_data))
 389                RETURN(PTR_ERR(op_data));
 390
 391        itp->it_flags |= MDS_OPEN_BY_FID;
 392        rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 393                            0 /*unused */, &req, ll_md_blocking_ast, 0);
 394        ll_finish_md_op_data(op_data);
 395        if (rc == -ESTALE) {
 396                /* reason for keep own exit path - don`t flood log
 397                * with messages with -ESTALE errors.
 398                */
 399                if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 400                     it_open_error(DISP_OPEN_OPEN, itp))
 401                        GOTO(out, rc);
 402                ll_release_openhandle(file->f_dentry, itp);
 403                GOTO(out, rc);
 404        }
 405
 406        if (it_disposition(itp, DISP_LOOKUP_NEG))
 407                GOTO(out, rc = -ENOENT);
 408
 409        if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 410                rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 411                CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 412                GOTO(out, rc);
 413        }
 414
 415        rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
 416        if (!rc && itp->d.lustre.it_lock_mode)
 417                ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
 418                                 itp, NULL);
 419
 420out:
 421        ptlrpc_req_finished(itp->d.lustre.it_data);
 422        it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 423        ll_intent_drop_lock(itp);
 424
 425        RETURN(rc);
 426}
 427
 428/**
 429 * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 430 * not believe attributes if a few ioepoch holders exist. Attributes for
 431 * previous ioepoch if new one is opened are also skipped by MDS.
 432 */
 433void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 434{
 435        if (ioepoch && lli->lli_ioepoch != ioepoch) {
 436                lli->lli_ioepoch = ioepoch;
 437                CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 438                       ioepoch, PFID(&lli->lli_fid));
 439        }
 440}
 441
 442static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 443                       struct lookup_intent *it, struct obd_client_handle *och)
 444{
 445        struct ptlrpc_request *req = it->d.lustre.it_data;
 446        struct mdt_body *body;
 447
 448        LASSERT(och);
 449
 450        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 451        LASSERT(body != NULL);                /* reply already checked out */
 452
 453        memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 454        och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 455        och->och_fid = lli->lli_fid;
 456        och->och_flags = it->it_flags;
 457        ll_ioepoch_open(lli, body->ioepoch);
 458
 459        return md_set_open_replay_data(md_exp, och, req);
 460}
 461
 462int ll_local_open(struct file *file, struct lookup_intent *it,
 463                  struct ll_file_data *fd, struct obd_client_handle *och)
 464{
 465        struct inode *inode = file->f_dentry->d_inode;
 466        struct ll_inode_info *lli = ll_i2info(inode);
 467        ENTRY;
 468
 469        LASSERT(!LUSTRE_FPRIVATE(file));
 470
 471        LASSERT(fd != NULL);
 472
 473        if (och) {
 474                struct ptlrpc_request *req = it->d.lustre.it_data;
 475                struct mdt_body *body;
 476                int rc;
 477
 478                rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 479                if (rc)
 480                        RETURN(rc);
 481
 482                body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 483                if ((it->it_flags & FMODE_WRITE) &&
 484                    (body->valid & OBD_MD_FLSIZE))
 485                        CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 486                               lli->lli_ioepoch, PFID(&lli->lli_fid));
 487        }
 488
 489        LUSTRE_FPRIVATE(file) = fd;
 490        ll_readahead_init(inode, &fd->fd_ras);
 491        fd->fd_omode = it->it_flags;
 492        RETURN(0);
 493}
 494
 495/* Open a file, and (for the very first open) create objects on the OSTs at
 496 * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 497 * creation or open until ll_lov_setstripe() ioctl is called.
 498 *
 499 * If we already have the stripe MD locally then we don't request it in
 500 * md_open(), by passing a lmm_size = 0.
 501 *
 502 * It is up to the application to ensure no other processes open this file
 503 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 504 * used.  We might be able to avoid races of that sort by getting lli_open_sem
 505 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 506 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 507 */
 508int ll_file_open(struct inode *inode, struct file *file)
 509{
 510        struct ll_inode_info *lli = ll_i2info(inode);
 511        struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 512                                          .it_flags = file->f_flags };
 513        struct obd_client_handle **och_p = NULL;
 514        __u64 *och_usecount = NULL;
 515        struct ll_file_data *fd;
 516        int rc = 0, opendir_set = 0;
 517        ENTRY;
 518
 519        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 520               inode->i_generation, inode, file->f_flags);
 521
 522        it = file->private_data; /* XXX: compat macro */
 523        file->private_data = NULL; /* prevent ll_local_open assertion */
 524
 525        fd = ll_file_data_get();
 526        if (fd == NULL)
 527                GOTO(out_och_free, rc = -ENOMEM);
 528
 529        fd->fd_file = file;
 530        if (S_ISDIR(inode->i_mode)) {
 531                spin_lock(&lli->lli_sa_lock);
 532                if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 533                    lli->lli_opendir_pid == 0) {
 534                        lli->lli_opendir_key = fd;
 535                        lli->lli_opendir_pid = current_pid();
 536                        opendir_set = 1;
 537                }
 538                spin_unlock(&lli->lli_sa_lock);
 539        }
 540
 541        if (inode->i_sb->s_root == file->f_dentry) {
 542                LUSTRE_FPRIVATE(file) = fd;
 543                RETURN(0);
 544        }
 545
 546        if (!it || !it->d.lustre.it_disposition) {
 547                /* Convert f_flags into access mode. We cannot use file->f_mode,
 548                 * because everything but O_ACCMODE mask was stripped from
 549                 * there */
 550                if ((oit.it_flags + 1) & O_ACCMODE)
 551                        oit.it_flags++;
 552                if (file->f_flags & O_TRUNC)
 553                        oit.it_flags |= FMODE_WRITE;
 554
 555                /* kernel only call f_op->open in dentry_open.  filp_open calls
 556                 * dentry_open after call to open_namei that checks permissions.
 557                 * Only nfsd_open call dentry_open directly without checking
 558                 * permissions and because of that this code below is safe. */
 559                if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 560                        oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 561
 562                /* We do not want O_EXCL here, presumably we opened the file
 563                 * already? XXX - NFS implications? */
 564                oit.it_flags &= ~O_EXCL;
 565
 566                /* bug20584, if "it_flags" contains O_CREAT, the file will be
 567                 * created if necessary, then "IT_CREAT" should be set to keep
 568                 * consistent with it */
 569                if (oit.it_flags & O_CREAT)
 570                        oit.it_op |= IT_CREAT;
 571
 572                it = &oit;
 573        }
 574
 575restart:
 576        /* Let's see if we have file open on MDS already. */
 577        if (it->it_flags & FMODE_WRITE) {
 578                och_p = &lli->lli_mds_write_och;
 579                och_usecount = &lli->lli_open_fd_write_count;
 580        } else if (it->it_flags & FMODE_EXEC) {
 581                och_p = &lli->lli_mds_exec_och;
 582                och_usecount = &lli->lli_open_fd_exec_count;
 583         } else {
 584                och_p = &lli->lli_mds_read_och;
 585                och_usecount = &lli->lli_open_fd_read_count;
 586        }
 587
 588        mutex_lock(&lli->lli_och_mutex);
 589        if (*och_p) { /* Open handle is present */
 590                if (it_disposition(it, DISP_OPEN_OPEN)) {
 591                        /* Well, there's extra open request that we do not need,
 592                           let's close it somehow. This will decref request. */
 593                        rc = it_open_error(DISP_OPEN_OPEN, it);
 594                        if (rc) {
 595                                mutex_unlock(&lli->lli_och_mutex);
 596                                GOTO(out_openerr, rc);
 597                        }
 598
 599                        ll_release_openhandle(file->f_dentry, it);
 600                }
 601                (*och_usecount)++;
 602
 603                rc = ll_local_open(file, it, fd, NULL);
 604                if (rc) {
 605                        (*och_usecount)--;
 606                        mutex_unlock(&lli->lli_och_mutex);
 607                        GOTO(out_openerr, rc);
 608                }
 609        } else {
 610                LASSERT(*och_usecount == 0);
 611                if (!it->d.lustre.it_disposition) {
 612                        /* We cannot just request lock handle now, new ELC code
 613                           means that one of other OPEN locks for this file
 614                           could be cancelled, and since blocking ast handler
 615                           would attempt to grab och_mutex as well, that would
 616                           result in a deadlock */
 617                        mutex_unlock(&lli->lli_och_mutex);
 618                        it->it_create_mode |= M_CHECK_STALE;
 619                        rc = ll_intent_file_open(file, NULL, 0, it);
 620                        it->it_create_mode &= ~M_CHECK_STALE;
 621                        if (rc)
 622                                GOTO(out_openerr, rc);
 623
 624                        goto restart;
 625                }
 626                OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 627                if (!*och_p)
 628                        GOTO(out_och_free, rc = -ENOMEM);
 629
 630                (*och_usecount)++;
 631
 632                /* md_intent_lock() didn't get a request ref if there was an
 633                 * open error, so don't do cleanup on the request here
 634                 * (bug 3430) */
 635                /* XXX (green): Should not we bail out on any error here, not
 636                 * just open error? */
 637                rc = it_open_error(DISP_OPEN_OPEN, it);
 638                if (rc)
 639                        GOTO(out_och_free, rc);
 640
 641                LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 642
 643                rc = ll_local_open(file, it, fd, *och_p);
 644                if (rc)
 645                        GOTO(out_och_free, rc);
 646        }
 647        mutex_unlock(&lli->lli_och_mutex);
 648        fd = NULL;
 649
 650        /* Must do this outside lli_och_mutex lock to prevent deadlock where
 651           different kind of OPEN lock for this same inode gets cancelled
 652           by ldlm_cancel_lru */
 653        if (!S_ISREG(inode->i_mode))
 654                GOTO(out_och_free, rc);
 655
 656        ll_capa_open(inode);
 657
 658        if (!lli->lli_has_smd) {
 659                if (file->f_flags & O_LOV_DELAY_CREATE ||
 660                    !(file->f_mode & FMODE_WRITE)) {
 661                        CDEBUG(D_INODE, "object creation was delayed\n");
 662                        GOTO(out_och_free, rc);
 663                }
 664        }
 665        file->f_flags &= ~O_LOV_DELAY_CREATE;
 666        GOTO(out_och_free, rc);
 667
 668out_och_free:
 669        if (rc) {
 670                if (och_p && *och_p) {
 671                        OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 672                        *och_p = NULL; /* OBD_FREE writes some magic there */
 673                        (*och_usecount)--;
 674                }
 675                mutex_unlock(&lli->lli_och_mutex);
 676
 677out_openerr:
 678                if (opendir_set != 0)
 679                        ll_stop_statahead(inode, lli->lli_opendir_key);
 680                if (fd != NULL)
 681                        ll_file_data_put(fd);
 682        } else {
 683                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 684        }
 685
 686        if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 687                ptlrpc_req_finished(it->d.lustre.it_data);
 688                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 689        }
 690
 691        return rc;
 692}
 693
 694/* Fills the obdo with the attributes for the lsm */
 695static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 696                          struct obd_capa *capa, struct obdo *obdo,
 697                          __u64 ioepoch, int sync)
 698{
 699        struct ptlrpc_request_set *set;
 700        struct obd_info     oinfo = { { { 0 } } };
 701        int                     rc;
 702
 703        ENTRY;
 704
 705        LASSERT(lsm != NULL);
 706
 707        oinfo.oi_md = lsm;
 708        oinfo.oi_oa = obdo;
 709        oinfo.oi_oa->o_oi = lsm->lsm_oi;
 710        oinfo.oi_oa->o_mode = S_IFREG;
 711        oinfo.oi_oa->o_ioepoch = ioepoch;
 712        oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 713                               OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 714                               OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 715                               OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 716                               OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 717                               OBD_MD_FLDATAVERSION;
 718        oinfo.oi_capa = capa;
 719        if (sync) {
 720                oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 721                oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 722        }
 723
 724        set = ptlrpc_prep_set();
 725        if (set == NULL) {
 726                CERROR("can't allocate ptlrpc set\n");
 727                rc = -ENOMEM;
 728        } else {
 729                rc = obd_getattr_async(exp, &oinfo, set);
 730                if (rc == 0)
 731                        rc = ptlrpc_set_wait(set);
 732                ptlrpc_set_destroy(set);
 733        }
 734        if (rc == 0)
 735                oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 736                                         OBD_MD_FLATIME | OBD_MD_FLMTIME |
 737                                         OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 738                                         OBD_MD_FLDATAVERSION);
 739        RETURN(rc);
 740}
 741
 742/**
 743  * Performs the getattr on the inode and updates its fields.
 744  * If @sync != 0, perform the getattr under the server-side lock.
 745  */
 746int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 747                     __u64 ioepoch, int sync)
 748{
 749        struct obd_capa      *capa = ll_mdscapa_get(inode);
 750        struct lov_stripe_md *lsm;
 751        int rc;
 752        ENTRY;
 753
 754        lsm = ccc_inode_lsm_get(inode);
 755        rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 756                            capa, obdo, ioepoch, sync);
 757        capa_put(capa);
 758        if (rc == 0) {
 759                struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
 760
 761                obdo_refresh_inode(inode, obdo, obdo->o_valid);
 762                CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
 763                       " blksize %lu\n", POSTID(oi), i_size_read(inode),
 764                       (unsigned long long)inode->i_blocks,
 765                       (unsigned long)ll_inode_blksize(inode));
 766        }
 767        ccc_inode_lsm_put(inode, lsm);
 768        RETURN(rc);
 769}
 770
 771int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 772{
 773        struct ll_inode_info *lli = ll_i2info(inode);
 774        struct cl_object *obj = lli->lli_clob;
 775        struct cl_attr *attr = ccc_env_thread_attr(env);
 776        struct ost_lvb lvb;
 777        int rc = 0;
 778
 779        ENTRY;
 780
 781        ll_inode_size_lock(inode);
 782        /* merge timestamps the most recently obtained from mds with
 783           timestamps obtained from osts */
 784        LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 785        LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
 786        LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
 787        inode_init_lvb(inode, &lvb);
 788
 789        cl_object_attr_lock(obj);
 790        rc = cl_object_attr_get(env, obj, attr);
 791        cl_object_attr_unlock(obj);
 792
 793        if (rc == 0) {
 794                if (lvb.lvb_atime < attr->cat_atime)
 795                        lvb.lvb_atime = attr->cat_atime;
 796                if (lvb.lvb_ctime < attr->cat_ctime)
 797                        lvb.lvb_ctime = attr->cat_ctime;
 798                if (lvb.lvb_mtime < attr->cat_mtime)
 799                        lvb.lvb_mtime = attr->cat_mtime;
 800
 801                CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
 802                                PFID(&lli->lli_fid), attr->cat_size);
 803                cl_isize_write_nolock(inode, attr->cat_size);
 804
 805                inode->i_blocks = attr->cat_blocks;
 806
 807                LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
 808                LTIME_S(inode->i_atime) = lvb.lvb_atime;
 809                LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
 810        }
 811        ll_inode_size_unlock(inode);
 812
 813        RETURN(rc);
 814}
 815
 816int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
 817                     lstat_t *st)
 818{
 819        struct obdo obdo = { 0 };
 820        int rc;
 821
 822        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
 823        if (rc == 0) {
 824                st->st_size   = obdo.o_size;
 825                st->st_blocks = obdo.o_blocks;
 826                st->st_mtime  = obdo.o_mtime;
 827                st->st_atime  = obdo.o_atime;
 828                st->st_ctime  = obdo.o_ctime;
 829        }
 830        return rc;
 831}
 832
 833void ll_io_init(struct cl_io *io, const struct file *file, int write)
 834{
 835        struct inode *inode = file->f_dentry->d_inode;
 836
 837        io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
 838        if (write) {
 839                io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
 840                io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
 841                                      file->f_flags & O_DIRECT ||
 842                                      IS_SYNC(inode);
 843        }
 844        io->ci_obj     = ll_i2info(inode)->lli_clob;
 845        io->ci_lockreq = CILR_MAYBE;
 846        if (ll_file_nolock(file)) {
 847                io->ci_lockreq = CILR_NEVER;
 848                io->ci_no_srvlock = 1;
 849        } else if (file->f_flags & O_APPEND) {
 850                io->ci_lockreq = CILR_MANDATORY;
 851        }
 852}
 853
 854static ssize_t
 855ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
 856                   struct file *file, enum cl_io_type iot,
 857                   loff_t *ppos, size_t count)
 858{
 859        struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
 860        struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
 861        struct cl_io     *io;
 862        ssize_t        result;
 863        ENTRY;
 864
 865restart:
 866        io = ccc_env_thread_io(env);
 867        ll_io_init(io, file, iot == CIT_WRITE);
 868
 869        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
 870                struct vvp_io *vio = vvp_env_io(env);
 871                struct ccc_io *cio = ccc_env_io(env);
 872                int write_mutex_locked = 0;
 873
 874                cio->cui_fd  = LUSTRE_FPRIVATE(file);
 875                vio->cui_io_subtype = args->via_io_subtype;
 876
 877                switch (vio->cui_io_subtype) {
 878                case IO_NORMAL:
 879                        cio->cui_iov = args->u.normal.via_iov;
 880                        cio->cui_nrsegs = args->u.normal.via_nrsegs;
 881                        cio->cui_tot_nrsegs = cio->cui_nrsegs;
 882                        cio->cui_iocb = args->u.normal.via_iocb;
 883                        if ((iot == CIT_WRITE) &&
 884                            !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 885                                if (mutex_lock_interruptible(&lli->
 886                                                               lli_write_mutex))
 887                                        GOTO(out, result = -ERESTARTSYS);
 888                                write_mutex_locked = 1;
 889                        } else if (iot == CIT_READ) {
 890                                down_read(&lli->lli_trunc_sem);
 891                        }
 892                        break;
 893                case IO_SENDFILE:
 894                        vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
 895                        vio->u.sendfile.cui_target = args->u.sendfile.via_target;
 896                        break;
 897                case IO_SPLICE:
 898                        vio->u.splice.cui_pipe = args->u.splice.via_pipe;
 899                        vio->u.splice.cui_flags = args->u.splice.via_flags;
 900                        break;
 901                default:
 902                        CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
 903                        LBUG();
 904                }
 905                result = cl_io_loop(env, io);
 906                if (write_mutex_locked)
 907                        mutex_unlock(&lli->lli_write_mutex);
 908                else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
 909                        up_read(&lli->lli_trunc_sem);
 910        } else {
 911                /* cl_io_rw_init() handled IO */
 912                result = io->ci_result;
 913        }
 914
 915        if (io->ci_nob > 0) {
 916                result = io->ci_nob;
 917                *ppos = io->u.ci_wr.wr.crw_pos;
 918        }
 919        GOTO(out, result);
 920out:
 921        cl_io_fini(env, io);
 922        /* If any bit been read/written (result != 0), we just return
 923         * short read/write instead of restart io. */
 924        if (result == 0 && io->ci_need_restart) {
 925                CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
 926                       iot == CIT_READ ? "read" : "write",
 927                       file->f_dentry->d_name.name, *ppos, count);
 928                LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
 929                goto restart;
 930        }
 931
 932        if (iot == CIT_READ) {
 933                if (result >= 0)
 934                        ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 935                                           LPROC_LL_READ_BYTES, result);
 936        } else if (iot == CIT_WRITE) {
 937                if (result >= 0) {
 938                        ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
 939                                           LPROC_LL_WRITE_BYTES, result);
 940                        fd->fd_write_failed = false;
 941                } else if (result != -ERESTARTSYS) {
 942                        fd->fd_write_failed = true;
 943                }
 944        }
 945
 946        return result;
 947}
 948
 949
 950/*
 951 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
 952 */
 953static int ll_file_get_iov_count(const struct iovec *iov,
 954                                 unsigned long *nr_segs, size_t *count)
 955{
 956        size_t cnt = 0;
 957        unsigned long seg;
 958
 959        for (seg = 0; seg < *nr_segs; seg++) {
 960                const struct iovec *iv = &iov[seg];
 961
 962                /*
 963                 * If any segment has a negative length, or the cumulative
 964                 * length ever wraps negative then return -EINVAL.
 965                 */
 966                cnt += iv->iov_len;
 967                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
 968                        return -EINVAL;
 969                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
 970                        continue;
 971                if (seg == 0)
 972                        return -EFAULT;
 973                *nr_segs = seg;
 974                cnt -= iv->iov_len;   /* This segment is no good */
 975                break;
 976        }
 977        *count = cnt;
 978        return 0;
 979}
 980
 981static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 982                                unsigned long nr_segs, loff_t pos)
 983{
 984        struct lu_env      *env;
 985        struct vvp_io_args *args;
 986        size_t        count;
 987        ssize_t      result;
 988        int              refcheck;
 989        ENTRY;
 990
 991        result = ll_file_get_iov_count(iov, &nr_segs, &count);
 992        if (result)
 993                RETURN(result);
 994
 995        env = cl_env_get(&refcheck);
 996        if (IS_ERR(env))
 997                RETURN(PTR_ERR(env));
 998
 999        args = vvp_env_args(env, IO_NORMAL);
1000        args->u.normal.via_iov = (struct iovec *)iov;
1001        args->u.normal.via_nrsegs = nr_segs;
1002        args->u.normal.via_iocb = iocb;
1003
1004        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1005                                    &iocb->ki_pos, count);
1006        cl_env_put(env, &refcheck);
1007        RETURN(result);
1008}
1009
1010static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1011                            loff_t *ppos)
1012{
1013        struct lu_env *env;
1014        struct iovec  *local_iov;
1015        struct kiocb  *kiocb;
1016        ssize_t result;
1017        int         refcheck;
1018        ENTRY;
1019
1020        env = cl_env_get(&refcheck);
1021        if (IS_ERR(env))
1022                RETURN(PTR_ERR(env));
1023
1024        local_iov = &vvp_env_info(env)->vti_local_iov;
1025        kiocb = &vvp_env_info(env)->vti_kiocb;
1026        local_iov->iov_base = (void __user *)buf;
1027        local_iov->iov_len = count;
1028        init_sync_kiocb(kiocb, file);
1029        kiocb->ki_pos = *ppos;
1030        kiocb->ki_left = count;
1031
1032        result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1033        *ppos = kiocb->ki_pos;
1034
1035        cl_env_put(env, &refcheck);
1036        RETURN(result);
1037}
1038
1039/*
1040 * Write to a file (through the page cache).
1041 */
1042static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1043                                 unsigned long nr_segs, loff_t pos)
1044{
1045        struct lu_env      *env;
1046        struct vvp_io_args *args;
1047        size_t        count;
1048        ssize_t      result;
1049        int              refcheck;
1050        ENTRY;
1051
1052        result = ll_file_get_iov_count(iov, &nr_segs, &count);
1053        if (result)
1054                RETURN(result);
1055
1056        env = cl_env_get(&refcheck);
1057        if (IS_ERR(env))
1058                RETURN(PTR_ERR(env));
1059
1060        args = vvp_env_args(env, IO_NORMAL);
1061        args->u.normal.via_iov = (struct iovec *)iov;
1062        args->u.normal.via_nrsegs = nr_segs;
1063        args->u.normal.via_iocb = iocb;
1064
1065        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1066                                  &iocb->ki_pos, count);
1067        cl_env_put(env, &refcheck);
1068        RETURN(result);
1069}
1070
1071static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1072                             loff_t *ppos)
1073{
1074        struct lu_env *env;
1075        struct iovec  *local_iov;
1076        struct kiocb  *kiocb;
1077        ssize_t result;
1078        int         refcheck;
1079        ENTRY;
1080
1081        env = cl_env_get(&refcheck);
1082        if (IS_ERR(env))
1083                RETURN(PTR_ERR(env));
1084
1085        local_iov = &vvp_env_info(env)->vti_local_iov;
1086        kiocb = &vvp_env_info(env)->vti_kiocb;
1087        local_iov->iov_base = (void __user *)buf;
1088        local_iov->iov_len = count;
1089        init_sync_kiocb(kiocb, file);
1090        kiocb->ki_pos = *ppos;
1091        kiocb->ki_left = count;
1092
1093        result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1094        *ppos = kiocb->ki_pos;
1095
1096        cl_env_put(env, &refcheck);
1097        RETURN(result);
1098}
1099
1100
1101
1102/*
1103 * Send file content (through pagecache) somewhere with helper
1104 */
1105static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1106                                   struct pipe_inode_info *pipe, size_t count,
1107                                   unsigned int flags)
1108{
1109        struct lu_env      *env;
1110        struct vvp_io_args *args;
1111        ssize_t      result;
1112        int              refcheck;
1113        ENTRY;
1114
1115        env = cl_env_get(&refcheck);
1116        if (IS_ERR(env))
1117                RETURN(PTR_ERR(env));
1118
1119        args = vvp_env_args(env, IO_SPLICE);
1120        args->u.splice.via_pipe = pipe;
1121        args->u.splice.via_flags = flags;
1122
1123        result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1124        cl_env_put(env, &refcheck);
1125        RETURN(result);
1126}
1127
1128static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1129                           obd_count ost_idx)
1130{
1131        struct obd_export *exp = ll_i2dtexp(inode);
1132        struct obd_trans_info oti = { 0 };
1133        struct obdo *oa = NULL;
1134        int lsm_size;
1135        int rc = 0;
1136        struct lov_stripe_md *lsm = NULL, *lsm2;
1137        ENTRY;
1138
1139        OBDO_ALLOC(oa);
1140        if (oa == NULL)
1141                RETURN(-ENOMEM);
1142
1143        lsm = ccc_inode_lsm_get(inode);
1144        if (lsm == NULL)
1145                GOTO(out, rc = -ENOENT);
1146
1147        lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1148                   (lsm->lsm_stripe_count));
1149
1150        OBD_ALLOC_LARGE(lsm2, lsm_size);
1151        if (lsm2 == NULL)
1152                GOTO(out, rc = -ENOMEM);
1153
1154        oa->o_oi = *oi;
1155        oa->o_nlink = ost_idx;
1156        oa->o_flags |= OBD_FL_RECREATE_OBJS;
1157        oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1158        obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1159                                   OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1160        obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1161        memcpy(lsm2, lsm, lsm_size);
1162        ll_inode_size_lock(inode);
1163        rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1164        ll_inode_size_unlock(inode);
1165
1166        OBD_FREE_LARGE(lsm2, lsm_size);
1167        GOTO(out, rc);
1168out:
1169        ccc_inode_lsm_put(inode, lsm);
1170        OBDO_FREE(oa);
1171        return rc;
1172}
1173
1174static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1175{
1176        struct ll_recreate_obj ucreat;
1177        struct ost_id           oi;
1178        ENTRY;
1179
1180        if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1181                RETURN(-EPERM);
1182
1183        if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1184                           sizeof(ucreat)))
1185                RETURN(-EFAULT);
1186
1187        ostid_set_seq_mdt0(&oi);
1188        ostid_set_id(&oi, ucreat.lrc_id);
1189        RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1190}
1191
1192static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1193{
1194        struct lu_fid   fid;
1195        struct ost_id   oi;
1196        obd_count       ost_idx;
1197        ENTRY;
1198
1199        if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1200                RETURN(-EPERM);
1201
1202        if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1203                RETURN(-EFAULT);
1204
1205        fid_to_ostid(&fid, &oi);
1206        ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1207        RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1208}
1209
1210int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1211                             int flags, struct lov_user_md *lum, int lum_size)
1212{
1213        struct lov_stripe_md *lsm = NULL;
1214        struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1215        int rc = 0;
1216        ENTRY;
1217
1218        lsm = ccc_inode_lsm_get(inode);
1219        if (lsm != NULL) {
1220                ccc_inode_lsm_put(inode, lsm);
1221                CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1222                       inode->i_ino);
1223                RETURN(-EEXIST);
1224        }
1225
1226        ll_inode_size_lock(inode);
1227        rc = ll_intent_file_open(file, lum, lum_size, &oit);
1228        if (rc)
1229                GOTO(out, rc);
1230        rc = oit.d.lustre.it_status;
1231        if (rc < 0)
1232                GOTO(out_req_free, rc);
1233
1234        ll_release_openhandle(file->f_dentry, &oit);
1235
1236 out:
1237        ll_inode_size_unlock(inode);
1238        ll_intent_release(&oit);
1239        ccc_inode_lsm_put(inode, lsm);
1240        RETURN(rc);
1241out_req_free:
1242        ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1243        goto out;
1244}
1245
1246int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1247                             struct lov_mds_md **lmmp, int *lmm_size,
1248                             struct ptlrpc_request **request)
1249{
1250        struct ll_sb_info *sbi = ll_i2sbi(inode);
1251        struct mdt_body  *body;
1252        struct lov_mds_md *lmm = NULL;
1253        struct ptlrpc_request *req = NULL;
1254        struct md_op_data *op_data;
1255        int rc, lmmsize;
1256
1257        rc = ll_get_max_mdsize(sbi, &lmmsize);
1258        if (rc)
1259                RETURN(rc);
1260
1261        op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1262                                     strlen(filename), lmmsize,
1263                                     LUSTRE_OPC_ANY, NULL);
1264        if (IS_ERR(op_data))
1265                RETURN(PTR_ERR(op_data));
1266
1267        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1268        rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1269        ll_finish_md_op_data(op_data);
1270        if (rc < 0) {
1271                CDEBUG(D_INFO, "md_getattr_name failed "
1272                       "on %s: rc %d\n", filename, rc);
1273                GOTO(out, rc);
1274        }
1275
1276        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1277        LASSERT(body != NULL); /* checked by mdc_getattr_name */
1278
1279        lmmsize = body->eadatasize;
1280
1281        if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1282                        lmmsize == 0) {
1283                GOTO(out, rc = -ENODATA);
1284        }
1285
1286        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1287        LASSERT(lmm != NULL);
1288
1289        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1290            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1291                GOTO(out, rc = -EPROTO);
1292        }
1293
1294        /*
1295         * This is coming from the MDS, so is probably in
1296         * little endian.  We convert it to host endian before
1297         * passing it to userspace.
1298         */
1299        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1300                /* if function called for directory - we should
1301                 * avoid swab not existent lsm objects */
1302                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1303                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1304                        if (S_ISREG(body->mode))
1305                                lustre_swab_lov_user_md_objects(
1306                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1307                                 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1308                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1309                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1310                        if (S_ISREG(body->mode))
1311                                lustre_swab_lov_user_md_objects(
1312                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1313                                 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1314                }
1315        }
1316
1317out:
1318        *lmmp = lmm;
1319        *lmm_size = lmmsize;
1320        *request = req;
1321        return rc;
1322}
1323
1324static int ll_lov_setea(struct inode *inode, struct file *file,
1325                            unsigned long arg)
1326{
1327        int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1328        struct lov_user_md      *lump;
1329        int                      lum_size = sizeof(struct lov_user_md) +
1330                                            sizeof(struct lov_user_ost_data);
1331        int                      rc;
1332        ENTRY;
1333
1334        if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1335                RETURN(-EPERM);
1336
1337        OBD_ALLOC_LARGE(lump, lum_size);
1338        if (lump == NULL)
1339                RETURN(-ENOMEM);
1340
1341        if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1342                OBD_FREE_LARGE(lump, lum_size);
1343                RETURN(-EFAULT);
1344        }
1345
1346        rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1347
1348        OBD_FREE_LARGE(lump, lum_size);
1349        RETURN(rc);
1350}
1351
1352static int ll_lov_setstripe(struct inode *inode, struct file *file,
1353                            unsigned long arg)
1354{
1355        struct lov_user_md_v3    lumv3;
1356        struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1357        struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1358        struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1359        int                      lum_size, rc;
1360        int                      flags = FMODE_WRITE;
1361        ENTRY;
1362
1363        /* first try with v1 which is smaller than v3 */
1364        lum_size = sizeof(struct lov_user_md_v1);
1365        if (copy_from_user(lumv1, lumv1p, lum_size))
1366                RETURN(-EFAULT);
1367
1368        if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1369                lum_size = sizeof(struct lov_user_md_v3);
1370                if (copy_from_user(&lumv3, lumv3p, lum_size))
1371                        RETURN(-EFAULT);
1372        }
1373
1374        rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1375        if (rc == 0) {
1376                struct lov_stripe_md *lsm;
1377                __u32 gen;
1378
1379                put_user(0, &lumv1p->lmm_stripe_count);
1380
1381                ll_layout_refresh(inode, &gen);
1382                lsm = ccc_inode_lsm_get(inode);
1383                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1384                                   0, lsm, (void *)arg);
1385                ccc_inode_lsm_put(inode, lsm);
1386        }
1387        RETURN(rc);
1388}
1389
1390static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1391{
1392        struct lov_stripe_md *lsm;
1393        int rc = -ENODATA;
1394        ENTRY;
1395
1396        lsm = ccc_inode_lsm_get(inode);
1397        if (lsm != NULL)
1398                rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1399                                   lsm, (void *)arg);
1400        ccc_inode_lsm_put(inode, lsm);
1401        RETURN(rc);
1402}
1403
1404int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1405{
1406        struct ll_inode_info   *lli = ll_i2info(inode);
1407        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1408        struct ccc_grouplock    grouplock;
1409        int                  rc;
1410        ENTRY;
1411
1412        if (ll_file_nolock(file))
1413                RETURN(-EOPNOTSUPP);
1414
1415        spin_lock(&lli->lli_lock);
1416        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1417                CWARN("group lock already existed with gid %lu\n",
1418                      fd->fd_grouplock.cg_gid);
1419                spin_unlock(&lli->lli_lock);
1420                RETURN(-EINVAL);
1421        }
1422        LASSERT(fd->fd_grouplock.cg_lock == NULL);
1423        spin_unlock(&lli->lli_lock);
1424
1425        rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1426                              arg, (file->f_flags & O_NONBLOCK), &grouplock);
1427        if (rc)
1428                RETURN(rc);
1429
1430        spin_lock(&lli->lli_lock);
1431        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1432                spin_unlock(&lli->lli_lock);
1433                CERROR("another thread just won the race\n");
1434                cl_put_grouplock(&grouplock);
1435                RETURN(-EINVAL);
1436        }
1437
1438        fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1439        fd->fd_grouplock = grouplock;
1440        spin_unlock(&lli->lli_lock);
1441
1442        CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1443        RETURN(0);
1444}
1445
1446int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1447{
1448        struct ll_inode_info   *lli = ll_i2info(inode);
1449        struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1450        struct ccc_grouplock    grouplock;
1451        ENTRY;
1452
1453        spin_lock(&lli->lli_lock);
1454        if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1455                spin_unlock(&lli->lli_lock);
1456                CWARN("no group lock held\n");
1457                RETURN(-EINVAL);
1458        }
1459        LASSERT(fd->fd_grouplock.cg_lock != NULL);
1460
1461        if (fd->fd_grouplock.cg_gid != arg) {
1462                CWARN("group lock %lu doesn't match current id %lu\n",
1463                       arg, fd->fd_grouplock.cg_gid);
1464                spin_unlock(&lli->lli_lock);
1465                RETURN(-EINVAL);
1466        }
1467
1468        grouplock = fd->fd_grouplock;
1469        memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1470        fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1471        spin_unlock(&lli->lli_lock);
1472
1473        cl_put_grouplock(&grouplock);
1474        CDEBUG(D_INFO, "group lock %lu released\n", arg);
1475        RETURN(0);
1476}
1477
1478/**
1479 * Close inode open handle
1480 *
1481 * \param dentry [in]     dentry which contains the inode
1482 * \param it     [in,out] intent which contains open info and result
1483 *
1484 * \retval 0     success
1485 * \retval <0    failure
1486 */
1487int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1488{
1489        struct inode *inode = dentry->d_inode;
1490        struct obd_client_handle *och;
1491        int rc;
1492        ENTRY;
1493
1494        LASSERT(inode);
1495
1496        /* Root ? Do nothing. */
1497        if (dentry->d_inode->i_sb->s_root == dentry)
1498                RETURN(0);
1499
1500        /* No open handle to close? Move away */
1501        if (!it_disposition(it, DISP_OPEN_OPEN))
1502                RETURN(0);
1503
1504        LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1505
1506        OBD_ALLOC(och, sizeof(*och));
1507        if (!och)
1508                GOTO(out, rc = -ENOMEM);
1509
1510        ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1511                    ll_i2info(inode), it, och);
1512
1513        rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1514                                       inode, och);
1515 out:
1516        /* this one is in place of ll_file_open */
1517        if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1518                ptlrpc_req_finished(it->d.lustre.it_data);
1519                it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1520        }
1521        RETURN(rc);
1522}
1523
1524/**
1525 * Get size for inode for which FIEMAP mapping is requested.
1526 * Make the FIEMAP get_info call and returns the result.
1527 */
1528int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1529              int num_bytes)
1530{
1531        struct obd_export *exp = ll_i2dtexp(inode);
1532        struct lov_stripe_md *lsm = NULL;
1533        struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1534        int vallen = num_bytes;
1535        int rc;
1536        ENTRY;
1537
1538        /* Checks for fiemap flags */
1539        if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1540                fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1541                return -EBADR;
1542        }
1543
1544        /* Check for FIEMAP_FLAG_SYNC */
1545        if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1546                rc = filemap_fdatawrite(inode->i_mapping);
1547                if (rc)
1548                        return rc;
1549        }
1550
1551        lsm = ccc_inode_lsm_get(inode);
1552        if (lsm == NULL)
1553                return -ENOENT;
1554
1555        /* If the stripe_count > 1 and the application does not understand
1556         * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1557         */
1558        if (lsm->lsm_stripe_count > 1 &&
1559            !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1560                GOTO(out, rc = -EOPNOTSUPP);
1561
1562        fm_key.oa.o_oi = lsm->lsm_oi;
1563        fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1564
1565        obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1566        obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1567        /* If filesize is 0, then there would be no objects for mapping */
1568        if (fm_key.oa.o_size == 0) {
1569                fiemap->fm_mapped_extents = 0;
1570                GOTO(out, rc = 0);
1571        }
1572
1573        memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1574
1575        rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1576                          fiemap, lsm);
1577        if (rc)
1578                CERROR("obd_get_info failed: rc = %d\n", rc);
1579
1580out:
1581        ccc_inode_lsm_put(inode, lsm);
1582        RETURN(rc);
1583}
1584
1585int ll_fid2path(struct inode *inode, void *arg)
1586{
1587        struct obd_export       *exp = ll_i2mdexp(inode);
1588        struct getinfo_fid2path *gfout, *gfin;
1589        int                      outsize, rc;
1590        ENTRY;
1591
1592        if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1593            !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1594                RETURN(-EPERM);
1595
1596        /* Need to get the buflen */
1597        OBD_ALLOC_PTR(gfin);
1598        if (gfin == NULL)
1599                RETURN(-ENOMEM);
1600        if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1601                OBD_FREE_PTR(gfin);
1602                RETURN(-EFAULT);
1603        }
1604
1605        outsize = sizeof(*gfout) + gfin->gf_pathlen;
1606        OBD_ALLOC(gfout, outsize);
1607        if (gfout == NULL) {
1608                OBD_FREE_PTR(gfin);
1609                RETURN(-ENOMEM);
1610        }
1611        memcpy(gfout, gfin, sizeof(*gfout));
1612        OBD_FREE_PTR(gfin);
1613
1614        /* Call mdc_iocontrol */
1615        rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1616        if (rc)
1617                GOTO(gf_free, rc);
1618
1619        if (copy_to_user(arg, gfout, outsize))
1620                rc = -EFAULT;
1621
1622gf_free:
1623        OBD_FREE(gfout, outsize);
1624        RETURN(rc);
1625}
1626
1627static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1628{
1629        struct ll_user_fiemap *fiemap_s;
1630        size_t num_bytes, ret_bytes;
1631        unsigned int extent_count;
1632        int rc = 0;
1633
1634        /* Get the extent count so we can calculate the size of
1635         * required fiemap buffer */
1636        if (get_user(extent_count,
1637            &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1638                RETURN(-EFAULT);
1639        num_bytes = sizeof(*fiemap_s) + (extent_count *
1640                                         sizeof(struct ll_fiemap_extent));
1641
1642        OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1643        if (fiemap_s == NULL)
1644                RETURN(-ENOMEM);
1645
1646        /* get the fiemap value */
1647        if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1648                           sizeof(*fiemap_s)))
1649                GOTO(error, rc = -EFAULT);
1650
1651        /* If fm_extent_count is non-zero, read the first extent since
1652         * it is used to calculate end_offset and device from previous
1653         * fiemap call. */
1654        if (extent_count) {
1655                if (copy_from_user(&fiemap_s->fm_extents[0],
1656                    (char __user *)arg + sizeof(*fiemap_s),
1657                    sizeof(struct ll_fiemap_extent)))
1658                        GOTO(error, rc = -EFAULT);
1659        }
1660
1661        rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1662        if (rc)
1663                GOTO(error, rc);
1664
1665        ret_bytes = sizeof(struct ll_user_fiemap);
1666
1667        if (extent_count != 0)
1668                ret_bytes += (fiemap_s->fm_mapped_extents *
1669                                 sizeof(struct ll_fiemap_extent));
1670
1671        if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1672                rc = -EFAULT;
1673
1674error:
1675        OBD_FREE_LARGE(fiemap_s, num_bytes);
1676        RETURN(rc);
1677}
1678
1679/*
1680 * Read the data_version for inode.
1681 *
1682 * This value is computed using stripe object version on OST.
1683 * Version is computed using server side locking.
1684 *
1685 * @param extent_lock  Take extent lock. Not needed if a process is already
1686 *                     holding the OST object group locks.
1687 */
1688int ll_data_version(struct inode *inode, __u64 *data_version,
1689                    int extent_lock)
1690{
1691        struct lov_stripe_md    *lsm = NULL;
1692        struct ll_sb_info       *sbi = ll_i2sbi(inode);
1693        struct obdo             *obdo = NULL;
1694        int                      rc;
1695        ENTRY;
1696
1697        /* If no stripe, we consider version is 0. */
1698        lsm = ccc_inode_lsm_get(inode);
1699        if (lsm == NULL) {
1700                *data_version = 0;
1701                CDEBUG(D_INODE, "No object for inode\n");
1702                RETURN(0);
1703        }
1704
1705        OBD_ALLOC_PTR(obdo);
1706        if (obdo == NULL) {
1707                ccc_inode_lsm_put(inode, lsm);
1708                RETURN(-ENOMEM);
1709        }
1710
1711        rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1712        if (!rc) {
1713                if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1714                        rc = -EOPNOTSUPP;
1715                else
1716                        *data_version = obdo->o_data_version;
1717        }
1718
1719        OBD_FREE_PTR(obdo);
1720        ccc_inode_lsm_put(inode, lsm);
1721
1722        RETURN(rc);
1723}
1724
1725struct ll_swap_stack {
1726        struct iattr             ia1, ia2;
1727        __u64                    dv1, dv2;
1728        struct inode            *inode1, *inode2;
1729        bool                     check_dv1, check_dv2;
1730};
1731
1732static int ll_swap_layouts(struct file *file1, struct file *file2,
1733                           struct lustre_swap_layouts *lsl)
1734{
1735        struct mdc_swap_layouts  msl;
1736        struct md_op_data       *op_data;
1737        __u32                    gid;
1738        __u64                    dv;
1739        struct ll_swap_stack    *llss = NULL;
1740        int                      rc;
1741
1742        OBD_ALLOC_PTR(llss);
1743        if (llss == NULL)
1744                RETURN(-ENOMEM);
1745
1746        llss->inode1 = file1->f_dentry->d_inode;
1747        llss->inode2 = file2->f_dentry->d_inode;
1748
1749        if (!S_ISREG(llss->inode2->i_mode))
1750                GOTO(free, rc = -EINVAL);
1751
1752        if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
1753            ll_permission(llss->inode2, MAY_WRITE, NULL))
1754                GOTO(free, rc = -EPERM);
1755
1756        if (llss->inode2->i_sb != llss->inode1->i_sb)
1757                GOTO(free, rc = -EXDEV);
1758
1759        /* we use 2 bool because it is easier to swap than 2 bits */
1760        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1761                llss->check_dv1 = true;
1762
1763        if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1764                llss->check_dv2 = true;
1765
1766        /* we cannot use lsl->sl_dvX directly because we may swap them */
1767        llss->dv1 = lsl->sl_dv1;
1768        llss->dv2 = lsl->sl_dv2;
1769
1770        rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1771        if (rc == 0) /* same file, done! */
1772                GOTO(free, rc = 0);
1773
1774        if (rc < 0) { /* sequentialize it */
1775                swap(llss->inode1, llss->inode2);
1776                swap(file1, file2);
1777                swap(llss->dv1, llss->dv2);
1778                swap(llss->check_dv1, llss->check_dv2);
1779        }
1780
1781        gid = lsl->sl_gid;
1782        if (gid != 0) { /* application asks to flush dirty cache */
1783                rc = ll_get_grouplock(llss->inode1, file1, gid);
1784                if (rc < 0)
1785                        GOTO(free, rc);
1786
1787                rc = ll_get_grouplock(llss->inode2, file2, gid);
1788                if (rc < 0) {
1789                        ll_put_grouplock(llss->inode1, file1, gid);
1790                        GOTO(free, rc);
1791                }
1792        }
1793
1794        /* to be able to restore mtime and atime after swap
1795         * we need to first save them */
1796        if (lsl->sl_flags &
1797            (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
1798                llss->ia1.ia_mtime = llss->inode1->i_mtime;
1799                llss->ia1.ia_atime = llss->inode1->i_atime;
1800                llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
1801                llss->ia2.ia_mtime = llss->inode2->i_mtime;
1802                llss->ia2.ia_atime = llss->inode2->i_atime;
1803                llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
1804        }
1805
1806        /* ultimate check, before swaping the layouts we check if
1807         * dataversion has changed (if requested) */
1808        if (llss->check_dv1) {
1809                rc = ll_data_version(llss->inode1, &dv, 0);
1810                if (rc)
1811                        GOTO(putgl, rc);
1812                if (dv != llss->dv1)
1813                        GOTO(putgl, rc = -EAGAIN);
1814        }
1815
1816        if (llss->check_dv2) {
1817                rc = ll_data_version(llss->inode2, &dv, 0);
1818                if (rc)
1819                        GOTO(putgl, rc);
1820                if (dv != llss->dv2)
1821                        GOTO(putgl, rc = -EAGAIN);
1822        }
1823
1824        /* struct md_op_data is used to send the swap args to the mdt
1825         * only flags is missing, so we use struct mdc_swap_layouts
1826         * through the md_op_data->op_data */
1827        /* flags from user space have to be converted before they are send to
1828         * server, no flag is sent today, they are only used on the client */
1829        msl.msl_flags = 0;
1830        rc = -ENOMEM;
1831        op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
1832                                     0, LUSTRE_OPC_ANY, &msl);
1833        if (op_data != NULL) {
1834                rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
1835                                   ll_i2mdexp(llss->inode1),
1836                                   sizeof(*op_data), op_data, NULL);
1837                ll_finish_md_op_data(op_data);
1838        }
1839
1840putgl:
1841        if (gid != 0) {
1842                ll_put_grouplock(llss->inode2, file2, gid);
1843                ll_put_grouplock(llss->inode1, file1, gid);
1844        }
1845
1846        /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
1847        if (rc != 0)
1848                GOTO(free, rc);
1849
1850        /* clear useless flags */
1851        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
1852                llss->ia1.ia_valid &= ~ATTR_MTIME;
1853                llss->ia2.ia_valid &= ~ATTR_MTIME;
1854        }
1855
1856        if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
1857                llss->ia1.ia_valid &= ~ATTR_ATIME;
1858                llss->ia2.ia_valid &= ~ATTR_ATIME;
1859        }
1860
1861        /* update time if requested */
1862        rc = 0;
1863        if (llss->ia2.ia_valid != 0) {
1864                mutex_lock(&llss->inode1->i_mutex);
1865                rc = ll_setattr(file1->f_dentry, &llss->ia2);
1866                mutex_unlock(&llss->inode1->i_mutex);
1867        }
1868
1869        if (llss->ia1.ia_valid != 0) {
1870                int rc1;
1871
1872                mutex_lock(&llss->inode2->i_mutex);
1873                rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
1874                mutex_unlock(&llss->inode2->i_mutex);
1875                if (rc == 0)
1876                        rc = rc1;
1877        }
1878
1879free:
1880        if (llss != NULL)
1881                OBD_FREE_PTR(llss);
1882
1883        RETURN(rc);
1884}
1885
1886long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1887{
1888        struct inode            *inode = file->f_dentry->d_inode;
1889        struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
1890        int                      flags, rc;
1891        ENTRY;
1892
1893        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1894               inode->i_generation, inode, cmd);
1895        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1896
1897        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1898        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1899                RETURN(-ENOTTY);
1900
1901        switch(cmd) {
1902        case LL_IOC_GETFLAGS:
1903                /* Get the current value of the file flags */
1904                return put_user(fd->fd_flags, (int *)arg);
1905        case LL_IOC_SETFLAGS:
1906        case LL_IOC_CLRFLAGS:
1907                /* Set or clear specific file flags */
1908                /* XXX This probably needs checks to ensure the flags are
1909                 *     not abused, and to handle any flag side effects.
1910                 */
1911                if (get_user(flags, (int *) arg))
1912                        RETURN(-EFAULT);
1913
1914                if (cmd == LL_IOC_SETFLAGS) {
1915                        if ((flags & LL_FILE_IGNORE_LOCK) &&
1916                            !(file->f_flags & O_DIRECT)) {
1917                                CERROR("%s: unable to disable locking on "
1918                                       "non-O_DIRECT file\n", current->comm);
1919                                RETURN(-EINVAL);
1920                        }
1921
1922                        fd->fd_flags |= flags;
1923                } else {
1924                        fd->fd_flags &= ~flags;
1925                }
1926                RETURN(0);
1927        case LL_IOC_LOV_SETSTRIPE:
1928                RETURN(ll_lov_setstripe(inode, file, arg));
1929        case LL_IOC_LOV_SETEA:
1930                RETURN(ll_lov_setea(inode, file, arg));
1931        case LL_IOC_LOV_SWAP_LAYOUTS: {
1932                struct file *file2;
1933                struct lustre_swap_layouts lsl;
1934
1935                if (copy_from_user(&lsl, (char *)arg,
1936                                       sizeof(struct lustre_swap_layouts)))
1937                        RETURN(-EFAULT);
1938
1939                if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
1940                        RETURN(-EPERM);
1941
1942                file2 = fget(lsl.sl_fd);
1943                if (file2 == NULL)
1944                        RETURN(-EBADF);
1945
1946                rc = -EPERM;
1947                if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
1948                        rc = ll_swap_layouts(file, file2, &lsl);
1949                fput(file2);
1950                RETURN(rc);
1951        }
1952        case LL_IOC_LOV_GETSTRIPE:
1953                RETURN(ll_lov_getstripe(inode, arg));
1954        case LL_IOC_RECREATE_OBJ:
1955                RETURN(ll_lov_recreate_obj(inode, arg));
1956        case LL_IOC_RECREATE_FID:
1957                RETURN(ll_lov_recreate_fid(inode, arg));
1958        case FSFILT_IOC_FIEMAP:
1959                RETURN(ll_ioctl_fiemap(inode, arg));
1960        case FSFILT_IOC_GETFLAGS:
1961        case FSFILT_IOC_SETFLAGS:
1962                RETURN(ll_iocontrol(inode, file, cmd, arg));
1963        case FSFILT_IOC_GETVERSION_OLD:
1964        case FSFILT_IOC_GETVERSION:
1965                RETURN(put_user(inode->i_generation, (int *)arg));
1966        case LL_IOC_GROUP_LOCK:
1967                RETURN(ll_get_grouplock(inode, file, arg));
1968        case LL_IOC_GROUP_UNLOCK:
1969                RETURN(ll_put_grouplock(inode, file, arg));
1970        case IOC_OBD_STATFS:
1971                RETURN(ll_obd_statfs(inode, (void *)arg));
1972
1973        /* We need to special case any other ioctls we want to handle,
1974         * to send them to the MDS/OST as appropriate and to properly
1975         * network encode the arg field.
1976        case FSFILT_IOC_SETVERSION_OLD:
1977        case FSFILT_IOC_SETVERSION:
1978        */
1979        case LL_IOC_FLUSHCTX:
1980                RETURN(ll_flush_ctx(inode));
1981        case LL_IOC_PATH2FID: {
1982                if (copy_to_user((void *)arg, ll_inode2fid(inode),
1983                                 sizeof(struct lu_fid)))
1984                        RETURN(-EFAULT);
1985
1986                RETURN(0);
1987        }
1988        case OBD_IOC_FID2PATH:
1989                RETURN(ll_fid2path(inode, (void *)arg));
1990        case LL_IOC_DATA_VERSION: {
1991                struct ioc_data_version idv;
1992                int                     rc;
1993
1994                if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
1995                        RETURN(-EFAULT);
1996
1997                rc = ll_data_version(inode, &idv.idv_version,
1998                                !(idv.idv_flags & LL_DV_NOFLUSH));
1999
2000                if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2001                        RETURN(-EFAULT);
2002
2003                RETURN(rc);
2004        }
2005
2006        case LL_IOC_GET_MDTIDX: {
2007                int mdtidx;
2008
2009                mdtidx = ll_get_mdt_idx(inode);
2010                if (mdtidx < 0)
2011                        RETURN(mdtidx);
2012
2013                if (put_user((int)mdtidx, (int*)arg))
2014                        RETURN(-EFAULT);
2015
2016                RETURN(0);
2017        }
2018        case OBD_IOC_GETDTNAME:
2019        case OBD_IOC_GETMDNAME:
2020                RETURN(ll_get_obd_name(inode, cmd, arg));
2021        case LL_IOC_HSM_STATE_GET: {
2022                struct md_op_data       *op_data;
2023                struct hsm_user_state   *hus;
2024                int                      rc;
2025
2026                OBD_ALLOC_PTR(hus);
2027                if (hus == NULL)
2028                        RETURN(-ENOMEM);
2029
2030                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2031                                             LUSTRE_OPC_ANY, hus);
2032                if (op_data == NULL) {
2033                        OBD_FREE_PTR(hus);
2034                        RETURN(-ENOMEM);
2035                }
2036
2037                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2038                                   op_data, NULL);
2039
2040                if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2041                        rc = -EFAULT;
2042
2043                ll_finish_md_op_data(op_data);
2044                OBD_FREE_PTR(hus);
2045                RETURN(rc);
2046        }
2047        case LL_IOC_HSM_STATE_SET: {
2048                struct md_op_data       *op_data;
2049                struct hsm_state_set    *hss;
2050                int                      rc;
2051
2052                OBD_ALLOC_PTR(hss);
2053                if (hss == NULL)
2054                        RETURN(-ENOMEM);
2055                if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2056                        OBD_FREE_PTR(hss);
2057                        RETURN(-EFAULT);
2058                }
2059
2060                /* Non-root users are forbidden to set or clear flags which are
2061                 * NOT defined in HSM_USER_MASK. */
2062                if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2063                    && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2064                        OBD_FREE_PTR(hss);
2065                        RETURN(-EPERM);
2066                }
2067
2068                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2069                                             LUSTRE_OPC_ANY, hss);
2070                if (op_data == NULL) {
2071                        OBD_FREE_PTR(hss);
2072                        RETURN(-ENOMEM);
2073                }
2074
2075                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2076                                   op_data, NULL);
2077
2078                ll_finish_md_op_data(op_data);
2079
2080                OBD_FREE_PTR(hss);
2081                RETURN(rc);
2082        }
2083        case LL_IOC_HSM_ACTION: {
2084                struct md_op_data               *op_data;
2085                struct hsm_current_action       *hca;
2086                int                              rc;
2087
2088                OBD_ALLOC_PTR(hca);
2089                if (hca == NULL)
2090                        RETURN(-ENOMEM);
2091
2092                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2093                                             LUSTRE_OPC_ANY, hca);
2094                if (op_data == NULL) {
2095                        OBD_FREE_PTR(hca);
2096                        RETURN(-ENOMEM);
2097                }
2098
2099                rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2100                                   op_data, NULL);
2101
2102                if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2103                        rc = -EFAULT;
2104
2105                ll_finish_md_op_data(op_data);
2106                OBD_FREE_PTR(hca);
2107                RETURN(rc);
2108        }
2109        default: {
2110                int err;
2111
2112                if (LLIOC_STOP ==
2113                     ll_iocontrol_call(inode, file, cmd, arg, &err))
2114                        RETURN(err);
2115
2116                RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2117                                     (void *)arg));
2118        }
2119        }
2120}
2121
2122
2123loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2124{
2125        struct inode *inode = file->f_dentry->d_inode;
2126        loff_t retval, eof = 0;
2127
2128        ENTRY;
2129        retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2130                           (origin == SEEK_CUR) ? file->f_pos : 0);
2131        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2132               inode->i_ino, inode->i_generation, inode, retval, retval,
2133               origin);
2134        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2135
2136        if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2137                retval = ll_glimpse_size(inode);
2138                if (retval != 0)
2139                        RETURN(retval);
2140                eof = i_size_read(inode);
2141        }
2142
2143        retval = ll_generic_file_llseek_size(file, offset, origin,
2144                                          ll_file_maxbytes(inode), eof);
2145        RETURN(retval);
2146}
2147
2148int ll_flush(struct file *file, fl_owner_t id)
2149{
2150        struct inode *inode = file->f_dentry->d_inode;
2151        struct ll_inode_info *lli = ll_i2info(inode);
2152        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2153        int rc, err;
2154
2155        LASSERT(!S_ISDIR(inode->i_mode));
2156
2157        /* catch async errors that were recorded back when async writeback
2158         * failed for pages in this mapping. */
2159        rc = lli->lli_async_rc;
2160        lli->lli_async_rc = 0;
2161        err = lov_read_and_clear_async_rc(lli->lli_clob);
2162        if (rc == 0)
2163                rc = err;
2164
2165        /* The application has been told write failure already.
2166         * Do not report failure again. */
2167        if (fd->fd_write_failed)
2168                return 0;
2169        return rc ? -EIO : 0;
2170}
2171
2172/**
2173 * Called to make sure a portion of file has been written out.
2174 * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2175 *
2176 * Return how many pages have been written.
2177 */
2178int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2179                       enum cl_fsync_mode mode, int ignore_layout)
2180{
2181        struct cl_env_nest nest;
2182        struct lu_env *env;
2183        struct cl_io *io;
2184        struct obd_capa *capa = NULL;
2185        struct cl_fsync_io *fio;
2186        int result;
2187        ENTRY;
2188
2189        if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2190            mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2191                RETURN(-EINVAL);
2192
2193        env = cl_env_nested_get(&nest);
2194        if (IS_ERR(env))
2195                RETURN(PTR_ERR(env));
2196
2197        capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2198
2199        io = ccc_env_thread_io(env);
2200        io->ci_obj = cl_i2info(inode)->lli_clob;
2201        io->ci_ignore_layout = ignore_layout;
2202
2203        /* initialize parameters for sync */
2204        fio = &io->u.ci_fsync;
2205        fio->fi_capa = capa;
2206        fio->fi_start = start;
2207        fio->fi_end = end;
2208        fio->fi_fid = ll_inode2fid(inode);
2209        fio->fi_mode = mode;
2210        fio->fi_nr_written = 0;
2211
2212        if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2213                result = cl_io_loop(env, io);
2214        else
2215                result = io->ci_result;
2216        if (result == 0)
2217                result = fio->fi_nr_written;
2218        cl_io_fini(env, io);
2219        cl_env_nested_put(&nest, env);
2220
2221        capa_put(capa);
2222
2223        RETURN(result);
2224}
2225
2226/*
2227 * When dentry is provided (the 'else' case), *file->f_dentry may be
2228 * null and dentry must be used directly rather than pulled from
2229 * *file->f_dentry as is done otherwise.
2230 */
2231
2232int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2233{
2234        struct dentry *dentry = file->f_dentry;
2235        struct inode *inode = dentry->d_inode;
2236        struct ll_inode_info *lli = ll_i2info(inode);
2237        struct ptlrpc_request *req;
2238        struct obd_capa *oc;
2239        int rc, err;
2240        ENTRY;
2241
2242        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2243               inode->i_generation, inode);
2244        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2245
2246        rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2247        mutex_lock(&inode->i_mutex);
2248
2249        /* catch async errors that were recorded back when async writeback
2250         * failed for pages in this mapping. */
2251        if (!S_ISDIR(inode->i_mode)) {
2252                err = lli->lli_async_rc;
2253                lli->lli_async_rc = 0;
2254                if (rc == 0)
2255                        rc = err;
2256                err = lov_read_and_clear_async_rc(lli->lli_clob);
2257                if (rc == 0)
2258                        rc = err;
2259        }
2260
2261        oc = ll_mdscapa_get(inode);
2262        err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2263                      &req);
2264        capa_put(oc);
2265        if (!rc)
2266                rc = err;
2267        if (!err)
2268                ptlrpc_req_finished(req);
2269
2270        if (datasync && S_ISREG(inode->i_mode)) {
2271                struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2272
2273                err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2274                                CL_FSYNC_ALL, 0);
2275                if (rc == 0 && err < 0)
2276                        rc = err;
2277                if (rc < 0)
2278                        fd->fd_write_failed = true;
2279                else
2280                        fd->fd_write_failed = false;
2281        }
2282
2283        mutex_unlock(&inode->i_mutex);
2284        RETURN(rc);
2285}
2286
2287int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2288{
2289        struct inode *inode = file->f_dentry->d_inode;
2290        struct ll_sb_info *sbi = ll_i2sbi(inode);
2291        struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2292                                           .ei_cb_cp =ldlm_flock_completion_ast,
2293                                           .ei_cbdata = file_lock };
2294        struct md_op_data *op_data;
2295        struct lustre_handle lockh = {0};
2296        ldlm_policy_data_t flock = {{0}};
2297        int flags = 0;
2298        int rc;
2299        int rc2 = 0;
2300        ENTRY;
2301
2302        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2303               inode->i_ino, file_lock);
2304
2305        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2306
2307        if (file_lock->fl_flags & FL_FLOCK) {
2308                LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2309                /* flocks are whole-file locks */
2310                flock.l_flock.end = OFFSET_MAX;
2311                /* For flocks owner is determined by the local file desctiptor*/
2312                flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2313        } else if (file_lock->fl_flags & FL_POSIX) {
2314                flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2315                flock.l_flock.start = file_lock->fl_start;
2316                flock.l_flock.end = file_lock->fl_end;
2317        } else {
2318                RETURN(-EINVAL);
2319        }
2320        flock.l_flock.pid = file_lock->fl_pid;
2321
2322        /* Somewhat ugly workaround for svc lockd.
2323         * lockd installs custom fl_lmops->lm_compare_owner that checks
2324         * for the fl_owner to be the same (which it always is on local node
2325         * I guess between lockd processes) and then compares pid.
2326         * As such we assign pid to the owner field to make it all work,
2327         * conflict with normal locks is unlikely since pid space and
2328         * pointer space for current->files are not intersecting */
2329        if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2330                flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2331
2332        switch (file_lock->fl_type) {
2333        case F_RDLCK:
2334                einfo.ei_mode = LCK_PR;
2335                break;
2336        case F_UNLCK:
2337                /* An unlock request may or may not have any relation to
2338                 * existing locks so we may not be able to pass a lock handle
2339                 * via a normal ldlm_lock_cancel() request. The request may even
2340                 * unlock a byte range in the middle of an existing lock. In
2341                 * order to process an unlock request we need all of the same
2342                 * information that is given with a normal read or write record
2343                 * lock request. To avoid creating another ldlm unlock (cancel)
2344                 * message we'll treat a LCK_NL flock request as an unlock. */
2345                einfo.ei_mode = LCK_NL;
2346                break;
2347        case F_WRLCK:
2348                einfo.ei_mode = LCK_PW;
2349                break;
2350        default:
2351                CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2352                        file_lock->fl_type);
2353                RETURN (-ENOTSUPP);
2354        }
2355
2356        switch (cmd) {
2357        case F_SETLKW:
2358#ifdef F_SETLKW64
2359        case F_SETLKW64:
2360#endif
2361                flags = 0;
2362                break;
2363        case F_SETLK:
2364#ifdef F_SETLK64
2365        case F_SETLK64:
2366#endif
2367                flags = LDLM_FL_BLOCK_NOWAIT;
2368                break;
2369        case F_GETLK:
2370#ifdef F_GETLK64
2371        case F_GETLK64:
2372#endif
2373                flags = LDLM_FL_TEST_LOCK;
2374                /* Save the old mode so that if the mode in the lock changes we
2375                 * can decrement the appropriate reader or writer refcount. */
2376                file_lock->fl_type = einfo.ei_mode;
2377                break;
2378        default:
2379                CERROR("unknown fcntl lock command: %d\n", cmd);
2380                RETURN (-EINVAL);
2381        }
2382
2383        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2384                                     LUSTRE_OPC_ANY, NULL);
2385        if (IS_ERR(op_data))
2386                RETURN(PTR_ERR(op_data));
2387
2388        CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2389               "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2390               flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2391
2392        rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2393                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2394
2395        if ((file_lock->fl_flags & FL_FLOCK) &&
2396            (rc == 0 || file_lock->fl_type == F_UNLCK))
2397                rc2  = flock_lock_file_wait(file, file_lock);
2398        if ((file_lock->fl_flags & FL_POSIX) &&
2399            (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2400            !(flags & LDLM_FL_TEST_LOCK))
2401                rc2  = posix_lock_file_wait(file, file_lock);
2402
2403        if (rc2 && file_lock->fl_type != F_UNLCK) {
2404                einfo.ei_mode = LCK_NL;
2405                md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2406                        op_data, &lockh, &flock, 0, NULL /* req */, flags);
2407                rc = rc2;
2408        }
2409
2410        ll_finish_md_op_data(op_data);
2411
2412        RETURN(rc);
2413}
2414
2415int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2416{
2417        ENTRY;
2418
2419        RETURN(-ENOSYS);
2420}
2421
2422/**
2423 * test if some locks matching bits and l_req_mode are acquired
2424 * - bits can be in different locks
2425 * - if found clear the common lock bits in *bits
2426 * - the bits not found, are kept in *bits
2427 * \param inode [IN]
2428 * \param bits [IN] searched lock bits [IN]
2429 * \param l_req_mode [IN] searched lock mode
2430 * \retval boolean, true iff all bits are found
2431 */
2432int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2433{
2434        struct lustre_handle lockh;
2435        ldlm_policy_data_t policy;
2436        ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2437                                (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2438        struct lu_fid *fid;
2439        __u64 flags;
2440        int i;
2441        ENTRY;
2442
2443        if (!inode)
2444               RETURN(0);
2445
2446        fid = &ll_i2info(inode)->lli_fid;
2447        CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2448               ldlm_lockname[mode]);
2449
2450        flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2451        for (i = 0; i < MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2452                policy.l_inodebits.bits = *bits & (1 << i);
2453                if (policy.l_inodebits.bits == 0)
2454                        continue;
2455
2456                if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2457                                  &policy, mode, &lockh)) {
2458                        struct ldlm_lock *lock;
2459
2460                        lock = ldlm_handle2lock(&lockh);
2461                        if (lock) {
2462                                *bits &=
2463                                      ~(lock->l_policy_data.l_inodebits.bits);
2464                                LDLM_LOCK_PUT(lock);
2465                        } else {
2466                                *bits &= ~policy.l_inodebits.bits;
2467                        }
2468                }
2469        }
2470        RETURN(*bits == 0);
2471}
2472
2473ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2474                            struct lustre_handle *lockh, __u64 flags)
2475{
2476        ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2477        struct lu_fid *fid;
2478        ldlm_mode_t rc;
2479        ENTRY;
2480
2481        fid = &ll_i2info(inode)->lli_fid;
2482        CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2483
2484        rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2485                           fid, LDLM_IBITS, &policy,
2486                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2487        RETURN(rc);
2488}
2489
2490static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2491{
2492        /* Already unlinked. Just update nlink and return success */
2493        if (rc == -ENOENT) {
2494                clear_nlink(inode);
2495                /* This path cannot be hit for regular files unless in
2496                 * case of obscure races, so no need to to validate
2497                 * size. */
2498                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2499                        return 0;
2500        } else if (rc != 0) {
2501                CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2502                       ll_get_fsname(inode->i_sb, NULL, 0),
2503                       PFID(ll_inode2fid(inode)), rc);
2504        }
2505
2506        return rc;
2507}
2508
2509int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2510                             __u64 ibits)
2511{
2512        struct inode *inode = dentry->d_inode;
2513        struct ptlrpc_request *req = NULL;
2514        struct obd_export *exp;
2515        int rc = 0;
2516        ENTRY;
2517
2518        LASSERT(inode != NULL);
2519
2520        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2521               inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2522
2523        exp = ll_i2mdexp(inode);
2524
2525        /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2526         *      But under CMD case, it caused some lock issues, should be fixed
2527         *      with new CMD ibits lock. See bug 12718 */
2528        if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2529                struct lookup_intent oit = { .it_op = IT_GETATTR };
2530                struct md_op_data *op_data;
2531
2532                if (ibits == MDS_INODELOCK_LOOKUP)
2533                        oit.it_op = IT_LOOKUP;
2534
2535                /* Call getattr by fid, so do not provide name at all. */
2536                op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2537                                             dentry->d_inode, NULL, 0, 0,
2538                                             LUSTRE_OPC_ANY, NULL);
2539                if (IS_ERR(op_data))
2540                        RETURN(PTR_ERR(op_data));
2541
2542                oit.it_create_mode |= M_CHECK_STALE;
2543                rc = md_intent_lock(exp, op_data, NULL, 0,
2544                                    /* we are not interested in name
2545                                       based lookup */
2546                                    &oit, 0, &req,
2547                                    ll_md_blocking_ast, 0);
2548                ll_finish_md_op_data(op_data);
2549                oit.it_create_mode &= ~M_CHECK_STALE;
2550                if (rc < 0) {
2551                        rc = ll_inode_revalidate_fini(inode, rc);
2552                        GOTO (out, rc);
2553                }
2554
2555                rc = ll_revalidate_it_finish(req, &oit, dentry);
2556                if (rc != 0) {
2557                        ll_intent_release(&oit);
2558                        GOTO(out, rc);
2559                }
2560
2561                /* Unlinked? Unhash dentry, so it is not picked up later by
2562                   do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2563                   here to preserve get_cwd functionality on 2.6.
2564                   Bug 10503 */
2565                if (!dentry->d_inode->i_nlink)
2566                        d_lustre_invalidate(dentry, 0);
2567
2568                ll_lookup_finish_locks(&oit, dentry);
2569        } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2570                struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2571                obd_valid valid = OBD_MD_FLGETATTR;
2572                struct md_op_data *op_data;
2573                int ealen = 0;
2574
2575                if (S_ISREG(inode->i_mode)) {
2576                        rc = ll_get_max_mdsize(sbi, &ealen);
2577                        if (rc)
2578                                RETURN(rc);
2579                        valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2580                }
2581
2582                op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2583                                             0, ealen, LUSTRE_OPC_ANY,
2584                                             NULL);
2585                if (IS_ERR(op_data))
2586                        RETURN(PTR_ERR(op_data));
2587
2588                op_data->op_valid = valid;
2589                /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2590                 * capa for this inode. Because we only keep capas of dirs
2591                 * fresh. */
2592                rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2593                ll_finish_md_op_data(op_data);
2594                if (rc) {
2595                        rc = ll_inode_revalidate_fini(inode, rc);
2596                        RETURN(rc);
2597                }
2598
2599                rc = ll_prep_inode(&inode, req, NULL, NULL);
2600        }
2601out:
2602        ptlrpc_req_finished(req);
2603        return rc;
2604}
2605
2606int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2607                           __u64 ibits)
2608{
2609        struct inode *inode = dentry->d_inode;
2610        int rc;
2611        ENTRY;
2612
2613        rc = __ll_inode_revalidate_it(dentry, it, ibits);
2614        if (rc != 0)
2615                RETURN(rc);
2616
2617        /* if object isn't regular file, don't validate size */
2618        if (!S_ISREG(inode->i_mode)) {
2619                LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
2620                LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
2621                LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
2622        } else {
2623                rc = ll_glimpse_size(inode);
2624        }
2625        RETURN(rc);
2626}
2627
2628int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2629                  struct lookup_intent *it, struct kstat *stat)
2630{
2631        struct inode *inode = de->d_inode;
2632        struct ll_sb_info *sbi = ll_i2sbi(inode);
2633        struct ll_inode_info *lli = ll_i2info(inode);
2634        int res = 0;
2635
2636        res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
2637                                             MDS_INODELOCK_LOOKUP);
2638        ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
2639
2640        if (res)
2641                return res;
2642
2643        stat->dev = inode->i_sb->s_dev;
2644        if (ll_need_32bit_api(sbi))
2645                stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
2646        else
2647                stat->ino = inode->i_ino;
2648        stat->mode = inode->i_mode;
2649        stat->nlink = inode->i_nlink;
2650        stat->uid = inode->i_uid;
2651        stat->gid = inode->i_gid;
2652        stat->rdev = inode->i_rdev;
2653        stat->atime = inode->i_atime;
2654        stat->mtime = inode->i_mtime;
2655        stat->ctime = inode->i_ctime;
2656        stat->blksize = 1 << inode->i_blkbits;
2657
2658        stat->size = i_size_read(inode);
2659        stat->blocks = inode->i_blocks;
2660
2661        return 0;
2662}
2663int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2664{
2665        struct lookup_intent it = { .it_op = IT_GETATTR };
2666
2667        return ll_getattr_it(mnt, de, &it, stat);
2668}
2669
2670
2671struct posix_acl * ll_get_acl(struct inode *inode, int type)
2672{
2673        struct ll_inode_info *lli = ll_i2info(inode);
2674        struct posix_acl *acl = NULL;
2675        ENTRY;
2676
2677        spin_lock(&lli->lli_lock);
2678        /* VFS' acl_permission_check->check_acl will release the refcount */
2679        acl = posix_acl_dup(lli->lli_posix_acl);
2680        spin_unlock(&lli->lli_lock);
2681
2682        RETURN(acl);
2683}
2684
2685
2686int ll_inode_permission(struct inode *inode, int mask)
2687{
2688        int rc = 0;
2689        ENTRY;
2690
2691#ifdef MAY_NOT_BLOCK
2692        if (mask & MAY_NOT_BLOCK)
2693                return -ECHILD;
2694#endif
2695
2696       /* as root inode are NOT getting validated in lookup operation,
2697        * need to do it before permission check. */
2698
2699        if (inode == inode->i_sb->s_root->d_inode) {
2700                struct lookup_intent it = { .it_op = IT_LOOKUP };
2701
2702                rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2703                                              MDS_INODELOCK_LOOKUP);
2704                if (rc)
2705                        RETURN(rc);
2706        }
2707
2708        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2709               inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2710
2711        if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2712                return lustre_check_remote_perm(inode, mask);
2713
2714        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2715        rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
2716
2717        RETURN(rc);
2718}
2719
2720#define READ_METHOD aio_read
2721#define READ_FUNCTION ll_file_aio_read
2722#define WRITE_METHOD aio_write
2723#define WRITE_FUNCTION ll_file_aio_write
2724
2725/* -o localflock - only provides locally consistent flock locks */
2726struct file_operations ll_file_operations = {
2727        .read      = ll_file_read,
2728        .READ_METHOD    = READ_FUNCTION,
2729        .write    = ll_file_write,
2730        .WRITE_METHOD   = WRITE_FUNCTION,
2731        .unlocked_ioctl = ll_file_ioctl,
2732        .open      = ll_file_open,
2733        .release        = ll_file_release,
2734        .mmap      = ll_file_mmap,
2735        .llseek  = ll_file_seek,
2736        .splice_read    = ll_file_splice_read,
2737        .fsync    = ll_fsync,
2738        .flush    = ll_flush
2739};
2740
2741struct file_operations ll_file_operations_flock = {
2742        .read      = ll_file_read,
2743        .READ_METHOD    = READ_FUNCTION,
2744        .write    = ll_file_write,
2745        .WRITE_METHOD   = WRITE_FUNCTION,
2746        .unlocked_ioctl = ll_file_ioctl,
2747        .open      = ll_file_open,
2748        .release        = ll_file_release,
2749        .mmap      = ll_file_mmap,
2750        .llseek  = ll_file_seek,
2751        .splice_read    = ll_file_splice_read,
2752        .fsync    = ll_fsync,
2753        .flush    = ll_flush,
2754        .flock    = ll_file_flock,
2755        .lock      = ll_file_flock
2756};
2757
2758/* These are for -o noflock - to return ENOSYS on flock calls */
2759struct file_operations ll_file_operations_noflock = {
2760        .read      = ll_file_read,
2761        .READ_METHOD    = READ_FUNCTION,
2762        .write    = ll_file_write,
2763        .WRITE_METHOD   = WRITE_FUNCTION,
2764        .unlocked_ioctl = ll_file_ioctl,
2765        .open      = ll_file_open,
2766        .release        = ll_file_release,
2767        .mmap      = ll_file_mmap,
2768        .llseek  = ll_file_seek,
2769        .splice_read    = ll_file_splice_read,
2770        .fsync    = ll_fsync,
2771        .flush    = ll_flush,
2772        .flock    = ll_file_noflock,
2773        .lock      = ll_file_noflock
2774};
2775
2776struct inode_operations ll_file_inode_operations = {
2777        .setattr        = ll_setattr,
2778        .getattr        = ll_getattr,
2779        .permission     = ll_inode_permission,
2780        .setxattr       = ll_setxattr,
2781        .getxattr       = ll_getxattr,
2782        .listxattr      = ll_listxattr,
2783        .removexattr    = ll_removexattr,
2784        .get_acl        = ll_get_acl,
2785};
2786
2787/* dynamic ioctl number support routins */
2788static struct llioc_ctl_data {
2789        struct rw_semaphore     ioc_sem;
2790        struct list_head              ioc_head;
2791} llioc = {
2792        __RWSEM_INITIALIZER(llioc.ioc_sem),
2793        LIST_HEAD_INIT(llioc.ioc_head)
2794};
2795
2796
2797struct llioc_data {
2798        struct list_head              iocd_list;
2799        unsigned int        iocd_size;
2800        llioc_callback_t        iocd_cb;
2801        unsigned int        iocd_count;
2802        unsigned int        iocd_cmd[0];
2803};
2804
2805void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2806{
2807        unsigned int size;
2808        struct llioc_data *in_data = NULL;
2809        ENTRY;
2810
2811        if (cb == NULL || cmd == NULL ||
2812            count > LLIOC_MAX_CMD || count < 0)
2813                RETURN(NULL);
2814
2815        size = sizeof(*in_data) + count * sizeof(unsigned int);
2816        OBD_ALLOC(in_data, size);
2817        if (in_data == NULL)
2818                RETURN(NULL);
2819
2820        memset(in_data, 0, sizeof(*in_data));
2821        in_data->iocd_size = size;
2822        in_data->iocd_cb = cb;
2823        in_data->iocd_count = count;
2824        memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2825
2826        down_write(&llioc.ioc_sem);
2827        list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2828        up_write(&llioc.ioc_sem);
2829
2830        RETURN(in_data);
2831}
2832
2833void ll_iocontrol_unregister(void *magic)
2834{
2835        struct llioc_data *tmp;
2836
2837        if (magic == NULL)
2838                return;
2839
2840        down_write(&llioc.ioc_sem);
2841        list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2842                if (tmp == magic) {
2843                        unsigned int size = tmp->iocd_size;
2844
2845                        list_del(&tmp->iocd_list);
2846                        up_write(&llioc.ioc_sem);
2847
2848                        OBD_FREE(tmp, size);
2849                        return;
2850                }
2851        }
2852        up_write(&llioc.ioc_sem);
2853
2854        CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2855}
2856
2857EXPORT_SYMBOL(ll_iocontrol_register);
2858EXPORT_SYMBOL(ll_iocontrol_unregister);
2859
2860enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2861                        unsigned int cmd, unsigned long arg, int *rcp)
2862{
2863        enum llioc_iter ret = LLIOC_CONT;
2864        struct llioc_data *data;
2865        int rc = -EINVAL, i;
2866
2867        down_read(&llioc.ioc_sem);
2868        list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2869                for (i = 0; i < data->iocd_count; i++) {
2870                        if (cmd != data->iocd_cmd[i])
2871                                continue;
2872
2873                        ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2874                        break;
2875                }
2876
2877                if (ret == LLIOC_STOP)
2878                        break;
2879        }
2880        up_read(&llioc.ioc_sem);
2881
2882        if (rcp)
2883                *rcp = rc;
2884        return ret;
2885}
2886
2887int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
2888{
2889        struct ll_inode_info *lli = ll_i2info(inode);
2890        struct cl_env_nest nest;
2891        struct lu_env *env;
2892        int result;
2893        ENTRY;
2894
2895        if (lli->lli_clob == NULL)
2896                RETURN(0);
2897
2898        env = cl_env_nested_get(&nest);
2899        if (IS_ERR(env))
2900                RETURN(PTR_ERR(env));
2901
2902        result = cl_conf_set(env, lli->lli_clob, conf);
2903        cl_env_nested_put(&nest, env);
2904
2905        if (conf->coc_opc == OBJECT_CONF_SET) {
2906                struct ldlm_lock *lock = conf->coc_lock;
2907
2908                LASSERT(lock != NULL);
2909                LASSERT(ldlm_has_layout(lock));
2910                if (result == 0) {
2911                        /* it can only be allowed to match after layout is
2912                         * applied to inode otherwise false layout would be
2913                         * seen. Applying layout shoud happen before dropping
2914                         * the intent lock. */
2915                        ldlm_lock_allow_match(lock);
2916                }
2917        }
2918        RETURN(result);
2919}
2920
2921/* Fetch layout from MDT with getxattr request, if it's not ready yet */
2922static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
2923
2924{
2925        struct ll_sb_info *sbi = ll_i2sbi(inode);
2926        struct obd_capa *oc;
2927        struct ptlrpc_request *req;
2928        struct mdt_body *body;
2929        void *lvbdata;
2930        void *lmm;
2931        int lmmsize;
2932        int rc;
2933        ENTRY;
2934
2935        if (lock->l_lvb_data != NULL)
2936                RETURN(0);
2937
2938        /* if layout lock was granted right away, the layout is returned
2939         * within DLM_LVB of dlm reply; otherwise if the lock was ever
2940         * blocked and then granted via completion ast, we have to fetch
2941         * layout here. Please note that we can't use the LVB buffer in
2942         * completion AST because it doesn't have a large enough buffer */
2943        oc = ll_mdscapa_get(inode);
2944        rc = ll_get_max_mdsize(sbi, &lmmsize);
2945        if (rc == 0)
2946                rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
2947                                OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
2948                                lmmsize, 0, &req);
2949        capa_put(oc);
2950        if (rc < 0)
2951                RETURN(rc);
2952
2953        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2954        if (body == NULL || body->eadatasize > lmmsize)
2955                GOTO(out, rc = -EPROTO);
2956
2957        lmmsize = body->eadatasize;
2958        if (lmmsize == 0) /* empty layout */
2959                GOTO(out, rc = 0);
2960
2961        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
2962        if (lmm == NULL)
2963                GOTO(out, rc = -EFAULT);
2964
2965        OBD_ALLOC_LARGE(lvbdata, lmmsize);
2966        if (lvbdata == NULL)
2967                GOTO(out, rc = -ENOMEM);
2968
2969        memcpy(lvbdata, lmm, lmmsize);
2970        lock_res_and_lock(lock);
2971        if (lock->l_lvb_data == NULL) {
2972                lock->l_lvb_data = lvbdata;
2973                lock->l_lvb_len = lmmsize;
2974                lvbdata = NULL;
2975        }
2976        unlock_res_and_lock(lock);
2977
2978        if (lvbdata != NULL)
2979                OBD_FREE_LARGE(lvbdata, lmmsize);
2980        EXIT;
2981
2982out:
2983        ptlrpc_req_finished(req);
2984        return rc;
2985}
2986
2987/**
2988 * Apply the layout to the inode. Layout lock is held and will be released
2989 * in this function.
2990 */
2991static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
2992                                struct inode *inode, __u32 *gen, bool reconf)
2993{
2994        struct ll_inode_info *lli = ll_i2info(inode);
2995        struct ll_sb_info    *sbi = ll_i2sbi(inode);
2996        struct ldlm_lock *lock;
2997        struct lustre_md md = { NULL };
2998        struct cl_object_conf conf;
2999        int rc = 0;
3000        bool lvb_ready;
3001        bool wait_layout = false;
3002        ENTRY;
3003
3004        LASSERT(lustre_handle_is_used(lockh));
3005
3006        lock = ldlm_handle2lock(lockh);
3007        LASSERT(lock != NULL);
3008        LASSERT(ldlm_has_layout(lock));
3009
3010        LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3011                inode, PFID(&lli->lli_fid), reconf);
3012
3013        /* in case this is a caching lock and reinstate with new inode */
3014        md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3015
3016        lock_res_and_lock(lock);
3017        lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3018        unlock_res_and_lock(lock);
3019        /* checking lvb_ready is racy but this is okay. The worst case is
3020         * that multi processes may configure the file on the same time. */
3021        if (lvb_ready || !reconf) {
3022                rc = -ENODATA;
3023                if (lvb_ready) {
3024                        /* layout_gen must be valid if layout lock is not
3025                         * cancelled and stripe has already set */
3026                        *gen = lli->lli_layout_gen;
3027                        rc = 0;
3028                }
3029                GOTO(out, rc);
3030        }
3031
3032        rc = ll_layout_fetch(inode, lock);
3033        if (rc < 0)
3034                GOTO(out, rc);
3035
3036        /* for layout lock, lmm is returned in lock's lvb.
3037         * lvb_data is immutable if the lock is held so it's safe to access it
3038         * without res lock. See the description in ldlm_lock_decref_internal()
3039         * for the condition to free lvb_data of layout lock */
3040        if (lock->l_lvb_data != NULL) {
3041                rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3042                                  lock->l_lvb_data, lock->l_lvb_len);
3043                if (rc >= 0) {
3044                        *gen = LL_LAYOUT_GEN_EMPTY;
3045                        if (md.lsm != NULL)
3046                                *gen = md.lsm->lsm_layout_gen;
3047                        rc = 0;
3048                } else {
3049                        CERROR("%s: file "DFID" unpackmd error: %d\n",
3050                                ll_get_fsname(inode->i_sb, NULL, 0),
3051                                PFID(&lli->lli_fid), rc);
3052                }
3053        }
3054        if (rc < 0)
3055                GOTO(out, rc);
3056
3057        /* set layout to file. Unlikely this will fail as old layout was
3058         * surely eliminated */
3059        memset(&conf, 0, sizeof conf);
3060        conf.coc_opc = OBJECT_CONF_SET;
3061        conf.coc_inode = inode;
3062        conf.coc_lock = lock;
3063        conf.u.coc_md = &md;
3064        rc = ll_layout_conf(inode, &conf);
3065
3066        if (md.lsm != NULL)
3067                obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3068
3069        /* refresh layout failed, need to wait */
3070        wait_layout = rc == -EBUSY;
3071        EXIT;
3072
3073out:
3074        LDLM_LOCK_PUT(lock);
3075        ldlm_lock_decref(lockh, mode);
3076
3077        /* wait for IO to complete if it's still being used. */
3078        if (wait_layout) {
3079                CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3080                        ll_get_fsname(inode->i_sb, NULL, 0),
3081                        inode, PFID(&lli->lli_fid));
3082
3083                memset(&conf, 0, sizeof conf);
3084                conf.coc_opc = OBJECT_CONF_WAIT;
3085                conf.coc_inode = inode;
3086                rc = ll_layout_conf(inode, &conf);
3087                if (rc == 0)
3088                        rc = -EAGAIN;
3089
3090                CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3091                        PFID(&lli->lli_fid), rc);
3092        }
3093        RETURN(rc);
3094}
3095
3096/**
3097 * This function checks if there exists a LAYOUT lock on the client side,
3098 * or enqueues it if it doesn't have one in cache.
3099 *
3100 * This function will not hold layout lock so it may be revoked any time after
3101 * this function returns. Any operations depend on layout should be redone
3102 * in that case.
3103 *
3104 * This function should be called before lov_io_init() to get an uptodate
3105 * layout version, the caller should save the version number and after IO
3106 * is finished, this function should be called again to verify that layout
3107 * is not changed during IO time.
3108 */
3109int ll_layout_refresh(struct inode *inode, __u32 *gen)
3110{
3111        struct ll_inode_info  *lli = ll_i2info(inode);
3112        struct ll_sb_info     *sbi = ll_i2sbi(inode);
3113        struct md_op_data     *op_data;
3114        struct lookup_intent   it;
3115        struct lustre_handle   lockh;
3116        ldlm_mode_t            mode;
3117        struct ldlm_enqueue_info einfo = { .ei_type = LDLM_IBITS,
3118                                           .ei_mode = LCK_CR,
3119                                           .ei_cb_bl = ll_md_blocking_ast,
3120                                           .ei_cb_cp = ldlm_completion_ast,
3121                                           .ei_cbdata = NULL };
3122        int rc;
3123        ENTRY;
3124
3125        *gen = lli->lli_layout_gen;
3126        if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3127                RETURN(0);
3128
3129        /* sanity checks */
3130        LASSERT(fid_is_sane(ll_inode2fid(inode)));
3131        LASSERT(S_ISREG(inode->i_mode));
3132
3133        /* mostly layout lock is caching on the local side, so try to match
3134         * it before grabbing layout lock mutex. */
3135        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3136        if (mode != 0) { /* hit cached lock */
3137                rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3138                if (rc == 0)
3139                        RETURN(0);
3140
3141                /* better hold lli_layout_mutex to try again otherwise
3142                 * it will have starvation problem. */
3143        }
3144
3145        /* take layout lock mutex to enqueue layout lock exclusively. */
3146        mutex_lock(&lli->lli_layout_mutex);
3147
3148again:
3149        /* try again. Maybe somebody else has done this. */
3150        mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3151        if (mode != 0) { /* hit cached lock */
3152                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3153                if (rc == -EAGAIN)
3154                        goto again;
3155
3156                mutex_unlock(&lli->lli_layout_mutex);
3157                RETURN(rc);
3158        }
3159
3160        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3161                        0, 0, LUSTRE_OPC_ANY, NULL);
3162        if (IS_ERR(op_data)) {
3163                mutex_unlock(&lli->lli_layout_mutex);
3164                RETURN(PTR_ERR(op_data));
3165        }
3166
3167        /* have to enqueue one */
3168        memset(&it, 0, sizeof(it));
3169        it.it_op = IT_LAYOUT;
3170        lockh.cookie = 0ULL;
3171
3172        LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3173                        ll_get_fsname(inode->i_sb, NULL, 0), inode,
3174                        PFID(&lli->lli_fid));
3175
3176        rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3177                        NULL, 0, NULL, 0);
3178        if (it.d.lustre.it_data != NULL)
3179                ptlrpc_req_finished(it.d.lustre.it_data);
3180        it.d.lustre.it_data = NULL;
3181
3182        ll_finish_md_op_data(op_data);
3183
3184        mode = it.d.lustre.it_lock_mode;
3185        it.d.lustre.it_lock_mode = 0;
3186        ll_intent_drop_lock(&it);
3187
3188        if (rc == 0) {
3189                /* set lock data in case this is a new lock */
3190                ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3191                rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3192                if (rc == -EAGAIN)
3193                        goto again;
3194        }
3195        mutex_unlock(&lli->lli_layout_mutex);
3196
3197        RETURN(rc);
3198}
3199