linux/drivers/staging/lustre/lustre/llite/dir.c
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.gnu.org/licenses/gpl-2.0.html
  19 *
  20 * GPL HEADER END
  21 */
  22/*
  23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24 * Use is subject to license terms.
  25 *
  26 * Copyright (c) 2011, 2015, Intel Corporation.
  27 */
  28/*
  29 * This file is part of Lustre, http://www.lustre.org/
  30 * Lustre is a trademark of Sun Microsystems, Inc.
  31 *
  32 * lustre/llite/dir.c
  33 *
  34 * Directory code for lustre client.
  35 */
  36
  37#include <linux/fs.h>
  38#include <linux/pagemap.h>
  39#include <linux/mm.h>
  40#include <linux/uaccess.h>
  41#include <linux/buffer_head.h>   /* for wait_on_buffer */
  42#include <linux/pagevec.h>
  43#include <linux/prefetch.h>
  44
  45#define DEBUG_SUBSYSTEM S_LLITE
  46
  47#include "../include/obd_support.h"
  48#include "../include/obd_class.h"
  49#include "../include/lustre/lustre_ioctl.h"
  50#include "../include/lustre_lib.h"
  51#include "../include/lustre_dlm.h"
  52#include "../include/lustre_fid.h"
  53#include "../include/lustre_kernelcomm.h"
  54#include "llite_internal.h"
  55
  56/*
  57 * (new) readdir implementation overview.
  58 *
  59 * Original lustre readdir implementation cached exact copy of raw directory
  60 * pages on the client. These pages were indexed in client page cache by
  61 * logical offset in the directory file. This design, while very simple and
  62 * intuitive had some inherent problems:
  63 *
  64 *     . it implies that byte offset to the directory entry serves as a
  65 *     telldir(3)/seekdir(3) cookie, but that offset is not stable: in
  66 *     ext3/htree directory entries may move due to splits, and more
  67 *     importantly,
  68 *
  69 *     . it is incompatible with the design of split directories for cmd3,
  70 *     that assumes that names are distributed across nodes based on their
  71 *     hash, and so readdir should be done in hash order.
  72 *
  73 * New readdir implementation does readdir in hash order, and uses hash of a
  74 * file name as a telldir/seekdir cookie. This led to number of complications:
  75 *
  76 *     . hash is not unique, so it cannot be used to index cached directory
  77 *     pages on the client (note, that it requires a whole pageful of hash
  78 *     collided entries to cause two pages to have identical hashes);
  79 *
  80 *     . hash is not unique, so it cannot, strictly speaking, be used as an
  81 *     entry cookie. ext3/htree has the same problem and lustre implementation
  82 *     mimics their solution: seekdir(hash) positions directory at the first
  83 *     entry with the given hash.
  84 *
  85 * Client side.
  86 *
  87 * 0. caching
  88 *
  89 * Client caches directory pages using hash of the first entry as an index. As
  90 * noted above hash is not unique, so this solution doesn't work as is:
  91 * special processing is needed for "page hash chains" (i.e., sequences of
  92 * pages filled with entries all having the same hash value).
  93 *
  94 * First, such chains have to be detected. To this end, server returns to the
  95 * client the hash of the first entry on the page next to one returned. When
  96 * client detects that this hash is the same as hash of the first entry on the
  97 * returned page, page hash collision has to be handled. Pages in the
  98 * hash chain, except first one, are termed "overflow pages".
  99 *
 100 * Solution to index uniqueness problem is to not cache overflow
 101 * pages. Instead, when page hash collision is detected, all overflow pages
 102 * from emerging chain are immediately requested from the server and placed in
 103 * a special data structure (struct ll_dir_chain). This data structure is used
 104 * by ll_readdir() to process entries from overflow pages. When readdir
 105 * invocation finishes, overflow pages are discarded. If page hash collision
 106 * chain weren't completely processed, next call to readdir will again detect
 107 * page hash collision, again read overflow pages in, process next portion of
 108 * entries and again discard the pages. This is not as wasteful as it looks,
 109 * because, given reasonable hash, page hash collisions are extremely rare.
 110 *
 111 * 1. directory positioning
 112 *
 113 * When seekdir(hash) is called, original
 114 *
 115 *
 116 *
 117 *
 118 *
 119 *
 120 *
 121 *
 122 * Server.
 123 *
 124 * identification of and access to overflow pages
 125 *
 126 * page format
 127 *
 128 * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains
 129 * a header lu_dirpage which describes the start/end hash, and whether this
 130 * page is empty (contains no dir entry) or hash collide with next page.
 131 * After client receives reply, several pages will be integrated into dir page
 132 * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the lu_dirpage
 133 * for this integrated page will be adjusted. See lmv_adjust_dirpages().
 134 *
 135 */
 136struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data,
 137                             __u64 offset)
 138{
 139        struct md_callback cb_op;
 140        struct page *page;
 141        int rc;
 142
 143        cb_op.md_blocking_ast = ll_md_blocking_ast;
 144        rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page);
 145        if (rc)
 146                return ERR_PTR(rc);
 147
 148        return page;
 149}
 150
 151void ll_release_page(struct inode *inode, struct page *page, bool remove)
 152{
 153        kunmap(page);
 154
 155        /*
 156         * Always remove the page for striped dir, because the page is
 157         * built from temporarily in LMV layer
 158         */
 159        if (inode && S_ISDIR(inode->i_mode) &&
 160            ll_i2info(inode)->lli_lsm_md) {
 161                __free_page(page);
 162                return;
 163        }
 164
 165        if (remove) {
 166                lock_page(page);
 167                if (likely(page->mapping))
 168                        truncate_complete_page(page->mapping, page);
 169                unlock_page(page);
 170        }
 171        put_page(page);
 172}
 173
 174/**
 175 * return IF_* type for given lu_dirent entry.
 176 * IF_* flag shld be converted to particular OS file type in
 177 * platform llite module.
 178 */
 179static __u16 ll_dirent_type_get(struct lu_dirent *ent)
 180{
 181        __u16 type = 0;
 182        struct luda_type *lt;
 183        int len = 0;
 184
 185        if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) {
 186                const unsigned int align = sizeof(struct luda_type) - 1;
 187
 188                len = le16_to_cpu(ent->lde_namelen);
 189                len = (len + align) & ~align;
 190                lt = (void *)ent->lde_name + len;
 191                type = IFTODT(le16_to_cpu(lt->lt_type));
 192        }
 193        return type;
 194}
 195
 196int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data,
 197                struct dir_context *ctx)
 198{
 199        struct ll_sb_info    *sbi       = ll_i2sbi(inode);
 200        __u64              pos          = *ppos;
 201        int                is_api32 = ll_need_32bit_api(sbi);
 202        int                is_hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH;
 203        struct page       *page;
 204        bool               done = false;
 205        int                rc = 0;
 206
 207        page = ll_get_dir_page(inode, op_data, pos);
 208
 209        while (rc == 0 && !done) {
 210                struct lu_dirpage *dp;
 211                struct lu_dirent  *ent;
 212                __u64 hash;
 213                __u64 next;
 214
 215                if (IS_ERR(page)) {
 216                        rc = PTR_ERR(page);
 217                        break;
 218                }
 219
 220                hash = MDS_DIR_END_OFF;
 221                dp = page_address(page);
 222                for (ent = lu_dirent_start(dp); ent && !done;
 223                     ent = lu_dirent_next(ent)) {
 224                        __u16     type;
 225                        int         namelen;
 226                        struct lu_fid  fid;
 227                        __u64     lhash;
 228                        __u64     ino;
 229
 230                        hash = le64_to_cpu(ent->lde_hash);
 231                        if (hash < pos)
 232                                /*
 233                                 * Skip until we find target hash
 234                                 * value.
 235                                 */
 236                                continue;
 237
 238                        namelen = le16_to_cpu(ent->lde_namelen);
 239                        if (namelen == 0)
 240                                /*
 241                                 * Skip dummy record.
 242                                 */
 243                                continue;
 244
 245                        if (is_api32 && is_hash64)
 246                                lhash = hash >> 32;
 247                        else
 248                                lhash = hash;
 249                        fid_le_to_cpu(&fid, &ent->lde_fid);
 250                        ino = cl_fid_build_ino(&fid, is_api32);
 251                        type = ll_dirent_type_get(ent);
 252                        ctx->pos = lhash;
 253                        /* For 'll_nfs_get_name_filldir()', it will try
 254                         * to access the 'ent' through its 'lde_name',
 255                         * so the parameter 'name' for 'ctx->actor()'
 256                         * must be part of the 'ent'.
 257                         */
 258                        done = !dir_emit(ctx, ent->lde_name,
 259                                         namelen, ino, type);
 260                }
 261
 262                if (done) {
 263                        pos = hash;
 264                        ll_release_page(inode, page, false);
 265                        break;
 266                }
 267
 268                next = le64_to_cpu(dp->ldp_hash_end);
 269                pos = next;
 270                if (pos == MDS_DIR_END_OFF) {
 271                        /*
 272                         * End of directory reached.
 273                         */
 274                        done = 1;
 275                        ll_release_page(inode, page, false);
 276                } else {
 277                        /*
 278                         * Normal case: continue to the next
 279                         * page.
 280                         */
 281                        ll_release_page(inode, page,
 282                                        le32_to_cpu(dp->ldp_flags) &
 283                                        LDF_COLLIDE);
 284                        next = pos;
 285                        page = ll_get_dir_page(inode, op_data, pos);
 286                }
 287        }
 288
 289        ctx->pos = pos;
 290        return rc;
 291}
 292
 293static int ll_readdir(struct file *filp, struct dir_context *ctx)
 294{
 295        struct inode            *inode  = file_inode(filp);
 296        struct ll_file_data     *lfd    = LUSTRE_FPRIVATE(filp);
 297        struct ll_sb_info       *sbi    = ll_i2sbi(inode);
 298        __u64 pos = lfd ? lfd->lfd_pos : 0;
 299        int                     hash64  = sbi->ll_flags & LL_SBI_64BIT_HASH;
 300        int                     api32   = ll_need_32bit_api(sbi);
 301        struct md_op_data *op_data;
 302        int                     rc;
 303
 304        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) pos/size %lu/%llu 32bit_api %d\n",
 305               PFID(ll_inode2fid(inode)), inode, (unsigned long)pos,
 306               i_size_read(inode), api32);
 307
 308        if (pos == MDS_DIR_END_OFF) {
 309                /*
 310                 * end-of-file.
 311                 */
 312                rc = 0;
 313                goto out;
 314        }
 315
 316        op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 317                                     LUSTRE_OPC_ANY, inode);
 318        if (IS_ERR(op_data)) {
 319                rc = PTR_ERR(op_data);
 320                goto out;
 321        }
 322
 323        if (unlikely(op_data->op_mea1)) {
 324                /*
 325                 * This is only needed for striped dir to fill ..,
 326                 * see lmv_read_page
 327                 */
 328                if (file_dentry(filp)->d_parent &&
 329                    file_dentry(filp)->d_parent->d_inode) {
 330                        __u64 ibits = MDS_INODELOCK_UPDATE;
 331                        struct inode *parent;
 332
 333                        parent = file_dentry(filp)->d_parent->d_inode;
 334                        if (ll_have_md_lock(parent, &ibits, LCK_MINMODE))
 335                                op_data->op_fid3 = *ll_inode2fid(parent);
 336                }
 337
 338                /*
 339                 * If it can not find in cache, do lookup .. on the master
 340                 * object
 341                 */
 342                if (fid_is_zero(&op_data->op_fid3)) {
 343                        rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3);
 344                        if (rc) {
 345                                ll_finish_md_op_data(op_data);
 346                                return rc;
 347                        }
 348                }
 349        }
 350        op_data->op_max_pages = sbi->ll_md_brw_pages;
 351        ctx->pos = pos;
 352        rc = ll_dir_read(inode, &pos, op_data, ctx);
 353        pos = ctx->pos;
 354        if (lfd)
 355                lfd->lfd_pos = pos;
 356
 357        if (pos == MDS_DIR_END_OFF) {
 358                if (api32)
 359                        pos = LL_DIR_END_OFF_32BIT;
 360                else
 361                        pos = LL_DIR_END_OFF;
 362        } else {
 363                if (api32 && hash64)
 364                        pos >>= 32;
 365        }
 366        ctx->pos = pos;
 367        ll_finish_md_op_data(op_data);
 368        filp->f_version = inode->i_version;
 369
 370out:
 371        if (!rc)
 372                ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1);
 373
 374        return rc;
 375}
 376
 377static int ll_send_mgc_param(struct obd_export *mgc, char *string)
 378{
 379        struct mgs_send_param *msp;
 380        int rc = 0;
 381
 382        msp = kzalloc(sizeof(*msp), GFP_NOFS);
 383        if (!msp)
 384                return -ENOMEM;
 385
 386        strlcpy(msp->mgs_param, string, sizeof(msp->mgs_param));
 387        rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO,
 388                                sizeof(struct mgs_send_param), msp, NULL);
 389        if (rc)
 390                CERROR("Failed to set parameter: %d\n", rc);
 391        kfree(msp);
 392
 393        return rc;
 394}
 395
 396/**
 397 * Create striped directory with specified stripe(@lump)
 398 *
 399 * param[in] parent     the parent of the directory.
 400 * param[in] lump       the specified stripes.
 401 * param[in] dirname    the name of the directory.
 402 * param[in] mode       the specified mode of the directory.
 403 *
 404 * retval               =0 if striped directory is being created successfully.
 405 *                      <0 if the creation is failed.
 406 */
 407static int ll_dir_setdirstripe(struct inode *parent, struct lmv_user_md *lump,
 408                               const char *dirname, umode_t mode)
 409{
 410        struct ptlrpc_request *request = NULL;
 411        struct md_op_data *op_data;
 412        struct ll_sb_info *sbi = ll_i2sbi(parent);
 413        int err;
 414
 415        if (unlikely(lump->lum_magic != LMV_USER_MAGIC))
 416                return -EINVAL;
 417
 418        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p) name %s stripe_offset %d, stripe_count: %u\n",
 419               PFID(ll_inode2fid(parent)), parent, dirname,
 420               (int)lump->lum_stripe_offset, lump->lum_stripe_count);
 421
 422        if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC))
 423                lustre_swab_lmv_user_md(lump);
 424
 425        if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent)))
 426                mode &= ~current_umask();
 427        mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 428        op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname,
 429                                     strlen(dirname), mode, LUSTRE_OPC_MKDIR,
 430                                     lump);
 431        if (IS_ERR(op_data)) {
 432                err = PTR_ERR(op_data);
 433                goto err_exit;
 434        }
 435
 436        op_data->op_cli_flags |= CLI_SET_MEA;
 437        err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode,
 438                        from_kuid(&init_user_ns, current_fsuid()),
 439                        from_kgid(&init_user_ns, current_fsgid()),
 440                        cfs_curproc_cap_pack(), 0, &request);
 441        ll_finish_md_op_data(op_data);
 442        if (err)
 443                goto err_exit;
 444err_exit:
 445        ptlrpc_req_finished(request);
 446        return err;
 447}
 448
 449int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
 450                     int set_default)
 451{
 452        struct ll_sb_info *sbi = ll_i2sbi(inode);
 453        struct md_op_data *op_data;
 454        struct ptlrpc_request *req = NULL;
 455        int rc = 0;
 456        struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
 457        struct obd_device *mgc = lsi->lsi_mgc;
 458        int lum_size;
 459
 460        if (lump) {
 461                /*
 462                 * This is coming from userspace, so should be in
 463                 * local endian.  But the MDS would like it in little
 464                 * endian, so we swab it before we send it.
 465                 */
 466                switch (lump->lmm_magic) {
 467                case LOV_USER_MAGIC_V1: {
 468                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
 469                                lustre_swab_lov_user_md_v1(lump);
 470                        lum_size = sizeof(struct lov_user_md_v1);
 471                        break;
 472                }
 473                case LOV_USER_MAGIC_V3: {
 474                        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
 475                                lustre_swab_lov_user_md_v3(
 476                                        (struct lov_user_md_v3 *)lump);
 477                        lum_size = sizeof(struct lov_user_md_v3);
 478                        break;
 479                }
 480                case LMV_USER_MAGIC: {
 481                        if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC))
 482                                lustre_swab_lmv_user_md(
 483                                        (struct lmv_user_md *)lump);
 484                        lum_size = sizeof(struct lmv_user_md);
 485                        break;
 486                }
 487                default: {
 488                        CDEBUG(D_IOCTL, "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
 489                               lump->lmm_magic, LOV_USER_MAGIC_V1,
 490                               LOV_USER_MAGIC_V3);
 491                        return -EINVAL;
 492                }
 493                }
 494        } else {
 495                lum_size = sizeof(struct lov_user_md_v1);
 496        }
 497
 498        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
 499                                     LUSTRE_OPC_ANY, NULL);
 500        if (IS_ERR(op_data))
 501                return PTR_ERR(op_data);
 502
 503        /* swabbing is done in lov_setstripe() on server side */
 504        rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
 505                        NULL, 0, &req, NULL);
 506        ll_finish_md_op_data(op_data);
 507        ptlrpc_req_finished(req);
 508        if (rc) {
 509                if (rc != -EPERM && rc != -EACCES)
 510                        CERROR("mdc_setattr fails: rc = %d\n", rc);
 511        }
 512
 513        /* In the following we use the fact that LOV_USER_MAGIC_V1 and
 514         * LOV_USER_MAGIC_V3 have the same initial fields so we do not
 515         * need to make the distinction between the 2 versions
 516         */
 517        if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
 518                char *param = NULL;
 519                char *buf;
 520
 521                param = kzalloc(MGS_PARAM_MAXLEN, GFP_NOFS);
 522                if (!param)
 523                        return -ENOMEM;
 524
 525                buf = param;
 526                /* Get fsname and assume devname to be -MDT0000. */
 527                ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN);
 528                strcat(buf, "-MDT0000.lov");
 529                buf += strlen(buf);
 530
 531                /* Set root stripesize */
 532                sprintf(buf, ".stripesize=%u",
 533                        lump ? le32_to_cpu(lump->lmm_stripe_size) : 0);
 534                rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
 535                if (rc)
 536                        goto end;
 537
 538                /* Set root stripecount */
 539                sprintf(buf, ".stripecount=%hd",
 540                        lump ? le16_to_cpu(lump->lmm_stripe_count) : 0);
 541                rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
 542                if (rc)
 543                        goto end;
 544
 545                /* Set root stripeoffset */
 546                sprintf(buf, ".stripeoffset=%hd",
 547                        lump ? le16_to_cpu(lump->lmm_stripe_offset) :
 548                        (typeof(lump->lmm_stripe_offset))(-1));
 549                rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param);
 550
 551end:
 552                kfree(param);
 553        }
 554        return rc;
 555}
 556
 557/**
 558 * This function will be used to get default LOV/LMV/Default LMV
 559 * @valid will be used to indicate which stripe it will retrieve
 560 *      OBD_MD_MEA              LMV stripe EA
 561 *      OBD_MD_DEFAULT_MEA      Default LMV stripe EA
 562 *      otherwise               Default LOV EA.
 563 * Each time, it can only retrieve 1 stripe EA
 564 **/
 565int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size,
 566                     struct ptlrpc_request **request, u64 valid)
 567{
 568        struct ll_sb_info *sbi = ll_i2sbi(inode);
 569        struct mdt_body   *body;
 570        struct lov_mds_md *lmm = NULL;
 571        struct ptlrpc_request *req = NULL;
 572        int rc, lmmsize;
 573        struct md_op_data *op_data;
 574
 575        rc = ll_get_max_mdsize(sbi, &lmmsize);
 576        if (rc)
 577                return rc;
 578
 579        op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
 580                                     0, lmmsize, LUSTRE_OPC_ANY,
 581                                     NULL);
 582        if (IS_ERR(op_data))
 583                return PTR_ERR(op_data);
 584
 585        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
 586        rc = md_getattr(sbi->ll_md_exp, op_data, &req);
 587        ll_finish_md_op_data(op_data);
 588        if (rc < 0) {
 589                CDEBUG(D_INFO, "md_getattr failed on inode "DFID": rc %d\n",
 590                       PFID(ll_inode2fid(inode)), rc);
 591                goto out;
 592        }
 593
 594        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 595
 596        lmmsize = body->mbo_eadatasize;
 597
 598        if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
 599            lmmsize == 0) {
 600                rc = -ENODATA;
 601                goto out;
 602        }
 603
 604        lmm = req_capsule_server_sized_get(&req->rq_pill,
 605                                           &RMF_MDT_MD, lmmsize);
 606        LASSERT(lmm);
 607
 608        /*
 609         * This is coming from the MDS, so is probably in
 610         * little endian.  We convert it to host endian before
 611         * passing it to userspace.
 612         */
 613        /* We don't swab objects for directories */
 614        switch (le32_to_cpu(lmm->lmm_magic)) {
 615        case LOV_MAGIC_V1:
 616                if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC)
 617                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
 618                break;
 619        case LOV_MAGIC_V3:
 620                if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC)
 621                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
 622                break;
 623        case LMV_MAGIC_V1:
 624                if (cpu_to_le32(LMV_MAGIC) != LMV_MAGIC)
 625                        lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm);
 626                break;
 627        case LMV_USER_MAGIC:
 628                if (cpu_to_le32(LMV_USER_MAGIC) != LMV_USER_MAGIC)
 629                        lustre_swab_lmv_user_md((struct lmv_user_md *)lmm);
 630                break;
 631        default:
 632                CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
 633                rc = -EPROTO;
 634        }
 635out:
 636        *plmm = lmm;
 637        *plmm_size = lmmsize;
 638        *request = req;
 639        return rc;
 640}
 641
 642int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid)
 643{
 644        struct md_op_data *op_data;
 645        int mdt_index, rc;
 646
 647        op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
 648        if (!op_data)
 649                return -ENOMEM;
 650
 651        op_data->op_flags |= MF_GET_MDT_IDX;
 652        op_data->op_fid1 = *fid;
 653        rc = md_getattr(sbi->ll_md_exp, op_data, NULL);
 654        mdt_index = op_data->op_mds;
 655        kvfree(op_data);
 656        if (rc < 0)
 657                return rc;
 658
 659        return mdt_index;
 660}
 661
 662/*
 663 *  Get MDT index for the inode.
 664 */
 665int ll_get_mdt_idx(struct inode *inode)
 666{
 667        return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode));
 668}
 669
 670/**
 671 * Generic handler to do any pre-copy work.
 672 *
 673 * It sends a first hsm_progress (with extent length == 0) to coordinator as a
 674 * first information for it that real work has started.
 675 *
 676 * Moreover, for a ARCHIVE request, it will sample the file data version and
 677 * store it in \a copy.
 678 *
 679 * \return 0 on success.
 680 */
 681static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy)
 682{
 683        struct ll_sb_info               *sbi = ll_s2sbi(sb);
 684        struct hsm_progress_kernel       hpk;
 685        int                              rc;
 686
 687        /* Forge a hsm_progress based on data from copy. */
 688        hpk.hpk_fid = copy->hc_hai.hai_fid;
 689        hpk.hpk_cookie = copy->hc_hai.hai_cookie;
 690        hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset;
 691        hpk.hpk_extent.length = 0;
 692        hpk.hpk_flags = 0;
 693        hpk.hpk_errval = 0;
 694        hpk.hpk_data_version = 0;
 695
 696        /* For archive request, we need to read the current file version. */
 697        if (copy->hc_hai.hai_action == HSMA_ARCHIVE) {
 698                struct inode    *inode;
 699                __u64            data_version = 0;
 700
 701                /* Get inode for this fid */
 702                inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
 703                if (IS_ERR(inode)) {
 704                        hpk.hpk_flags |= HP_FLAG_RETRY;
 705                        /* hpk_errval is >= 0 */
 706                        hpk.hpk_errval = -PTR_ERR(inode);
 707                        rc = PTR_ERR(inode);
 708                        goto progress;
 709                }
 710
 711                /* Read current file data version */
 712                rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
 713                iput(inode);
 714                if (rc != 0) {
 715                        CDEBUG(D_HSM, "Could not read file data version of "
 716                                      DFID" (rc = %d). Archive request (%#llx) could not be done.\n",
 717                                      PFID(&copy->hc_hai.hai_fid), rc,
 718                                      copy->hc_hai.hai_cookie);
 719                        hpk.hpk_flags |= HP_FLAG_RETRY;
 720                        /* hpk_errval must be >= 0 */
 721                        hpk.hpk_errval = -rc;
 722                        goto progress;
 723                }
 724
 725                /* Store in the hsm_copy for later copytool use.
 726                 * Always modified even if no lsm.
 727                 */
 728                copy->hc_data_version = data_version;
 729        }
 730
 731progress:
 732        /* On error, the request should be considered as completed */
 733        if (hpk.hpk_errval > 0)
 734                hpk.hpk_flags |= HP_FLAG_COMPLETED;
 735        rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
 736                           &hpk, NULL);
 737
 738        return rc;
 739}
 740
 741/**
 742 * Generic handler to do any post-copy work.
 743 *
 744 * It will send the last hsm_progress update to coordinator to inform it
 745 * that copy is finished and whether it was successful or not.
 746 *
 747 * Moreover,
 748 * - for ARCHIVE request, it will sample the file data version and compare it
 749 *   with the version saved in ll_ioc_copy_start(). If they do not match, copy
 750 *   will be considered as failed.
 751 * - for RESTORE request, it will sample the file data version and send it to
 752 *   coordinator which is useful if the file was imported as 'released'.
 753 *
 754 * \return 0 on success.
 755 */
 756static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy)
 757{
 758        struct ll_sb_info               *sbi = ll_s2sbi(sb);
 759        struct hsm_progress_kernel       hpk;
 760        int                              rc;
 761
 762        /* If you modify the logic here, also check llapi_hsm_copy_end(). */
 763        /* Take care: copy->hc_hai.hai_action, len, gid and data are not
 764         * initialized if copy_end was called with copy == NULL.
 765         */
 766
 767        /* Forge a hsm_progress based on data from copy. */
 768        hpk.hpk_fid = copy->hc_hai.hai_fid;
 769        hpk.hpk_cookie = copy->hc_hai.hai_cookie;
 770        hpk.hpk_extent = copy->hc_hai.hai_extent;
 771        hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED;
 772        hpk.hpk_errval = copy->hc_errval;
 773        hpk.hpk_data_version = 0;
 774
 775        /* For archive request, we need to check the file data was not changed.
 776         *
 777         * For restore request, we need to send the file data version, this is
 778         * useful when the file was created using hsm_import.
 779         */
 780        if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) ||
 781             (copy->hc_hai.hai_action == HSMA_RESTORE)) &&
 782            (copy->hc_errval == 0)) {
 783                struct inode    *inode;
 784                __u64            data_version = 0;
 785
 786                /* Get lsm for this fid */
 787                inode = search_inode_for_lustre(sb, &copy->hc_hai.hai_fid);
 788                if (IS_ERR(inode)) {
 789                        hpk.hpk_flags |= HP_FLAG_RETRY;
 790                        /* hpk_errval must be >= 0 */
 791                        hpk.hpk_errval = -PTR_ERR(inode);
 792                        rc = PTR_ERR(inode);
 793                        goto progress;
 794                }
 795
 796                rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH);
 797                iput(inode);
 798                if (rc) {
 799                        CDEBUG(D_HSM, "Could not read file data version. Request could not be confirmed.\n");
 800                        if (hpk.hpk_errval == 0)
 801                                hpk.hpk_errval = -rc;
 802                        goto progress;
 803                }
 804
 805                /* Store in the hsm_copy for later copytool use.
 806                 * Always modified even if no lsm.
 807                 */
 808                hpk.hpk_data_version = data_version;
 809
 810                /* File could have been stripped during archiving, so we need
 811                 * to check anyway.
 812                 */
 813                if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) &&
 814                    (copy->hc_data_version != data_version)) {
 815                        CDEBUG(D_HSM, "File data version mismatched. File content was changed during archiving. "
 816                               DFID", start:%#llx current:%#llx\n",
 817                               PFID(&copy->hc_hai.hai_fid),
 818                               copy->hc_data_version, data_version);
 819                        /* File was changed, send error to cdt. Do not ask for
 820                         * retry because if a file is modified frequently,
 821                         * the cdt will loop on retried archive requests.
 822                         * The policy engine will ask for a new archive later
 823                         * when the file will not be modified for some tunable
 824                         * time
 825                         */
 826                        /* we do not notify caller */
 827                        hpk.hpk_flags &= ~HP_FLAG_RETRY;
 828                        /* hpk_errval must be >= 0 */
 829                        hpk.hpk_errval = EBUSY;
 830                }
 831        }
 832
 833progress:
 834        rc = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk),
 835                           &hpk, NULL);
 836
 837        return rc;
 838}
 839
 840static int copy_and_ioctl(int cmd, struct obd_export *exp,
 841                          const void __user *data, size_t size)
 842{
 843        void *copy;
 844        int rc;
 845
 846        copy = memdup_user(data, size);
 847        if (IS_ERR(copy))
 848                return PTR_ERR(copy);
 849
 850        rc = obd_iocontrol(cmd, exp, size, copy, NULL);
 851        kfree(copy);
 852
 853        return rc;
 854}
 855
 856static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl)
 857{
 858        int cmd = qctl->qc_cmd;
 859        int type = qctl->qc_type;
 860        int id = qctl->qc_id;
 861        int valid = qctl->qc_valid;
 862        int rc = 0;
 863
 864        switch (cmd) {
 865        case LUSTRE_Q_INVALIDATE:
 866        case LUSTRE_Q_FINVALIDATE:
 867        case Q_QUOTAON:
 868        case Q_QUOTAOFF:
 869        case Q_SETQUOTA:
 870        case Q_SETINFO:
 871                if (!capable(CFS_CAP_SYS_ADMIN))
 872                        return -EPERM;
 873                break;
 874        case Q_GETQUOTA:
 875                if (((type == USRQUOTA &&
 876                      !uid_eq(current_euid(), make_kuid(&init_user_ns, id))) ||
 877                     (type == GRPQUOTA &&
 878                      !in_egroup_p(make_kgid(&init_user_ns, id)))) &&
 879                      !capable(CFS_CAP_SYS_ADMIN))
 880                        return -EPERM;
 881                break;
 882        case Q_GETINFO:
 883                break;
 884        default:
 885                CERROR("unsupported quotactl op: %#x\n", cmd);
 886                return -ENOTTY;
 887        }
 888
 889        if (valid != QC_GENERAL) {
 890                if (cmd == Q_GETINFO)
 891                        qctl->qc_cmd = Q_GETOINFO;
 892                else if (cmd == Q_GETQUOTA)
 893                        qctl->qc_cmd = Q_GETOQUOTA;
 894                else
 895                        return -EINVAL;
 896
 897                switch (valid) {
 898                case QC_MDTIDX:
 899                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
 900                                           sizeof(*qctl), qctl, NULL);
 901                        break;
 902                case QC_OSTIDX:
 903                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp,
 904                                           sizeof(*qctl), qctl, NULL);
 905                        break;
 906                case QC_UUID:
 907                        rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp,
 908                                           sizeof(*qctl), qctl, NULL);
 909                        if (rc == -EAGAIN)
 910                                rc = obd_iocontrol(OBD_IOC_QUOTACTL,
 911                                                   sbi->ll_dt_exp,
 912                                                   sizeof(*qctl), qctl, NULL);
 913                        break;
 914                default:
 915                        rc = -EINVAL;
 916                        break;
 917                }
 918
 919                if (rc)
 920                        return rc;
 921
 922                qctl->qc_cmd = cmd;
 923        } else {
 924                struct obd_quotactl *oqctl;
 925
 926                oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
 927                if (!oqctl)
 928                        return -ENOMEM;
 929
 930                QCTL_COPY(oqctl, qctl);
 931                rc = obd_quotactl(sbi->ll_md_exp, oqctl);
 932                if (rc) {
 933                        if (rc != -EALREADY && cmd == Q_QUOTAON) {
 934                                oqctl->qc_cmd = Q_QUOTAOFF;
 935                                obd_quotactl(sbi->ll_md_exp, oqctl);
 936                        }
 937                        kfree(oqctl);
 938                        return rc;
 939                }
 940                /* If QIF_SPACE is not set, client should collect the
 941                 * space usage from OSSs by itself
 942                 */
 943                if (cmd == Q_GETQUOTA &&
 944                    !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) &&
 945                    !oqctl->qc_dqblk.dqb_curspace) {
 946                        struct obd_quotactl *oqctl_tmp;
 947
 948                        oqctl_tmp = kzalloc(sizeof(*oqctl_tmp), GFP_NOFS);
 949                        if (!oqctl_tmp) {
 950                                rc = -ENOMEM;
 951                                goto out;
 952                        }
 953
 954                        oqctl_tmp->qc_cmd = Q_GETOQUOTA;
 955                        oqctl_tmp->qc_id = oqctl->qc_id;
 956                        oqctl_tmp->qc_type = oqctl->qc_type;
 957
 958                        /* collect space usage from OSTs */
 959                        oqctl_tmp->qc_dqblk.dqb_curspace = 0;
 960                        rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp);
 961                        if (!rc || rc == -EREMOTEIO) {
 962                                oqctl->qc_dqblk.dqb_curspace =
 963                                        oqctl_tmp->qc_dqblk.dqb_curspace;
 964                                oqctl->qc_dqblk.dqb_valid |= QIF_SPACE;
 965                        }
 966
 967                        /* collect space & inode usage from MDTs */
 968                        oqctl_tmp->qc_dqblk.dqb_curspace = 0;
 969                        oqctl_tmp->qc_dqblk.dqb_curinodes = 0;
 970                        rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp);
 971                        if (!rc || rc == -EREMOTEIO) {
 972                                oqctl->qc_dqblk.dqb_curspace +=
 973                                        oqctl_tmp->qc_dqblk.dqb_curspace;
 974                                oqctl->qc_dqblk.dqb_curinodes =
 975                                        oqctl_tmp->qc_dqblk.dqb_curinodes;
 976                                oqctl->qc_dqblk.dqb_valid |= QIF_INODES;
 977                        } else {
 978                                oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE;
 979                        }
 980
 981                        kfree(oqctl_tmp);
 982                }
 983out:
 984                QCTL_COPY(qctl, oqctl);
 985                kfree(oqctl);
 986        }
 987
 988        return rc;
 989}
 990
 991/* This function tries to get a single name component,
 992 * to send to the server. No actual path traversal involved,
 993 * so we limit to NAME_MAX
 994 */
 995static char *ll_getname(const char __user *filename)
 996{
 997        int ret = 0, len;
 998        char *tmp;
 999
1000        tmp = kzalloc(NAME_MAX + 1, GFP_KERNEL);
1001        if (!tmp)
1002                return ERR_PTR(-ENOMEM);
1003
1004        len = strncpy_from_user(tmp, filename, NAME_MAX + 1);
1005        if (len < 0)
1006                ret = len;
1007        else if (len == 0)
1008                ret = -ENOENT;
1009        else if (len > NAME_MAX && tmp[NAME_MAX] != 0)
1010                ret = -ENAMETOOLONG;
1011
1012        if (ret) {
1013                kfree(tmp);
1014                tmp =  ERR_PTR(ret);
1015        }
1016        return tmp;
1017}
1018
1019#define ll_putname(filename) kfree(filename)
1020
1021static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1022{
1023        struct inode *inode = file_inode(file);
1024        struct ll_sb_info *sbi = ll_i2sbi(inode);
1025        struct obd_ioctl_data *data;
1026        int rc = 0;
1027
1028        CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%#x\n",
1029               PFID(ll_inode2fid(inode)), inode, cmd);
1030
1031        /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1032        if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1033                return -ENOTTY;
1034
1035        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1036        switch (cmd) {
1037        case FSFILT_IOC_GETFLAGS:
1038        case FSFILT_IOC_SETFLAGS:
1039                return ll_iocontrol(inode, file, cmd, arg);
1040        case FSFILT_IOC_GETVERSION_OLD:
1041        case FSFILT_IOC_GETVERSION:
1042                return put_user(inode->i_generation, (int __user *)arg);
1043        /* We need to special case any other ioctls we want to handle,
1044         * to send them to the MDS/OST as appropriate and to properly
1045         * network encode the arg field.
1046        case FSFILT_IOC_SETVERSION_OLD:
1047        case FSFILT_IOC_SETVERSION:
1048        */
1049        case LL_IOC_GET_MDTIDX: {
1050                int mdtidx;
1051
1052                mdtidx = ll_get_mdt_idx(inode);
1053                if (mdtidx < 0)
1054                        return mdtidx;
1055
1056                if (put_user((int)mdtidx, (int __user *)arg))
1057                        return -EFAULT;
1058
1059                return 0;
1060        }
1061        case IOC_MDC_LOOKUP: {
1062                int namelen, len = 0;
1063                char *buf = NULL;
1064                char *filename;
1065
1066                rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
1067                if (rc)
1068                        return rc;
1069                data = (void *)buf;
1070
1071                filename = data->ioc_inlbuf1;
1072                namelen = strlen(filename);
1073
1074                if (namelen < 1) {
1075                        CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1076                        rc = -EINVAL;
1077                        goto out_free;
1078                }
1079
1080                rc = ll_get_fid_by_name(inode, filename, namelen, NULL);
1081                if (rc < 0) {
1082                        CERROR("%s: lookup %.*s failed: rc = %d\n",
1083                               ll_get_fsname(inode->i_sb, NULL, 0), namelen,
1084                               filename, rc);
1085                        goto out_free;
1086                }
1087out_free:
1088                obd_ioctl_freedata(buf, len);
1089                return rc;
1090        }
1091        case LL_IOC_LMV_SETSTRIPE: {
1092                struct lmv_user_md  *lum;
1093                char            *buf = NULL;
1094                char            *filename;
1095                int              namelen = 0;
1096                int              lumlen = 0;
1097                umode_t mode;
1098                int              len;
1099                int              rc;
1100
1101                rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
1102                if (rc)
1103                        return rc;
1104
1105                data = (void *)buf;
1106                if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
1107                    data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) {
1108                        rc = -EINVAL;
1109                        goto lmv_out_free;
1110                }
1111
1112                filename = data->ioc_inlbuf1;
1113                namelen = data->ioc_inllen1;
1114
1115                if (namelen < 1) {
1116                        CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n");
1117                        rc = -EINVAL;
1118                        goto lmv_out_free;
1119                }
1120                lum = (struct lmv_user_md *)data->ioc_inlbuf2;
1121                lumlen = data->ioc_inllen2;
1122
1123                if (lum->lum_magic != LMV_USER_MAGIC ||
1124                    lumlen != sizeof(*lum)) {
1125                        CERROR("%s: wrong lum magic %x or size %d: rc = %d\n",
1126                               filename, lum->lum_magic, lumlen, -EFAULT);
1127                        rc = -EINVAL;
1128                        goto lmv_out_free;
1129                }
1130
1131#if OBD_OCD_VERSION(2, 9, 50, 0) > LUSTRE_VERSION_CODE
1132                mode = data->ioc_type != 0 ? data->ioc_type : S_IRWXUGO;
1133#else
1134                mode = data->ioc_type;
1135#endif
1136                rc = ll_dir_setdirstripe(inode, lum, filename, mode);
1137lmv_out_free:
1138                obd_ioctl_freedata(buf, len);
1139                return rc;
1140        }
1141        case LL_IOC_LMV_SET_DEFAULT_STRIPE: {
1142                struct lmv_user_md __user *ulump;
1143                struct lmv_user_md lum;
1144                int rc;
1145
1146                ulump = (struct lmv_user_md __user *)arg;
1147                if (copy_from_user(&lum, ulump, sizeof(lum)))
1148                        return -EFAULT;
1149
1150                if (lum.lum_magic != LMV_USER_MAGIC)
1151                        return -EINVAL;
1152
1153                rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0);
1154
1155                return rc;
1156        }
1157        case LL_IOC_LOV_SETSTRIPE: {
1158                struct lov_user_md_v3 lumv3;
1159                struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1160                struct lov_user_md_v1 __user *lumv1p = (void __user *)arg;
1161                struct lov_user_md_v3 __user *lumv3p = (void __user *)arg;
1162
1163                int set_default = 0;
1164
1165                LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
1166                LASSERT(sizeof(lumv3.lmm_objects[0]) ==
1167                        sizeof(lumv3p->lmm_objects[0]));
1168                /* first try with v1 which is smaller than v3 */
1169                if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1)))
1170                        return -EFAULT;
1171
1172                if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1173                        if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3)))
1174                                return -EFAULT;
1175                }
1176
1177                if (is_root_inode(inode))
1178                        set_default = 1;
1179
1180                /* in v1 and v3 cases lumv1 points to data */
1181                rc = ll_dir_setstripe(inode, lumv1, set_default);
1182
1183                return rc;
1184        }
1185        case LL_IOC_LMV_GETSTRIPE: {
1186                struct lmv_user_md __user *ulmv;
1187                struct lmv_user_md lum;
1188                struct ptlrpc_request *request = NULL;
1189                struct lmv_user_md *tmp = NULL;
1190                union lmv_mds_md *lmm = NULL;
1191                u64 valid = 0;
1192                int stripe_count;
1193                int mdt_index;
1194                int lum_size;
1195                int lmmsize;
1196                int rc;
1197                int i;
1198
1199                ulmv = (struct lmv_user_md __user *)arg;
1200                if (copy_from_user(&lum, ulmv, sizeof(*ulmv)))
1201                        return -EFAULT;
1202
1203                /*
1204                 * lum_magic will indicate which stripe the ioctl will like
1205                 * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC
1206                 * is for default LMV stripe
1207                 */
1208                if (lum.lum_magic == LMV_MAGIC_V1)
1209                        valid |= OBD_MD_MEA;
1210                else if (lum.lum_magic == LMV_USER_MAGIC)
1211                        valid |= OBD_MD_DEFAULT_MEA;
1212                else
1213                        return -EINVAL;
1214
1215                rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request,
1216                                      valid);
1217                if (rc)
1218                        goto finish_req;
1219
1220                /* Get default LMV EA */
1221                if (lum.lum_magic == LMV_USER_MAGIC) {
1222                        if (rc)
1223                                goto finish_req;
1224
1225                        if (lmmsize > sizeof(*ulmv)) {
1226                                rc = -EINVAL;
1227                                goto finish_req;
1228                        }
1229
1230                        if (copy_to_user(ulmv, lmm, lmmsize))
1231                                rc = -EFAULT;
1232
1233                        goto finish_req;
1234                }
1235
1236                stripe_count = lmv_mds_md_stripe_count_get(lmm);
1237                lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1);
1238                tmp = kzalloc(lum_size, GFP_NOFS);
1239                if (!tmp) {
1240                        rc = -ENOMEM;
1241                        goto finish_req;
1242                }
1243
1244                mdt_index = ll_get_mdt_idx(inode);
1245                if (mdt_index < 0) {
1246                        rc = -ENOMEM;
1247                        goto out_tmp;
1248                }
1249                tmp->lum_magic = LMV_MAGIC_V1;
1250                tmp->lum_stripe_count = 0;
1251                tmp->lum_stripe_offset = mdt_index;
1252                for (i = 0; i < stripe_count; i++) {
1253                        struct lu_fid fid;
1254
1255                        fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]);
1256                        mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid);
1257                        if (mdt_index < 0) {
1258                                rc = mdt_index;
1259                                goto out_tmp;
1260                        }
1261                        tmp->lum_objects[i].lum_mds = mdt_index;
1262                        tmp->lum_objects[i].lum_fid = fid;
1263                        tmp->lum_stripe_count++;
1264                }
1265
1266                if (copy_to_user(ulmv, tmp, lum_size)) {
1267                        rc = -EFAULT;
1268                        goto out_tmp;
1269                }
1270out_tmp:
1271                kfree(tmp);
1272finish_req:
1273                ptlrpc_req_finished(request);
1274                return rc;
1275        }
1276
1277        case LL_IOC_LOV_SWAP_LAYOUTS:
1278                return -EPERM;
1279        case IOC_OBD_STATFS:
1280                return ll_obd_statfs(inode, (void __user *)arg);
1281        case LL_IOC_LOV_GETSTRIPE:
1282        case LL_IOC_MDC_GETINFO:
1283        case IOC_MDC_GETFILEINFO:
1284        case IOC_MDC_GETFILESTRIPE: {
1285                struct ptlrpc_request *request = NULL;
1286                struct lov_user_md __user *lump;
1287                struct lov_mds_md *lmm = NULL;
1288                struct mdt_body *body;
1289                char *filename = NULL;
1290                int lmmsize;
1291
1292                if (cmd == IOC_MDC_GETFILEINFO ||
1293                    cmd == IOC_MDC_GETFILESTRIPE) {
1294                        filename = ll_getname((const char __user *)arg);
1295                        if (IS_ERR(filename))
1296                                return PTR_ERR(filename);
1297
1298                        rc = ll_lov_getstripe_ea_info(inode, filename, &lmm,
1299                                                      &lmmsize, &request);
1300                } else {
1301                        rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize,
1302                                              &request, 0);
1303                }
1304
1305                if (request) {
1306                        body = req_capsule_server_get(&request->rq_pill,
1307                                                      &RMF_MDT_BODY);
1308                        LASSERT(body);
1309                } else {
1310                        goto out_req;
1311                }
1312
1313                if (rc < 0) {
1314                        if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO ||
1315                                               cmd == LL_IOC_MDC_GETINFO)) {
1316                                rc = 0;
1317                                goto skip_lmm;
1318                        } else {
1319                                goto out_req;
1320                        }
1321                }
1322
1323                if (cmd == IOC_MDC_GETFILESTRIPE ||
1324                    cmd == LL_IOC_LOV_GETSTRIPE) {
1325                        lump = (struct lov_user_md __user *)arg;
1326                } else {
1327                        struct lov_user_mds_data __user *lmdp;
1328
1329                        lmdp = (struct lov_user_mds_data __user *)arg;
1330                        lump = &lmdp->lmd_lmm;
1331                }
1332                if (copy_to_user(lump, lmm, lmmsize)) {
1333                        if (copy_to_user(lump, lmm, sizeof(*lump))) {
1334                                rc = -EFAULT;
1335                                goto out_req;
1336                        }
1337                        rc = -EOVERFLOW;
1338                }
1339skip_lmm:
1340                if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) {
1341                        struct lov_user_mds_data __user *lmdp;
1342                        lstat_t st = { 0 };
1343
1344                        st.st_dev     = inode->i_sb->s_dev;
1345                        st.st_mode    = body->mbo_mode;
1346                        st.st_nlink   = body->mbo_nlink;
1347                        st.st_uid     = body->mbo_uid;
1348                        st.st_gid     = body->mbo_gid;
1349                        st.st_rdev    = body->mbo_rdev;
1350                        st.st_size    = body->mbo_size;
1351                        st.st_blksize = PAGE_SIZE;
1352                        st.st_blocks  = body->mbo_blocks;
1353                        st.st_atime   = body->mbo_atime;
1354                        st.st_mtime   = body->mbo_mtime;
1355                        st.st_ctime   = body->mbo_ctime;
1356                        st.st_ino     = cl_fid_build_ino(&body->mbo_fid1,
1357                                                         sbi->ll_flags &
1358                                                         LL_SBI_32BIT_API);
1359
1360                        lmdp = (struct lov_user_mds_data __user *)arg;
1361                        if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) {
1362                                rc = -EFAULT;
1363                                goto out_req;
1364                        }
1365                }
1366
1367out_req:
1368                ptlrpc_req_finished(request);
1369                if (filename)
1370                        ll_putname(filename);
1371                return rc;
1372        }
1373        case IOC_LOV_GETINFO: {
1374                struct lov_user_mds_data __user *lumd;
1375                struct lov_stripe_md *lsm;
1376                struct lov_user_md __user *lum;
1377                struct lov_mds_md *lmm;
1378                int lmmsize;
1379                lstat_t st;
1380
1381                lumd = (struct lov_user_mds_data __user *)arg;
1382                lum = &lumd->lmd_lmm;
1383
1384                rc = ll_get_max_mdsize(sbi, &lmmsize);
1385                if (rc)
1386                        return rc;
1387
1388                lmm = libcfs_kvzalloc(lmmsize, GFP_NOFS);
1389                if (!lmm)
1390                        return -ENOMEM;
1391                if (copy_from_user(lmm, lum, lmmsize)) {
1392                        rc = -EFAULT;
1393                        goto free_lmm;
1394                }
1395
1396                switch (lmm->lmm_magic) {
1397                case LOV_USER_MAGIC_V1:
1398                        if (cpu_to_le32(LOV_USER_MAGIC_V1) == LOV_USER_MAGIC_V1)
1399                                break;
1400                        /* swab objects first so that stripes num will be sane */
1401                        lustre_swab_lov_user_md_objects(
1402                                ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1403                                ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1404                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1405                        break;
1406                case LOV_USER_MAGIC_V3:
1407                        if (cpu_to_le32(LOV_USER_MAGIC_V3) == LOV_USER_MAGIC_V3)
1408                                break;
1409                        /* swab objects first so that stripes num will be sane */
1410                        lustre_swab_lov_user_md_objects(
1411                                ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1412                                ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1413                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1414                        break;
1415                default:
1416                        rc = -EINVAL;
1417                        goto free_lmm;
1418                }
1419
1420                rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1421                if (rc < 0) {
1422                        rc = -ENOMEM;
1423                        goto free_lmm;
1424                }
1425
1426                /* Perform glimpse_size operation. */
1427                memset(&st, 0, sizeof(st));
1428
1429                rc = ll_glimpse_ioctl(sbi, lsm, &st);
1430                if (rc)
1431                        goto free_lsm;
1432
1433                if (copy_to_user(&lumd->lmd_st, &st, sizeof(st))) {
1434                        rc = -EFAULT;
1435                        goto free_lsm;
1436                }
1437
1438free_lsm:
1439                obd_free_memmd(sbi->ll_dt_exp, &lsm);
1440free_lmm:
1441                kvfree(lmm);
1442                return rc;
1443        }
1444        case OBD_IOC_QUOTACHECK: {
1445                struct obd_quotactl *oqctl;
1446                int error = 0;
1447
1448                if (!capable(CFS_CAP_SYS_ADMIN))
1449                        return -EPERM;
1450
1451                oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS);
1452                if (!oqctl)
1453                        return -ENOMEM;
1454                oqctl->qc_type = arg;
1455                rc = obd_quotacheck(sbi->ll_md_exp, oqctl);
1456                if (rc < 0) {
1457                        CDEBUG(D_INFO, "md_quotacheck failed: rc %d\n", rc);
1458                        error = rc;
1459                }
1460
1461                rc = obd_quotacheck(sbi->ll_dt_exp, oqctl);
1462                if (rc < 0)
1463                        CDEBUG(D_INFO, "obd_quotacheck failed: rc %d\n", rc);
1464
1465                kfree(oqctl);
1466                return error ?: rc;
1467        }
1468        case OBD_IOC_POLL_QUOTACHECK: {
1469                struct if_quotacheck *check;
1470
1471                if (!capable(CFS_CAP_SYS_ADMIN))
1472                        return -EPERM;
1473
1474                check = kzalloc(sizeof(*check), GFP_NOFS);
1475                if (!check)
1476                        return -ENOMEM;
1477
1478                rc = obd_iocontrol(cmd, sbi->ll_md_exp, 0, (void *)check,
1479                                   NULL);
1480                if (rc) {
1481                        CDEBUG(D_QUOTA, "mdc ioctl %d failed: %d\n", cmd, rc);
1482                        if (copy_to_user((void __user *)arg, check,
1483                                         sizeof(*check)))
1484                                CDEBUG(D_QUOTA, "copy_to_user failed\n");
1485                        goto out_poll;
1486                }
1487
1488                rc = obd_iocontrol(cmd, sbi->ll_dt_exp, 0, (void *)check,
1489                                   NULL);
1490                if (rc) {
1491                        CDEBUG(D_QUOTA, "osc ioctl %d failed: %d\n", cmd, rc);
1492                        if (copy_to_user((void __user *)arg, check,
1493                                         sizeof(*check)))
1494                                CDEBUG(D_QUOTA, "copy_to_user failed\n");
1495                        goto out_poll;
1496                }
1497out_poll:
1498                kfree(check);
1499                return rc;
1500        }
1501        case OBD_IOC_QUOTACTL: {
1502                struct if_quotactl *qctl;
1503
1504                qctl = kzalloc(sizeof(*qctl), GFP_NOFS);
1505                if (!qctl)
1506                        return -ENOMEM;
1507
1508                if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl))) {
1509                        rc = -EFAULT;
1510                        goto out_quotactl;
1511                }
1512
1513                rc = quotactl_ioctl(sbi, qctl);
1514
1515                if (rc == 0 && copy_to_user((void __user *)arg, qctl,
1516                                            sizeof(*qctl)))
1517                        rc = -EFAULT;
1518
1519out_quotactl:
1520                kfree(qctl);
1521                return rc;
1522        }
1523        case OBD_IOC_GETDTNAME:
1524        case OBD_IOC_GETMDNAME:
1525                return ll_get_obd_name(inode, cmd, arg);
1526        case LL_IOC_FLUSHCTX:
1527                return ll_flush_ctx(inode);
1528        case LL_IOC_GETOBDCOUNT: {
1529                int count, vallen;
1530                struct obd_export *exp;
1531
1532                if (copy_from_user(&count, (int __user *)arg, sizeof(int)))
1533                        return -EFAULT;
1534
1535                /* get ost count when count is zero, get mdt count otherwise */
1536                exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp;
1537                vallen = sizeof(count);
1538                rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT),
1539                                  KEY_TGT_COUNT, &vallen, &count, NULL);
1540                if (rc) {
1541                        CERROR("get target count failed: %d\n", rc);
1542                        return rc;
1543                }
1544
1545                if (copy_to_user((int __user *)arg, &count, sizeof(int)))
1546                        return -EFAULT;
1547
1548                return 0;
1549        }
1550        case LL_IOC_PATH2FID:
1551                if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
1552                                 sizeof(struct lu_fid)))
1553                        return -EFAULT;
1554                return 0;
1555        case LL_IOC_GET_CONNECT_FLAGS: {
1556                return obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL,
1557                                     (void __user *)arg);
1558        }
1559        case OBD_IOC_CHANGELOG_SEND:
1560        case OBD_IOC_CHANGELOG_CLEAR:
1561                if (!capable(CFS_CAP_SYS_ADMIN))
1562                        return -EPERM;
1563
1564                rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg,
1565                                    sizeof(struct ioc_changelog));
1566                return rc;
1567        case OBD_IOC_FID2PATH:
1568                return ll_fid2path(inode, (void __user *)arg);
1569        case LL_IOC_GETPARENT:
1570                return ll_getparent(file, (void __user *)arg);
1571        case LL_IOC_FID2MDTIDX: {
1572                struct obd_export *exp = ll_i2mdexp(inode);
1573                struct lu_fid fid;
1574                __u32 index;
1575
1576                if (copy_from_user(&fid, (const struct lu_fid __user *)arg,
1577                                   sizeof(fid)))
1578                        return -EFAULT;
1579
1580                /* Call mdc_iocontrol */
1581                rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid,
1582                                   &index);
1583                if (rc)
1584                        return rc;
1585
1586                return index;
1587        }
1588        case LL_IOC_HSM_REQUEST: {
1589                struct hsm_user_request *hur;
1590                ssize_t                  totalsize;
1591
1592                hur = memdup_user((void __user *)arg, sizeof(*hur));
1593                if (IS_ERR(hur))
1594                        return PTR_ERR(hur);
1595
1596                /* Compute the whole struct size */
1597                totalsize = hur_len(hur);
1598                kfree(hur);
1599                if (totalsize < 0)
1600                        return -E2BIG;
1601
1602                /* Final size will be more than double totalsize */
1603                if (totalsize >= MDS_MAXREQSIZE / 3)
1604                        return -E2BIG;
1605
1606                hur = libcfs_kvzalloc(totalsize, GFP_NOFS);
1607                if (!hur)
1608                        return -ENOMEM;
1609
1610                /* Copy the whole struct */
1611                if (copy_from_user(hur, (void __user *)arg, totalsize)) {
1612                        kvfree(hur);
1613                        return -EFAULT;
1614                }
1615
1616                if (hur->hur_request.hr_action == HUA_RELEASE) {
1617                        const struct lu_fid *fid;
1618                        struct inode *f;
1619                        int i;
1620
1621                        for (i = 0; i < hur->hur_request.hr_itemcount; i++) {
1622                                fid = &hur->hur_user_item[i].hui_fid;
1623                                f = search_inode_for_lustre(inode->i_sb, fid);
1624                                if (IS_ERR(f)) {
1625                                        rc = PTR_ERR(f);
1626                                        break;
1627                                }
1628
1629                                rc = ll_hsm_release(f);
1630                                iput(f);
1631                                if (rc != 0)
1632                                        break;
1633                        }
1634                } else {
1635                        rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize,
1636                                           hur, NULL);
1637                }
1638
1639                kvfree(hur);
1640
1641                return rc;
1642        }
1643        case LL_IOC_HSM_PROGRESS: {
1644                struct hsm_progress_kernel      hpk;
1645                struct hsm_progress             hp;
1646
1647                if (copy_from_user(&hp, (void __user *)arg, sizeof(hp)))
1648                        return -EFAULT;
1649
1650                hpk.hpk_fid = hp.hp_fid;
1651                hpk.hpk_cookie = hp.hp_cookie;
1652                hpk.hpk_extent = hp.hp_extent;
1653                hpk.hpk_flags = hp.hp_flags;
1654                hpk.hpk_errval = hp.hp_errval;
1655                hpk.hpk_data_version = 0;
1656
1657                /* File may not exist in Lustre; all progress
1658                 * reported to Lustre root
1659                 */
1660                rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk,
1661                                   NULL);
1662                return rc;
1663        }
1664        case LL_IOC_HSM_CT_START:
1665                if (!capable(CFS_CAP_SYS_ADMIN))
1666                        return -EPERM;
1667
1668                rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg,
1669                                    sizeof(struct lustre_kernelcomm));
1670                return rc;
1671
1672        case LL_IOC_HSM_COPY_START: {
1673                struct hsm_copy *copy;
1674                int              rc;
1675
1676                copy = memdup_user((char __user *)arg, sizeof(*copy));
1677                if (IS_ERR(copy))
1678                        return PTR_ERR(copy);
1679
1680                rc = ll_ioc_copy_start(inode->i_sb, copy);
1681                if (copy_to_user((char __user *)arg, copy, sizeof(*copy)))
1682                        rc = -EFAULT;
1683
1684                kfree(copy);
1685                return rc;
1686        }
1687        case LL_IOC_HSM_COPY_END: {
1688                struct hsm_copy *copy;
1689                int              rc;
1690
1691                copy = memdup_user((char __user *)arg, sizeof(*copy));
1692                if (IS_ERR(copy))
1693                        return PTR_ERR(copy);
1694
1695                rc = ll_ioc_copy_end(inode->i_sb, copy);
1696                if (copy_to_user((char __user *)arg, copy, sizeof(*copy)))
1697                        rc = -EFAULT;
1698
1699                kfree(copy);
1700                return rc;
1701        }
1702        case LL_IOC_MIGRATE: {
1703                char *buf = NULL;
1704                const char *filename;
1705                int namelen = 0;
1706                int len;
1707                int rc;
1708                int mdtidx;
1709
1710                rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg);
1711                if (rc < 0)
1712                        return rc;
1713
1714                data = (struct obd_ioctl_data *)buf;
1715                if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 ||
1716                    !data->ioc_inllen1 || !data->ioc_inllen2) {
1717                        rc = -EINVAL;
1718                        goto migrate_free;
1719                }
1720
1721                filename = data->ioc_inlbuf1;
1722                namelen = data->ioc_inllen1;
1723                if (namelen < 1 || namelen != strlen(filename) + 1) {
1724                        rc = -EINVAL;
1725                        goto migrate_free;
1726                }
1727
1728                if (data->ioc_inllen2 != sizeof(mdtidx)) {
1729                        rc = -EINVAL;
1730                        goto migrate_free;
1731                }
1732                mdtidx = *(int *)data->ioc_inlbuf2;
1733
1734                rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1);
1735migrate_free:
1736                obd_ioctl_freedata(buf, len);
1737
1738                return rc;
1739        }
1740
1741        default:
1742                return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL,
1743                                     (void __user *)arg);
1744        }
1745}
1746
1747static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin)
1748{
1749        struct inode *inode = file->f_mapping->host;
1750        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1751        struct ll_sb_info *sbi = ll_i2sbi(inode);
1752        int api32 = ll_need_32bit_api(sbi);
1753        loff_t ret = -EINVAL;
1754
1755        switch (origin) {
1756        case SEEK_SET:
1757                break;
1758        case SEEK_CUR:
1759                offset += file->f_pos;
1760                break;
1761        case SEEK_END:
1762                if (offset > 0)
1763                        goto out;
1764                if (api32)
1765                        offset += LL_DIR_END_OFF_32BIT;
1766                else
1767                        offset += LL_DIR_END_OFF;
1768                break;
1769        default:
1770                goto out;
1771        }
1772
1773        if (offset >= 0 &&
1774            ((api32 && offset <= LL_DIR_END_OFF_32BIT) ||
1775             (!api32 && offset <= LL_DIR_END_OFF))) {
1776                if (offset != file->f_pos) {
1777                        if ((api32 && offset == LL_DIR_END_OFF_32BIT) ||
1778                            (!api32 && offset == LL_DIR_END_OFF))
1779                                fd->lfd_pos = MDS_DIR_END_OFF;
1780                        else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH)
1781                                fd->lfd_pos = offset << 32;
1782                        else
1783                                fd->lfd_pos = offset;
1784                        file->f_pos = offset;
1785                        file->f_version = 0;
1786                }
1787                ret = offset;
1788        }
1789        goto out;
1790
1791out:
1792        return ret;
1793}
1794
1795static int ll_dir_open(struct inode *inode, struct file *file)
1796{
1797        return ll_file_open(inode, file);
1798}
1799
1800static int ll_dir_release(struct inode *inode, struct file *file)
1801{
1802        return ll_file_release(inode, file);
1803}
1804
1805const struct file_operations ll_dir_operations = {
1806        .llseek   = ll_dir_seek,
1807        .open     = ll_dir_open,
1808        .release  = ll_dir_release,
1809        .read     = generic_read_dir,
1810        .iterate_shared  = ll_readdir,
1811        .unlocked_ioctl   = ll_dir_ioctl,
1812        .fsync    = ll_fsync,
1813};
1814