linux/drivers/staging/lustre/lustre/include/cl_object.h
<<
>>
Prefs
   1/*
   2 * GPL HEADER START
   3 *
   4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5 *
   6 * This program is free software; you can redistribute it and/or modify
   7 * it under the terms of the GNU General Public License version 2 only,
   8 * as published by the Free Software Foundation.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 * General Public License version 2 for more details (a copy is included
  14 * in the LICENSE file that accompanied this code).
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * version 2 along with this program; If not, see
  18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19 *
  20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21 * CA 95054 USA or visit www.sun.com if you need additional information or
  22 * have any questions.
  23 *
  24 * GPL HEADER END
  25 */
  26/*
  27 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  28 * Use is subject to license terms.
  29 *
  30 * Copyright (c) 2011, 2012, Intel Corporation.
  31 */
  32/*
  33 * This file is part of Lustre, http://www.lustre.org/
  34 * Lustre is a trademark of Sun Microsystems, Inc.
  35 */
  36#ifndef _LUSTRE_CL_OBJECT_H
  37#define _LUSTRE_CL_OBJECT_H
  38
  39/** \defgroup clio clio
  40 *
  41 * Client objects implement io operations and cache pages.
  42 *
  43 * Examples: lov and osc are implementations of cl interface.
  44 *
  45 * Big Theory Statement.
  46 *
  47 * Layered objects.
  48 *
  49 * Client implementation is based on the following data-types:
  50 *
  51 *   - cl_object
  52 *
  53 *   - cl_page
  54 *
  55 *   - cl_lock     represents an extent lock on an object.
  56 *
  57 *   - cl_io       represents high-level i/o activity such as whole read/write
  58 *               system call, or write-out of pages from under the lock being
  59 *               canceled. cl_io has sub-ios that can be stopped and resumed
  60 *               independently, thus achieving high degree of transfer
  61 *               parallelism. Single cl_io can be advanced forward by
  62 *               the multiple threads (although in the most usual case of
  63 *               read/write system call it is associated with the single user
  64 *               thread, that issued the system call).
  65 *
  66 *   - cl_req      represents a collection of pages for a transfer. cl_req is
  67 *               constructed by req-forming engine that tries to saturate
  68 *               transport with large and continuous transfers.
  69 *
  70 * Terminology
  71 *
  72 *     - to avoid confusion high-level I/O operation like read or write system
  73 *     call is referred to as "an io", whereas low-level I/O operation, like
  74 *     RPC, is referred to as "a transfer"
  75 *
  76 *     - "generic code" means generic (not file system specific) code in the
  77 *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
  78 *     is not layer specific.
  79 *
  80 * Locking.
  81 *
  82 *  - i_mutex
  83 *      - PG_locked
  84 *        - cl_object_header::coh_page_guard
  85 *        - cl_object_header::coh_lock_guard
  86 *        - lu_site::ls_guard
  87 *
  88 * See the top comment in cl_object.c for the description of overall locking and
  89 * reference-counting design.
  90 *
  91 * See comments below for the description of i/o, page, and dlm-locking
  92 * design.
  93 *
  94 * @{
  95 */
  96
  97/*
  98 * super-class definitions.
  99 */
 100#include <lu_object.h>
 101#include <lvfs.h>
 102#       include <linux/mutex.h>
 103#       include <linux/radix-tree.h>
 104
 105struct inode;
 106
 107struct cl_device;
 108struct cl_device_operations;
 109
 110struct cl_object;
 111struct cl_object_page_operations;
 112struct cl_object_lock_operations;
 113
 114struct cl_page;
 115struct cl_page_slice;
 116struct cl_lock;
 117struct cl_lock_slice;
 118
 119struct cl_lock_operations;
 120struct cl_page_operations;
 121
 122struct cl_io;
 123struct cl_io_slice;
 124
 125struct cl_req;
 126struct cl_req_slice;
 127
 128/**
 129 * Operations for each data device in the client stack.
 130 *
 131 * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
 132 */
 133struct cl_device_operations {
 134        /**
 135         * Initialize cl_req. This method is called top-to-bottom on all
 136         * devices in the stack to get them a chance to allocate layer-private
 137         * data, and to attach them to the cl_req by calling
 138         * cl_req_slice_add().
 139         *
 140         * \see osc_req_init(), lov_req_init(), lovsub_req_init()
 141         * \see ccc_req_init()
 142         */
 143        int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
 144                            struct cl_req *req);
 145};
 146
 147/**
 148 * Device in the client stack.
 149 *
 150 * \see ccc_device, lov_device, lovsub_device, osc_device
 151 */
 152struct cl_device {
 153        /** Super-class. */
 154        struct lu_device                   cd_lu_dev;
 155        /** Per-layer operation vector. */
 156        const struct cl_device_operations *cd_ops;
 157};
 158
 159/** \addtogroup cl_object cl_object
 160 * @{ */
 161/**
 162 * "Data attributes" of cl_object. Data attributes can be updated
 163 * independently for a sub-object, and top-object's attributes are calculated
 164 * from sub-objects' ones.
 165 */
 166struct cl_attr {
 167        /** Object size, in bytes */
 168        loff_t cat_size;
 169        /**
 170         * Known minimal size, in bytes.
 171         *
 172         * This is only valid when at least one DLM lock is held.
 173         */
 174        loff_t cat_kms;
 175        /** Modification time. Measured in seconds since epoch. */
 176        time_t cat_mtime;
 177        /** Access time. Measured in seconds since epoch. */
 178        time_t cat_atime;
 179        /** Change time. Measured in seconds since epoch. */
 180        time_t cat_ctime;
 181        /**
 182         * Blocks allocated to this cl_object on the server file system.
 183         *
 184         * \todo XXX An interface for block size is needed.
 185         */
 186        __u64  cat_blocks;
 187        /**
 188         * User identifier for quota purposes.
 189         */
 190        uid_t  cat_uid;
 191        /**
 192         * Group identifier for quota purposes.
 193         */
 194        gid_t  cat_gid;
 195};
 196
 197/**
 198 * Fields in cl_attr that are being set.
 199 */
 200enum cl_attr_valid {
 201        CAT_SIZE   = 1 << 0,
 202        CAT_KMS    = 1 << 1,
 203        CAT_MTIME  = 1 << 3,
 204        CAT_ATIME  = 1 << 4,
 205        CAT_CTIME  = 1 << 5,
 206        CAT_BLOCKS = 1 << 6,
 207        CAT_UID    = 1 << 7,
 208        CAT_GID    = 1 << 8
 209};
 210
 211/**
 212 * Sub-class of lu_object with methods common for objects on the client
 213 * stacks.
 214 *
 215 * cl_object: represents a regular file system object, both a file and a
 216 *    stripe. cl_object is based on lu_object: it is identified by a fid,
 217 *    layered, cached, hashed, and lrued. Important distinction with the server
 218 *    side, where md_object and dt_object are used, is that cl_object "fans out"
 219 *    at the lov/sns level: depending on the file layout, single file is
 220 *    represented as a set of "sub-objects" (stripes). At the implementation
 221 *    level, struct lov_object contains an array of cl_objects. Each sub-object
 222 *    is a full-fledged cl_object, having its fid, living in the lru and hash
 223 *    table.
 224 *
 225 *    This leads to the next important difference with the server side: on the
 226 *    client, it's quite usual to have objects with the different sequence of
 227 *    layers. For example, typical top-object is composed of the following
 228 *    layers:
 229 *
 230 *      - vvp
 231 *      - lov
 232 *
 233 *    whereas its sub-objects are composed of
 234 *
 235 *      - lovsub
 236 *      - osc
 237 *
 238 *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
 239 *    track of the object-subobject relationship.
 240 *
 241 *    Sub-objects are not cached independently: when top-object is about to
 242 *    be discarded from the memory, all its sub-objects are torn-down and
 243 *    destroyed too.
 244 *
 245 * \see ccc_object, lov_object, lovsub_object, osc_object
 246 */
 247struct cl_object {
 248        /** super class */
 249        struct lu_object                   co_lu;
 250        /** per-object-layer operations */
 251        const struct cl_object_operations *co_ops;
 252        /** offset of page slice in cl_page buffer */
 253        int                                co_slice_off;
 254};
 255
 256/**
 257 * Description of the client object configuration. This is used for the
 258 * creation of a new client object that is identified by a more state than
 259 * fid.
 260 */
 261struct cl_object_conf {
 262        /** Super-class. */
 263        struct lu_object_conf     coc_lu;
 264        union {
 265                /**
 266                 * Object layout. This is consumed by lov.
 267                 */
 268                struct lustre_md *coc_md;
 269                /**
 270                 * Description of particular stripe location in the
 271                 * cluster. This is consumed by osc.
 272                 */
 273                struct lov_oinfo *coc_oinfo;
 274        } u;
 275        /**
 276         * VFS inode. This is consumed by vvp.
 277         */
 278        struct inode         *coc_inode;
 279        /**
 280         * Layout lock handle.
 281         */
 282        struct ldlm_lock         *coc_lock;
 283        /**
 284         * Operation to handle layout, OBJECT_CONF_XYZ.
 285         */
 286        int                       coc_opc;
 287};
 288
 289enum {
 290        /** configure layout, set up a new stripe, must be called while
 291         * holding layout lock. */
 292        OBJECT_CONF_SET = 0,
 293        /** invalidate the current stripe configuration due to losing
 294         * layout lock. */
 295        OBJECT_CONF_INVALIDATE = 1,
 296        /** wait for old layout to go away so that new layout can be
 297         * set up. */
 298        OBJECT_CONF_WAIT = 2
 299};
 300
 301/**
 302 * Operations implemented for each cl object layer.
 303 *
 304 * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
 305 */
 306struct cl_object_operations {
 307        /**
 308         * Initialize page slice for this layer. Called top-to-bottom through
 309         * every object layer when a new cl_page is instantiated. Layer
 310         * keeping private per-page data, or requiring its own page operations
 311         * vector should allocate these data here, and attach then to the page
 312         * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
 313         * sense). Optional.
 314         *
 315         * \retval NULL success.
 316         *
 317         * \retval ERR_PTR(errno) failure code.
 318         *
 319         * \retval valid-pointer pointer to already existing referenced page
 320         *       to be used instead of newly created.
 321         */
 322        int  (*coo_page_init)(const struct lu_env *env, struct cl_object *obj,
 323                                struct cl_page *page, struct page *vmpage);
 324        /**
 325         * Initialize lock slice for this layer. Called top-to-bottom through
 326         * every object layer when a new cl_lock is instantiated. Layer
 327         * keeping private per-lock data, or requiring its own lock operations
 328         * vector should allocate these data here, and attach then to the lock
 329         * by calling cl_lock_slice_add(). Mandatory.
 330         */
 331        int  (*coo_lock_init)(const struct lu_env *env,
 332                              struct cl_object *obj, struct cl_lock *lock,
 333                              const struct cl_io *io);
 334        /**
 335         * Initialize io state for a given layer.
 336         *
 337         * called top-to-bottom once per io existence to initialize io
 338         * state. If layer wants to keep some state for this type of io, it
 339         * has to embed struct cl_io_slice in lu_env::le_ses, and register
 340         * slice with cl_io_slice_add(). It is guaranteed that all threads
 341         * participating in this io share the same session.
 342         */
 343        int  (*coo_io_init)(const struct lu_env *env,
 344                            struct cl_object *obj, struct cl_io *io);
 345        /**
 346         * Fill portion of \a attr that this layer controls. This method is
 347         * called top-to-bottom through all object layers.
 348         *
 349         * \pre cl_object_header::coh_attr_guard of the top-object is locked.
 350         *
 351         * \return   0: to continue
 352         * \return +ve: to stop iterating through layers (but 0 is returned
 353         * from enclosing cl_object_attr_get())
 354         * \return -ve: to signal error
 355         */
 356        int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
 357                            struct cl_attr *attr);
 358        /**
 359         * Update attributes.
 360         *
 361         * \a valid is a bitmask composed from enum #cl_attr_valid, and
 362         * indicating what attributes are to be set.
 363         *
 364         * \pre cl_object_header::coh_attr_guard of the top-object is locked.
 365         *
 366         * \return the same convention as for
 367         * cl_object_operations::coo_attr_get() is used.
 368         */
 369        int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
 370                            const struct cl_attr *attr, unsigned valid);
 371        /**
 372         * Update object configuration. Called top-to-bottom to modify object
 373         * configuration.
 374         *
 375         * XXX error conditions and handling.
 376         */
 377        int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
 378                            const struct cl_object_conf *conf);
 379        /**
 380         * Glimpse ast. Executed when glimpse ast arrives for a lock on this
 381         * object. Layers are supposed to fill parts of \a lvb that will be
 382         * shipped to the glimpse originator as a glimpse result.
 383         *
 384         * \see ccc_object_glimpse(), lovsub_object_glimpse(),
 385         * \see osc_object_glimpse()
 386         */
 387        int (*coo_glimpse)(const struct lu_env *env,
 388                           const struct cl_object *obj, struct ost_lvb *lvb);
 389};
 390
 391/**
 392 * Extended header for client object.
 393 */
 394struct cl_object_header {
 395        /** Standard lu_object_header. cl_object::co_lu::lo_header points
 396         * here. */
 397        struct lu_object_header  coh_lu;
 398        /** \name locks
 399         * \todo XXX move locks below to the separate cache-lines, they are
 400         * mostly useless otherwise.
 401         */
 402        /** @{ */
 403        /** Lock protecting page tree. */
 404        spinlock_t               coh_page_guard;
 405        /** Lock protecting lock list. */
 406        spinlock_t               coh_lock_guard;
 407        /** @} locks */
 408        /** Radix tree of cl_page's, cached for this object. */
 409        struct radix_tree_root   coh_tree;
 410        /** # of pages in radix tree. */
 411        unsigned long       coh_pages;
 412        /** List of cl_lock's granted for this object. */
 413        struct list_head               coh_locks;
 414
 415        /**
 416         * Parent object. It is assumed that an object has a well-defined
 417         * parent, but not a well-defined child (there may be multiple
 418         * sub-objects, for the same top-object). cl_object_header::coh_parent
 419         * field allows certain code to be written generically, without
 420         * limiting possible cl_object layouts unduly.
 421         */
 422        struct cl_object_header *coh_parent;
 423        /**
 424         * Protects consistency between cl_attr of parent object and
 425         * attributes of sub-objects, that the former is calculated ("merged")
 426         * from.
 427         *
 428         * \todo XXX this can be read/write lock if needed.
 429         */
 430        spinlock_t               coh_attr_guard;
 431        /**
 432         * Size of cl_page + page slices
 433         */
 434        unsigned short           coh_page_bufsize;
 435        /**
 436         * Number of objects above this one: 0 for a top-object, 1 for its
 437         * sub-object, etc.
 438         */
 439        unsigned char            coh_nesting;
 440};
 441
 442/**
 443 * Helper macro: iterate over all layers of the object \a obj, assigning every
 444 * layer top-to-bottom to \a slice.
 445 */
 446#define cl_object_for_each(slice, obj)                                \
 447        list_for_each_entry((slice),                                \
 448                                &(obj)->co_lu.lo_header->loh_layers,    \
 449                                co_lu.lo_linkage)
 450/**
 451 * Helper macro: iterate over all layers of the object \a obj, assigning every
 452 * layer bottom-to-top to \a slice.
 453 */
 454#define cl_object_for_each_reverse(slice, obj)                         \
 455        list_for_each_entry_reverse((slice),                         \
 456                                        &(obj)->co_lu.lo_header->loh_layers, \
 457                                        co_lu.lo_linkage)
 458/** @} cl_object */
 459
 460#ifndef pgoff_t
 461#define pgoff_t unsigned long
 462#endif
 463
 464#define CL_PAGE_EOF ((pgoff_t)~0ull)
 465
 466/** \addtogroup cl_page cl_page
 467 * @{ */
 468
 469/** \struct cl_page
 470 * Layered client page.
 471 *
 472 * cl_page: represents a portion of a file, cached in the memory. All pages
 473 *    of the given file are of the same size, and are kept in the radix tree
 474 *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
 475 *    of the top-level file object are first class cl_objects, they have their
 476 *    own radix trees of pages and hence page is implemented as a sequence of
 477 *    struct cl_pages's, linked into double-linked list through
 478 *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
 479 *    corresponding radix tree at the corresponding logical offset.
 480 *
 481 * cl_page is associated with VM page of the hosting environment (struct
 482 *    page in Linux kernel, for example), struct page. It is assumed, that this
 483 *    association is implemented by one of cl_page layers (top layer in the
 484 *    current design) that
 485 *
 486 *      - intercepts per-VM-page call-backs made by the environment (e.g.,
 487 *        memory pressure),
 488 *
 489 *      - translates state (page flag bits) and locking between lustre and
 490 *        environment.
 491 *
 492 *    The association between cl_page and struct page is immutable and
 493 *    established when cl_page is created.
 494 *
 495 * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
 496 *    this io an exclusive access to this page w.r.t. other io attempts and
 497 *    various events changing page state (such as transfer completion, or
 498 *    eviction of the page from the memory). Note, that in general cl_io
 499 *    cannot be identified with a particular thread, and page ownership is not
 500 *    exactly equal to the current thread holding a lock on the page. Layer
 501 *    implementing association between cl_page and struct page has to implement
 502 *    ownership on top of available synchronization mechanisms.
 503 *
 504 *    While lustre client maintains the notion of an page ownership by io,
 505 *    hosting MM/VM usually has its own page concurrency control
 506 *    mechanisms. For example, in Linux, page access is synchronized by the
 507 *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
 508 *    takes care to acquire and release such locks as necessary around the
 509 *    calls to the file system methods (->readpage(), ->prepare_write(),
 510 *    ->commit_write(), etc.). This leads to the situation when there are two
 511 *    different ways to own a page in the client:
 512 *
 513 *      - client code explicitly and voluntary owns the page (cl_page_own());
 514 *
 515 *      - VM locks a page and then calls the client, that has "to assume"
 516 *        the ownership from the VM (cl_page_assume()).
 517 *
 518 *    Dual methods to release ownership are cl_page_disown() and
 519 *    cl_page_unassume().
 520 *
 521 * cl_page is reference counted (cl_page::cp_ref). When reference counter
 522 *    drops to 0, the page is returned to the cache, unless it is in
 523 *    cl_page_state::CPS_FREEING state, in which case it is immediately
 524 *    destroyed.
 525 *
 526 *    The general logic guaranteeing the absence of "existential races" for
 527 *    pages is the following:
 528 *
 529 *      - there are fixed known ways for a thread to obtain a new reference
 530 *        to a page:
 531 *
 532 *          - by doing a lookup in the cl_object radix tree, protected by the
 533 *            spin-lock;
 534 *
 535 *          - by starting from VM-locked struct page and following some
 536 *            hosting environment method (e.g., following ->private pointer in
 537 *            the case of Linux kernel), see cl_vmpage_page();
 538 *
 539 *      - when the page enters cl_page_state::CPS_FREEING state, all these
 540 *        ways are severed with the proper synchronization
 541 *        (cl_page_delete());
 542 *
 543 *      - entry into cl_page_state::CPS_FREEING is serialized by the VM page
 544 *        lock;
 545 *
 546 *      - no new references to the page in cl_page_state::CPS_FREEING state
 547 *        are allowed (checked in cl_page_get()).
 548 *
 549 *    Together this guarantees that when last reference to a
 550 *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
 551 *    page, as neither references to it can be acquired at that point, nor
 552 *    ones exist.
 553 *
 554 * cl_page is a state machine. States are enumerated in enum
 555 *    cl_page_state. Possible state transitions are enumerated in
 556 *    cl_page_state_set(). State transition process (i.e., actual changing of
 557 *    cl_page::cp_state field) is protected by the lock on the underlying VM
 558 *    page.
 559 *
 560 * Linux Kernel implementation.
 561 *
 562 *    Binding between cl_page and struct page (which is a typedef for
 563 *    struct page) is implemented in the vvp layer. cl_page is attached to the
 564 *    ->private pointer of the struct page, together with the setting of
 565 *    PG_private bit in page->flags, and acquiring additional reference on the
 566 *    struct page (much like struct buffer_head, or any similar file system
 567 *    private data structures).
 568 *
 569 *    PG_locked lock is used to implement both ownership and transfer
 570 *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
 571 *    states. No additional references are acquired for the duration of the
 572 *    transfer.
 573 *
 574 * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
 575 *        write-out is "protected" by the special PG_writeback bit.
 576 */
 577
 578/**
 579 * States of cl_page. cl_page.c assumes particular order here.
 580 *
 581 * The page state machine is rather crude, as it doesn't recognize finer page
 582 * states like "dirty" or "up to date". This is because such states are not
 583 * always well defined for the whole stack (see, for example, the
 584 * implementation of the read-ahead, that hides page up-to-dateness to track
 585 * cache hits accurately). Such sub-states are maintained by the layers that
 586 * are interested in them.
 587 */
 588enum cl_page_state {
 589        /**
 590         * Page is in the cache, un-owned. Page leaves cached state in the
 591         * following cases:
 592         *
 593         *     - [cl_page_state::CPS_OWNED] io comes across the page and
 594         *     owns it;
 595         *
 596         *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
 597         *     req-formation engine decides that it wants to include this page
 598         *     into an cl_req being constructed, and yanks it from the cache;
 599         *
 600         *     - [cl_page_state::CPS_FREEING] VM callback is executed to
 601         *     evict the page form the memory;
 602         *
 603         * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
 604         */
 605        CPS_CACHED,
 606        /**
 607         * Page is exclusively owned by some cl_io. Page may end up in this
 608         * state as a result of
 609         *
 610         *     - io creating new page and immediately owning it;
 611         *
 612         *     - [cl_page_state::CPS_CACHED] io finding existing cached page
 613         *     and owning it;
 614         *
 615         *     - [cl_page_state::CPS_OWNED] io finding existing owned page
 616         *     and waiting for owner to release the page;
 617         *
 618         * Page leaves owned state in the following cases:
 619         *
 620         *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
 621         *     the cache, doing nothing;
 622         *
 623         *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
 624         *     this page;
 625         *
 626         *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
 627         *     transfer for this page;
 628         *
 629         *     - [cl_page_state::CPS_FREEING] io decides to destroy this
 630         *     page (e.g., as part of truncate or extent lock cancellation).
 631         *
 632         * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
 633         */
 634        CPS_OWNED,
 635        /**
 636         * Page is being written out, as a part of a transfer. This state is
 637         * entered when req-formation logic decided that it wants this page to
 638         * be sent through the wire _now_. Specifically, it means that once
 639         * this state is achieved, transfer completion handler (with either
 640         * success or failure indication) is guaranteed to be executed against
 641         * this page independently of any locks and any scheduling decisions
 642         * made by the hosting environment (that effectively means that the
 643         * page is never put into cl_page_state::CPS_PAGEOUT state "in
 644         * advance". This property is mentioned, because it is important when
 645         * reasoning about possible dead-locks in the system). The page can
 646         * enter this state as a result of
 647         *
 648         *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
 649         *     write-out of this page, or
 650         *
 651         *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
 652         *     that it has enough dirty pages cached to issue a "good"
 653         *     transfer.
 654         *
 655         * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
 656         * is completed---it is moved into cl_page_state::CPS_CACHED state.
 657         *
 658         * Underlying VM page is locked for the duration of transfer.
 659         *
 660         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
 661         */
 662        CPS_PAGEOUT,
 663        /**
 664         * Page is being read in, as a part of a transfer. This is quite
 665         * similar to the cl_page_state::CPS_PAGEOUT state, except that
 666         * read-in is always "immediate"---there is no such thing a sudden
 667         * construction of read cl_req from cached, presumably not up to date,
 668         * pages.
 669         *
 670         * Underlying VM page is locked for the duration of transfer.
 671         *
 672         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
 673         */
 674        CPS_PAGEIN,
 675        /**
 676         * Page is being destroyed. This state is entered when client decides
 677         * that page has to be deleted from its host object, as, e.g., a part
 678         * of truncate.
 679         *
 680         * Once this state is reached, there is no way to escape it.
 681         *
 682         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
 683         */
 684        CPS_FREEING,
 685        CPS_NR
 686};
 687
 688enum cl_page_type {
 689        /** Host page, the page is from the host inode which the cl_page
 690         * belongs to. */
 691        CPT_CACHEABLE = 1,
 692
 693        /** Transient page, the transient cl_page is used to bind a cl_page
 694         *  to vmpage which is not belonging to the same object of cl_page.
 695         *  it is used in DirectIO, lockless IO and liblustre. */
 696        CPT_TRANSIENT,
 697};
 698
 699/**
 700 * Flags maintained for every cl_page.
 701 */
 702enum cl_page_flags {
 703        /**
 704         * Set when pagein completes. Used for debugging (read completes at
 705         * most once for a page).
 706         */
 707        CPF_READ_COMPLETED = 1 << 0
 708};
 709
 710/**
 711 * Fields are protected by the lock on struct page, except for atomics and
 712 * immutables.
 713 *
 714 * \invariant Data type invariants are in cl_page_invariant(). Basically:
 715 * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
 716 * list, consistent with the parent/child pointers in the cl_page::cp_obj and
 717 * cl_page::cp_owner (when set).
 718 */
 719struct cl_page {
 720        /** Reference counter. */
 721        atomic_t             cp_ref;
 722        /** An object this page is a part of. Immutable after creation. */
 723        struct cl_object        *cp_obj;
 724        /** Logical page index within the object. Immutable after creation. */
 725        pgoff_t           cp_index;
 726        /** List of slices. Immutable after creation. */
 727        struct list_head               cp_layers;
 728        /** Parent page, NULL for top-level page. Immutable after creation. */
 729        struct cl_page    *cp_parent;
 730        /** Lower-layer page. NULL for bottommost page. Immutable after
 731         * creation. */
 732        struct cl_page    *cp_child;
 733        /**
 734         * Page state. This field is const to avoid accidental update, it is
 735         * modified only internally within cl_page.c. Protected by a VM lock.
 736         */
 737        const enum cl_page_state cp_state;
 738        /** Linkage of pages within group. Protected by cl_page::cp_mutex. */
 739        struct list_head                cp_batch;
 740        /** Mutex serializing membership of a page in a batch. */
 741        struct mutex            cp_mutex;
 742        /** Linkage of pages within cl_req. */
 743        struct list_head               cp_flight;
 744        /** Transfer error. */
 745        int                   cp_error;
 746
 747        /**
 748         * Page type. Only CPT_TRANSIENT is used so far. Immutable after
 749         * creation.
 750         */
 751        enum cl_page_type       cp_type;
 752
 753        /**
 754         * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
 755         * by sub-io. Protected by a VM lock.
 756         */
 757        struct cl_io        *cp_owner;
 758        /**
 759         * Debug information, the task is owning the page.
 760         */
 761        task_t        *cp_task;
 762        /**
 763         * Owning IO request in cl_page_state::CPS_PAGEOUT and
 764         * cl_page_state::CPS_PAGEIN states. This field is maintained only in
 765         * the top-level pages. Protected by a VM lock.
 766         */
 767        struct cl_req      *cp_req;
 768        /** List of references to this page, for debugging. */
 769        struct lu_ref       cp_reference;
 770        /** Link to an object, for debugging. */
 771        struct lu_ref_link      *cp_obj_ref;
 772        /** Link to a queue, for debugging. */
 773        struct lu_ref_link      *cp_queue_ref;
 774        /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
 775        unsigned                 cp_flags;
 776        /** Assigned if doing a sync_io */
 777        struct cl_sync_io       *cp_sync_io;
 778};
 779
 780/**
 781 * Per-layer part of cl_page.
 782 *
 783 * \see ccc_page, lov_page, osc_page
 784 */
 785struct cl_page_slice {
 786        struct cl_page            *cpl_page;
 787        /**
 788         * Object slice corresponding to this page slice. Immutable after
 789         * creation.
 790         */
 791        struct cl_object                *cpl_obj;
 792        const struct cl_page_operations *cpl_ops;
 793        /** Linkage into cl_page::cp_layers. Immutable after creation. */
 794        struct list_head                       cpl_linkage;
 795};
 796
 797/**
 798 * Lock mode. For the client extent locks.
 799 *
 800 * \warning: cl_lock_mode_match() assumes particular ordering here.
 801 * \ingroup cl_lock
 802 */
 803enum cl_lock_mode {
 804        /**
 805         * Mode of a lock that protects no data, and exists only as a
 806         * placeholder. This is used for `glimpse' requests. A phantom lock
 807         * might get promoted to real lock at some point.
 808         */
 809        CLM_PHANTOM,
 810        CLM_READ,
 811        CLM_WRITE,
 812        CLM_GROUP
 813};
 814
 815/**
 816 * Requested transfer type.
 817 * \ingroup cl_req
 818 */
 819enum cl_req_type {
 820        CRT_READ,
 821        CRT_WRITE,
 822        CRT_NR
 823};
 824
 825/**
 826 * Per-layer page operations.
 827 *
 828 * Methods taking an \a io argument are for the activity happening in the
 829 * context of given \a io. Page is assumed to be owned by that io, except for
 830 * the obvious cases (like cl_page_operations::cpo_own()).
 831 *
 832 * \see vvp_page_ops, lov_page_ops, osc_page_ops
 833 */
 834struct cl_page_operations {
 835        /**
 836         * cl_page<->struct page methods. Only one layer in the stack has to
 837         * implement these. Current code assumes that this functionality is
 838         * provided by the topmost layer, see cl_page_disown0() as an example.
 839         */
 840
 841        /**
 842         * \return the underlying VM page. Optional.
 843         */
 844        struct page *(*cpo_vmpage)(const struct lu_env *env,
 845                                  const struct cl_page_slice *slice);
 846        /**
 847         * Called when \a io acquires this page into the exclusive
 848         * ownership. When this method returns, it is guaranteed that the is
 849         * not owned by other io, and no transfer is going on against
 850         * it. Optional.
 851         *
 852         * \see cl_page_own()
 853         * \see vvp_page_own(), lov_page_own()
 854         */
 855        int  (*cpo_own)(const struct lu_env *env,
 856                        const struct cl_page_slice *slice,
 857                        struct cl_io *io, int nonblock);
 858        /** Called when ownership it yielded. Optional.
 859         *
 860         * \see cl_page_disown()
 861         * \see vvp_page_disown()
 862         */
 863        void (*cpo_disown)(const struct lu_env *env,
 864                           const struct cl_page_slice *slice, struct cl_io *io);
 865        /**
 866         * Called for a page that is already "owned" by \a io from VM point of
 867         * view. Optional.
 868         *
 869         * \see cl_page_assume()
 870         * \see vvp_page_assume(), lov_page_assume()
 871         */
 872        void (*cpo_assume)(const struct lu_env *env,
 873                           const struct cl_page_slice *slice, struct cl_io *io);
 874        /** Dual to cl_page_operations::cpo_assume(). Optional. Called
 875         * bottom-to-top when IO releases a page without actually unlocking
 876         * it.
 877         *
 878         * \see cl_page_unassume()
 879         * \see vvp_page_unassume()
 880         */
 881        void (*cpo_unassume)(const struct lu_env *env,
 882                             const struct cl_page_slice *slice,
 883                             struct cl_io *io);
 884        /**
 885         * Announces whether the page contains valid data or not by \a uptodate.
 886         *
 887         * \see cl_page_export()
 888         * \see vvp_page_export()
 889         */
 890        void  (*cpo_export)(const struct lu_env *env,
 891                            const struct cl_page_slice *slice, int uptodate);
 892        /**
 893         * Unmaps page from the user space (if it is mapped).
 894         *
 895         * \see cl_page_unmap()
 896         * \see vvp_page_unmap()
 897         */
 898        int (*cpo_unmap)(const struct lu_env *env,
 899                         const struct cl_page_slice *slice, struct cl_io *io);
 900        /**
 901         * Checks whether underlying VM page is locked (in the suitable
 902         * sense). Used for assertions.
 903         *
 904         * \retval    -EBUSY: page is protected by a lock of a given mode;
 905         * \retval  -ENODATA: page is not protected by a lock;
 906         * \retval       0: this layer cannot decide. (Should never happen.)
 907         */
 908        int (*cpo_is_vmlocked)(const struct lu_env *env,
 909                               const struct cl_page_slice *slice);
 910        /**
 911         * Page destruction.
 912         */
 913
 914        /**
 915         * Called when page is truncated from the object. Optional.
 916         *
 917         * \see cl_page_discard()
 918         * \see vvp_page_discard(), osc_page_discard()
 919         */
 920        void (*cpo_discard)(const struct lu_env *env,
 921                            const struct cl_page_slice *slice,
 922                            struct cl_io *io);
 923        /**
 924         * Called when page is removed from the cache, and is about to being
 925         * destroyed. Optional.
 926         *
 927         * \see cl_page_delete()
 928         * \see vvp_page_delete(), osc_page_delete()
 929         */
 930        void (*cpo_delete)(const struct lu_env *env,
 931                           const struct cl_page_slice *slice);
 932        /** Destructor. Frees resources and slice itself. */
 933        void (*cpo_fini)(const struct lu_env *env,
 934                         struct cl_page_slice *slice);
 935
 936        /**
 937         * Checks whether the page is protected by a cl_lock. This is a
 938         * per-layer method, because certain layers have ways to check for the
 939         * lock much more efficiently than through the generic locks scan, or
 940         * implement locking mechanisms separate from cl_lock, e.g.,
 941         * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
 942         * being canceled, or scheduled for cancellation as soon as the last
 943         * user goes away, too.
 944         *
 945         * \retval    -EBUSY: page is protected by a lock of a given mode;
 946         * \retval  -ENODATA: page is not protected by a lock;
 947         * \retval       0: this layer cannot decide.
 948         *
 949         * \see cl_page_is_under_lock()
 950         */
 951        int (*cpo_is_under_lock)(const struct lu_env *env,
 952                                 const struct cl_page_slice *slice,
 953                                 struct cl_io *io);
 954
 955        /**
 956         * Optional debugging helper. Prints given page slice.
 957         *
 958         * \see cl_page_print()
 959         */
 960        int (*cpo_print)(const struct lu_env *env,
 961                         const struct cl_page_slice *slice,
 962                         void *cookie, lu_printer_t p);
 963        /**
 964         * \name transfer
 965         *
 966         * Transfer methods. See comment on cl_req for a description of
 967         * transfer formation and life-cycle.
 968         *
 969         * @{
 970         */
 971        /**
 972         * Request type dependent vector of operations.
 973         *
 974         * Transfer operations depend on transfer mode (cl_req_type). To avoid
 975         * passing transfer mode to each and every of these methods, and to
 976         * avoid branching on request type inside of the methods, separate
 977         * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
 978         * provided. That is, method invocation usually looks like
 979         *
 980         *       slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
 981         */
 982        struct {
 983                /**
 984                 * Called when a page is submitted for a transfer as a part of
 985                 * cl_page_list.
 986                 *
 987                 * \return    0  : page is eligible for submission;
 988                 * \return    -EALREADY : skip this page;
 989                 * \return    -ve       : error.
 990                 *
 991                 * \see cl_page_prep()
 992                 */
 993                int  (*cpo_prep)(const struct lu_env *env,
 994                                 const struct cl_page_slice *slice,
 995                                 struct cl_io *io);
 996                /**
 997                 * Completion handler. This is guaranteed to be eventually
 998                 * fired after cl_page_operations::cpo_prep() or
 999                 * cl_page_operations::cpo_make_ready() call.
1000                 *
1001                 * This method can be called in a non-blocking context. It is
1002                 * guaranteed however, that the page involved and its object
1003                 * are pinned in memory (and, hence, calling cl_page_put() is
1004                 * safe).
1005                 *
1006                 * \see cl_page_completion()
1007                 */
1008                void (*cpo_completion)(const struct lu_env *env,
1009                                       const struct cl_page_slice *slice,
1010                                       int ioret);
1011                /**
1012                 * Called when cached page is about to be added to the
1013                 * cl_req as a part of req formation.
1014                 *
1015                 * \return    0       : proceed with this page;
1016                 * \return    -EAGAIN : skip this page;
1017                 * \return    -ve     : error.
1018                 *
1019                 * \see cl_page_make_ready()
1020                 */
1021                int  (*cpo_make_ready)(const struct lu_env *env,
1022                                       const struct cl_page_slice *slice);
1023                /**
1024                 * Announce that this page is to be written out
1025                 * opportunistically, that is, page is dirty, it is not
1026                 * necessary to start write-out transfer right now, but
1027                 * eventually page has to be written out.
1028                 *
1029                 * Main caller of this is the write path (see
1030                 * vvp_io_commit_write()), using this method to build a
1031                 * "transfer cache" from which large transfers are then
1032                 * constructed by the req-formation engine.
1033                 *
1034                 * \todo XXX it would make sense to add page-age tracking
1035                 * semantics here, and to oblige the req-formation engine to
1036                 * send the page out not later than it is too old.
1037                 *
1038                 * \see cl_page_cache_add()
1039                 */
1040                int  (*cpo_cache_add)(const struct lu_env *env,
1041                                      const struct cl_page_slice *slice,
1042                                      struct cl_io *io);
1043        } io[CRT_NR];
1044        /**
1045         * Tell transfer engine that only [to, from] part of a page should be
1046         * transmitted.
1047         *
1048         * This is used for immediate transfers.
1049         *
1050         * \todo XXX this is not very good interface. It would be much better
1051         * if all transfer parameters were supplied as arguments to
1052         * cl_io_operations::cio_submit() call, but it is not clear how to do
1053         * this for page queues.
1054         *
1055         * \see cl_page_clip()
1056         */
1057        void (*cpo_clip)(const struct lu_env *env,
1058                         const struct cl_page_slice *slice,
1059                         int from, int to);
1060        /**
1061         * \pre  the page was queued for transferring.
1062         * \post page is removed from client's pending list, or -EBUSY
1063         *       is returned if it has already been in transferring.
1064         *
1065         * This is one of seldom page operation which is:
1066         * 0. called from top level;
1067         * 1. don't have vmpage locked;
1068         * 2. every layer should synchronize execution of its ->cpo_cancel()
1069         *    with completion handlers. Osc uses client obd lock for this
1070         *    purpose. Based on there is no vvp_page_cancel and
1071         *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
1072         *
1073         * \see osc_page_cancel().
1074         */
1075        int (*cpo_cancel)(const struct lu_env *env,
1076                          const struct cl_page_slice *slice);
1077        /**
1078         * Write out a page by kernel. This is only called by ll_writepage
1079         * right now.
1080         *
1081         * \see cl_page_flush()
1082         */
1083        int (*cpo_flush)(const struct lu_env *env,
1084                         const struct cl_page_slice *slice,
1085                         struct cl_io *io);
1086        /** @} transfer */
1087};
1088
1089/**
1090 * Helper macro, dumping detailed information about \a page into a log.
1091 */
1092#define CL_PAGE_DEBUG(mask, env, page, format, ...)                  \
1093do {                                                                \
1094        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
1095                                                                        \
1096        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
1097                cl_page_print(env, &msgdata, lu_cdebug_printer, page);  \
1098                CDEBUG(mask, format , ## __VA_ARGS__);            \
1099        }                                                              \
1100} while (0)
1101
1102/**
1103 * Helper macro, dumping shorter information about \a page into a log.
1104 */
1105#define CL_PAGE_HEADER(mask, env, page, format, ...)                      \
1106do {                                                                      \
1107        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                      \
1108                                                                              \
1109        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {                    \
1110                cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \
1111                CDEBUG(mask, format , ## __VA_ARGS__);                  \
1112        }                                                                    \
1113} while (0)
1114
1115static inline int __page_in_use(const struct cl_page *page, int refc)
1116{
1117        if (page->cp_type == CPT_CACHEABLE)
1118                ++refc;
1119        LASSERT(atomic_read(&page->cp_ref) > 0);
1120        return (atomic_read(&page->cp_ref) > refc);
1121}
1122#define cl_page_in_use(pg)       __page_in_use(pg, 1)
1123#define cl_page_in_use_noref(pg) __page_in_use(pg, 0)
1124
1125/** @} cl_page */
1126
1127/** \addtogroup cl_lock cl_lock
1128 * @{ */
1129/** \struct cl_lock
1130 *
1131 * Extent locking on the client.
1132 *
1133 * LAYERING
1134 *
1135 * The locking model of the new client code is built around
1136 *
1137 *      struct cl_lock
1138 *
1139 * data-type representing an extent lock on a regular file. cl_lock is a
1140 * layered object (much like cl_object and cl_page), it consists of a header
1141 * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
1142 * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
1143 *
1144 * All locks for a given object are linked into cl_object_header::coh_locks
1145 * list (protected by cl_object_header::coh_lock_guard spin-lock) through
1146 * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
1147 * sort it in starting lock offset, or use altogether different data structure
1148 * like a tree.
1149 *
1150 * Typical cl_lock consists of the two layers:
1151 *
1152 *     - vvp_lock (vvp specific data), and
1153 *     - lov_lock (lov specific data).
1154 *
1155 * lov_lock contains an array of sub-locks. Each of these sub-locks is a
1156 * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
1157 *
1158 *     - lovsub_lock, and
1159 *     - osc_lock
1160 *
1161 * Each sub-lock is associated with a cl_object (representing stripe
1162 * sub-object or the file to which top-level cl_lock is associated to), and is
1163 * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
1164 * cl_object (that at lov layer also fans out into multiple sub-objects), and
1165 * is different from cl_page, that doesn't fan out (there is usually exactly
1166 * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
1167 * a "top-lock" and its lovsub-osc portion a "sub-lock".
1168 *
1169 * LIFE CYCLE
1170 *
1171 * cl_lock is reference counted. When reference counter drops to 0, lock is
1172 * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
1173 * lock is destroyed when last reference is released. Referencing between
1174 * top-lock and its sub-locks is described in the lov documentation module.
1175 *
1176 * STATE MACHINE
1177 *
1178 * Also, cl_lock is a state machine. This requires some clarification. One of
1179 * the goals of client IO re-write was to make IO path non-blocking, or at
1180 * least to make it easier to make it non-blocking in the future. Here
1181 * `non-blocking' means that when a system call (read, write, truncate)
1182 * reaches a situation where it has to wait for a communication with the
1183 * server, it should --instead of waiting-- remember its current state and
1184 * switch to some other work.  E.g,. instead of waiting for a lock enqueue,
1185 * client should proceed doing IO on the next stripe, etc. Obviously this is
1186 * rather radical redesign, and it is not planned to be fully implemented at
1187 * this time, instead we are putting some infrastructure in place, that would
1188 * make it easier to do asynchronous non-blocking IO easier in the
1189 * future. Specifically, where old locking code goes to sleep (waiting for
1190 * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
1191 * enqueue reply comes, its completion handler signals that lock state-machine
1192 * is ready to transit to the next state. There is some generic code in
1193 * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
1194 * this cl_lock.c code, it looks like locking is done in normal blocking
1195 * fashion, and it the same time it is possible to switch to the non-blocking
1196 * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
1197 * functions).
1198 *
1199 * For a description of state machine states and transitions see enum
1200 * cl_lock_state.
1201 *
1202 * There are two ways to restrict a set of states which lock might move to:
1203 *
1204 *     - placing a "hold" on a lock guarantees that lock will not be moved
1205 *       into cl_lock_state::CLS_FREEING state until hold is released. Hold
1206 *       can be only acquired on a lock that is not in
1207 *       cl_lock_state::CLS_FREEING. All holds on a lock are counted in
1208 *       cl_lock::cll_holds. Hold protects lock from cancellation and
1209 *       destruction. Requests to cancel and destroy a lock on hold will be
1210 *       recorded, but only honored when last hold on a lock is released;
1211 *
1212 *     - placing a "user" on a lock guarantees that lock will not leave
1213 *       cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
1214 *       cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
1215 *       states, once it enters this set. That is, if a user is added onto a
1216 *       lock in a state not from this set, it doesn't immediately enforce
1217 *       lock to move to this set, but once lock enters this set it will
1218 *       remain there until all users are removed. Lock users are counted in
1219 *       cl_lock::cll_users.
1220 *
1221 *       User is used to assure that lock is not canceled or destroyed while
1222 *       it is being enqueued, or actively used by some IO.
1223 *
1224 *       Currently, a user always comes with a hold (cl_lock_invariant()
1225 *       checks that a number of holds is not less than a number of users).
1226 *
1227 * CONCURRENCY
1228 *
1229 * This is how lock state-machine operates. struct cl_lock contains a mutex
1230 * cl_lock::cll_guard that protects struct fields.
1231 *
1232 *     - mutex is taken, and cl_lock::cll_state is examined.
1233 *
1234 *     - for every state there are possible target states where lock can move
1235 *       into. They are tried in order. Attempts to move into next state are
1236 *       done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
1237 *
1238 *     - if the transition can be performed immediately, state is changed,
1239 *       and mutex is released.
1240 *
1241 *     - if the transition requires blocking, _try() function returns
1242 *       cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
1243 *       sleep, waiting for possibility of lock state change. It is woken
1244 *       up when some event occurs, that makes lock state change possible
1245 *       (e.g., the reception of the reply from the server), and repeats
1246 *       the loop.
1247 *
1248 * Top-lock and sub-lock has separate mutexes and the latter has to be taken
1249 * first to avoid dead-lock.
1250 *
1251 * To see an example of interaction of all these issues, take a look at the
1252 * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
1253 * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
1254 * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
1255 * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
1256 * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
1257 * done in parallel, rather than one after another (this is used for glimpse
1258 * locks, that cannot dead-lock).
1259 *
1260 * INTERFACE AND USAGE
1261 *
1262 * struct cl_lock_operations provide a number of call-backs that are invoked
1263 * when events of interest occurs. Layers can intercept and handle glimpse,
1264 * blocking, cancel ASTs and a reception of the reply from the server.
1265 *
1266 * One important difference with the old client locking model is that new
1267 * client has a representation for the top-lock, whereas in the old code only
1268 * sub-locks existed as real data structures and file-level locks are
1269 * represented by "request sets" that are created and destroyed on each and
1270 * every lock creation.
1271 *
1272 * Top-locks are cached, and can be found in the cache by the system calls. It
1273 * is possible that top-lock is in cache, but some of its sub-locks were
1274 * canceled and destroyed. In that case top-lock has to be enqueued again
1275 * before it can be used.
1276 *
1277 * Overall process of the locking during IO operation is as following:
1278 *
1279 *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
1280 *       is called on each layer. Responsibility of this method is to add locks,
1281 *       needed by a given layer into cl_io.ci_lockset.
1282 *
1283 *     - once locks for all layers were collected, they are sorted to avoid
1284 *       dead-locks (cl_io_locks_sort()), and enqueued.
1285 *
1286 *     - when all locks are acquired, IO is performed;
1287 *
1288 *     - locks are released into cache.
1289 *
1290 * Striping introduces major additional complexity into locking. The
1291 * fundamental problem is that it is generally unsafe to actively use (hold)
1292 * two locks on the different OST servers at the same time, as this introduces
1293 * inter-server dependency and can lead to cascading evictions.
1294 *
1295 * Basic solution is to sub-divide large read/write IOs into smaller pieces so
1296 * that no multi-stripe locks are taken (note that this design abandons POSIX
1297 * read/write semantics). Such pieces ideally can be executed concurrently. At
1298 * the same time, certain types of IO cannot be sub-divived, without
1299 * sacrificing correctness. This includes:
1300 *
1301 *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
1302 *  atomicity;
1303 *
1304 *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
1305 *
1306 * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
1307 * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
1308 * has to be held together with the usual lock on [offset, offset + count].
1309 *
1310 * As multi-stripe locks have to be allowed, it makes sense to cache them, so
1311 * that, for example, a sequence of O_APPEND writes can proceed quickly
1312 * without going down to the individual stripes to do lock matching. On the
1313 * other hand, multi-stripe locks shouldn't be used by normal read/write
1314 * calls. To achieve this, every layer can implement ->clo_fits_into() method,
1315 * that is called by lock matching code (cl_lock_lookup()), and that can be
1316 * used to selectively disable matching of certain locks for certain IOs. For
1317 * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe
1318 * locks to be matched only for truncates and O_APPEND writes.
1319 *
1320 * Interaction with DLM
1321 *
1322 * In the expected setup, cl_lock is ultimately backed up by a collection of
1323 * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
1324 * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
1325 * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
1326 * description of interaction with DLM.
1327 */
1328
1329/**
1330 * Lock description.
1331 */
1332struct cl_lock_descr {
1333        /** Object this lock is granted for. */
1334        struct cl_object *cld_obj;
1335        /** Index of the first page protected by this lock. */
1336        pgoff_t    cld_start;
1337        /** Index of the last page (inclusive) protected by this lock. */
1338        pgoff_t    cld_end;
1339        /** Group ID, for group lock */
1340        __u64        cld_gid;
1341        /** Lock mode. */
1342        enum cl_lock_mode cld_mode;
1343        /**
1344         * flags to enqueue lock. A combination of bit-flags from
1345         * enum cl_enq_flags.
1346         */
1347        __u32        cld_enq_flags;
1348};
1349
1350#define DDESCR "%s(%d):[%lu, %lu]"
1351#define PDESCR(descr)                                              \
1352        cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,        \
1353        (descr)->cld_start, (descr)->cld_end
1354
1355const char *cl_lock_mode_name(const enum cl_lock_mode mode);
1356
1357/**
1358 * Lock state-machine states.
1359 *
1360 * \htmlonly
1361 * <pre>
1362 *
1363 * Possible state transitions:
1364 *
1365 *            +------------------>NEW
1366 *            |             |
1367 *            |             | cl_enqueue_try()
1368 *            |             |
1369 *            |    cl_unuse_try()  V
1370 *            |  +--------------QUEUING (*)
1371 *            |  |               |
1372 *            |  |               | cl_enqueue_try()
1373 *            |  |               |
1374 *            |  | cl_unuse_try()  V
1375 *    sub-lock  |  +-------------ENQUEUED (*)
1376 *    canceled  |  |             |
1377 *            |  |               | cl_wait_try()
1378 *            |  |               |
1379 *            |  |              (R)
1380 *            |  |               |
1381 *            |  |               V
1382 *            |  |              HELD<---------+
1383 *            |  |               |          |
1384 *            |  |               |          | cl_use_try()
1385 *            |  |  cl_unuse_try() |        |
1386 *            |  |               |          |
1387 *            |  |               V       ---+
1388 *            |  +------------>INTRANSIT (D) <--+
1389 *            |             |       |
1390 *            |     cl_unuse_try() |        | cached lock found
1391 *            |             |       | cl_use_try()
1392 *            |             |       |
1393 *            |             V       |
1394 *            +------------------CACHED---------+
1395 *                                 |
1396 *                                (C)
1397 *                                 |
1398 *                                 V
1399 *                              FREEING
1400 *
1401 * Legend:
1402 *
1403 *       In states marked with (*) transition to the same state (i.e., a loop
1404 *       in the diagram) is possible.
1405 *
1406 *       (R) is the point where Receive call-back is invoked: it allows layers
1407 *       to handle arrival of lock reply.
1408 *
1409 *       (C) is the point where Cancellation call-back is invoked.
1410 *
1411 *       (D) is the transit state which means the lock is changing.
1412 *
1413 *       Transition to FREEING state is possible from any other state in the
1414 *       diagram in case of unrecoverable error.
1415 * </pre>
1416 * \endhtmlonly
1417 *
1418 * These states are for individual cl_lock object. Top-lock and its sub-locks
1419 * can be in the different states. Another way to say this is that we have
1420 * nested state-machines.
1421 *
1422 * Separate QUEUING and ENQUEUED states are needed to support non-blocking
1423 * operation for locks with multiple sub-locks. Imagine lock on a file F, that
1424 * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
1425 * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
1426 * its completion and at last enqueue lock for S2, and wait for its
1427 * completion. In that case, top-lock is in QUEUING state while S0, S1 are
1428 * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
1429 * that in this case, sub-locks move from state to state, and top-lock remains
1430 * in the same state).
1431 */
1432enum cl_lock_state {
1433        /**
1434         * Lock that wasn't yet enqueued
1435         */
1436        CLS_NEW,
1437        /**
1438         * Enqueue is in progress, blocking for some intermediate interaction
1439         * with the other side.
1440         */
1441        CLS_QUEUING,
1442        /**
1443         * Lock is fully enqueued, waiting for server to reply when it is
1444         * granted.
1445         */
1446        CLS_ENQUEUED,
1447        /**
1448         * Lock granted, actively used by some IO.
1449         */
1450        CLS_HELD,
1451        /**
1452         * This state is used to mark the lock is being used, or unused.
1453         * We need this state because the lock may have several sublocks,
1454         * so it's impossible to have an atomic way to bring all sublocks
1455         * into CLS_HELD state at use case, or all sublocks to CLS_CACHED
1456         * at unuse case.
1457         * If a thread is referring to a lock, and it sees the lock is in this
1458         * state, it must wait for the lock.
1459         * See state diagram for details.
1460         */
1461        CLS_INTRANSIT,
1462        /**
1463         * Lock granted, not used.
1464         */
1465        CLS_CACHED,
1466        /**
1467         * Lock is being destroyed.
1468         */
1469        CLS_FREEING,
1470        CLS_NR
1471};
1472
1473enum cl_lock_flags {
1474        /**
1475         * lock has been cancelled. This flag is never cleared once set (by
1476         * cl_lock_cancel0()).
1477         */
1478        CLF_CANCELLED  = 1 << 0,
1479        /** cancellation is pending for this lock. */
1480        CLF_CANCELPEND = 1 << 1,
1481        /** destruction is pending for this lock. */
1482        CLF_DOOMED     = 1 << 2,
1483        /** from enqueue RPC reply upcall. */
1484        CLF_FROM_UPCALL= 1 << 3,
1485};
1486
1487/**
1488 * Lock closure.
1489 *
1490 * Lock closure is a collection of locks (both top-locks and sub-locks) that
1491 * might be updated in a result of an operation on a certain lock (which lock
1492 * this is a closure of).
1493 *
1494 * Closures are needed to guarantee dead-lock freedom in the presence of
1495 *
1496 *     - nested state-machines (top-lock state-machine composed of sub-lock
1497 *       state-machines), and
1498 *
1499 *     - shared sub-locks.
1500 *
1501 * Specifically, many operations, such as lock enqueue, wait, unlock,
1502 * etc. start from a top-lock, and then operate on a sub-locks of this
1503 * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
1504 * of such operation, this change has to be propagated to all top-locks that
1505 * share this sub-lock. Obviously, no natural lock ordering (e.g.,
1506 * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
1507 * to be used. Lock closure systematizes this try-and-repeat logic.
1508 */
1509struct cl_lock_closure {
1510        /**
1511         * Lock that is mutexed when closure construction is started. When
1512         * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
1513         * origin is released before waiting.
1514         */
1515        struct cl_lock   *clc_origin;
1516        /**
1517         * List of enclosed locks, so far. Locks are linked here through
1518         * cl_lock::cll_inclosure.
1519         */
1520        struct list_head        clc_list;
1521        /**
1522         * True iff closure is in a `wait' mode. This determines what
1523         * cl_lock_enclosure() does when a lock L to be added to the closure
1524         * is currently mutexed by some other thread.
1525         *
1526         * If cl_lock_closure::clc_wait is not set, then closure construction
1527         * fails with CLO_REPEAT immediately.
1528         *
1529         * In wait mode, cl_lock_enclosure() waits until next attempt to build
1530         * a closure might succeed. To this end it releases an origin mutex
1531         * (cl_lock_closure::clc_origin), that has to be the only lock mutex
1532         * owned by the current thread, and then waits on L mutex (by grabbing
1533         * it and immediately releasing), before returning CLO_REPEAT to the
1534         * caller.
1535         */
1536        int            clc_wait;
1537        /** Number of locks in the closure. */
1538        int            clc_nr;
1539};
1540
1541/**
1542 * Layered client lock.
1543 */
1544struct cl_lock {
1545        /** Reference counter. */
1546        atomic_t          cll_ref;
1547        /** List of slices. Immutable after creation. */
1548        struct list_head            cll_layers;
1549        /**
1550         * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
1551         * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
1552         */
1553        struct list_head            cll_linkage;
1554        /**
1555         * Parameters of this lock. Protected by
1556         * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
1557         * cl_lock::cll_guard. Modified only on lock creation and in
1558         * cl_lock_modify().
1559         */
1560        struct cl_lock_descr  cll_descr;
1561        /** Protected by cl_lock::cll_guard. */
1562        enum cl_lock_state    cll_state;
1563        /** signals state changes. */
1564        wait_queue_head_t          cll_wq;
1565        /**
1566         * Recursive lock, most fields in cl_lock{} are protected by this.
1567         *
1568         * Locking rules: this mutex is never held across network
1569         * communication, except when lock is being canceled.
1570         *
1571         * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
1572         * on a top-lock. Other direction is implemented through a
1573         * try-lock-repeat loop. Mutices of unrelated locks can be taken only
1574         * by try-locking.
1575         *
1576         * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
1577         */
1578        struct mutex            cll_guard;
1579        task_t     *cll_guarder;
1580        int                cll_depth;
1581
1582        /**
1583         * the owner for INTRANSIT state
1584         */
1585        task_t     *cll_intransit_owner;
1586        int                cll_error;
1587        /**
1588         * Number of holds on a lock. A hold prevents a lock from being
1589         * canceled and destroyed. Protected by cl_lock::cll_guard.
1590         *
1591         * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
1592         */
1593        int                cll_holds;
1594         /**
1595          * Number of lock users. Valid in cl_lock_state::CLS_HELD state
1596          * only. Lock user pins lock in CLS_HELD state. Protected by
1597          * cl_lock::cll_guard.
1598          *
1599          * \see cl_wait(), cl_unuse().
1600          */
1601        int                cll_users;
1602        /**
1603         * Flag bit-mask. Values from enum cl_lock_flags. Updates are
1604         * protected by cl_lock::cll_guard.
1605         */
1606        unsigned long    cll_flags;
1607        /**
1608         * A linkage into a list of locks in a closure.
1609         *
1610         * \see cl_lock_closure
1611         */
1612        struct list_head            cll_inclosure;
1613        /**
1614         * Confict lock at queuing time.
1615         */
1616        struct cl_lock       *cll_conflict;
1617        /**
1618         * A list of references to this lock, for debugging.
1619         */
1620        struct lu_ref    cll_reference;
1621        /**
1622         * A list of holds on this lock, for debugging.
1623         */
1624        struct lu_ref    cll_holders;
1625        /**
1626         * A reference for cl_lock::cll_descr::cld_obj. For debugging.
1627         */
1628        struct lu_ref_link   *cll_obj_ref;
1629#ifdef CONFIG_LOCKDEP
1630        /* "dep_map" name is assumed by lockdep.h macros. */
1631        struct lockdep_map    dep_map;
1632#endif
1633};
1634
1635/**
1636 * Per-layer part of cl_lock
1637 *
1638 * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
1639 */
1640struct cl_lock_slice {
1641        struct cl_lock            *cls_lock;
1642        /** Object slice corresponding to this lock slice. Immutable after
1643         * creation. */
1644        struct cl_object                *cls_obj;
1645        const struct cl_lock_operations *cls_ops;
1646        /** Linkage into cl_lock::cll_layers. Immutable after creation. */
1647        struct list_head                       cls_linkage;
1648};
1649
1650/**
1651 * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
1652 *
1653 * NOTE: lov_subresult() depends on ordering here.
1654 */
1655enum cl_lock_transition {
1656        /** operation cannot be completed immediately. Wait for state change. */
1657        CLO_WAIT        = 1,
1658        /** operation had to release lock mutex, restart. */
1659        CLO_REPEAT      = 2,
1660        /** lower layer re-enqueued. */
1661        CLO_REENQUEUED  = 3,
1662};
1663
1664/**
1665 *
1666 * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
1667 */
1668struct cl_lock_operations {
1669        /**
1670         * \name statemachine
1671         *
1672         * State machine transitions. These 3 methods are called to transfer
1673         * lock from one state to another, as described in the commentary
1674         * above enum #cl_lock_state.
1675         *
1676         * \retval 0      this layer has nothing more to do to before
1677         *                     transition to the target state happens;
1678         *
1679         * \retval CLO_REPEAT method had to release and re-acquire cl_lock
1680         *                  mutex, repeat invocation of transition method
1681         *                  across all layers;
1682         *
1683         * \retval CLO_WAIT   this layer cannot move to the target state
1684         *                  immediately, as it has to wait for certain event
1685         *                  (e.g., the communication with the server). It
1686         *                  is guaranteed, that when the state transfer
1687         *                  becomes possible, cl_lock::cll_wq wait-queue
1688         *                  is signaled. Caller can wait for this event by
1689         *                  calling cl_lock_state_wait();
1690         *
1691         * \retval -ve  failure, abort state transition, move the lock
1692         *                  into cl_lock_state::CLS_FREEING state, and set
1693         *                  cl_lock::cll_error.
1694         *
1695         * Once all layers voted to agree to transition (by returning 0), lock
1696         * is moved into corresponding target state. All state transition
1697         * methods are optional.
1698         */
1699        /** @{ */
1700        /**
1701         * Attempts to enqueue the lock. Called top-to-bottom.
1702         *
1703         * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
1704         * \see osc_lock_enqueue()
1705         */
1706        int  (*clo_enqueue)(const struct lu_env *env,
1707                            const struct cl_lock_slice *slice,
1708                            struct cl_io *io, __u32 enqflags);
1709        /**
1710         * Attempts to wait for enqueue result. Called top-to-bottom.
1711         *
1712         * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
1713         */
1714        int  (*clo_wait)(const struct lu_env *env,
1715                         const struct cl_lock_slice *slice);
1716        /**
1717         * Attempts to unlock the lock. Called bottom-to-top. In addition to
1718         * usual return values of lock state-machine methods, this can return
1719         * -ESTALE to indicate that lock cannot be returned to the cache, and
1720         * has to be re-initialized.
1721         * unuse is a one-shot operation, so it must NOT return CLO_WAIT.
1722         *
1723         * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse()
1724         */
1725        int  (*clo_unuse)(const struct lu_env *env,
1726                          const struct cl_lock_slice *slice);
1727        /**
1728         * Notifies layer that cached lock is started being used.
1729         *
1730         * \pre lock->cll_state == CLS_CACHED
1731         *
1732         * \see lov_lock_use(), osc_lock_use()
1733         */
1734        int  (*clo_use)(const struct lu_env *env,
1735                        const struct cl_lock_slice *slice);
1736        /** @} statemachine */
1737        /**
1738         * A method invoked when lock state is changed (as a result of state
1739         * transition). This is used, for example, to track when the state of
1740         * a sub-lock changes, to propagate this change to the corresponding
1741         * top-lock. Optional
1742         *
1743         * \see lovsub_lock_state()
1744         */
1745        void (*clo_state)(const struct lu_env *env,
1746                          const struct cl_lock_slice *slice,
1747                          enum cl_lock_state st);
1748        /**
1749         * Returns true, iff given lock is suitable for the given io, idea
1750         * being, that there are certain "unsafe" locks, e.g., ones acquired
1751         * for O_APPEND writes, that we don't want to re-use for a normal
1752         * write, to avoid the danger of cascading evictions. Optional. Runs
1753         * under cl_object_header::coh_lock_guard.
1754         *
1755         * XXX this should take more information about lock needed by
1756         * io. Probably lock description or something similar.
1757         *
1758         * \see lov_fits_into()
1759         */
1760        int (*clo_fits_into)(const struct lu_env *env,
1761                             const struct cl_lock_slice *slice,
1762                             const struct cl_lock_descr *need,
1763                             const struct cl_io *io);
1764        /**
1765         * \name ast
1766         * Asynchronous System Traps. All of then are optional, all are
1767         * executed bottom-to-top.
1768         */
1769        /** @{ */
1770
1771        /**
1772         * Cancellation callback. Cancel a lock voluntarily, or under
1773         * the request of server.
1774         */
1775        void (*clo_cancel)(const struct lu_env *env,
1776                           const struct cl_lock_slice *slice);
1777        /**
1778         * Lock weighting ast. Executed to estimate how precious this lock
1779         * is. The sum of results across all layers is used to determine
1780         * whether lock worth keeping in cache given present memory usage.
1781         *
1782         * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
1783         */
1784        unsigned long (*clo_weigh)(const struct lu_env *env,
1785                                   const struct cl_lock_slice *slice);
1786        /** @} ast */
1787
1788        /**
1789         * \see lovsub_lock_closure()
1790         */
1791        int (*clo_closure)(const struct lu_env *env,
1792                           const struct cl_lock_slice *slice,
1793                           struct cl_lock_closure *closure);
1794        /**
1795         * Executed bottom-to-top when lock description changes (e.g., as a
1796         * result of server granting more generous lock than was requested).
1797         *
1798         * \see lovsub_lock_modify()
1799         */
1800        int (*clo_modify)(const struct lu_env *env,
1801                          const struct cl_lock_slice *slice,
1802                          const struct cl_lock_descr *updated);
1803        /**
1804         * Notifies layers (bottom-to-top) that lock is going to be
1805         * destroyed. Responsibility of layers is to prevent new references on
1806         * this lock from being acquired once this method returns.
1807         *
1808         * This can be called multiple times due to the races.
1809         *
1810         * \see cl_lock_delete()
1811         * \see osc_lock_delete(), lovsub_lock_delete()
1812         */
1813        void (*clo_delete)(const struct lu_env *env,
1814                           const struct cl_lock_slice *slice);
1815        /**
1816         * Destructor. Frees resources and the slice.
1817         *
1818         * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
1819         * \see osc_lock_fini()
1820         */
1821        void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
1822        /**
1823         * Optional debugging helper. Prints given lock slice.
1824         */
1825        int (*clo_print)(const struct lu_env *env,
1826                         void *cookie, lu_printer_t p,
1827                         const struct cl_lock_slice *slice);
1828};
1829
1830#define CL_LOCK_DEBUG(mask, env, lock, format, ...)                  \
1831do {                                                                \
1832        LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL);                \
1833                                                                        \
1834        if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) {              \
1835                cl_lock_print(env, &msgdata, lu_cdebug_printer, lock);  \
1836                CDEBUG(mask, format , ## __VA_ARGS__);            \
1837        }                                                              \
1838} while (0)
1839
1840#define CL_LOCK_ASSERT(expr, env, lock) do {                        \
1841        if (likely(expr))                                              \
1842                break;                                            \
1843                                                                        \
1844        CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr);    \
1845        LBUG();                                                  \
1846} while (0)
1847
1848/** @} cl_lock */
1849
1850/** \addtogroup cl_page_list cl_page_list
1851 * Page list used to perform collective operations on a group of pages.
1852 *
1853 * Pages are added to the list one by one. cl_page_list acquires a reference
1854 * for every page in it. Page list is used to perform collective operations on
1855 * pages:
1856 *
1857 *     - submit pages for an immediate transfer,
1858 *
1859 *     - own pages on behalf of certain io (waiting for each page in turn),
1860 *
1861 *     - discard pages.
1862 *
1863 * When list is finalized, it releases references on all pages it still has.
1864 *
1865 * \todo XXX concurrency control.
1866 *
1867 * @{
1868 */
1869struct cl_page_list {
1870        unsigned             pl_nr;
1871        struct list_head           pl_pages;
1872        task_t    *pl_owner;
1873};
1874
1875/**
1876 * A 2-queue of pages. A convenience data-type for common use case, 2-queue
1877 * contains an incoming page list and an outgoing page list.
1878 */
1879struct cl_2queue {
1880        struct cl_page_list c2_qin;
1881        struct cl_page_list c2_qout;
1882};
1883
1884/** @} cl_page_list */
1885
1886/** \addtogroup cl_io cl_io
1887 * @{ */
1888/** \struct cl_io
1889 * I/O
1890 *
1891 * cl_io represents a high level I/O activity like
1892 * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
1893 * lock.
1894 *
1895 * cl_io is a layered object, much like cl_{object,page,lock} but with one
1896 * important distinction. We want to minimize number of calls to the allocator
1897 * in the fast path, e.g., in the case of read(2) when everything is cached:
1898 * client already owns the lock over region being read, and data are cached
1899 * due to read-ahead. To avoid allocation of cl_io layers in such situations,
1900 * per-layer io state is stored in the session, associated with the io, see
1901 * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
1902 * by using free-lists, see cl_env_get().
1903 *
1904 * There is a small predefined number of possible io types, enumerated in enum
1905 * cl_io_type.
1906 *
1907 * cl_io is a state machine, that can be advanced concurrently by the multiple
1908 * threads. It is up to these threads to control the concurrency and,
1909 * specifically, to detect when io is done, and its state can be safely
1910 * released.
1911 *
1912 * For read/write io overall execution plan is as following:
1913 *
1914 *     (0) initialize io state through all layers;
1915 *
1916 *     (1) loop: prepare chunk of work to do
1917 *
1918 *     (2) call all layers to collect locks they need to process current chunk
1919 *
1920 *     (3) sort all locks to avoid dead-locks, and acquire them
1921 *
1922 *     (4) process the chunk: call per-page methods
1923 *       (cl_io_operations::cio_read_page() for read,
1924 *       cl_io_operations::cio_prepare_write(),
1925 *       cl_io_operations::cio_commit_write() for write)
1926 *
1927 *     (5) release locks
1928 *
1929 *     (6) repeat loop.
1930 *
1931 * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
1932 * address allocation efficiency issues mentioned above), and returns with the
1933 * special error condition from per-page method when current sub-io has to
1934 * block. This causes io loop to be repeated, and lov switches to the next
1935 * sub-io in its cl_io_operations::cio_iter_init() implementation.
1936 */
1937
1938/** IO types */
1939enum cl_io_type {
1940        /** read system call */
1941        CIT_READ,
1942        /** write system call */
1943        CIT_WRITE,
1944        /** truncate, utime system calls */
1945        CIT_SETATTR,
1946        /**
1947         * page fault handling
1948         */
1949        CIT_FAULT,
1950        /**
1951         * fsync system call handling
1952         * To write out a range of file
1953         */
1954        CIT_FSYNC,
1955        /**
1956         * Miscellaneous io. This is used for occasional io activity that
1957         * doesn't fit into other types. Currently this is used for:
1958         *
1959         *     - cancellation of an extent lock. This io exists as a context
1960         *     to write dirty pages from under the lock being canceled back
1961         *     to the server;
1962         *
1963         *     - VM induced page write-out. An io context for writing page out
1964         *     for memory cleansing;
1965         *
1966         *     - glimpse. An io context to acquire glimpse lock.
1967         *
1968         *     - grouplock. An io context to acquire group lock.
1969         *
1970         * CIT_MISC io is used simply as a context in which locks and pages
1971         * are manipulated. Such io has no internal "process", that is,
1972         * cl_io_loop() is never called for it.
1973         */
1974        CIT_MISC,
1975        CIT_OP_NR
1976};
1977
1978/**
1979 * States of cl_io state machine
1980 */
1981enum cl_io_state {
1982        /** Not initialized. */
1983        CIS_ZERO,
1984        /** Initialized. */
1985        CIS_INIT,
1986        /** IO iteration started. */
1987        CIS_IT_STARTED,
1988        /** Locks taken. */
1989        CIS_LOCKED,
1990        /** Actual IO is in progress. */
1991        CIS_IO_GOING,
1992        /** IO for the current iteration finished. */
1993        CIS_IO_FINISHED,
1994        /** Locks released. */
1995        CIS_UNLOCKED,
1996        /** Iteration completed. */
1997        CIS_IT_ENDED,
1998        /** cl_io finalized. */
1999        CIS_FINI
2000};
2001
2002/**
2003 * IO state private for a layer.
2004 *
2005 * This is usually embedded into layer session data, rather than allocated
2006 * dynamically.
2007 *
2008 * \see vvp_io, lov_io, osc_io, ccc_io
2009 */
2010struct cl_io_slice {
2011        struct cl_io              *cis_io;
2012        /** corresponding object slice. Immutable after creation. */
2013        struct cl_object              *cis_obj;
2014        /** io operations. Immutable after creation. */
2015        const struct cl_io_operations *cis_iop;
2016        /**
2017         * linkage into a list of all slices for a given cl_io, hanging off
2018         * cl_io::ci_layers. Immutable after creation.
2019         */
2020        struct list_head                     cis_linkage;
2021};
2022
2023
2024/**
2025 * Per-layer io operations.
2026 * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
2027 */
2028struct cl_io_operations {
2029        /**
2030         * Vector of io state transition methods for every io type.
2031         *
2032         * \see cl_page_operations::io
2033         */
2034        struct {
2035                /**
2036                 * Prepare io iteration at a given layer.
2037                 *
2038                 * Called top-to-bottom at the beginning of each iteration of
2039                 * "io loop" (if it makes sense for this type of io). Here
2040                 * layer selects what work it will do during this iteration.
2041                 *
2042                 * \see cl_io_operations::cio_iter_fini()
2043                 */
2044                int (*cio_iter_init) (const struct lu_env *env,
2045                                      const struct cl_io_slice *slice);
2046                /**
2047                 * Finalize io iteration.
2048                 *
2049                 * Called bottom-to-top at the end of each iteration of "io
2050                 * loop". Here layers can decide whether IO has to be
2051                 * continued.
2052                 *
2053                 * \see cl_io_operations::cio_iter_init()
2054                 */
2055                void (*cio_iter_fini) (const struct lu_env *env,
2056                                       const struct cl_io_slice *slice);
2057                /**
2058                 * Collect locks for the current iteration of io.
2059                 *
2060                 * Called top-to-bottom to collect all locks necessary for
2061                 * this iteration. This methods shouldn't actually enqueue
2062                 * anything, instead it should post a lock through
2063                 * cl_io_lock_add(). Once all locks are collected, they are
2064                 * sorted and enqueued in the proper order.
2065                 */
2066                int  (*cio_lock) (const struct lu_env *env,
2067                                  const struct cl_io_slice *slice);
2068                /**
2069                 * Finalize unlocking.
2070                 *
2071                 * Called bottom-to-top to finish layer specific unlocking
2072                 * functionality, after generic code released all locks
2073                 * acquired by cl_io_operations::cio_lock().
2074                 */
2075                void  (*cio_unlock)(const struct lu_env *env,
2076                                    const struct cl_io_slice *slice);
2077                /**
2078                 * Start io iteration.
2079                 *
2080                 * Once all locks are acquired, called top-to-bottom to
2081                 * commence actual IO. In the current implementation,
2082                 * top-level vvp_io_{read,write}_start() does all the work
2083                 * synchronously by calling generic_file_*(), so other layers
2084                 * are called when everything is done.
2085                 */
2086                int  (*cio_start)(const struct lu_env *env,
2087                                  const struct cl_io_slice *slice);
2088                /**
2089                 * Called top-to-bottom at the end of io loop. Here layer
2090                 * might wait for an unfinished asynchronous io.
2091                 */
2092                void (*cio_end)  (const struct lu_env *env,
2093                                  const struct cl_io_slice *slice);
2094                /**
2095                 * Called bottom-to-top to notify layers that read/write IO
2096                 * iteration finished, with \a nob bytes transferred.
2097                 */
2098                void (*cio_advance)(const struct lu_env *env,
2099                                    const struct cl_io_slice *slice,
2100                                    size_t nob);
2101                /**
2102                 * Called once per io, bottom-to-top to release io resources.
2103                 */
2104                void (*cio_fini) (const struct lu_env *env,
2105                                  const struct cl_io_slice *slice);
2106        } op[CIT_OP_NR];
2107        struct {
2108                /**
2109                 * Submit pages from \a queue->c2_qin for IO, and move
2110                 * successfully submitted pages into \a queue->c2_qout. Return
2111                 * non-zero if failed to submit even the single page. If
2112                 * submission failed after some pages were moved into \a
2113                 * queue->c2_qout, completion callback with non-zero ioret is
2114                 * executed on them.
2115                 */
2116                int  (*cio_submit)(const struct lu_env *env,
2117                                   const struct cl_io_slice *slice,
2118                                   enum cl_req_type crt,
2119                                   struct cl_2queue *queue);
2120        } req_op[CRT_NR];
2121        /**
2122         * Read missing page.
2123         *
2124         * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
2125         * method, when it hits not-up-to-date page in the range. Optional.
2126         *
2127         * \pre io->ci_type == CIT_READ
2128         */
2129        int (*cio_read_page)(const struct lu_env *env,
2130                             const struct cl_io_slice *slice,
2131                             const struct cl_page_slice *page);
2132        /**
2133         * Prepare write of a \a page. Called bottom-to-top by a top-level
2134         * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
2135         * get data from user-level buffer.
2136         *
2137         * \pre io->ci_type == CIT_WRITE
2138         *
2139         * \see vvp_io_prepare_write(), lov_io_prepare_write(),
2140         * osc_io_prepare_write().
2141         */
2142        int (*cio_prepare_write)(const struct lu_env *env,
2143                                 const struct cl_io_slice *slice,
2144                                 const struct cl_page_slice *page,
2145                                 unsigned from, unsigned to);
2146        /**
2147         *
2148         * \pre io->ci_type == CIT_WRITE
2149         *
2150         * \see vvp_io_commit_write(), lov_io_commit_write(),
2151         * osc_io_commit_write().
2152         */
2153        int (*cio_commit_write)(const struct lu_env *env,
2154                                const struct cl_io_slice *slice,
2155                                const struct cl_page_slice *page,
2156                                unsigned from, unsigned to);
2157        /**
2158         * Optional debugging helper. Print given io slice.
2159         */
2160        int (*cio_print)(const struct lu_env *env, void *cookie,
2161                         lu_printer_t p, const struct cl_io_slice *slice);
2162};
2163
2164/**
2165 * Flags to lock enqueue procedure.
2166 * \ingroup cl_lock
2167 */
2168enum cl_enq_flags {
2169        /**
2170         * instruct server to not block, if conflicting lock is found. Instead
2171         * -EWOULDBLOCK is returned immediately.
2172         */
2173        CEF_NONBLOCK     = 0x00000001,
2174        /**
2175         * take lock asynchronously (out of order), as it cannot
2176         * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
2177         */
2178        CEF_ASYNC       = 0x00000002,
2179        /**
2180         * tell the server to instruct (though a flag in the blocking ast) an
2181         * owner of the conflicting lock, that it can drop dirty pages
2182         * protected by this lock, without sending them to the server.
2183         */
2184        CEF_DISCARD_DATA = 0x00000004,
2185        /**
2186         * tell the sub layers that it must be a `real' lock. This is used for
2187         * mmapped-buffer locks and glimpse locks that must be never converted
2188         * into lockless mode.
2189         *
2190         * \see vvp_mmap_locks(), cl_glimpse_lock().
2191         */
2192        CEF_MUST         = 0x00000008,
2193        /**
2194         * tell the sub layers that never request a `real' lock. This flag is
2195         * not used currently.
2196         *
2197         * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless
2198         * conversion policy: ci_lockreq describes generic information of lock
2199         * requirement for this IO, especially for locks which belong to the
2200         * object doing IO; however, lock itself may have precise requirements
2201         * that are described by the enqueue flags.
2202         */
2203        CEF_NEVER       = 0x00000010,
2204        /**
2205         * for async glimpse lock.
2206         */
2207        CEF_AGL   = 0x00000020,
2208        /**
2209         * mask of enq_flags.
2210         */
2211        CEF_MASK         = 0x0000003f,
2212};
2213
2214/**
2215 * Link between lock and io. Intermediate structure is needed, because the
2216 * same lock can be part of multiple io's simultaneously.
2217 */
2218struct cl_io_lock_link {
2219        /** linkage into one of cl_lockset lists. */
2220        struct list_head           cill_linkage;
2221        struct cl_lock_descr cill_descr;
2222        struct cl_lock      *cill_lock;
2223        /** optional destructor */
2224        void           (*cill_fini)(const struct lu_env *env,
2225                                        struct cl_io_lock_link *link);
2226};
2227
2228/**
2229 * Lock-set represents a collection of locks, that io needs at a
2230 * time. Generally speaking, client tries to avoid holding multiple locks when
2231 * possible, because
2232 *
2233 *      - holding extent locks over multiple ost's introduces the danger of
2234 *      "cascading timeouts";
2235 *
2236 *      - holding multiple locks over the same ost is still dead-lock prone,
2237 *      see comment in osc_lock_enqueue(),
2238 *
2239 * but there are certain situations where this is unavoidable:
2240 *
2241 *      - O_APPEND writes have to take [0, EOF] lock for correctness;
2242 *
2243 *      - truncate has to take [new-size, EOF] lock for correctness;
2244 *
2245 *      - SNS has to take locks across full stripe for correctness;
2246 *
2247 *      - in the case when user level buffer, supplied to {read,write}(file0),
2248 *      is a part of a memory mapped lustre file, client has to take a dlm
2249 *      locks on file0, and all files that back up the buffer (or a part of
2250 *      the buffer, that is being processed in the current chunk, in any
2251 *      case, there are situations where at least 2 locks are necessary).
2252 *
2253 * In such cases we at least try to take locks in the same consistent
2254 * order. To this end, all locks are first collected, then sorted, and then
2255 * enqueued.
2256 */
2257struct cl_lockset {
2258        /** locks to be acquired. */
2259        struct list_head  cls_todo;
2260        /** locks currently being processed. */
2261        struct list_head  cls_curr;
2262        /** locks acquired. */
2263        struct list_head  cls_done;
2264};
2265
2266/**
2267 * Lock requirements(demand) for IO. It should be cl_io_lock_req,
2268 * but 'req' is always to be thought as 'request' :-)
2269 */
2270enum cl_io_lock_dmd {
2271        /** Always lock data (e.g., O_APPEND). */
2272        CILR_MANDATORY = 0,
2273        /** Layers are free to decide between local and global locking. */
2274        CILR_MAYBE,
2275        /** Never lock: there is no cache (e.g., liblustre). */
2276        CILR_NEVER
2277};
2278
2279enum cl_fsync_mode {
2280        /** start writeback, do not wait for them to finish */
2281        CL_FSYNC_NONE  = 0,
2282        /** start writeback and wait for them to finish */
2283        CL_FSYNC_LOCAL = 1,
2284        /** discard all of dirty pages in a specific file range */
2285        CL_FSYNC_DISCARD = 2,
2286        /** start writeback and make sure they have reached storage before
2287         * return. OST_SYNC RPC must be issued and finished */
2288        CL_FSYNC_ALL   = 3
2289};
2290
2291struct cl_io_rw_common {
2292        loff_t      crw_pos;
2293        size_t      crw_count;
2294        int      crw_nonblock;
2295};
2296
2297
2298/**
2299 * State for io.
2300 *
2301 * cl_io is shared by all threads participating in this IO (in current
2302 * implementation only one thread advances IO, but parallel IO design and
2303 * concurrent copy_*_user() require multiple threads acting on the same IO. It
2304 * is up to these threads to serialize their activities, including updates to
2305 * mutable cl_io fields.
2306 */
2307struct cl_io {
2308        /** type of this IO. Immutable after creation. */
2309        enum cl_io_type         ci_type;
2310        /** current state of cl_io state machine. */
2311        enum cl_io_state               ci_state;
2312        /** main object this io is against. Immutable after creation. */
2313        struct cl_object              *ci_obj;
2314        /**
2315         * Upper layer io, of which this io is a part of. Immutable after
2316         * creation.
2317         */
2318        struct cl_io              *ci_parent;
2319        /** List of slices. Immutable after creation. */
2320        struct list_head                     ci_layers;
2321        /** list of locks (to be) acquired by this io. */
2322        struct cl_lockset             ci_lockset;
2323        /** lock requirements, this is just a help info for sublayers. */
2324        enum cl_io_lock_dmd         ci_lockreq;
2325        union {
2326                struct cl_rd_io {
2327                        struct cl_io_rw_common rd;
2328                } ci_rd;
2329                struct cl_wr_io {
2330                        struct cl_io_rw_common wr;
2331                        int                 wr_append;
2332                        int                 wr_sync;
2333                } ci_wr;
2334                struct cl_io_rw_common ci_rw;
2335                struct cl_setattr_io {
2336                        struct ost_lvb   sa_attr;
2337                        unsigned int     sa_valid;
2338                        struct obd_capa *sa_capa;
2339                } ci_setattr;
2340                struct cl_fault_io {
2341                        /** page index within file. */
2342                        pgoff_t  ft_index;
2343                        /** bytes valid byte on a faulted page. */
2344                        int          ft_nob;
2345                        /** writable page? for nopage() only */
2346                        int          ft_writable;
2347                        /** page of an executable? */
2348                        int          ft_executable;
2349                        /** page_mkwrite() */
2350                        int          ft_mkwrite;
2351                        /** resulting page */
2352                        struct cl_page *ft_page;
2353                } ci_fault;
2354                struct cl_fsync_io {
2355                        loff_t       fi_start;
2356                        loff_t       fi_end;
2357                        struct obd_capa   *fi_capa;
2358                        /** file system level fid */
2359                        struct lu_fid     *fi_fid;
2360                        enum cl_fsync_mode fi_mode;
2361                        /* how many pages were written/discarded */
2362                        unsigned int       fi_nr_written;
2363                } ci_fsync;
2364        } u;
2365        struct cl_2queue     ci_queue;
2366        size_t         ci_nob;
2367        int               ci_result;
2368        unsigned int     ci_continue:1,
2369        /**
2370         * This io has held grouplock, to inform sublayers that
2371         * don't do lockless i/o.
2372         */
2373                             ci_no_srvlock:1,
2374        /**
2375         * The whole IO need to be restarted because layout has been changed
2376         */
2377                             ci_need_restart:1,
2378        /**
2379         * to not refresh layout - the IO issuer knows that the layout won't
2380         * change(page operations, layout change causes all page to be
2381         * discarded), or it doesn't matter if it changes(sync).
2382         */
2383                             ci_ignore_layout:1,
2384        /**
2385         * Check if layout changed after the IO finishes. Mainly for HSM
2386         * requirement. If IO occurs to openning files, it doesn't need to
2387         * verify layout because HSM won't release openning files.
2388         * Right now, only two opertaions need to verify layout: glimpse
2389         * and setattr.
2390         */
2391                             ci_verify_layout:1;
2392        /**
2393         * Number of pages owned by this IO. For invariant checking.
2394         */
2395        unsigned             ci_owned_nr;
2396};
2397
2398/** @} cl_io */
2399
2400/** \addtogroup cl_req cl_req
2401 * @{ */
2402/** \struct cl_req
2403 * Transfer.
2404 *
2405 * There are two possible modes of transfer initiation on the client:
2406 *
2407 *     - immediate transfer: this is started when a high level io wants a page
2408 *       or a collection of pages to be transferred right away. Examples:
2409 *       read-ahead, synchronous read in the case of non-page aligned write,
2410 *       page write-out as a part of extent lock cancellation, page write-out
2411 *       as a part of memory cleansing. Immediate transfer can be both
2412 *       cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
2413 *
2414 *     - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
2415 *       when io wants to transfer a page to the server some time later, when
2416 *       it can be done efficiently. Example: pages dirtied by the write(2)
2417 *       path.
2418 *
2419 * In any case, transfer takes place in the form of a cl_req, which is a
2420 * representation for a network RPC.
2421 *
2422 * Pages queued for an opportunistic transfer are cached until it is decided
2423 * that efficient RPC can be composed of them. This decision is made by "a
2424 * req-formation engine", currently implemented as a part of osc
2425 * layer. Req-formation depends on many factors: the size of the resulting
2426 * RPC, whether or not multi-object RPCs are supported by the server,
2427 * max-rpc-in-flight limitations, size of the dirty cache, etc.
2428 *
2429 * For the immediate transfer io submits a cl_page_list, that req-formation
2430 * engine slices into cl_req's, possibly adding cached pages to some of
2431 * the resulting req's.
2432 *
2433 * Whenever a page from cl_page_list is added to a newly constructed req, its
2434 * cl_page_operations::cpo_prep() layer methods are called. At that moment,
2435 * page state is atomically changed from cl_page_state::CPS_OWNED to
2436 * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
2437 * is zeroed, and cl_page::cp_req is set to the
2438 * req. cl_page_operations::cpo_prep() method at the particular layer might
2439 * return -EALREADY to indicate that it does not need to submit this page
2440 * at all. This is possible, for example, if page, submitted for read,
2441 * became up-to-date in the meantime; and for write, the page don't have
2442 * dirty bit marked. \see cl_io_submit_rw()
2443 *
2444 * Whenever a cached page is added to a newly constructed req, its
2445 * cl_page_operations::cpo_make_ready() layer methods are called. At that
2446 * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
2447 * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
2448 * req. cl_page_operations::cpo_make_ready() method at the particular layer
2449 * might return -EAGAIN to indicate that this page is not eligible for the
2450 * transfer right now.
2451 *
2452 * FUTURE
2453 *
2454 * Plan is to divide transfers into "priority bands" (indicated when
2455 * submitting cl_page_list, and queuing a page for the opportunistic transfer)
2456 * and allow glueing of cached pages to immediate transfers only within single
2457 * band. This would make high priority transfers (like lock cancellation or
2458 * memory pressure induced write-out) really high priority.
2459 *
2460 */
2461
2462/**
2463 * Per-transfer attributes.
2464 */
2465struct cl_req_attr {
2466        /** Generic attributes for the server consumption. */
2467        struct obdo     *cra_oa;
2468        /** Capability. */
2469        struct obd_capa *cra_capa;
2470        /** Jobid */
2471        char             cra_jobid[JOBSTATS_JOBID_SIZE];
2472};
2473
2474/**
2475 * Transfer request operations definable at every layer.
2476 *
2477 * Concurrency: transfer formation engine synchronizes calls to all transfer
2478 * methods.
2479 */
2480struct cl_req_operations {
2481        /**
2482         * Invoked top-to-bottom by cl_req_prep() when transfer formation is
2483         * complete (all pages are added).
2484         *
2485         * \see osc_req_prep()
2486         */
2487        int  (*cro_prep)(const struct lu_env *env,
2488                         const struct cl_req_slice *slice);
2489        /**
2490         * Called top-to-bottom to fill in \a oa fields. This is called twice
2491         * with different flags, see bug 10150 and osc_build_req().
2492         *
2493         * \param obj an object from cl_req which attributes are to be set in
2494         *          \a oa.
2495         *
2496         * \param oa struct obdo where attributes are placed
2497         *
2498         * \param flags \a oa fields to be filled.
2499         */
2500        void (*cro_attr_set)(const struct lu_env *env,
2501                             const struct cl_req_slice *slice,
2502                             const struct cl_object *obj,
2503                             struct cl_req_attr *attr, obd_valid flags);
2504        /**
2505         * Called top-to-bottom from cl_req_completion() to notify layers that
2506         * transfer completed. Has to free all state allocated by
2507         * cl_device_operations::cdo_req_init().
2508         */
2509        void (*cro_completion)(const struct lu_env *env,
2510                               const struct cl_req_slice *slice, int ioret);
2511};
2512
2513/**
2514 * A per-object state that (potentially multi-object) transfer request keeps.
2515 */
2516struct cl_req_obj {
2517        /** object itself */
2518        struct cl_object   *ro_obj;
2519        /** reference to cl_req_obj::ro_obj. For debugging. */
2520        struct lu_ref_link *ro_obj_ref;
2521        /* something else? Number of pages for a given object? */
2522};
2523
2524/**
2525 * Transfer request.
2526 *
2527 * Transfer requests are not reference counted, because IO sub-system owns
2528 * them exclusively and knows when to free them.
2529 *
2530 * Life cycle.
2531 *
2532 * cl_req is created by cl_req_alloc() that calls
2533 * cl_device_operations::cdo_req_init() device methods to allocate per-req
2534 * state in every layer.
2535 *
2536 * Then pages are added (cl_req_page_add()), req keeps track of all objects it
2537 * contains pages for.
2538 *
2539 * Once all pages were collected, cl_page_operations::cpo_prep() method is
2540 * called top-to-bottom. At that point layers can modify req, let it pass, or
2541 * deny it completely. This is to support things like SNS that have transfer
2542 * ordering requirements invisible to the individual req-formation engine.
2543 *
2544 * On transfer completion (or transfer timeout, or failure to initiate the
2545 * transfer of an allocated req), cl_req_operations::cro_completion() method
2546 * is called, after execution of cl_page_operations::cpo_completion() of all
2547 * req's pages.
2548 */
2549struct cl_req {
2550        enum cl_req_type      crq_type;
2551        /** A list of pages being transfered */
2552        struct list_head            crq_pages;
2553        /** Number of pages in cl_req::crq_pages */
2554        unsigned              crq_nrpages;
2555        /** An array of objects which pages are in ->crq_pages */
2556        struct cl_req_obj    *crq_o;
2557        /** Number of elements in cl_req::crq_objs[] */
2558        unsigned              crq_nrobjs;
2559        struct list_head            crq_layers;
2560};
2561
2562/**
2563 * Per-layer state for request.
2564 */
2565struct cl_req_slice {
2566        struct cl_req    *crs_req;
2567        struct cl_device *crs_dev;
2568        struct list_head        crs_linkage;
2569        const struct cl_req_operations *crs_ops;
2570};
2571
2572/* @} cl_req */
2573
2574enum cache_stats_item {
2575        /** how many cache lookups were performed */
2576        CS_lookup = 0,
2577        /** how many times cache lookup resulted in a hit */
2578        CS_hit,
2579        /** how many entities are in the cache right now */
2580        CS_total,
2581        /** how many entities in the cache are actively used (and cannot be
2582         * evicted) right now */
2583        CS_busy,
2584        /** how many entities were created at all */
2585        CS_create,
2586        CS_NR
2587};
2588
2589#define CS_NAMES { "lookup", "hit", "total", "busy", "create" }
2590
2591/**
2592 * Stats for a generic cache (similar to inode, lu_object, etc. caches).
2593 */
2594struct cache_stats {
2595        const char    *cs_name;
2596        atomic_t   cs_stats[CS_NR];
2597};
2598
2599/** These are not exported so far */
2600void cache_stats_init (struct cache_stats *cs, const char *name);
2601
2602/**
2603 * Client-side site. This represents particular client stack. "Global"
2604 * variables should (directly or indirectly) be added here to allow multiple
2605 * clients to co-exist in the single address space.
2606 */
2607struct cl_site {
2608        struct lu_site  cs_lu;
2609        /**
2610         * Statistical counters. Atomics do not scale, something better like
2611         * per-cpu counters is needed.
2612         *
2613         * These are exported as /proc/fs/lustre/llite/.../site
2614         *
2615         * When interpreting keep in mind that both sub-locks (and sub-pages)
2616         * and top-locks (and top-pages) are accounted here.
2617         */
2618        struct cache_stats    cs_pages;
2619        struct cache_stats    cs_locks;
2620        atomic_t          cs_pages_state[CPS_NR];
2621        atomic_t          cs_locks_state[CLS_NR];
2622};
2623
2624int  cl_site_init (struct cl_site *s, struct cl_device *top);
2625void cl_site_fini (struct cl_site *s);
2626void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
2627
2628/**
2629 * Output client site statistical counters into a buffer. Suitable for
2630 * ll_rd_*()-style functions.
2631 */
2632int cl_site_stats_print(const struct cl_site *site, struct seq_file *m);
2633
2634/**
2635 * \name helpers
2636 *
2637 * Type conversion and accessory functions.
2638 */
2639/** @{ */
2640
2641static inline struct cl_site *lu2cl_site(const struct lu_site *site)
2642{
2643        return container_of(site, struct cl_site, cs_lu);
2644}
2645
2646static inline int lu_device_is_cl(const struct lu_device *d)
2647{
2648        return d->ld_type->ldt_tags & LU_DEVICE_CL;
2649}
2650
2651static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
2652{
2653        LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
2654        return container_of0(d, struct cl_device, cd_lu_dev);
2655}
2656
2657static inline struct lu_device *cl2lu_dev(struct cl_device *d)
2658{
2659        return &d->cd_lu_dev;
2660}
2661
2662static inline struct cl_object *lu2cl(const struct lu_object *o)
2663{
2664        LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
2665        return container_of0(o, struct cl_object, co_lu);
2666}
2667
2668static inline const struct cl_object_conf *
2669lu2cl_conf(const struct lu_object_conf *conf)
2670{
2671        return container_of0(conf, struct cl_object_conf, coc_lu);
2672}
2673
2674static inline struct cl_object *cl_object_next(const struct cl_object *obj)
2675{
2676        return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
2677}
2678
2679static inline struct cl_device *cl_object_device(const struct cl_object *o)
2680{
2681        LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
2682        return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
2683}
2684
2685static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
2686{
2687        return container_of0(h, struct cl_object_header, coh_lu);
2688}
2689
2690static inline struct cl_site *cl_object_site(const struct cl_object *obj)
2691{
2692        return lu2cl_site(obj->co_lu.lo_dev->ld_site);
2693}
2694
2695static inline
2696struct cl_object_header *cl_object_header(const struct cl_object *obj)
2697{
2698        return luh2coh(obj->co_lu.lo_header);
2699}
2700
2701static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
2702{
2703        return lu_device_init(&d->cd_lu_dev, t);
2704}
2705
2706static inline void cl_device_fini(struct cl_device *d)
2707{
2708        lu_device_fini(&d->cd_lu_dev);
2709}
2710
2711void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
2712                       struct cl_object *obj,
2713                       const struct cl_page_operations *ops);
2714void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
2715                       struct cl_object *obj,
2716                       const struct cl_lock_operations *ops);
2717void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
2718                     struct cl_object *obj, const struct cl_io_operations *ops);
2719void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
2720                      struct cl_device *dev,
2721                      const struct cl_req_operations *ops);
2722/** @} helpers */
2723
2724/** \defgroup cl_object cl_object
2725 * @{ */
2726struct cl_object *cl_object_top (struct cl_object *o);
2727struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
2728                                 const struct lu_fid *fid,
2729                                 const struct cl_object_conf *c);
2730
2731int  cl_object_header_init(struct cl_object_header *h);
2732void cl_object_header_fini(struct cl_object_header *h);
2733void cl_object_put      (const struct lu_env *env, struct cl_object *o);
2734void cl_object_get      (struct cl_object *o);
2735void cl_object_attr_lock  (struct cl_object *o);
2736void cl_object_attr_unlock(struct cl_object *o);
2737int  cl_object_attr_get   (const struct lu_env *env, struct cl_object *obj,
2738                           struct cl_attr *attr);
2739int  cl_object_attr_set   (const struct lu_env *env, struct cl_object *obj,
2740                           const struct cl_attr *attr, unsigned valid);
2741int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
2742                           struct ost_lvb *lvb);
2743int  cl_conf_set          (const struct lu_env *env, struct cl_object *obj,
2744                           const struct cl_object_conf *conf);
2745void cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
2746void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
2747int  cl_object_has_locks  (struct cl_object *obj);
2748
2749/**
2750 * Returns true, iff \a o0 and \a o1 are slices of the same object.
2751 */
2752static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
2753{
2754        return cl_object_header(o0) == cl_object_header(o1);
2755}
2756
2757static inline void cl_object_page_init(struct cl_object *clob, int size)
2758{
2759        clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize;
2760        cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8);
2761}
2762
2763static inline void *cl_object_page_slice(struct cl_object *clob,
2764                                         struct cl_page *page)
2765{
2766        return (void *)((char *)page + clob->co_slice_off);
2767}
2768
2769/** @} cl_object */
2770
2771/** \defgroup cl_page cl_page
2772 * @{ */
2773enum {
2774        CLP_GANG_OKAY = 0,
2775        CLP_GANG_RESCHED,
2776        CLP_GANG_AGAIN,
2777        CLP_GANG_ABORT
2778};
2779
2780/* callback of cl_page_gang_lookup() */
2781typedef int   (*cl_page_gang_cb_t)  (const struct lu_env *, struct cl_io *,
2782                                     struct cl_page *, void *);
2783int          cl_page_gang_lookup (const struct lu_env *env,
2784                                     struct cl_object *obj,
2785                                     struct cl_io *io,
2786                                     pgoff_t start, pgoff_t end,
2787                                     cl_page_gang_cb_t cb, void *cbdata);
2788struct cl_page *cl_page_lookup      (struct cl_object_header *hdr,
2789                                     pgoff_t index);
2790struct cl_page *cl_page_find    (const struct lu_env *env,
2791                                     struct cl_object *obj,
2792                                     pgoff_t idx, struct page *vmpage,
2793                                     enum cl_page_type type);
2794struct cl_page *cl_page_find_sub    (const struct lu_env *env,
2795                                     struct cl_object *obj,
2796                                     pgoff_t idx, struct page *vmpage,
2797                                     struct cl_page *parent);
2798void        cl_page_get  (struct cl_page *page);
2799void        cl_page_put  (const struct lu_env *env,
2800                                     struct cl_page *page);
2801void        cl_page_print       (const struct lu_env *env, void *cookie,
2802                                     lu_printer_t printer,
2803                                     const struct cl_page *pg);
2804void        cl_page_header_print(const struct lu_env *env, void *cookie,
2805                                     lu_printer_t printer,
2806                                     const struct cl_page *pg);
2807struct page     *cl_page_vmpage      (const struct lu_env *env,
2808                                     struct cl_page *page);
2809struct cl_page *cl_vmpage_page      (struct page *vmpage, struct cl_object *obj);
2810struct cl_page *cl_page_top      (struct cl_page *page);
2811
2812const struct cl_page_slice *cl_page_at(const struct cl_page *page,
2813                                       const struct lu_device_type *dtype);
2814
2815/**
2816 * \name ownership
2817 *
2818 * Functions dealing with the ownership of page by io.
2819 */
2820/** @{ */
2821
2822int  cl_page_own        (const struct lu_env *env,
2823                         struct cl_io *io, struct cl_page *page);
2824int  cl_page_own_try    (const struct lu_env *env,
2825                         struct cl_io *io, struct cl_page *page);
2826void cl_page_assume     (const struct lu_env *env,
2827                         struct cl_io *io, struct cl_page *page);
2828void cl_page_unassume   (const struct lu_env *env,
2829                         struct cl_io *io, struct cl_page *pg);
2830void cl_page_disown     (const struct lu_env *env,
2831                         struct cl_io *io, struct cl_page *page);
2832int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
2833
2834/** @} ownership */
2835
2836/**
2837 * \name transfer
2838 *
2839 * Functions dealing with the preparation of a page for a transfer, and
2840 * tracking transfer state.
2841 */
2842/** @{ */
2843int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
2844                         struct cl_page *pg, enum cl_req_type crt);
2845void cl_page_completion (const struct lu_env *env,
2846                         struct cl_page *pg, enum cl_req_type crt, int ioret);
2847int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
2848                         enum cl_req_type crt);
2849int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
2850                         struct cl_page *pg, enum cl_req_type crt);
2851void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
2852                         int from, int to);
2853int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
2854int  cl_page_flush      (const struct lu_env *env, struct cl_io *io,
2855                         struct cl_page *pg);
2856
2857/** @} transfer */
2858
2859
2860/**
2861 * \name helper routines
2862 * Functions to discard, delete and export a cl_page.
2863 */
2864/** @{ */
2865void    cl_page_discard      (const struct lu_env *env, struct cl_io *io,
2866                              struct cl_page *pg);
2867void    cl_page_delete       (const struct lu_env *env, struct cl_page *pg);
2868int     cl_page_unmap   (const struct lu_env *env, struct cl_io *io,
2869                              struct cl_page *pg);
2870int     cl_page_is_vmlocked  (const struct lu_env *env,
2871                              const struct cl_page *pg);
2872void    cl_page_export       (const struct lu_env *env,
2873                              struct cl_page *pg, int uptodate);
2874int     cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
2875                              struct cl_page *page);
2876loff_t  cl_offset           (const struct cl_object *obj, pgoff_t idx);
2877pgoff_t cl_index             (const struct cl_object *obj, loff_t offset);
2878int     cl_page_size     (const struct cl_object *obj);
2879int     cl_pages_prune       (const struct lu_env *env, struct cl_object *obj);
2880
2881void cl_lock_print      (const struct lu_env *env, void *cookie,
2882                         lu_printer_t printer, const struct cl_lock *lock);
2883void cl_lock_descr_print(const struct lu_env *env, void *cookie,
2884                         lu_printer_t printer,
2885                         const struct cl_lock_descr *descr);
2886/* @} helper */
2887
2888/** @} cl_page */
2889
2890/** \defgroup cl_lock cl_lock
2891 * @{ */
2892
2893struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
2894                             const struct cl_lock_descr *need,
2895                             const char *scope, const void *source);
2896struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
2897                             const struct cl_lock_descr *need,
2898                             const char *scope, const void *source);
2899struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
2900                                const struct cl_lock_descr *need,
2901                                const char *scope, const void *source);
2902struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
2903                                 struct cl_object *obj, pgoff_t index,
2904                                 struct cl_lock *except, int pending,
2905                                 int canceld);
2906static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
2907                                              struct cl_object *obj,
2908                                              struct cl_page *page,
2909                                              struct cl_lock *except,
2910                                              int pending, int canceld)
2911{
2912        LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj));
2913        return cl_lock_at_pgoff(env, obj, page->cp_index, except,
2914                                pending, canceld);
2915}
2916
2917const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
2918                                       const struct lu_device_type *dtype);
2919
2920void  cl_lock_get       (struct cl_lock *lock);
2921void  cl_lock_get_trust (struct cl_lock *lock);
2922void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
2923void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
2924                         const char *scope, const void *source);
2925void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
2926                          const char *scope, const void *source);
2927void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
2928                         const char *scope, const void *source);
2929void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
2930                         const char *scope, const void *source);
2931void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
2932void  cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
2933
2934enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
2935                                     struct cl_lock *lock);
2936void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
2937                       enum cl_lock_state state);
2938int cl_lock_is_intransit(struct cl_lock *lock);
2939
2940int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock,
2941                         int keep_mutex);
2942
2943/** \name statemachine statemachine
2944 * Interface to lock state machine consists of 3 parts:
2945 *
2946 *     - "try" functions that attempt to effect a state transition. If state
2947 *     transition is not possible right now (e.g., if it has to wait for some
2948 *     asynchronous event to occur), these functions return
2949 *     cl_lock_transition::CLO_WAIT.
2950 *
2951 *     - "non-try" functions that implement synchronous blocking interface on
2952 *     top of non-blocking "try" functions. These functions repeatedly call
2953 *     corresponding "try" versions, and if state transition is not possible
2954 *     immediately, wait for lock state change.
2955 *
2956 *     - methods from cl_lock_operations, called by "try" functions. Lock can
2957 *     be advanced to the target state only when all layers voted that they
2958 *     are ready for this transition. "Try" functions call methods under lock
2959 *     mutex. If a layer had to release a mutex, it re-acquires it and returns
2960 *     cl_lock_transition::CLO_REPEAT, causing "try" function to call all
2961 *     layers again.
2962 *
2963 * TRY        NON-TRY      METHOD                           FINAL STATE
2964 *
2965 * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
2966 *
2967 * cl_wait_try()    cl_wait()    cl_lock_operations::clo_wait()    CLS_HELD
2968 *
2969 * cl_unuse_try()   cl_unuse()   cl_lock_operations::clo_unuse()   CLS_CACHED
2970 *
2971 * cl_use_try()     NONE         cl_lock_operations::clo_use()     CLS_HELD
2972 *
2973 * @{ */
2974
2975int   cl_enqueue    (const struct lu_env *env, struct cl_lock *lock,
2976                     struct cl_io *io, __u32 flags);
2977int   cl_wait       (const struct lu_env *env, struct cl_lock *lock);
2978void  cl_unuse      (const struct lu_env *env, struct cl_lock *lock);
2979int   cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
2980                     struct cl_io *io, __u32 flags);
2981int   cl_unuse_try  (const struct lu_env *env, struct cl_lock *lock);
2982int   cl_wait_try   (const struct lu_env *env, struct cl_lock *lock);
2983int   cl_use_try    (const struct lu_env *env, struct cl_lock *lock, int atomic);
2984
2985/** @} statemachine */
2986
2987void cl_lock_signal      (const struct lu_env *env, struct cl_lock *lock);
2988int  cl_lock_state_wait  (const struct lu_env *env, struct cl_lock *lock);
2989void cl_lock_state_set   (const struct lu_env *env, struct cl_lock *lock,
2990                          enum cl_lock_state state);
2991int  cl_queue_match      (const struct list_head *queue,
2992                          const struct cl_lock_descr *need);
2993
2994void cl_lock_mutex_get  (const struct lu_env *env, struct cl_lock *lock);
2995int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
2996void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
2997int  cl_lock_is_mutexed (struct cl_lock *lock);
2998int  cl_lock_nr_mutexed (const struct lu_env *env);
2999int  cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
3000int  cl_lock_ext_match  (const struct cl_lock_descr *has,
3001                         const struct cl_lock_descr *need);
3002int  cl_lock_descr_match(const struct cl_lock_descr *has,
3003                         const struct cl_lock_descr *need);
3004int  cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
3005int  cl_lock_modify     (const struct lu_env *env, struct cl_lock *lock,
3006                         const struct cl_lock_descr *desc);
3007
3008void cl_lock_closure_init (const struct lu_env *env,
3009                           struct cl_lock_closure *closure,
3010                           struct cl_lock *origin, int wait);
3011void cl_lock_closure_fini (struct cl_lock_closure *closure);
3012int  cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
3013                           struct cl_lock_closure *closure);
3014void cl_lock_disclosure   (const struct lu_env *env,
3015                           struct cl_lock_closure *closure);
3016int  cl_lock_enclosure    (const struct lu_env *env, struct cl_lock *lock,
3017                           struct cl_lock_closure *closure);
3018
3019void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
3020void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
3021void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
3022void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
3023
3024unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
3025
3026/** @} cl_lock */
3027
3028/** \defgroup cl_io cl_io
3029 * @{ */
3030
3031int   cl_io_init         (const struct lu_env *env, struct cl_io *io,
3032                          enum cl_io_type iot, struct cl_object *obj);
3033int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
3034                          enum cl_io_type iot, struct cl_object *obj);
3035int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
3036                          enum cl_io_type iot, loff_t pos, size_t count);
3037int   cl_io_loop         (const struct lu_env *env, struct cl_io *io);
3038
3039void  cl_io_fini         (const struct lu_env *env, struct cl_io *io);
3040int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
3041void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
3042int   cl_io_lock         (const struct lu_env *env, struct cl_io *io);
3043void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
3044int   cl_io_start       (const struct lu_env *env, struct cl_io *io);
3045void  cl_io_end   (const struct lu_env *env, struct cl_io *io);
3046int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
3047                          struct cl_io_lock_link *link);
3048int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
3049                           struct cl_lock_descr *descr);
3050int   cl_io_read_page    (const struct lu_env *env, struct cl_io *io,
3051                          struct cl_page *page);
3052int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
3053                          struct cl_page *page, unsigned from, unsigned to);
3054int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
3055                          struct cl_page *page, unsigned from, unsigned to);
3056int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
3057                          enum cl_req_type iot, struct cl_2queue *queue);
3058int   cl_io_submit_sync  (const struct lu_env *env, struct cl_io *io,
3059                          enum cl_req_type iot, struct cl_2queue *queue,
3060                          long timeout);
3061void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
3062                          size_t nob);
3063int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
3064                          struct cl_page_list *queue);
3065int   cl_io_is_going     (const struct lu_env *env);
3066
3067/**
3068 * True, iff \a io is an O_APPEND write(2).
3069 */
3070static inline int cl_io_is_append(const struct cl_io *io)
3071{
3072        return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
3073}
3074
3075static inline int cl_io_is_sync_write(const struct cl_io *io)
3076{
3077        return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
3078}
3079
3080static inline int cl_io_is_mkwrite(const struct cl_io *io)
3081{
3082        return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
3083}
3084
3085/**
3086 * True, iff \a io is a truncate(2).
3087 */
3088static inline int cl_io_is_trunc(const struct cl_io *io)
3089{
3090        return io->ci_type == CIT_SETATTR &&
3091                (io->u.ci_setattr.sa_valid & ATTR_SIZE);
3092}
3093
3094struct cl_io *cl_io_top(struct cl_io *io);
3095
3096void cl_io_print(const struct lu_env *env, void *cookie,
3097                 lu_printer_t printer, const struct cl_io *io);
3098
3099#define CL_IO_SLICE_CLEAN(foo_io, base)                          \
3100do {                                                                \
3101        typeof(foo_io) __foo_io = (foo_io);                          \
3102                                                                        \
3103        CLASSERT(offsetof(typeof(*__foo_io), base) == 0);              \
3104        memset(&__foo_io->base + 1, 0,                            \
3105               (sizeof *__foo_io) - sizeof __foo_io->base);          \
3106} while (0)
3107
3108/** @} cl_io */
3109
3110/** \defgroup cl_page_list cl_page_list
3111 * @{ */
3112
3113/**
3114 * Last page in the page list.
3115 */
3116static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist)
3117{
3118        LASSERT(plist->pl_nr > 0);
3119        return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch);
3120}
3121
3122/**
3123 * Iterate over pages in a page list.
3124 */
3125#define cl_page_list_for_each(page, list)                              \
3126        list_for_each_entry((page), &(list)->pl_pages, cp_batch)
3127
3128/**
3129 * Iterate over pages in a page list, taking possible removals into account.
3130 */
3131#define cl_page_list_for_each_safe(page, temp, list)                \
3132        list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
3133
3134void cl_page_list_init   (struct cl_page_list *plist);
3135void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
3136void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
3137                          struct cl_page *page);
3138void cl_page_list_splice (struct cl_page_list *list,
3139                          struct cl_page_list *head);
3140void cl_page_list_del    (const struct lu_env *env,
3141                          struct cl_page_list *plist, struct cl_page *page);
3142void cl_page_list_disown (const struct lu_env *env,
3143                          struct cl_io *io, struct cl_page_list *plist);
3144int  cl_page_list_own    (const struct lu_env *env,
3145                          struct cl_io *io, struct cl_page_list *plist);
3146void cl_page_list_assume (const struct lu_env *env,
3147                          struct cl_io *io, struct cl_page_list *plist);
3148void cl_page_list_discard(const struct lu_env *env,
3149                          struct cl_io *io, struct cl_page_list *plist);
3150int  cl_page_list_unmap  (const struct lu_env *env,
3151                          struct cl_io *io, struct cl_page_list *plist);
3152void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
3153
3154void cl_2queue_init     (struct cl_2queue *queue);
3155void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
3156void cl_2queue_disown   (const struct lu_env *env,
3157                         struct cl_io *io, struct cl_2queue *queue);
3158void cl_2queue_assume   (const struct lu_env *env,
3159                         struct cl_io *io, struct cl_2queue *queue);
3160void cl_2queue_discard  (const struct lu_env *env,
3161                         struct cl_io *io, struct cl_2queue *queue);
3162void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
3163void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
3164
3165/** @} cl_page_list */
3166
3167/** \defgroup cl_req cl_req
3168 * @{ */
3169struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
3170                            enum cl_req_type crt, int nr_objects);
3171
3172void cl_req_page_add  (const struct lu_env *env, struct cl_req *req,
3173                       struct cl_page *page);
3174void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
3175int  cl_req_prep      (const struct lu_env *env, struct cl_req *req);
3176void cl_req_attr_set  (const struct lu_env *env, struct cl_req *req,
3177                       struct cl_req_attr *attr, obd_valid flags);
3178void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
3179
3180/** \defgroup cl_sync_io cl_sync_io
3181 * @{ */
3182
3183/**
3184 * Anchor for synchronous transfer. This is allocated on a stack by thread
3185 * doing synchronous transfer, and a pointer to this structure is set up in
3186 * every page submitted for transfer. Transfer completion routine updates
3187 * anchor and wakes up waiting thread when transfer is complete.
3188 */
3189struct cl_sync_io {
3190        /** number of pages yet to be transferred. */
3191        atomic_t                csi_sync_nr;
3192        /** error code. */
3193        int                     csi_sync_rc;
3194        /** barrier of destroy this structure */
3195        atomic_t                csi_barrier;
3196        /** completion to be signaled when transfer is complete. */
3197        wait_queue_head_t               csi_waitq;
3198};
3199
3200void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
3201int  cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
3202                     struct cl_page_list *queue, struct cl_sync_io *anchor,
3203                     long timeout);
3204void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
3205
3206/** @} cl_sync_io */
3207
3208/** @} cl_req */
3209
3210/** \defgroup cl_env cl_env
3211 *
3212 * lu_env handling for a client.
3213 *
3214 * lu_env is an environment within which lustre code executes. Its major part
3215 * is lu_context---a fast memory allocation mechanism that is used to conserve
3216 * precious kernel stack space. Originally lu_env was designed for a server,
3217 * where
3218 *
3219 *     - there is a (mostly) fixed number of threads, and
3220 *
3221 *     - call chains have no non-lustre portions inserted between lustre code.
3222 *
3223 * On a client both these assumtpion fails, because every user thread can
3224 * potentially execute lustre code as part of a system call, and lustre calls
3225 * into VFS or MM that call back into lustre.
3226 *
3227 * To deal with that, cl_env wrapper functions implement the following
3228 * optimizations:
3229 *
3230 *     - allocation and destruction of environment is amortized by caching no
3231 *     longer used environments instead of destroying them;
3232 *
3233 *     - there is a notion of "current" environment, attached to the kernel
3234 *     data structure representing current thread Top-level lustre code
3235 *     allocates an environment and makes it current, then calls into
3236 *     non-lustre code, that in turn calls lustre back. Low-level lustre
3237 *     code thus called can fetch environment created by the top-level code
3238 *     and reuse it, avoiding additional environment allocation.
3239 *       Right now, three interfaces can attach the cl_env to running thread:
3240 *       - cl_env_get
3241 *       - cl_env_implant
3242 *       - cl_env_reexit(cl_env_reenter had to be called priorly)
3243 *
3244 * \see lu_env, lu_context, lu_context_key
3245 * @{ */
3246
3247struct cl_env_nest {
3248        int   cen_refcheck;
3249        void *cen_cookie;
3250};
3251
3252struct lu_env *cl_env_peek       (int *refcheck);
3253struct lu_env *cl_env_get       (int *refcheck);
3254struct lu_env *cl_env_alloc      (int *refcheck, __u32 tags);
3255struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
3256void       cl_env_put   (struct lu_env *env, int *refcheck);
3257void       cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
3258void      *cl_env_reenter    (void);
3259void       cl_env_reexit     (void *cookie);
3260void       cl_env_implant    (struct lu_env *env, int *refcheck);
3261void       cl_env_unplant    (struct lu_env *env, int *refcheck);
3262
3263/** @} cl_env */
3264
3265/*
3266 * Misc
3267 */
3268void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
3269void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
3270
3271struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
3272                                struct lu_device_type *ldt,
3273                                struct lu_device *next);
3274/** @} clio */
3275
3276int cl_global_init(void);
3277void cl_global_fini(void);
3278
3279#endif /* _LINUX_CL_OBJECT_H */
3280