linux/fs/nfs/pnfs.c
<<
>>
Prefs
   1/*
   2 *  pNFS functions to call and manage layout drivers.
   3 *
   4 *  Copyright (c) 2002 [year of first publication]
   5 *  The Regents of the University of Michigan
   6 *  All Rights Reserved
   7 *
   8 *  Dean Hildebrand <dhildebz@umich.edu>
   9 *
  10 *  Permission is granted to use, copy, create derivative works, and
  11 *  redistribute this software and such derivative works for any purpose,
  12 *  so long as the name of the University of Michigan is not used in
  13 *  any advertising or publicity pertaining to the use or distribution
  14 *  of this software without specific, written prior authorization. If
  15 *  the above copyright notice or any other identification of the
  16 *  University of Michigan is included in any copy of any portion of
  17 *  this software, then the disclaimer below must also be included.
  18 *
  19 *  This software is provided as is, without representation or warranty
  20 *  of any kind either express or implied, including without limitation
  21 *  the implied warranties of merchantability, fitness for a particular
  22 *  purpose, or noninfringement.  The Regents of the University of
  23 *  Michigan shall not be liable for any damages, including special,
  24 *  indirect, incidental, or consequential damages, with respect to any
  25 *  claim arising out of or in connection with the use of the software,
  26 *  even if it has been or is hereafter advised of the possibility of
  27 *  such damages.
  28 */
  29
  30#include <linux/nfs_fs.h>
  31#include <linux/nfs_page.h>
  32#include <linux/module.h>
  33#include <linux/sort.h>
  34#include "internal.h"
  35#include "pnfs.h"
  36#include "iostat.h"
  37#include "nfs4trace.h"
  38#include "delegation.h"
  39#include "nfs42.h"
  40#include "nfs4_fs.h"
  41
  42#define NFSDBG_FACILITY         NFSDBG_PNFS
  43#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
  44
  45/* Locking:
  46 *
  47 * pnfs_spinlock:
  48 *      protects pnfs_modules_tbl.
  49 */
  50static DEFINE_SPINLOCK(pnfs_spinlock);
  51
  52/*
  53 * pnfs_modules_tbl holds all pnfs modules
  54 */
  55static LIST_HEAD(pnfs_modules_tbl);
  56
  57static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
  58static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
  59                struct list_head *free_me,
  60                const struct pnfs_layout_range *range,
  61                u32 seq);
  62static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
  63                                struct list_head *tmp_list);
  64
  65/* Return the registered pnfs layout driver module matching given id */
  66static struct pnfs_layoutdriver_type *
  67find_pnfs_driver_locked(u32 id)
  68{
  69        struct pnfs_layoutdriver_type *local;
  70
  71        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
  72                if (local->id == id)
  73                        goto out;
  74        local = NULL;
  75out:
  76        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
  77        return local;
  78}
  79
  80static struct pnfs_layoutdriver_type *
  81find_pnfs_driver(u32 id)
  82{
  83        struct pnfs_layoutdriver_type *local;
  84
  85        spin_lock(&pnfs_spinlock);
  86        local = find_pnfs_driver_locked(id);
  87        if (local != NULL && !try_module_get(local->owner)) {
  88                dprintk("%s: Could not grab reference on module\n", __func__);
  89                local = NULL;
  90        }
  91        spin_unlock(&pnfs_spinlock);
  92        return local;
  93}
  94
  95void
  96unset_pnfs_layoutdriver(struct nfs_server *nfss)
  97{
  98        if (nfss->pnfs_curr_ld) {
  99                if (nfss->pnfs_curr_ld->clear_layoutdriver)
 100                        nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
 101                /* Decrement the MDS count. Purge the deviceid cache if zero */
 102                if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
 103                        nfs4_deviceid_purge_client(nfss->nfs_client);
 104                module_put(nfss->pnfs_curr_ld->owner);
 105        }
 106        nfss->pnfs_curr_ld = NULL;
 107}
 108
 109/*
 110 * When the server sends a list of layout types, we choose one in the order
 111 * given in the list below.
 112 *
 113 * FIXME: should this list be configurable in some fashion? module param?
 114 *        mount option? something else?
 115 */
 116static const u32 ld_prefs[] = {
 117        LAYOUT_SCSI,
 118        LAYOUT_BLOCK_VOLUME,
 119        LAYOUT_OSD2_OBJECTS,
 120        LAYOUT_FLEX_FILES,
 121        LAYOUT_NFSV4_1_FILES,
 122        0
 123};
 124
 125static int
 126ld_cmp(const void *e1, const void *e2)
 127{
 128        u32 ld1 = *((u32 *)e1);
 129        u32 ld2 = *((u32 *)e2);
 130        int i;
 131
 132        for (i = 0; ld_prefs[i] != 0; i++) {
 133                if (ld1 == ld_prefs[i])
 134                        return -1;
 135
 136                if (ld2 == ld_prefs[i])
 137                        return 1;
 138        }
 139        return 0;
 140}
 141
 142/*
 143 * Try to set the server's pnfs module to the pnfs layout type specified by id.
 144 * Currently only one pNFS layout driver per filesystem is supported.
 145 *
 146 * @ids array of layout types supported by MDS.
 147 */
 148void
 149set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
 150                      struct nfs_fsinfo *fsinfo)
 151{
 152        struct pnfs_layoutdriver_type *ld_type = NULL;
 153        u32 id;
 154        int i;
 155
 156        if (fsinfo->nlayouttypes == 0)
 157                goto out_no_driver;
 158        if (!(server->nfs_client->cl_exchange_flags &
 159                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
 160                printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
 161                        __func__, server->nfs_client->cl_exchange_flags);
 162                goto out_no_driver;
 163        }
 164
 165        sort(fsinfo->layouttype, fsinfo->nlayouttypes,
 166                sizeof(*fsinfo->layouttype), ld_cmp, NULL);
 167
 168        for (i = 0; i < fsinfo->nlayouttypes; i++) {
 169                id = fsinfo->layouttype[i];
 170                ld_type = find_pnfs_driver(id);
 171                if (!ld_type) {
 172                        request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
 173                                        id);
 174                        ld_type = find_pnfs_driver(id);
 175                }
 176                if (ld_type)
 177                        break;
 178        }
 179
 180        if (!ld_type) {
 181                dprintk("%s: No pNFS module found!\n", __func__);
 182                goto out_no_driver;
 183        }
 184
 185        server->pnfs_curr_ld = ld_type;
 186        if (ld_type->set_layoutdriver
 187            && ld_type->set_layoutdriver(server, mntfh)) {
 188                printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
 189                        "driver %u.\n", __func__, id);
 190                module_put(ld_type->owner);
 191                goto out_no_driver;
 192        }
 193        /* Bump the MDS count */
 194        atomic_inc(&server->nfs_client->cl_mds_count);
 195
 196        dprintk("%s: pNFS module for %u set\n", __func__, id);
 197        return;
 198
 199out_no_driver:
 200        dprintk("%s: Using NFSv4 I/O\n", __func__);
 201        server->pnfs_curr_ld = NULL;
 202}
 203
 204int
 205pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 206{
 207        int status = -EINVAL;
 208        struct pnfs_layoutdriver_type *tmp;
 209
 210        if (ld_type->id == 0) {
 211                printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
 212                return status;
 213        }
 214        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
 215                printk(KERN_ERR "NFS: %s Layout driver must provide "
 216                       "alloc_lseg and free_lseg.\n", __func__);
 217                return status;
 218        }
 219
 220        spin_lock(&pnfs_spinlock);
 221        tmp = find_pnfs_driver_locked(ld_type->id);
 222        if (!tmp) {
 223                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
 224                status = 0;
 225                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
 226                        ld_type->name);
 227        } else {
 228                printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
 229                        __func__, ld_type->id);
 230        }
 231        spin_unlock(&pnfs_spinlock);
 232
 233        return status;
 234}
 235EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
 236
 237void
 238pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 239{
 240        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
 241        spin_lock(&pnfs_spinlock);
 242        list_del(&ld_type->pnfs_tblid);
 243        spin_unlock(&pnfs_spinlock);
 244}
 245EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 246
 247/*
 248 * pNFS client layout cache
 249 */
 250
 251/* Need to hold i_lock if caller does not already hold reference */
 252void
 253pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
 254{
 255        refcount_inc(&lo->plh_refcount);
 256}
 257
 258static struct pnfs_layout_hdr *
 259pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 260{
 261        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
 262        return ld->alloc_layout_hdr(ino, gfp_flags);
 263}
 264
 265static void
 266pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 267{
 268        struct nfs_server *server = NFS_SERVER(lo->plh_inode);
 269        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 270
 271        if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
 272                struct nfs_client *clp = server->nfs_client;
 273
 274                spin_lock(&clp->cl_lock);
 275                list_del_rcu(&lo->plh_layouts);
 276                spin_unlock(&clp->cl_lock);
 277        }
 278        put_cred(lo->plh_lc_cred);
 279        return ld->free_layout_hdr(lo);
 280}
 281
 282static void
 283pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
 284{
 285        struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
 286        dprintk("%s: freeing layout cache %p\n", __func__, lo);
 287        nfsi->layout = NULL;
 288        /* Reset MDS Threshold I/O counters */
 289        nfsi->write_io = 0;
 290        nfsi->read_io = 0;
 291}
 292
 293void
 294pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 295{
 296        struct inode *inode;
 297
 298        if (!lo)
 299                return;
 300        inode = lo->plh_inode;
 301        pnfs_layoutreturn_before_put_layout_hdr(lo);
 302
 303        if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 304                if (!list_empty(&lo->plh_segs))
 305                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
 306                pnfs_detach_layout_hdr(lo);
 307                spin_unlock(&inode->i_lock);
 308                pnfs_free_layout_hdr(lo);
 309        }
 310}
 311
 312static struct inode *
 313pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
 314{
 315        struct inode *inode = igrab(lo->plh_inode);
 316        if (inode)
 317                return inode;
 318        set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
 319        return NULL;
 320}
 321
 322static void
 323pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
 324                         u32 seq)
 325{
 326        if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
 327                iomode = IOMODE_ANY;
 328        lo->plh_return_iomode = iomode;
 329        set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
 330        if (seq != 0) {
 331                WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
 332                lo->plh_return_seq = seq;
 333        }
 334}
 335
 336static void
 337pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
 338{
 339        struct pnfs_layout_segment *lseg;
 340        lo->plh_return_iomode = 0;
 341        lo->plh_return_seq = 0;
 342        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
 343        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 344                if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
 345                        continue;
 346                pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
 347        }
 348}
 349
 350static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 351{
 352        clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
 353        clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
 354        smp_mb__after_atomic();
 355        wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
 356        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 357}
 358
 359static void
 360pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
 361                struct list_head *free_me)
 362{
 363        clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
 364        clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
 365        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
 366                pnfs_lseg_dec_and_remove_zero(lseg, free_me);
 367        if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
 368                pnfs_lseg_dec_and_remove_zero(lseg, free_me);
 369}
 370
 371/*
 372 * Update the seqid of a layout stateid after receiving
 373 * NFS4ERR_OLD_STATEID
 374 */
 375bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
 376                struct pnfs_layout_range *dst_range,
 377                struct inode *inode)
 378{
 379        struct pnfs_layout_hdr *lo;
 380        struct pnfs_layout_range range = {
 381                .iomode = IOMODE_ANY,
 382                .offset = 0,
 383                .length = NFS4_MAX_UINT64,
 384        };
 385        bool ret = false;
 386        LIST_HEAD(head);
 387        int err;
 388
 389        spin_lock(&inode->i_lock);
 390        lo = NFS_I(inode)->layout;
 391        if (lo &&  pnfs_layout_is_valid(lo) &&
 392            nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
 393                /* Is our call using the most recent seqid? If so, bump it */
 394                if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
 395                        nfs4_stateid_seqid_inc(dst);
 396                        ret = true;
 397                        goto out;
 398                }
 399                /* Try to update the seqid to the most recent */
 400                err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
 401                if (err != -EBUSY) {
 402                        dst->seqid = lo->plh_stateid.seqid;
 403                        *dst_range = range;
 404                        ret = true;
 405                }
 406        }
 407out:
 408        spin_unlock(&inode->i_lock);
 409        pnfs_free_lseg_list(&head);
 410        return ret;
 411}
 412
 413/*
 414 * Mark a pnfs_layout_hdr and all associated layout segments as invalid
 415 *
 416 * In order to continue using the pnfs_layout_hdr, a full recovery
 417 * is required.
 418 * Note that caller must hold inode->i_lock.
 419 */
 420int
 421pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
 422                struct list_head *lseg_list)
 423{
 424        struct pnfs_layout_range range = {
 425                .iomode = IOMODE_ANY,
 426                .offset = 0,
 427                .length = NFS4_MAX_UINT64,
 428        };
 429        struct pnfs_layout_segment *lseg, *next;
 430
 431        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 432        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 433                pnfs_clear_lseg_state(lseg, lseg_list);
 434        pnfs_clear_layoutreturn_info(lo);
 435        pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
 436        if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
 437            !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
 438                pnfs_clear_layoutreturn_waitbit(lo);
 439        return !list_empty(&lo->plh_segs);
 440}
 441
 442static int
 443pnfs_iomode_to_fail_bit(u32 iomode)
 444{
 445        return iomode == IOMODE_RW ?
 446                NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 447}
 448
 449static void
 450pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 451{
 452        lo->plh_retry_timestamp = jiffies;
 453        if (!test_and_set_bit(fail_bit, &lo->plh_flags))
 454                refcount_inc(&lo->plh_refcount);
 455}
 456
 457static void
 458pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 459{
 460        if (test_and_clear_bit(fail_bit, &lo->plh_flags))
 461                refcount_dec(&lo->plh_refcount);
 462}
 463
 464static void
 465pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 466{
 467        struct inode *inode = lo->plh_inode;
 468        struct pnfs_layout_range range = {
 469                .iomode = iomode,
 470                .offset = 0,
 471                .length = NFS4_MAX_UINT64,
 472        };
 473        LIST_HEAD(head);
 474
 475        spin_lock(&inode->i_lock);
 476        pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 477        pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
 478        spin_unlock(&inode->i_lock);
 479        pnfs_free_lseg_list(&head);
 480        dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
 481                        iomode == IOMODE_RW ?  "RW" : "READ");
 482}
 483
 484static bool
 485pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 486{
 487        unsigned long start, end;
 488        int fail_bit = pnfs_iomode_to_fail_bit(iomode);
 489
 490        if (test_bit(fail_bit, &lo->plh_flags) == 0)
 491                return false;
 492        end = jiffies;
 493        start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
 494        if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
 495                /* It is time to retry the failed layoutgets */
 496                pnfs_layout_clear_fail_bit(lo, fail_bit);
 497                return false;
 498        }
 499        return true;
 500}
 501
 502static void
 503pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
 504                const struct pnfs_layout_range *range,
 505                const nfs4_stateid *stateid)
 506{
 507        INIT_LIST_HEAD(&lseg->pls_list);
 508        INIT_LIST_HEAD(&lseg->pls_lc_list);
 509        INIT_LIST_HEAD(&lseg->pls_commits);
 510        refcount_set(&lseg->pls_refcount, 1);
 511        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 512        lseg->pls_layout = lo;
 513        lseg->pls_range = *range;
 514        lseg->pls_seq = be32_to_cpu(stateid->seqid);
 515}
 516
 517static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
 518{
 519        if (lseg != NULL) {
 520                struct inode *inode = lseg->pls_layout->plh_inode;
 521                NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
 522        }
 523}
 524
 525static void
 526pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 527                struct pnfs_layout_segment *lseg)
 528{
 529        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 530        list_del_init(&lseg->pls_list);
 531        /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
 532        refcount_dec(&lo->plh_refcount);
 533        if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
 534                return;
 535        if (list_empty(&lo->plh_segs) &&
 536            !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
 537            !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
 538                if (atomic_read(&lo->plh_outstanding) == 0)
 539                        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 540                clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 541        }
 542}
 543
 544static bool
 545pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
 546                struct pnfs_layout_segment *lseg)
 547{
 548        if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
 549            pnfs_layout_is_valid(lo)) {
 550                pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
 551                list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
 552                return true;
 553        }
 554        return false;
 555}
 556
 557void
 558pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 559{
 560        struct pnfs_layout_hdr *lo;
 561        struct inode *inode;
 562
 563        if (!lseg)
 564                return;
 565
 566        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 567                refcount_read(&lseg->pls_refcount),
 568                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 569
 570        lo = lseg->pls_layout;
 571        inode = lo->plh_inode;
 572
 573        if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
 574                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 575                        spin_unlock(&inode->i_lock);
 576                        return;
 577                }
 578                pnfs_get_layout_hdr(lo);
 579                pnfs_layout_remove_lseg(lo, lseg);
 580                if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
 581                        lseg = NULL;
 582                spin_unlock(&inode->i_lock);
 583                pnfs_free_lseg(lseg);
 584                pnfs_put_layout_hdr(lo);
 585        }
 586}
 587EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 588
 589/*
 590 * is l2 fully contained in l1?
 591 *   start1                             end1
 592 *   [----------------------------------)
 593 *           start2           end2
 594 *           [----------------)
 595 */
 596static bool
 597pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
 598                 const struct pnfs_layout_range *l2)
 599{
 600        u64 start1 = l1->offset;
 601        u64 end1 = pnfs_end_offset(start1, l1->length);
 602        u64 start2 = l2->offset;
 603        u64 end2 = pnfs_end_offset(start2, l2->length);
 604
 605        return (start1 <= start2) && (end1 >= end2);
 606}
 607
 608static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
 609                struct list_head *tmp_list)
 610{
 611        if (!refcount_dec_and_test(&lseg->pls_refcount))
 612                return false;
 613        pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
 614        list_add(&lseg->pls_list, tmp_list);
 615        return true;
 616}
 617
 618/* Returns 1 if lseg is removed from list, 0 otherwise */
 619static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 620                             struct list_head *tmp_list)
 621{
 622        int rv = 0;
 623
 624        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 625                /* Remove the reference keeping the lseg in the
 626                 * list.  It will now be removed when all
 627                 * outstanding io is finished.
 628                 */
 629                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
 630                        refcount_read(&lseg->pls_refcount));
 631                if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
 632                        rv = 1;
 633        }
 634        return rv;
 635}
 636
 637/*
 638 * Compare 2 layout stateid sequence ids, to see which is newer,
 639 * taking into account wraparound issues.
 640 */
 641static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 642{
 643        return (s32)(s1 - s2) > 0;
 644}
 645
 646static bool
 647pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
 648                 const struct pnfs_layout_range *recall_range)
 649{
 650        return (recall_range->iomode == IOMODE_ANY ||
 651                lseg_range->iomode == recall_range->iomode) &&
 652               pnfs_lseg_range_intersecting(lseg_range, recall_range);
 653}
 654
 655static bool
 656pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
 657                const struct pnfs_layout_range *recall_range,
 658                u32 seq)
 659{
 660        if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
 661                return false;
 662        if (recall_range == NULL)
 663                return true;
 664        return pnfs_should_free_range(&lseg->pls_range, recall_range);
 665}
 666
 667/**
 668 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
 669 * @lo: layout header containing the lsegs
 670 * @tmp_list: list head where doomed lsegs should go
 671 * @recall_range: optional recall range argument to match (may be NULL)
 672 * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
 673 *
 674 * Walk the list of lsegs in the layout header, and tear down any that should
 675 * be destroyed. If "recall_range" is specified then the segment must match
 676 * that range. If "seq" is non-zero, then only match segments that were handed
 677 * out at or before that sequence.
 678 *
 679 * Returns number of matching invalid lsegs remaining in list after scanning
 680 * it and purging them.
 681 */
 682int
 683pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 684                            struct list_head *tmp_list,
 685                            const struct pnfs_layout_range *recall_range,
 686                            u32 seq)
 687{
 688        struct pnfs_layout_segment *lseg, *next;
 689        int remaining = 0;
 690
 691        dprintk("%s:Begin lo %p\n", __func__, lo);
 692
 693        if (list_empty(&lo->plh_segs))
 694                return 0;
 695        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 696                if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
 697                        dprintk("%s: freeing lseg %p iomode %d seq %u "
 698                                "offset %llu length %llu\n", __func__,
 699                                lseg, lseg->pls_range.iomode, lseg->pls_seq,
 700                                lseg->pls_range.offset, lseg->pls_range.length);
 701                        if (!mark_lseg_invalid(lseg, tmp_list))
 702                                remaining++;
 703                }
 704        dprintk("%s:Return %i\n", __func__, remaining);
 705        return remaining;
 706}
 707
 708static void
 709pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
 710                struct list_head *free_me,
 711                const struct pnfs_layout_range *range,
 712                u32 seq)
 713{
 714        struct pnfs_layout_segment *lseg, *next;
 715
 716        list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
 717                if (pnfs_match_lseg_recall(lseg, range, seq))
 718                        list_move_tail(&lseg->pls_list, free_me);
 719        }
 720}
 721
 722/* note free_me must contain lsegs from a single layout_hdr */
 723void
 724pnfs_free_lseg_list(struct list_head *free_me)
 725{
 726        struct pnfs_layout_segment *lseg, *tmp;
 727
 728        if (list_empty(free_me))
 729                return;
 730
 731        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 732                list_del(&lseg->pls_list);
 733                pnfs_free_lseg(lseg);
 734        }
 735}
 736
 737void
 738pnfs_destroy_layout(struct nfs_inode *nfsi)
 739{
 740        struct pnfs_layout_hdr *lo;
 741        LIST_HEAD(tmp_list);
 742
 743        spin_lock(&nfsi->vfs_inode.i_lock);
 744        lo = nfsi->layout;
 745        if (lo) {
 746                pnfs_get_layout_hdr(lo);
 747                pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
 748                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
 749                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
 750                spin_unlock(&nfsi->vfs_inode.i_lock);
 751                pnfs_free_lseg_list(&tmp_list);
 752                nfs_commit_inode(&nfsi->vfs_inode, 0);
 753                pnfs_put_layout_hdr(lo);
 754        } else
 755                spin_unlock(&nfsi->vfs_inode.i_lock);
 756}
 757EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 758
 759static bool
 760pnfs_layout_add_bulk_destroy_list(struct inode *inode,
 761                struct list_head *layout_list)
 762{
 763        struct pnfs_layout_hdr *lo;
 764        bool ret = false;
 765
 766        spin_lock(&inode->i_lock);
 767        lo = NFS_I(inode)->layout;
 768        if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
 769                pnfs_get_layout_hdr(lo);
 770                list_add(&lo->plh_bulk_destroy, layout_list);
 771                ret = true;
 772        }
 773        spin_unlock(&inode->i_lock);
 774        return ret;
 775}
 776
 777/* Caller must hold rcu_read_lock and clp->cl_lock */
 778static int
 779pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
 780                struct nfs_server *server,
 781                struct list_head *layout_list)
 782        __must_hold(&clp->cl_lock)
 783        __must_hold(RCU)
 784{
 785        struct pnfs_layout_hdr *lo, *next;
 786        struct inode *inode;
 787
 788        list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
 789                if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
 790                    test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
 791                    !list_empty(&lo->plh_bulk_destroy))
 792                        continue;
 793                /* If the sb is being destroyed, just bail */
 794                if (!nfs_sb_active(server->super))
 795                        break;
 796                inode = pnfs_grab_inode_layout_hdr(lo);
 797                if (inode != NULL) {
 798                        if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
 799                                list_del_rcu(&lo->plh_layouts);
 800                        if (pnfs_layout_add_bulk_destroy_list(inode,
 801                                                layout_list))
 802                                continue;
 803                        rcu_read_unlock();
 804                        spin_unlock(&clp->cl_lock);
 805                        iput(inode);
 806                } else {
 807                        rcu_read_unlock();
 808                        spin_unlock(&clp->cl_lock);
 809                }
 810                nfs_sb_deactive(server->super);
 811                spin_lock(&clp->cl_lock);
 812                rcu_read_lock();
 813                return -EAGAIN;
 814        }
 815        return 0;
 816}
 817
 818static int
 819pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 820                bool is_bulk_recall)
 821{
 822        struct pnfs_layout_hdr *lo;
 823        struct inode *inode;
 824        LIST_HEAD(lseg_list);
 825        int ret = 0;
 826
 827        while (!list_empty(layout_list)) {
 828                lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
 829                                plh_bulk_destroy);
 830                dprintk("%s freeing layout for inode %lu\n", __func__,
 831                        lo->plh_inode->i_ino);
 832                inode = lo->plh_inode;
 833
 834                pnfs_layoutcommit_inode(inode, false);
 835
 836                spin_lock(&inode->i_lock);
 837                list_del_init(&lo->plh_bulk_destroy);
 838                if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
 839                        if (is_bulk_recall)
 840                                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 841                        ret = -EAGAIN;
 842                }
 843                spin_unlock(&inode->i_lock);
 844                pnfs_free_lseg_list(&lseg_list);
 845                /* Free all lsegs that are attached to commit buckets */
 846                nfs_commit_inode(inode, 0);
 847                pnfs_put_layout_hdr(lo);
 848                nfs_iput_and_deactive(inode);
 849        }
 850        return ret;
 851}
 852
 853int
 854pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
 855                struct nfs_fsid *fsid,
 856                bool is_recall)
 857{
 858        struct nfs_server *server;
 859        LIST_HEAD(layout_list);
 860
 861        spin_lock(&clp->cl_lock);
 862        rcu_read_lock();
 863restart:
 864        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 865                if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
 866                        continue;
 867                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 868                                server,
 869                                &layout_list) != 0)
 870                        goto restart;
 871        }
 872        rcu_read_unlock();
 873        spin_unlock(&clp->cl_lock);
 874
 875        if (list_empty(&layout_list))
 876                return 0;
 877        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 878}
 879
 880int
 881pnfs_destroy_layouts_byclid(struct nfs_client *clp,
 882                bool is_recall)
 883{
 884        struct nfs_server *server;
 885        LIST_HEAD(layout_list);
 886
 887        spin_lock(&clp->cl_lock);
 888        rcu_read_lock();
 889restart:
 890        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 891                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 892                                        server,
 893                                        &layout_list) != 0)
 894                        goto restart;
 895        }
 896        rcu_read_unlock();
 897        spin_unlock(&clp->cl_lock);
 898
 899        if (list_empty(&layout_list))
 900                return 0;
 901        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 902}
 903
 904/*
 905 * Called by the state manger to remove all layouts established under an
 906 * expired lease.
 907 */
 908void
 909pnfs_destroy_all_layouts(struct nfs_client *clp)
 910{
 911        nfs4_deviceid_mark_client_invalid(clp);
 912        nfs4_deviceid_purge_client(clp);
 913
 914        pnfs_destroy_layouts_byclid(clp, false);
 915}
 916
 917static void
 918pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
 919{
 920        const struct cred *old;
 921
 922        if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
 923                old = xchg(&lo->plh_lc_cred, get_cred(cred));
 924                put_cred(old);
 925        }
 926}
 927
 928/* update lo->plh_stateid with new if is more recent */
 929void
 930pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 931                        const struct cred *cred, bool update_barrier)
 932{
 933        u32 oldseq, newseq, new_barrier = 0;
 934
 935        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 936        newseq = be32_to_cpu(new->seqid);
 937
 938        if (!pnfs_layout_is_valid(lo)) {
 939                pnfs_set_layout_cred(lo, cred);
 940                nfs4_stateid_copy(&lo->plh_stateid, new);
 941                lo->plh_barrier = newseq;
 942                pnfs_clear_layoutreturn_info(lo);
 943                clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 944                return;
 945        }
 946        if (pnfs_seqid_is_newer(newseq, oldseq)) {
 947                nfs4_stateid_copy(&lo->plh_stateid, new);
 948                /*
 949                 * Because of wraparound, we want to keep the barrier
 950                 * "close" to the current seqids.
 951                 */
 952                new_barrier = newseq - atomic_read(&lo->plh_outstanding);
 953        }
 954        if (update_barrier)
 955                new_barrier = be32_to_cpu(new->seqid);
 956        else if (new_barrier == 0)
 957                return;
 958        if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
 959                lo->plh_barrier = new_barrier;
 960}
 961
 962static bool
 963pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
 964                const nfs4_stateid *stateid)
 965{
 966        u32 seqid = be32_to_cpu(stateid->seqid);
 967
 968        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 969}
 970
 971/* lget is set to 1 if called from inside send_layoutget call chain */
 972static bool
 973pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 974{
 975        return lo->plh_block_lgets ||
 976                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 977}
 978
 979static struct nfs_server *
 980pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx)
 981{
 982        struct nfs_server *server;
 983
 984        if (inode) {
 985                server = NFS_SERVER(inode);
 986        } else {
 987                struct dentry *parent_dir = dget_parent(ctx->dentry);
 988                server = NFS_SERVER(parent_dir->d_inode);
 989                dput(parent_dir);
 990        }
 991        return server;
 992}
 993
 994static void nfs4_free_pages(struct page **pages, size_t size)
 995{
 996        int i;
 997
 998        if (!pages)
 999                return;
1000
1001        for (i = 0; i < size; i++) {
1002                if (!pages[i])
1003                        break;
1004                __free_page(pages[i]);
1005        }
1006        kfree(pages);
1007}
1008
1009static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
1010{
1011        struct page **pages;
1012        int i;
1013
1014        pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
1015        if (!pages) {
1016                dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
1017                return NULL;
1018        }
1019
1020        for (i = 0; i < size; i++) {
1021                pages[i] = alloc_page(gfp_flags);
1022                if (!pages[i]) {
1023                        dprintk("%s: failed to allocate page\n", __func__);
1024                        nfs4_free_pages(pages, i);
1025                        return NULL;
1026                }
1027        }
1028
1029        return pages;
1030}
1031
1032static struct nfs4_layoutget *
1033pnfs_alloc_init_layoutget_args(struct inode *ino,
1034           struct nfs_open_context *ctx,
1035           const nfs4_stateid *stateid,
1036           const struct pnfs_layout_range *range,
1037           gfp_t gfp_flags)
1038{
1039        struct nfs_server *server = pnfs_find_server(ino, ctx);
1040        size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
1041        size_t max_pages = max_response_pages(server);
1042        struct nfs4_layoutget *lgp;
1043
1044        dprintk("--> %s\n", __func__);
1045
1046        lgp = kzalloc(sizeof(*lgp), gfp_flags);
1047        if (lgp == NULL)
1048                return NULL;
1049
1050        if (max_reply_sz) {
1051                size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
1052                if (npages < max_pages)
1053                        max_pages = npages;
1054        }
1055
1056        lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
1057        if (!lgp->args.layout.pages) {
1058                kfree(lgp);
1059                return NULL;
1060        }
1061        lgp->args.layout.pglen = max_pages * PAGE_SIZE;
1062        lgp->res.layoutp = &lgp->args.layout;
1063
1064        /* Don't confuse uninitialised result and success */
1065        lgp->res.status = -NFS4ERR_DELAY;
1066
1067        lgp->args.minlength = PAGE_SIZE;
1068        if (lgp->args.minlength > range->length)
1069                lgp->args.minlength = range->length;
1070        if (ino) {
1071                loff_t i_size = i_size_read(ino);
1072
1073                if (range->iomode == IOMODE_READ) {
1074                        if (range->offset >= i_size)
1075                                lgp->args.minlength = 0;
1076                        else if (i_size - range->offset < lgp->args.minlength)
1077                                lgp->args.minlength = i_size - range->offset;
1078                }
1079        }
1080        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
1081        pnfs_copy_range(&lgp->args.range, range);
1082        lgp->args.type = server->pnfs_curr_ld->id;
1083        lgp->args.inode = ino;
1084        lgp->args.ctx = get_nfs_open_context(ctx);
1085        nfs4_stateid_copy(&lgp->args.stateid, stateid);
1086        lgp->gfp_flags = gfp_flags;
1087        lgp->cred = ctx->cred;
1088        return lgp;
1089}
1090
1091void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
1092{
1093        size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
1094
1095        nfs4_free_pages(lgp->args.layout.pages, max_pages);
1096        if (lgp->args.inode)
1097                pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
1098        put_nfs_open_context(lgp->args.ctx);
1099        kfree(lgp);
1100}
1101
1102static void pnfs_clear_layoutcommit(struct inode *inode,
1103                struct list_head *head)
1104{
1105        struct nfs_inode *nfsi = NFS_I(inode);
1106        struct pnfs_layout_segment *lseg, *tmp;
1107
1108        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1109                return;
1110        list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
1111                if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1112                        continue;
1113                pnfs_lseg_dec_and_remove_zero(lseg, head);
1114        }
1115}
1116
1117void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
1118                const nfs4_stateid *arg_stateid,
1119                const struct pnfs_layout_range *range,
1120                const nfs4_stateid *stateid)
1121{
1122        struct inode *inode = lo->plh_inode;
1123        LIST_HEAD(freeme);
1124
1125        spin_lock(&inode->i_lock);
1126        if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
1127            !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
1128                goto out_unlock;
1129        if (stateid) {
1130                u32 seq = be32_to_cpu(arg_stateid->seqid);
1131
1132                pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
1133                pnfs_free_returned_lsegs(lo, &freeme, range, seq);
1134                pnfs_set_layout_stateid(lo, stateid, NULL, true);
1135        } else
1136                pnfs_mark_layout_stateid_invalid(lo, &freeme);
1137out_unlock:
1138        pnfs_clear_layoutreturn_waitbit(lo);
1139        spin_unlock(&inode->i_lock);
1140        pnfs_free_lseg_list(&freeme);
1141
1142}
1143
1144static bool
1145pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
1146                nfs4_stateid *stateid,
1147                const struct cred **cred,
1148                enum pnfs_iomode *iomode)
1149{
1150        /* Serialise LAYOUTGET/LAYOUTRETURN */
1151        if (atomic_read(&lo->plh_outstanding) != 0)
1152                return false;
1153        if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
1154                return false;
1155        set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1156        pnfs_get_layout_hdr(lo);
1157        if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
1158                nfs4_stateid_copy(stateid, &lo->plh_stateid);
1159                *cred = get_cred(lo->plh_lc_cred);
1160                if (lo->plh_return_seq != 0)
1161                        stateid->seqid = cpu_to_be32(lo->plh_return_seq);
1162                if (iomode != NULL)
1163                        *iomode = lo->plh_return_iomode;
1164                pnfs_clear_layoutreturn_info(lo);
1165                return true;
1166        }
1167        nfs4_stateid_copy(stateid, &lo->plh_stateid);
1168        *cred = get_cred(lo->plh_lc_cred);
1169        if (iomode != NULL)
1170                *iomode = IOMODE_ANY;
1171        return true;
1172}
1173
1174static void
1175pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
1176                struct pnfs_layout_hdr *lo,
1177                const nfs4_stateid *stateid,
1178                enum pnfs_iomode iomode)
1179{
1180        struct inode *inode = lo->plh_inode;
1181
1182        args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
1183        args->inode = inode;
1184        args->range.iomode = iomode;
1185        args->range.offset = 0;
1186        args->range.length = NFS4_MAX_UINT64;
1187        args->layout = lo;
1188        nfs4_stateid_copy(&args->stateid, stateid);
1189}
1190
1191static int
1192pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
1193                       const nfs4_stateid *stateid,
1194                       const struct cred **pcred,
1195                       enum pnfs_iomode iomode,
1196                       bool sync)
1197{
1198        struct inode *ino = lo->plh_inode;
1199        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1200        struct nfs4_layoutreturn *lrp;
1201        const struct cred *cred = *pcred;
1202        int status = 0;
1203
1204        *pcred = NULL;
1205        lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
1206        if (unlikely(lrp == NULL)) {
1207                status = -ENOMEM;
1208                spin_lock(&ino->i_lock);
1209                pnfs_clear_layoutreturn_waitbit(lo);
1210                spin_unlock(&ino->i_lock);
1211                put_cred(cred);
1212                pnfs_put_layout_hdr(lo);
1213                goto out;
1214        }
1215
1216        pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
1217        lrp->args.ld_private = &lrp->ld_private;
1218        lrp->clp = NFS_SERVER(ino)->nfs_client;
1219        lrp->cred = cred;
1220        if (ld->prepare_layoutreturn)
1221                ld->prepare_layoutreturn(&lrp->args);
1222
1223        status = nfs4_proc_layoutreturn(lrp, sync);
1224out:
1225        dprintk("<-- %s status: %d\n", __func__, status);
1226        return status;
1227}
1228
1229/* Return true if layoutreturn is needed */
1230static bool
1231pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
1232{
1233        struct pnfs_layout_segment *s;
1234        enum pnfs_iomode iomode;
1235        u32 seq;
1236
1237        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1238                return false;
1239
1240        seq = lo->plh_return_seq;
1241        iomode = lo->plh_return_iomode;
1242
1243        /* Defer layoutreturn until all recalled lsegs are done */
1244        list_for_each_entry(s, &lo->plh_segs, pls_list) {
1245                if (seq && pnfs_seqid_is_newer(s->pls_seq, seq))
1246                        continue;
1247                if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode)
1248                        continue;
1249                if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
1250                        return false;
1251        }
1252
1253        return true;
1254}
1255
1256static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
1257{
1258        struct inode *inode= lo->plh_inode;
1259
1260        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1261                return;
1262        spin_lock(&inode->i_lock);
1263        if (pnfs_layout_need_return(lo)) {
1264                const struct cred *cred;
1265                nfs4_stateid stateid;
1266                enum pnfs_iomode iomode;
1267                bool send;
1268
1269                send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
1270                spin_unlock(&inode->i_lock);
1271                if (send) {
1272                        /* Send an async layoutreturn so we dont deadlock */
1273                        pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
1274                }
1275        } else
1276                spin_unlock(&inode->i_lock);
1277}
1278
1279/*
1280 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
1281 * when the layout segment list is empty.
1282 *
1283 * Note that a pnfs_layout_hdr can exist with an empty layout segment
1284 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
1285 * deviceid is marked invalid.
1286 */
1287int
1288_pnfs_return_layout(struct inode *ino)
1289{
1290        struct pnfs_layout_hdr *lo = NULL;
1291        struct nfs_inode *nfsi = NFS_I(ino);
1292        LIST_HEAD(tmp_list);
1293        const struct cred *cred;
1294        nfs4_stateid stateid;
1295        int status = 0;
1296        bool send, valid_layout;
1297
1298        dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
1299
1300        spin_lock(&ino->i_lock);
1301        lo = nfsi->layout;
1302        if (!lo) {
1303                spin_unlock(&ino->i_lock);
1304                dprintk("NFS: %s no layout to return\n", __func__);
1305                goto out;
1306        }
1307        /* Reference matched in nfs4_layoutreturn_release */
1308        pnfs_get_layout_hdr(lo);
1309        /* Is there an outstanding layoutreturn ? */
1310        if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1311                spin_unlock(&ino->i_lock);
1312                if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1313                                        TASK_UNINTERRUPTIBLE))
1314                        goto out_put_layout_hdr;
1315                spin_lock(&ino->i_lock);
1316        }
1317        valid_layout = pnfs_layout_is_valid(lo);
1318        pnfs_clear_layoutcommit(ino, &tmp_list);
1319        pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
1320
1321        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
1322                struct pnfs_layout_range range = {
1323                        .iomode         = IOMODE_ANY,
1324                        .offset         = 0,
1325                        .length         = NFS4_MAX_UINT64,
1326                };
1327                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1328        }
1329
1330        /* Don't send a LAYOUTRETURN if list was initially empty */
1331        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
1332                        !valid_layout) {
1333                spin_unlock(&ino->i_lock);
1334                dprintk("NFS: %s no layout segments to return\n", __func__);
1335                goto out_wait_layoutreturn;
1336        }
1337
1338        send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
1339        spin_unlock(&ino->i_lock);
1340        if (send)
1341                status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
1342out_wait_layoutreturn:
1343        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
1344out_put_layout_hdr:
1345        pnfs_free_lseg_list(&tmp_list);
1346        pnfs_put_layout_hdr(lo);
1347out:
1348        dprintk("<-- %s status: %d\n", __func__, status);
1349        return status;
1350}
1351
1352int
1353pnfs_commit_and_return_layout(struct inode *inode)
1354{
1355        struct pnfs_layout_hdr *lo;
1356        int ret;
1357
1358        spin_lock(&inode->i_lock);
1359        lo = NFS_I(inode)->layout;
1360        if (lo == NULL) {
1361                spin_unlock(&inode->i_lock);
1362                return 0;
1363        }
1364        pnfs_get_layout_hdr(lo);
1365        /* Block new layoutgets and read/write to ds */
1366        lo->plh_block_lgets++;
1367        spin_unlock(&inode->i_lock);
1368        filemap_fdatawait(inode->i_mapping);
1369        ret = pnfs_layoutcommit_inode(inode, true);
1370        if (ret == 0)
1371                ret = _pnfs_return_layout(inode);
1372        spin_lock(&inode->i_lock);
1373        lo->plh_block_lgets--;
1374        spin_unlock(&inode->i_lock);
1375        pnfs_put_layout_hdr(lo);
1376        return ret;
1377}
1378
1379bool pnfs_roc(struct inode *ino,
1380                struct nfs4_layoutreturn_args *args,
1381                struct nfs4_layoutreturn_res *res,
1382                const struct cred *cred)
1383{
1384        struct nfs_inode *nfsi = NFS_I(ino);
1385        struct nfs_open_context *ctx;
1386        struct nfs4_state *state;
1387        struct pnfs_layout_hdr *lo;
1388        struct pnfs_layout_segment *lseg, *next;
1389        const struct cred *lc_cred;
1390        nfs4_stateid stateid;
1391        enum pnfs_iomode iomode = 0;
1392        bool layoutreturn = false, roc = false;
1393        bool skip_read = false;
1394
1395        if (!nfs_have_layout(ino))
1396                return false;
1397retry:
1398        rcu_read_lock();
1399        spin_lock(&ino->i_lock);
1400        lo = nfsi->layout;
1401        if (!lo || !pnfs_layout_is_valid(lo) ||
1402            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1403                lo = NULL;
1404                goto out_noroc;
1405        }
1406        pnfs_get_layout_hdr(lo);
1407        if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1408                spin_unlock(&ino->i_lock);
1409                rcu_read_unlock();
1410                wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1411                                TASK_UNINTERRUPTIBLE);
1412                pnfs_put_layout_hdr(lo);
1413                goto retry;
1414        }
1415
1416        /* no roc if we hold a delegation */
1417        if (nfs4_check_delegation(ino, FMODE_READ)) {
1418                if (nfs4_check_delegation(ino, FMODE_WRITE))
1419                        goto out_noroc;
1420                skip_read = true;
1421        }
1422
1423        list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
1424                state = ctx->state;
1425                if (state == NULL)
1426                        continue;
1427                /* Don't return layout if there is open file state */
1428                if (state->state & FMODE_WRITE)
1429                        goto out_noroc;
1430                if (state->state & FMODE_READ)
1431                        skip_read = true;
1432        }
1433
1434
1435        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
1436                if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
1437                        continue;
1438                /* If we are sending layoutreturn, invalidate all valid lsegs */
1439                if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
1440                        continue;
1441                /*
1442                 * Note: mark lseg for return so pnfs_layout_remove_lseg
1443                 * doesn't invalidate the layout for us.
1444                 */
1445                set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1446                if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
1447                        continue;
1448                pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
1449        }
1450
1451        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1452                goto out_noroc;
1453
1454        /* ROC in two conditions:
1455         * 1. there are ROC lsegs
1456         * 2. we don't send layoutreturn
1457         */
1458        /* lo ref dropped in pnfs_roc_release() */
1459        layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
1460        /* If the creds don't match, we can't compound the layoutreturn */
1461        if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0)
1462                goto out_noroc;
1463
1464        roc = layoutreturn;
1465        pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
1466        res->lrs_present = 0;
1467        layoutreturn = false;
1468        put_cred(lc_cred);
1469
1470out_noroc:
1471        spin_unlock(&ino->i_lock);
1472        rcu_read_unlock();
1473        pnfs_layoutcommit_inode(ino, true);
1474        if (roc) {
1475                struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1476                if (ld->prepare_layoutreturn)
1477                        ld->prepare_layoutreturn(args);
1478                pnfs_put_layout_hdr(lo);
1479                return true;
1480        }
1481        if (layoutreturn)
1482                pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
1483        pnfs_put_layout_hdr(lo);
1484        return false;
1485}
1486
1487int pnfs_roc_done(struct rpc_task *task, struct inode *inode,
1488                struct nfs4_layoutreturn_args **argpp,
1489                struct nfs4_layoutreturn_res **respp,
1490                int *ret)
1491{
1492        struct nfs4_layoutreturn_args *arg = *argpp;
1493        int retval = -EAGAIN;
1494
1495        if (!arg)
1496                return 0;
1497        /* Handle Layoutreturn errors */
1498        switch (*ret) {
1499        case 0:
1500                retval = 0;
1501                break;
1502        case -NFS4ERR_NOMATCHING_LAYOUT:
1503                /* Was there an RPC level error? If not, retry */
1504                if (task->tk_rpc_status == 0)
1505                        break;
1506                /* If the call was not sent, let caller handle it */
1507                if (!RPC_WAS_SENT(task))
1508                        return 0;
1509                /*
1510                 * Otherwise, assume the call succeeded and
1511                 * that we need to release the layout
1512                 */
1513                *ret = 0;
1514                (*respp)->lrs_present = 0;
1515                retval = 0;
1516                break;
1517        case -NFS4ERR_DELAY:
1518                /* Let the caller handle the retry */
1519                *ret = -NFS4ERR_NOMATCHING_LAYOUT;
1520                return 0;
1521        case -NFS4ERR_OLD_STATEID:
1522                if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
1523                                        &arg->range, inode))
1524                        break;
1525                *ret = -NFS4ERR_NOMATCHING_LAYOUT;
1526                return -EAGAIN;
1527        }
1528        *argpp = NULL;
1529        *respp = NULL;
1530        return retval;
1531}
1532
1533void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
1534                struct nfs4_layoutreturn_res *res,
1535                int ret)
1536{
1537        struct pnfs_layout_hdr *lo = args->layout;
1538        const nfs4_stateid *arg_stateid = NULL;
1539        const nfs4_stateid *res_stateid = NULL;
1540        struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
1541
1542        switch (ret) {
1543        case -NFS4ERR_NOMATCHING_LAYOUT:
1544                break;
1545        case 0:
1546                if (res->lrs_present)
1547                        res_stateid = &res->stateid;
1548                /* Fallthrough */
1549        default:
1550                arg_stateid = &args->stateid;
1551        }
1552        pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
1553                        res_stateid);
1554        if (ld_private && ld_private->ops && ld_private->ops->free)
1555                ld_private->ops->free(ld_private);
1556        pnfs_put_layout_hdr(lo);
1557        trace_nfs4_layoutreturn_on_close(args->inode, 0);
1558}
1559
1560bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1561{
1562        struct nfs_inode *nfsi = NFS_I(ino);
1563        struct pnfs_layout_hdr *lo;
1564        bool sleep = false;
1565
1566        /* we might not have grabbed lo reference. so need to check under
1567         * i_lock */
1568        spin_lock(&ino->i_lock);
1569        lo = nfsi->layout;
1570        if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1571                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1572                sleep = true;
1573        }
1574        spin_unlock(&ino->i_lock);
1575        return sleep;
1576}
1577
1578/*
1579 * Compare two layout segments for sorting into layout cache.
1580 * We want to preferentially return RW over RO layouts, so ensure those
1581 * are seen first.
1582 */
1583static s64
1584pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1585           const struct pnfs_layout_range *l2)
1586{
1587        s64 d;
1588
1589        /* high offset > low offset */
1590        d = l1->offset - l2->offset;
1591        if (d)
1592                return d;
1593
1594        /* short length > long length */
1595        d = l2->length - l1->length;
1596        if (d)
1597                return d;
1598
1599        /* read > read/write */
1600        return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1601}
1602
1603static bool
1604pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1605                const struct pnfs_layout_range *l2)
1606{
1607        return pnfs_lseg_range_cmp(l1, l2) > 0;
1608}
1609
1610static bool
1611pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1612                struct pnfs_layout_segment *old)
1613{
1614        return false;
1615}
1616
1617void
1618pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1619                   struct pnfs_layout_segment *lseg,
1620                   bool (*is_after)(const struct pnfs_layout_range *,
1621                           const struct pnfs_layout_range *),
1622                   bool (*do_merge)(struct pnfs_layout_segment *,
1623                           struct pnfs_layout_segment *),
1624                   struct list_head *free_me)
1625{
1626        struct pnfs_layout_segment *lp, *tmp;
1627
1628        dprintk("%s:Begin\n", __func__);
1629
1630        list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1631                if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1632                        continue;
1633                if (do_merge(lseg, lp)) {
1634                        mark_lseg_invalid(lp, free_me);
1635                        continue;
1636                }
1637                if (is_after(&lseg->pls_range, &lp->pls_range))
1638                        continue;
1639                list_add_tail(&lseg->pls_list, &lp->pls_list);
1640                dprintk("%s: inserted lseg %p "
1641                        "iomode %d offset %llu length %llu before "
1642                        "lp %p iomode %d offset %llu length %llu\n",
1643                        __func__, lseg, lseg->pls_range.iomode,
1644                        lseg->pls_range.offset, lseg->pls_range.length,
1645                        lp, lp->pls_range.iomode, lp->pls_range.offset,
1646                        lp->pls_range.length);
1647                goto out;
1648        }
1649        list_add_tail(&lseg->pls_list, &lo->plh_segs);
1650        dprintk("%s: inserted lseg %p "
1651                "iomode %d offset %llu length %llu at tail\n",
1652                __func__, lseg, lseg->pls_range.iomode,
1653                lseg->pls_range.offset, lseg->pls_range.length);
1654out:
1655        pnfs_get_layout_hdr(lo);
1656
1657        dprintk("%s:Return\n", __func__);
1658}
1659EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1660
1661static void
1662pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1663                   struct pnfs_layout_segment *lseg,
1664                   struct list_head *free_me)
1665{
1666        struct inode *inode = lo->plh_inode;
1667        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1668
1669        if (ld->add_lseg != NULL)
1670                ld->add_lseg(lo, lseg, free_me);
1671        else
1672                pnfs_generic_layout_insert_lseg(lo, lseg,
1673                                pnfs_lseg_range_is_after,
1674                                pnfs_lseg_no_merge,
1675                                free_me);
1676}
1677
1678static struct pnfs_layout_hdr *
1679alloc_init_layout_hdr(struct inode *ino,
1680                      struct nfs_open_context *ctx,
1681                      gfp_t gfp_flags)
1682{
1683        struct pnfs_layout_hdr *lo;
1684
1685        lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1686        if (!lo)
1687                return NULL;
1688        refcount_set(&lo->plh_refcount, 1);
1689        INIT_LIST_HEAD(&lo->plh_layouts);
1690        INIT_LIST_HEAD(&lo->plh_segs);
1691        INIT_LIST_HEAD(&lo->plh_return_segs);
1692        INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1693        lo->plh_inode = ino;
1694        lo->plh_lc_cred = get_cred(ctx->cred);
1695        lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
1696        return lo;
1697}
1698
1699static struct pnfs_layout_hdr *
1700pnfs_find_alloc_layout(struct inode *ino,
1701                       struct nfs_open_context *ctx,
1702                       gfp_t gfp_flags)
1703        __releases(&ino->i_lock)
1704        __acquires(&ino->i_lock)
1705{
1706        struct nfs_inode *nfsi = NFS_I(ino);
1707        struct pnfs_layout_hdr *new = NULL;
1708
1709        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1710
1711        if (nfsi->layout != NULL)
1712                goto out_existing;
1713        spin_unlock(&ino->i_lock);
1714        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1715        spin_lock(&ino->i_lock);
1716
1717        if (likely(nfsi->layout == NULL)) {     /* Won the race? */
1718                nfsi->layout = new;
1719                return new;
1720        } else if (new != NULL)
1721                pnfs_free_layout_hdr(new);
1722out_existing:
1723        pnfs_get_layout_hdr(nfsi->layout);
1724        return nfsi->layout;
1725}
1726
1727/*
1728 * iomode matching rules:
1729 * iomode       lseg    strict match
1730 *                      iomode
1731 * -----        -----   ------ -----
1732 * ANY          READ    N/A    true
1733 * ANY          RW      N/A    true
1734 * RW           READ    N/A    false
1735 * RW           RW      N/A    true
1736 * READ         READ    N/A    true
1737 * READ         RW      true   false
1738 * READ         RW      false  true
1739 */
1740static bool
1741pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1742                 const struct pnfs_layout_range *range,
1743                 bool strict_iomode)
1744{
1745        struct pnfs_layout_range range1;
1746
1747        if ((range->iomode == IOMODE_RW &&
1748             ls_range->iomode != IOMODE_RW) ||
1749            (range->iomode != ls_range->iomode &&
1750             strict_iomode) ||
1751            !pnfs_lseg_range_intersecting(ls_range, range))
1752                return false;
1753
1754        /* range1 covers only the first byte in the range */
1755        range1 = *range;
1756        range1.length = 1;
1757        return pnfs_lseg_range_contained(ls_range, &range1);
1758}
1759
1760/*
1761 * lookup range in layout
1762 */
1763static struct pnfs_layout_segment *
1764pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1765                struct pnfs_layout_range *range,
1766                bool strict_iomode)
1767{
1768        struct pnfs_layout_segment *lseg, *ret = NULL;
1769
1770        dprintk("%s:Begin\n", __func__);
1771
1772        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1773                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1774                    pnfs_lseg_range_match(&lseg->pls_range, range,
1775                                          strict_iomode)) {
1776                        ret = pnfs_get_lseg(lseg);
1777                        break;
1778                }
1779        }
1780
1781        dprintk("%s:Return lseg %p ref %d\n",
1782                __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
1783        return ret;
1784}
1785
1786/*
1787 * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1788 * to the MDS or over pNFS
1789 *
1790 * The nfs_inode read_io and write_io fields are cumulative counters reset
1791 * when there are no layout segments. Note that in pnfs_update_layout iomode
1792 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1793 * WRITE request.
1794 *
1795 * A return of true means use MDS I/O.
1796 *
1797 * From rfc 5661:
1798 * If a file's size is smaller than the file size threshold, data accesses
1799 * SHOULD be sent to the metadata server.  If an I/O request has a length that
1800 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1801 * server.  If both file size and I/O size are provided, the client SHOULD
1802 * reach or exceed  both thresholds before sending its read or write
1803 * requests to the data server.
1804 */
1805static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1806                                     struct inode *ino, int iomode)
1807{
1808        struct nfs4_threshold *t = ctx->mdsthreshold;
1809        struct nfs_inode *nfsi = NFS_I(ino);
1810        loff_t fsize = i_size_read(ino);
1811        bool size = false, size_set = false, io = false, io_set = false, ret = false;
1812
1813        if (t == NULL)
1814                return ret;
1815
1816        dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1817                __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1818
1819        switch (iomode) {
1820        case IOMODE_READ:
1821                if (t->bm & THRESHOLD_RD) {
1822                        dprintk("%s fsize %llu\n", __func__, fsize);
1823                        size_set = true;
1824                        if (fsize < t->rd_sz)
1825                                size = true;
1826                }
1827                if (t->bm & THRESHOLD_RD_IO) {
1828                        dprintk("%s nfsi->read_io %llu\n", __func__,
1829                                nfsi->read_io);
1830                        io_set = true;
1831                        if (nfsi->read_io < t->rd_io_sz)
1832                                io = true;
1833                }
1834                break;
1835        case IOMODE_RW:
1836                if (t->bm & THRESHOLD_WR) {
1837                        dprintk("%s fsize %llu\n", __func__, fsize);
1838                        size_set = true;
1839                        if (fsize < t->wr_sz)
1840                                size = true;
1841                }
1842                if (t->bm & THRESHOLD_WR_IO) {
1843                        dprintk("%s nfsi->write_io %llu\n", __func__,
1844                                nfsi->write_io);
1845                        io_set = true;
1846                        if (nfsi->write_io < t->wr_io_sz)
1847                                io = true;
1848                }
1849                break;
1850        }
1851        if (size_set && io_set) {
1852                if (size && io)
1853                        ret = true;
1854        } else if (size || io)
1855                ret = true;
1856
1857        dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1858        return ret;
1859}
1860
1861static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1862{
1863        /*
1864         * send layoutcommit as it can hold up layoutreturn due to lseg
1865         * reference
1866         */
1867        pnfs_layoutcommit_inode(lo->plh_inode, false);
1868        return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1869                                   nfs_wait_bit_killable,
1870                                   TASK_KILLABLE);
1871}
1872
1873static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
1874{
1875        atomic_inc(&lo->plh_outstanding);
1876}
1877
1878static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
1879{
1880        if (atomic_dec_and_test(&lo->plh_outstanding))
1881                wake_up_var(&lo->plh_outstanding);
1882}
1883
1884static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1885{
1886        unsigned long *bitlock = &lo->plh_flags;
1887
1888        clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1889        smp_mb__after_atomic();
1890        wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1891}
1892
1893static void _add_to_server_list(struct pnfs_layout_hdr *lo,
1894                                struct nfs_server *server)
1895{
1896        if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
1897                struct nfs_client *clp = server->nfs_client;
1898
1899                /* The lo must be on the clp list if there is any
1900                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1901                 */
1902                spin_lock(&clp->cl_lock);
1903                list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
1904                spin_unlock(&clp->cl_lock);
1905        }
1906}
1907
1908/*
1909 * Layout segment is retreived from the server if not cached.
1910 * The appropriate layout segment is referenced and returned to the caller.
1911 */
1912struct pnfs_layout_segment *
1913pnfs_update_layout(struct inode *ino,
1914                   struct nfs_open_context *ctx,
1915                   loff_t pos,
1916                   u64 count,
1917                   enum pnfs_iomode iomode,
1918                   bool strict_iomode,
1919                   gfp_t gfp_flags)
1920{
1921        struct pnfs_layout_range arg = {
1922                .iomode = iomode,
1923                .offset = pos,
1924                .length = count,
1925        };
1926        unsigned pg_offset;
1927        struct nfs_server *server = NFS_SERVER(ino);
1928        struct nfs_client *clp = server->nfs_client;
1929        struct pnfs_layout_hdr *lo = NULL;
1930        struct pnfs_layout_segment *lseg = NULL;
1931        struct nfs4_layoutget *lgp;
1932        nfs4_stateid stateid;
1933        long timeout = 0;
1934        unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
1935        bool first;
1936
1937        if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1938                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1939                                 PNFS_UPDATE_LAYOUT_NO_PNFS);
1940                goto out;
1941        }
1942
1943        if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1944                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1945                                 PNFS_UPDATE_LAYOUT_MDSTHRESH);
1946                goto out;
1947        }
1948
1949lookup_again:
1950        lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
1951        if (IS_ERR(lseg))
1952                goto out;
1953        first = false;
1954        spin_lock(&ino->i_lock);
1955        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1956        if (lo == NULL) {
1957                spin_unlock(&ino->i_lock);
1958                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1959                                 PNFS_UPDATE_LAYOUT_NOMEM);
1960                goto out;
1961        }
1962
1963        /* Do we even need to bother with this? */
1964        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1965                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1966                                 PNFS_UPDATE_LAYOUT_BULK_RECALL);
1967                dprintk("%s matches recall, use MDS\n", __func__);
1968                goto out_unlock;
1969        }
1970
1971        /* if LAYOUTGET already failed once we don't try again */
1972        if (pnfs_layout_io_test_failed(lo, iomode)) {
1973                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1974                                 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1975                goto out_unlock;
1976        }
1977
1978        /*
1979         * If the layout segment list is empty, but there are outstanding
1980         * layoutget calls, then they might be subject to a layoutrecall.
1981         */
1982        if (list_empty(&lo->plh_segs) &&
1983            atomic_read(&lo->plh_outstanding) != 0) {
1984                spin_unlock(&ino->i_lock);
1985                lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
1986                                        !atomic_read(&lo->plh_outstanding)));
1987                if (IS_ERR(lseg))
1988                        goto out_put_layout_hdr;
1989                pnfs_put_layout_hdr(lo);
1990                goto lookup_again;
1991        }
1992
1993        lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
1994        if (lseg) {
1995                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1996                                PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1997                goto out_unlock;
1998        }
1999
2000        /*
2001         * Choose a stateid for the LAYOUTGET. If we don't have a layout
2002         * stateid, or it has been invalidated, then we must use the open
2003         * stateid.
2004         */
2005        if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
2006                int status;
2007
2008                /*
2009                 * The first layoutget for the file. Need to serialize per
2010                 * RFC 5661 Errata 3208.
2011                 */
2012                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
2013                                     &lo->plh_flags)) {
2014                        spin_unlock(&ino->i_lock);
2015                        lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
2016                                                NFS_LAYOUT_FIRST_LAYOUTGET,
2017                                                TASK_KILLABLE));
2018                        if (IS_ERR(lseg))
2019                                goto out_put_layout_hdr;
2020                        pnfs_put_layout_hdr(lo);
2021                        dprintk("%s retrying\n", __func__);
2022                        goto lookup_again;
2023                }
2024
2025                spin_unlock(&ino->i_lock);
2026                first = true;
2027                status = nfs4_select_rw_stateid(ctx->state,
2028                                        iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
2029                                        NULL, &stateid, NULL);
2030                if (status != 0) {
2031                        lseg = ERR_PTR(status);
2032                        trace_pnfs_update_layout(ino, pos, count,
2033                                        iomode, lo, lseg,
2034                                        PNFS_UPDATE_LAYOUT_INVALID_OPEN);
2035                        nfs4_schedule_stateid_recovery(server, ctx->state);
2036                        pnfs_clear_first_layoutget(lo);
2037                        pnfs_put_layout_hdr(lo);
2038                        goto lookup_again;
2039                }
2040                spin_lock(&ino->i_lock);
2041        } else {
2042                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
2043        }
2044
2045        /*
2046         * Because we free lsegs before sending LAYOUTRETURN, we need to wait
2047         * for LAYOUTRETURN even if first is true.
2048         */
2049        if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
2050                spin_unlock(&ino->i_lock);
2051                dprintk("%s wait for layoutreturn\n", __func__);
2052                lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
2053                if (!IS_ERR(lseg)) {
2054                        if (first)
2055                                pnfs_clear_first_layoutget(lo);
2056                        pnfs_put_layout_hdr(lo);
2057                        dprintk("%s retrying\n", __func__);
2058                        trace_pnfs_update_layout(ino, pos, count, iomode, lo,
2059                                        lseg, PNFS_UPDATE_LAYOUT_RETRY);
2060                        goto lookup_again;
2061                }
2062                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2063                                PNFS_UPDATE_LAYOUT_RETURN);
2064                goto out_put_layout_hdr;
2065        }
2066
2067        if (pnfs_layoutgets_blocked(lo)) {
2068                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2069                                PNFS_UPDATE_LAYOUT_BLOCKED);
2070                goto out_unlock;
2071        }
2072        nfs_layoutget_begin(lo);
2073        spin_unlock(&ino->i_lock);
2074
2075        _add_to_server_list(lo, server);
2076
2077        pg_offset = arg.offset & ~PAGE_MASK;
2078        if (pg_offset) {
2079                arg.offset -= pg_offset;
2080                arg.length += pg_offset;
2081        }
2082        if (arg.length != NFS4_MAX_UINT64)
2083                arg.length = PAGE_ALIGN(arg.length);
2084
2085        lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
2086        if (!lgp) {
2087                trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
2088                                         PNFS_UPDATE_LAYOUT_NOMEM);
2089                nfs_layoutget_end(lo);
2090                goto out_put_layout_hdr;
2091        }
2092
2093        lseg = nfs4_proc_layoutget(lgp, &timeout);
2094        trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2095                                 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
2096        nfs_layoutget_end(lo);
2097        if (IS_ERR(lseg)) {
2098                switch(PTR_ERR(lseg)) {
2099                case -EBUSY:
2100                        if (time_after(jiffies, giveup))
2101                                lseg = NULL;
2102                        break;
2103                case -ERECALLCONFLICT:
2104                case -EAGAIN:
2105                        break;
2106                default:
2107                        if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
2108                                pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2109                                lseg = NULL;
2110                        }
2111                        goto out_put_layout_hdr;
2112                }
2113                if (lseg) {
2114                        if (first)
2115                                pnfs_clear_first_layoutget(lo);
2116                        trace_pnfs_update_layout(ino, pos, count,
2117                                iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
2118                        pnfs_put_layout_hdr(lo);
2119                        goto lookup_again;
2120                }
2121        } else {
2122                pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2123        }
2124
2125out_put_layout_hdr:
2126        if (first)
2127                pnfs_clear_first_layoutget(lo);
2128        trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2129                                 PNFS_UPDATE_LAYOUT_EXIT);
2130        pnfs_put_layout_hdr(lo);
2131out:
2132        dprintk("%s: inode %s/%llu pNFS layout segment %s for "
2133                        "(%s, offset: %llu, length: %llu)\n",
2134                        __func__, ino->i_sb->s_id,
2135                        (unsigned long long)NFS_FILEID(ino),
2136                        IS_ERR_OR_NULL(lseg) ? "not found" : "found",
2137                        iomode==IOMODE_RW ?  "read/write" : "read-only",
2138                        (unsigned long long)pos,
2139                        (unsigned long long)count);
2140        return lseg;
2141out_unlock:
2142        spin_unlock(&ino->i_lock);
2143        goto out_put_layout_hdr;
2144}
2145EXPORT_SYMBOL_GPL(pnfs_update_layout);
2146
2147static bool
2148pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
2149{
2150        switch (range->iomode) {
2151        case IOMODE_READ:
2152        case IOMODE_RW:
2153                break;
2154        default:
2155                return false;
2156        }
2157        if (range->offset == NFS4_MAX_UINT64)
2158                return false;
2159        if (range->length == 0)
2160                return false;
2161        if (range->length != NFS4_MAX_UINT64 &&
2162            range->length > NFS4_MAX_UINT64 - range->offset)
2163                return false;
2164        return true;
2165}
2166
2167static struct pnfs_layout_hdr *
2168_pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
2169{
2170        struct pnfs_layout_hdr *lo;
2171
2172        spin_lock(&ino->i_lock);
2173        lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL);
2174        if (!lo)
2175                goto out_unlock;
2176        if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
2177                goto out_unlock;
2178        if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
2179                goto out_unlock;
2180        if (pnfs_layoutgets_blocked(lo))
2181                goto out_unlock;
2182        if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags))
2183                goto out_unlock;
2184        nfs_layoutget_begin(lo);
2185        spin_unlock(&ino->i_lock);
2186        _add_to_server_list(lo, NFS_SERVER(ino));
2187        return lo;
2188
2189out_unlock:
2190        spin_unlock(&ino->i_lock);
2191        pnfs_put_layout_hdr(lo);
2192        return NULL;
2193}
2194
2195static void _lgopen_prepare_attached(struct nfs4_opendata *data,
2196                                     struct nfs_open_context *ctx)
2197{
2198        struct inode *ino = data->dentry->d_inode;
2199        struct pnfs_layout_range rng = {
2200                .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
2201                          IOMODE_RW: IOMODE_READ,
2202                .offset = 0,
2203                .length = NFS4_MAX_UINT64,
2204        };
2205        struct nfs4_layoutget *lgp;
2206        struct pnfs_layout_hdr *lo;
2207
2208        /* Heuristic: don't send layoutget if we have cached data */
2209        if (rng.iomode == IOMODE_READ &&
2210           (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0))
2211                return;
2212
2213        lo = _pnfs_grab_empty_layout(ino, ctx);
2214        if (!lo)
2215                return;
2216        lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
2217                                             &rng, GFP_KERNEL);
2218        if (!lgp) {
2219                pnfs_clear_first_layoutget(lo);
2220                pnfs_put_layout_hdr(lo);
2221                return;
2222        }
2223        data->lgp = lgp;
2224        data->o_arg.lg_args = &lgp->args;
2225        data->o_res.lg_res = &lgp->res;
2226}
2227
2228static void _lgopen_prepare_floating(struct nfs4_opendata *data,
2229                                     struct nfs_open_context *ctx)
2230{
2231        struct pnfs_layout_range rng = {
2232                .iomode = (data->o_arg.fmode & FMODE_WRITE) ?
2233                          IOMODE_RW: IOMODE_READ,
2234                .offset = 0,
2235                .length = NFS4_MAX_UINT64,
2236        };
2237        struct nfs4_layoutget *lgp;
2238
2239        lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, &current_stateid,
2240                                             &rng, GFP_KERNEL);
2241        if (!lgp)
2242                return;
2243        data->lgp = lgp;
2244        data->o_arg.lg_args = &lgp->args;
2245        data->o_res.lg_res = &lgp->res;
2246}
2247
2248void pnfs_lgopen_prepare(struct nfs4_opendata *data,
2249                         struct nfs_open_context *ctx)
2250{
2251        struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
2252
2253        if (!(pnfs_enabled_sb(server) &&
2254              server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN))
2255                return;
2256        /* Could check on max_ops, but currently hardcoded high enough */
2257        if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
2258                return;
2259        if (data->state)
2260                _lgopen_prepare_attached(data, ctx);
2261        else
2262                _lgopen_prepare_floating(data, ctx);
2263}
2264
2265void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
2266                       struct nfs_open_context *ctx)
2267{
2268        struct pnfs_layout_hdr *lo;
2269        struct pnfs_layout_segment *lseg;
2270        struct nfs_server *srv = NFS_SERVER(ino);
2271        u32 iomode;
2272
2273        if (!lgp)
2274                return;
2275        dprintk("%s: entered with status %i\n", __func__, lgp->res.status);
2276        if (lgp->res.status) {
2277                switch (lgp->res.status) {
2278                default:
2279                        break;
2280                /*
2281                 * Halt lgopen attempts if the server doesn't recognise
2282                 * the "current stateid" value, the layout type, or the
2283                 * layoutget operation as being valid.
2284                 * Also if it complains about too many ops in the compound
2285                 * or of the request/reply being too big.
2286                 */
2287                case -NFS4ERR_BAD_STATEID:
2288                case -NFS4ERR_NOTSUPP:
2289                case -NFS4ERR_REP_TOO_BIG:
2290                case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
2291                case -NFS4ERR_REQ_TOO_BIG:
2292                case -NFS4ERR_TOO_MANY_OPS:
2293                case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
2294                        srv->caps &= ~NFS_CAP_LGOPEN;
2295                }
2296                return;
2297        }
2298        if (!lgp->args.inode) {
2299                lo = _pnfs_grab_empty_layout(ino, ctx);
2300                if (!lo)
2301                        return;
2302                lgp->args.inode = ino;
2303        } else
2304                lo = NFS_I(lgp->args.inode)->layout;
2305
2306        lseg = pnfs_layout_process(lgp);
2307        if (!IS_ERR(lseg)) {
2308                iomode = lgp->args.range.iomode;
2309                pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2310                pnfs_put_lseg(lseg);
2311        }
2312}
2313
2314void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
2315{
2316        if (lgp != NULL) {
2317                struct inode *inode = lgp->args.inode;
2318                if (inode) {
2319                        struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
2320                        pnfs_clear_first_layoutget(lo);
2321                        nfs_layoutget_end(lo);
2322                }
2323                pnfs_layoutget_free(lgp);
2324        }
2325}
2326
2327struct pnfs_layout_segment *
2328pnfs_layout_process(struct nfs4_layoutget *lgp)
2329{
2330        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
2331        struct nfs4_layoutget_res *res = &lgp->res;
2332        struct pnfs_layout_segment *lseg;
2333        struct inode *ino = lo->plh_inode;
2334        LIST_HEAD(free_me);
2335
2336        if (!pnfs_sanity_check_layout_range(&res->range))
2337                return ERR_PTR(-EINVAL);
2338
2339        /* Inject layout blob into I/O device driver */
2340        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
2341        if (IS_ERR_OR_NULL(lseg)) {
2342                if (!lseg)
2343                        lseg = ERR_PTR(-ENOMEM);
2344
2345                dprintk("%s: Could not allocate layout: error %ld\n",
2346                       __func__, PTR_ERR(lseg));
2347                return lseg;
2348        }
2349
2350        pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
2351
2352        spin_lock(&ino->i_lock);
2353        if (pnfs_layoutgets_blocked(lo)) {
2354                dprintk("%s forget reply due to state\n", __func__);
2355                goto out_forget;
2356        }
2357
2358        if (!pnfs_layout_is_valid(lo)) {
2359                /* We have a completely new layout */
2360                pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
2361        } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
2362                /* existing state ID, make sure the sequence number matches. */
2363                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
2364                        dprintk("%s forget reply due to sequence\n", __func__);
2365                        goto out_forget;
2366                }
2367                pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
2368        } else {
2369                /*
2370                 * We got an entirely new state ID.  Mark all segments for the
2371                 * inode invalid, and retry the layoutget
2372                 */
2373                pnfs_mark_layout_stateid_invalid(lo, &free_me);
2374                goto out_forget;
2375        }
2376
2377        pnfs_get_lseg(lseg);
2378        pnfs_layout_insert_lseg(lo, lseg, &free_me);
2379
2380
2381        if (res->return_on_close)
2382                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
2383
2384        spin_unlock(&ino->i_lock);
2385        pnfs_free_lseg_list(&free_me);
2386        return lseg;
2387
2388out_forget:
2389        spin_unlock(&ino->i_lock);
2390        lseg->pls_layout = lo;
2391        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
2392        return ERR_PTR(-EAGAIN);
2393}
2394
2395static int
2396mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg,
2397                struct list_head *tmp_list)
2398{
2399        if (!mark_lseg_invalid(lseg, tmp_list))
2400                return 0;
2401        pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg);
2402        return 1;
2403}
2404
2405/**
2406 * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
2407 * @lo: pointer to layout header
2408 * @tmp_list: list header to be used with pnfs_free_lseg_list()
2409 * @return_range: describe layout segment ranges to be returned
2410 * @seq: stateid seqid to match
2411 *
2412 * This function is mainly intended for use by layoutrecall. It attempts
2413 * to free the layout segment immediately, or else to mark it for return
2414 * as soon as its reference count drops to zero.
2415 *
2416 * Returns
2417 * - 0: a layoutreturn needs to be scheduled.
2418 * - EBUSY: there are layout segment that are still in use.
2419 * - ENOENT: there are no layout segments that need to be returned.
2420 */
2421int
2422pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
2423                                struct list_head *tmp_list,
2424                                const struct pnfs_layout_range *return_range,
2425                                u32 seq)
2426{
2427        struct pnfs_layout_segment *lseg, *next;
2428        int remaining = 0;
2429
2430        dprintk("%s:Begin lo %p\n", __func__, lo);
2431
2432        assert_spin_locked(&lo->plh_inode->i_lock);
2433
2434        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
2435                if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
2436                        dprintk("%s: marking lseg %p iomode %d "
2437                                "offset %llu length %llu\n", __func__,
2438                                lseg, lseg->pls_range.iomode,
2439                                lseg->pls_range.offset,
2440                                lseg->pls_range.length);
2441                        if (mark_lseg_invalid_or_return(lseg, tmp_list))
2442                                continue;
2443                        remaining++;
2444                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
2445                }
2446
2447        if (remaining) {
2448                pnfs_set_plh_return_info(lo, return_range->iomode, seq);
2449                return -EBUSY;
2450        }
2451
2452        if (!list_empty(&lo->plh_return_segs)) {
2453                pnfs_set_plh_return_info(lo, return_range->iomode, seq);
2454                return 0;
2455        }
2456
2457        return -ENOENT;
2458}
2459
2460static void
2461pnfs_mark_layout_for_return(struct inode *inode,
2462                            const struct pnfs_layout_range *range)
2463{
2464        struct pnfs_layout_hdr *lo;
2465        bool return_now = false;
2466
2467        spin_lock(&inode->i_lock);
2468        lo = NFS_I(inode)->layout;
2469        if (!pnfs_layout_is_valid(lo)) {
2470                spin_unlock(&inode->i_lock);
2471                return;
2472        }
2473        pnfs_set_plh_return_info(lo, range->iomode, 0);
2474        /*
2475         * mark all matching lsegs so that we are sure to have no live
2476         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
2477         * for how it works.
2478         */
2479        if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
2480                const struct cred *cred;
2481                nfs4_stateid stateid;
2482                enum pnfs_iomode iomode;
2483
2484                return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
2485                spin_unlock(&inode->i_lock);
2486                if (return_now)
2487                        pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
2488        } else {
2489                spin_unlock(&inode->i_lock);
2490                nfs_commit_inode(inode, 0);
2491        }
2492}
2493
2494void pnfs_error_mark_layout_for_return(struct inode *inode,
2495                                       struct pnfs_layout_segment *lseg)
2496{
2497        struct pnfs_layout_range range = {
2498                .iomode = lseg->pls_range.iomode,
2499                .offset = 0,
2500                .length = NFS4_MAX_UINT64,
2501        };
2502
2503        pnfs_mark_layout_for_return(inode, &range);
2504}
2505EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
2506
2507static bool
2508pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
2509{
2510        return pnfs_layout_is_valid(lo) &&
2511                !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
2512                !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
2513}
2514
2515static struct pnfs_layout_segment *
2516pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
2517                     const struct pnfs_layout_range *range,
2518                     enum pnfs_iomode iomode)
2519{
2520        struct pnfs_layout_segment *lseg;
2521
2522        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
2523                if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
2524                        continue;
2525                if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
2526                        continue;
2527                if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
2528                        continue;
2529                if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
2530                        return lseg;
2531        }
2532        return NULL;
2533}
2534
2535/* Find open file states whose mode matches that of the range */
2536static bool
2537pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
2538                                 const struct pnfs_layout_range *range)
2539{
2540        struct list_head *head;
2541        struct nfs_open_context *ctx;
2542        fmode_t mode = 0;
2543
2544        if (!pnfs_layout_can_be_returned(lo) ||
2545            !pnfs_find_first_lseg(lo, range, range->iomode))
2546                return false;
2547
2548        head = &NFS_I(lo->plh_inode)->open_files;
2549        list_for_each_entry_rcu(ctx, head, list) {
2550                if (ctx->state)
2551                        mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
2552        }
2553
2554        switch (range->iomode) {
2555        default:
2556                break;
2557        case IOMODE_READ:
2558                mode &= ~FMODE_WRITE;
2559                break;
2560        case IOMODE_RW:
2561                if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
2562                        mode &= ~FMODE_READ;
2563        }
2564        return mode == 0;
2565}
2566
2567static int
2568pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
2569{
2570        const struct pnfs_layout_range *range = data;
2571        struct pnfs_layout_hdr *lo;
2572        struct inode *inode;
2573restart:
2574        rcu_read_lock();
2575        list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
2576                if (!pnfs_layout_can_be_returned(lo) ||
2577                    test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
2578                        continue;
2579                inode = lo->plh_inode;
2580                spin_lock(&inode->i_lock);
2581                if (!pnfs_should_return_unused_layout(lo, range)) {
2582                        spin_unlock(&inode->i_lock);
2583                        continue;
2584                }
2585                spin_unlock(&inode->i_lock);
2586                inode = pnfs_grab_inode_layout_hdr(lo);
2587                if (!inode)
2588                        continue;
2589                rcu_read_unlock();
2590                pnfs_mark_layout_for_return(inode, range);
2591                iput(inode);
2592                cond_resched();
2593                goto restart;
2594        }
2595        rcu_read_unlock();
2596        return 0;
2597}
2598
2599void
2600pnfs_layout_return_unused_byclid(struct nfs_client *clp,
2601                                 enum pnfs_iomode iomode)
2602{
2603        struct pnfs_layout_range range = {
2604                .iomode = iomode,
2605                .offset = 0,
2606                .length = NFS4_MAX_UINT64,
2607        };
2608
2609        nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
2610                        &range);
2611}
2612
2613void
2614pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
2615{
2616        if (pgio->pg_lseg == NULL ||
2617            test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
2618                return;
2619        pnfs_put_lseg(pgio->pg_lseg);
2620        pgio->pg_lseg = NULL;
2621}
2622EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
2623
2624/*
2625 * Check for any intersection between the request and the pgio->pg_lseg,
2626 * and if none, put this pgio->pg_lseg away.
2627 */
2628void
2629pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2630{
2631        if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
2632                pnfs_put_lseg(pgio->pg_lseg);
2633                pgio->pg_lseg = NULL;
2634        }
2635}
2636EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
2637
2638void
2639pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2640{
2641        u64 rd_size = req->wb_bytes;
2642
2643        pnfs_generic_pg_check_layout(pgio);
2644        pnfs_generic_pg_check_range(pgio, req);
2645        if (pgio->pg_lseg == NULL) {
2646                if (pgio->pg_dreq == NULL)
2647                        rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
2648                else
2649                        rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
2650
2651                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2652                                                   nfs_req_openctx(req),
2653                                                   req_offset(req),
2654                                                   rd_size,
2655                                                   IOMODE_READ,
2656                                                   false,
2657                                                   GFP_KERNEL);
2658                if (IS_ERR(pgio->pg_lseg)) {
2659                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2660                        pgio->pg_lseg = NULL;
2661                        return;
2662                }
2663        }
2664        /* If no lseg, fall back to read through mds */
2665        if (pgio->pg_lseg == NULL)
2666                nfs_pageio_reset_read_mds(pgio);
2667
2668}
2669EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
2670
2671void
2672pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
2673                           struct nfs_page *req, u64 wb_size)
2674{
2675        pnfs_generic_pg_check_layout(pgio);
2676        pnfs_generic_pg_check_range(pgio, req);
2677        if (pgio->pg_lseg == NULL) {
2678                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2679                                                   nfs_req_openctx(req),
2680                                                   req_offset(req),
2681                                                   wb_size,
2682                                                   IOMODE_RW,
2683                                                   false,
2684                                                   GFP_KERNEL);
2685                if (IS_ERR(pgio->pg_lseg)) {
2686                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2687                        pgio->pg_lseg = NULL;
2688                        return;
2689                }
2690        }
2691        /* If no lseg, fall back to write through mds */
2692        if (pgio->pg_lseg == NULL)
2693                nfs_pageio_reset_write_mds(pgio);
2694}
2695EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
2696
2697void
2698pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
2699{
2700        if (desc->pg_lseg) {
2701                pnfs_put_lseg(desc->pg_lseg);
2702                desc->pg_lseg = NULL;
2703        }
2704}
2705EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
2706
2707/*
2708 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
2709 * of bytes (maximum @req->wb_bytes) that can be coalesced.
2710 */
2711size_t
2712pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
2713                     struct nfs_page *prev, struct nfs_page *req)
2714{
2715        unsigned int size;
2716        u64 seg_end, req_start, seg_left;
2717
2718        size = nfs_generic_pg_test(pgio, prev, req);
2719        if (!size)
2720                return 0;
2721
2722        /*
2723         * 'size' contains the number of bytes left in the current page (up
2724         * to the original size asked for in @req->wb_bytes).
2725         *
2726         * Calculate how many bytes are left in the layout segment
2727         * and if there are less bytes than 'size', return that instead.
2728         *
2729         * Please also note that 'end_offset' is actually the offset of the
2730         * first byte that lies outside the pnfs_layout_range. FIXME?
2731         *
2732         */
2733        if (pgio->pg_lseg) {
2734                seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
2735                                     pgio->pg_lseg->pls_range.length);
2736                req_start = req_offset(req);
2737
2738                /* start of request is past the last byte of this segment */
2739                if (req_start >= seg_end)
2740                        return 0;
2741
2742                /* adjust 'size' iff there are fewer bytes left in the
2743                 * segment than what nfs_generic_pg_test returned */
2744                seg_left = seg_end - req_start;
2745                if (seg_left < size)
2746                        size = (unsigned int)seg_left;
2747        }
2748
2749        return size;
2750}
2751EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
2752
2753int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
2754{
2755        struct nfs_pageio_descriptor pgio;
2756
2757        /* Resend all requests through the MDS */
2758        nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
2759                              hdr->completion_ops);
2760        set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
2761        return nfs_pageio_resend(&pgio, hdr);
2762}
2763EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
2764
2765static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
2766{
2767
2768        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
2769        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2770            PNFS_LAYOUTRET_ON_ERROR) {
2771                pnfs_return_layout(hdr->inode);
2772        }
2773        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2774                hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
2775}
2776
2777/*
2778 * Called by non rpc-based layout drivers
2779 */
2780void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
2781{
2782        if (likely(!hdr->pnfs_error)) {
2783                pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
2784                                hdr->mds_offset + hdr->res.count);
2785                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2786        }
2787        trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
2788        if (unlikely(hdr->pnfs_error))
2789                pnfs_ld_handle_write_error(hdr);
2790        hdr->mds_ops->rpc_release(hdr);
2791}
2792EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
2793
2794static void
2795pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
2796                struct nfs_pgio_header *hdr)
2797{
2798        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2799
2800        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2801                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2802                nfs_pageio_reset_write_mds(desc);
2803                mirror->pg_recoalesce = 1;
2804        }
2805        hdr->completion_ops->completion(hdr);
2806}
2807
2808static enum pnfs_try_status
2809pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
2810                        const struct rpc_call_ops *call_ops,
2811                        struct pnfs_layout_segment *lseg,
2812                        int how)
2813{
2814        struct inode *inode = hdr->inode;
2815        enum pnfs_try_status trypnfs;
2816        struct nfs_server *nfss = NFS_SERVER(inode);
2817
2818        hdr->mds_ops = call_ops;
2819
2820        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
2821                inode->i_ino, hdr->args.count, hdr->args.offset, how);
2822        trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
2823        if (trypnfs != PNFS_NOT_ATTEMPTED)
2824                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
2825        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2826        return trypnfs;
2827}
2828
2829static void
2830pnfs_do_write(struct nfs_pageio_descriptor *desc,
2831              struct nfs_pgio_header *hdr, int how)
2832{
2833        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2834        struct pnfs_layout_segment *lseg = desc->pg_lseg;
2835        enum pnfs_try_status trypnfs;
2836
2837        trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
2838        switch (trypnfs) {
2839        case PNFS_NOT_ATTEMPTED:
2840                pnfs_write_through_mds(desc, hdr);
2841        case PNFS_ATTEMPTED:
2842                break;
2843        case PNFS_TRY_AGAIN:
2844                /* cleanup hdr and prepare to redo pnfs */
2845                if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2846                        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2847                        list_splice_init(&hdr->pages, &mirror->pg_list);
2848                        mirror->pg_recoalesce = 1;
2849                }
2850                hdr->mds_ops->rpc_release(hdr);
2851        }
2852}
2853
2854static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
2855{
2856        pnfs_put_lseg(hdr->lseg);
2857        nfs_pgio_header_free(hdr);
2858}
2859
2860int
2861pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
2862{
2863        struct nfs_pgio_header *hdr;
2864        int ret;
2865
2866        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2867        if (!hdr) {
2868                desc->pg_error = -ENOMEM;
2869                return desc->pg_error;
2870        }
2871        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
2872
2873        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2874        ret = nfs_generic_pgio(desc, hdr);
2875        if (!ret)
2876                pnfs_do_write(desc, hdr, desc->pg_ioflags);
2877
2878        return ret;
2879}
2880EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2881
2882int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2883{
2884        struct nfs_pageio_descriptor pgio;
2885
2886        /* Resend all requests through the MDS */
2887        nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2888        return nfs_pageio_resend(&pgio, hdr);
2889}
2890EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2891
2892static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2893{
2894        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2895        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2896            PNFS_LAYOUTRET_ON_ERROR) {
2897                pnfs_return_layout(hdr->inode);
2898        }
2899        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2900                hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2901}
2902
2903/*
2904 * Called by non rpc-based layout drivers
2905 */
2906void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2907{
2908        if (likely(!hdr->pnfs_error))
2909                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2910        trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2911        if (unlikely(hdr->pnfs_error))
2912                pnfs_ld_handle_read_error(hdr);
2913        hdr->mds_ops->rpc_release(hdr);
2914}
2915EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2916
2917static void
2918pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2919                struct nfs_pgio_header *hdr)
2920{
2921        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2922
2923        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2924                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2925                nfs_pageio_reset_read_mds(desc);
2926                mirror->pg_recoalesce = 1;
2927        }
2928        hdr->completion_ops->completion(hdr);
2929}
2930
2931/*
2932 * Call the appropriate parallel I/O subsystem read function.
2933 */
2934static enum pnfs_try_status
2935pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2936                       const struct rpc_call_ops *call_ops,
2937                       struct pnfs_layout_segment *lseg)
2938{
2939        struct inode *inode = hdr->inode;
2940        struct nfs_server *nfss = NFS_SERVER(inode);
2941        enum pnfs_try_status trypnfs;
2942
2943        hdr->mds_ops = call_ops;
2944
2945        dprintk("%s: Reading ino:%lu %u@%llu\n",
2946                __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
2947
2948        trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
2949        if (trypnfs != PNFS_NOT_ATTEMPTED)
2950                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
2951        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2952        return trypnfs;
2953}
2954
2955/* Resend all requests through pnfs. */
2956void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2957{
2958        struct nfs_pageio_descriptor pgio;
2959
2960        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2961                /* Prevent deadlocks with layoutreturn! */
2962                pnfs_put_lseg(hdr->lseg);
2963                hdr->lseg = NULL;
2964
2965                nfs_pageio_init_read(&pgio, hdr->inode, false,
2966                                        hdr->completion_ops);
2967                hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
2968        }
2969}
2970EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2971
2972static void
2973pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2974{
2975        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2976        struct pnfs_layout_segment *lseg = desc->pg_lseg;
2977        enum pnfs_try_status trypnfs;
2978
2979        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2980        switch (trypnfs) {
2981        case PNFS_NOT_ATTEMPTED:
2982                pnfs_read_through_mds(desc, hdr);
2983        case PNFS_ATTEMPTED:
2984                break;
2985        case PNFS_TRY_AGAIN:
2986                /* cleanup hdr and prepare to redo pnfs */
2987                if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2988                        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2989                        list_splice_init(&hdr->pages, &mirror->pg_list);
2990                        mirror->pg_recoalesce = 1;
2991                }
2992                hdr->mds_ops->rpc_release(hdr);
2993        }
2994}
2995
2996static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2997{
2998        pnfs_put_lseg(hdr->lseg);
2999        nfs_pgio_header_free(hdr);
3000}
3001
3002int
3003pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
3004{
3005        struct nfs_pgio_header *hdr;
3006        int ret;
3007
3008        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
3009        if (!hdr) {
3010                desc->pg_error = -ENOMEM;
3011                return desc->pg_error;
3012        }
3013        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
3014        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
3015        ret = nfs_generic_pgio(desc, hdr);
3016        if (!ret)
3017                pnfs_do_read(desc, hdr);
3018        return ret;
3019}
3020EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
3021
3022static void pnfs_clear_layoutcommitting(struct inode *inode)
3023{
3024        unsigned long *bitlock = &NFS_I(inode)->flags;
3025
3026        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
3027        smp_mb__after_atomic();
3028        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
3029}
3030
3031/*
3032 * There can be multiple RW segments.
3033 */
3034static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
3035{
3036        struct pnfs_layout_segment *lseg;
3037
3038        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
3039                if (lseg->pls_range.iomode == IOMODE_RW &&
3040                    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
3041                        list_add(&lseg->pls_lc_list, listp);
3042        }
3043}
3044
3045static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
3046{
3047        struct pnfs_layout_segment *lseg, *tmp;
3048
3049        /* Matched by references in pnfs_set_layoutcommit */
3050        list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
3051                list_del_init(&lseg->pls_lc_list);
3052                pnfs_put_lseg(lseg);
3053        }
3054
3055        pnfs_clear_layoutcommitting(inode);
3056}
3057
3058void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
3059{
3060        pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
3061}
3062EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
3063
3064void
3065pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
3066                loff_t end_pos)
3067{
3068        struct nfs_inode *nfsi = NFS_I(inode);
3069        bool mark_as_dirty = false;
3070
3071        spin_lock(&inode->i_lock);
3072        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
3073                nfsi->layout->plh_lwb = end_pos;
3074                mark_as_dirty = true;
3075                dprintk("%s: Set layoutcommit for inode %lu ",
3076                        __func__, inode->i_ino);
3077        } else if (end_pos > nfsi->layout->plh_lwb)
3078                nfsi->layout->plh_lwb = end_pos;
3079        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
3080                /* references matched in nfs4_layoutcommit_release */
3081                pnfs_get_lseg(lseg);
3082        }
3083        spin_unlock(&inode->i_lock);
3084        dprintk("%s: lseg %p end_pos %llu\n",
3085                __func__, lseg, nfsi->layout->plh_lwb);
3086
3087        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
3088         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
3089        if (mark_as_dirty)
3090                mark_inode_dirty_sync(inode);
3091}
3092EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
3093
3094void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
3095{
3096        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
3097
3098        if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
3099                nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
3100        pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
3101}
3102
3103/*
3104 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
3105 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
3106 * data to disk to allow the server to recover the data if it crashes.
3107 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
3108 * is off, and a COMMIT is sent to a data server, or
3109 * if WRITEs to a data server return NFS_DATA_SYNC.
3110 */
3111int
3112pnfs_layoutcommit_inode(struct inode *inode, bool sync)
3113{
3114        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
3115        struct nfs4_layoutcommit_data *data;
3116        struct nfs_inode *nfsi = NFS_I(inode);
3117        loff_t end_pos;
3118        int status;
3119
3120        if (!pnfs_layoutcommit_outstanding(inode))
3121                return 0;
3122
3123        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
3124
3125        status = -EAGAIN;
3126        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
3127                if (!sync)
3128                        goto out;
3129                status = wait_on_bit_lock_action(&nfsi->flags,
3130                                NFS_INO_LAYOUTCOMMITTING,
3131                                nfs_wait_bit_killable,
3132                                TASK_KILLABLE);
3133                if (status)
3134                        goto out;
3135        }
3136
3137        status = -ENOMEM;
3138        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
3139        data = kzalloc(sizeof(*data), GFP_NOFS);
3140        if (!data)
3141                goto clear_layoutcommitting;
3142
3143        status = 0;
3144        spin_lock(&inode->i_lock);
3145        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
3146                goto out_unlock;
3147
3148        INIT_LIST_HEAD(&data->lseg_list);
3149        pnfs_list_write_lseg(inode, &data->lseg_list);
3150
3151        end_pos = nfsi->layout->plh_lwb;
3152
3153        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
3154        data->cred = get_cred(nfsi->layout->plh_lc_cred);
3155        spin_unlock(&inode->i_lock);
3156
3157        data->args.inode = inode;
3158        nfs_fattr_init(&data->fattr);
3159        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
3160        data->res.fattr = &data->fattr;
3161        if (end_pos != 0)
3162                data->args.lastbytewritten = end_pos - 1;
3163        else
3164                data->args.lastbytewritten = U64_MAX;
3165        data->res.server = NFS_SERVER(inode);
3166
3167        if (ld->prepare_layoutcommit) {
3168                status = ld->prepare_layoutcommit(&data->args);
3169                if (status) {
3170                        put_cred(data->cred);
3171                        spin_lock(&inode->i_lock);
3172                        set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
3173                        if (end_pos > nfsi->layout->plh_lwb)
3174                                nfsi->layout->plh_lwb = end_pos;
3175                        goto out_unlock;
3176                }
3177        }
3178
3179
3180        status = nfs4_proc_layoutcommit(data, sync);
3181out:
3182        if (status)
3183                mark_inode_dirty_sync(inode);
3184        dprintk("<-- %s status %d\n", __func__, status);
3185        return status;
3186out_unlock:
3187        spin_unlock(&inode->i_lock);
3188        kfree(data);
3189clear_layoutcommitting:
3190        pnfs_clear_layoutcommitting(inode);
3191        goto out;
3192}
3193EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
3194
3195int
3196pnfs_generic_sync(struct inode *inode, bool datasync)
3197{
3198        return pnfs_layoutcommit_inode(inode, true);
3199}
3200EXPORT_SYMBOL_GPL(pnfs_generic_sync);
3201
3202struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
3203{
3204        struct nfs4_threshold *thp;
3205
3206        thp = kzalloc(sizeof(*thp), GFP_NOFS);
3207        if (!thp) {
3208                dprintk("%s mdsthreshold allocation failed\n", __func__);
3209                return NULL;
3210        }
3211        return thp;
3212}
3213
3214#if IS_ENABLED(CONFIG_NFS_V4_2)
3215int
3216pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
3217{
3218        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
3219        struct nfs_server *server = NFS_SERVER(inode);
3220        struct nfs_inode *nfsi = NFS_I(inode);
3221        struct nfs42_layoutstat_data *data;
3222        struct pnfs_layout_hdr *hdr;
3223        int status = 0;
3224
3225        if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
3226                goto out;
3227
3228        if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
3229                goto out;
3230
3231        if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
3232                goto out;
3233
3234        spin_lock(&inode->i_lock);
3235        if (!NFS_I(inode)->layout) {
3236                spin_unlock(&inode->i_lock);
3237                goto out_clear_layoutstats;
3238        }
3239        hdr = NFS_I(inode)->layout;
3240        pnfs_get_layout_hdr(hdr);
3241        spin_unlock(&inode->i_lock);
3242
3243        data = kzalloc(sizeof(*data), gfp_flags);
3244        if (!data) {
3245                status = -ENOMEM;
3246                goto out_put;
3247        }
3248
3249        data->args.fh = NFS_FH(inode);
3250        data->args.inode = inode;
3251        status = ld->prepare_layoutstats(&data->args);
3252        if (status)
3253                goto out_free;
3254
3255        status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
3256
3257out:
3258        dprintk("%s returns %d\n", __func__, status);
3259        return status;
3260
3261out_free:
3262        kfree(data);
3263out_put:
3264        pnfs_put_layout_hdr(hdr);
3265out_clear_layoutstats:
3266        smp_mb__before_atomic();
3267        clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
3268        smp_mb__after_atomic();
3269        goto out;
3270}
3271EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
3272#endif
3273
3274unsigned int layoutstats_timer;
3275module_param(layoutstats_timer, uint, 0644);
3276EXPORT_SYMBOL_GPL(layoutstats_timer);
3277