linux/fs/nfs/pnfs.c
<<
>>
Prefs
   1/*
   2 *  pNFS functions to call and manage layout drivers.
   3 *
   4 *  Copyright (c) 2002 [year of first publication]
   5 *  The Regents of the University of Michigan
   6 *  All Rights Reserved
   7 *
   8 *  Dean Hildebrand <dhildebz@umich.edu>
   9 *
  10 *  Permission is granted to use, copy, create derivative works, and
  11 *  redistribute this software and such derivative works for any purpose,
  12 *  so long as the name of the University of Michigan is not used in
  13 *  any advertising or publicity pertaining to the use or distribution
  14 *  of this software without specific, written prior authorization. If
  15 *  the above copyright notice or any other identification of the
  16 *  University of Michigan is included in any copy of any portion of
  17 *  this software, then the disclaimer below must also be included.
  18 *
  19 *  This software is provided as is, without representation or warranty
  20 *  of any kind either express or implied, including without limitation
  21 *  the implied warranties of merchantability, fitness for a particular
  22 *  purpose, or noninfringement.  The Regents of the University of
  23 *  Michigan shall not be liable for any damages, including special,
  24 *  indirect, incidental, or consequential damages, with respect to any
  25 *  claim arising out of or in connection with the use of the software,
  26 *  even if it has been or is hereafter advised of the possibility of
  27 *  such damages.
  28 */
  29
  30#include <linux/nfs_fs.h>
  31#include <linux/nfs_page.h>
  32#include <linux/module.h>
  33#include "internal.h"
  34#include "pnfs.h"
  35#include "iostat.h"
  36#include "nfs4trace.h"
  37#include "delegation.h"
  38#include "nfs42.h"
  39
  40#define NFSDBG_FACILITY         NFSDBG_PNFS
  41#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
  42
  43/* Locking:
  44 *
  45 * pnfs_spinlock:
  46 *      protects pnfs_modules_tbl.
  47 */
  48static DEFINE_SPINLOCK(pnfs_spinlock);
  49
  50/*
  51 * pnfs_modules_tbl holds all pnfs modules
  52 */
  53static LIST_HEAD(pnfs_modules_tbl);
  54
  55static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
  56
  57/* Return the registered pnfs layout driver module matching given id */
  58static struct pnfs_layoutdriver_type *
  59find_pnfs_driver_locked(u32 id)
  60{
  61        struct pnfs_layoutdriver_type *local;
  62
  63        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
  64                if (local->id == id)
  65                        goto out;
  66        local = NULL;
  67out:
  68        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
  69        return local;
  70}
  71
  72static struct pnfs_layoutdriver_type *
  73find_pnfs_driver(u32 id)
  74{
  75        struct pnfs_layoutdriver_type *local;
  76
  77        spin_lock(&pnfs_spinlock);
  78        local = find_pnfs_driver_locked(id);
  79        if (local != NULL && !try_module_get(local->owner)) {
  80                dprintk("%s: Could not grab reference on module\n", __func__);
  81                local = NULL;
  82        }
  83        spin_unlock(&pnfs_spinlock);
  84        return local;
  85}
  86
  87void
  88unset_pnfs_layoutdriver(struct nfs_server *nfss)
  89{
  90        if (nfss->pnfs_curr_ld) {
  91                if (nfss->pnfs_curr_ld->clear_layoutdriver)
  92                        nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
  93                /* Decrement the MDS count. Purge the deviceid cache if zero */
  94                if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
  95                        nfs4_deviceid_purge_client(nfss->nfs_client);
  96                module_put(nfss->pnfs_curr_ld->owner);
  97        }
  98        nfss->pnfs_curr_ld = NULL;
  99}
 100
 101/*
 102 * Try to set the server's pnfs module to the pnfs layout type specified by id.
 103 * Currently only one pNFS layout driver per filesystem is supported.
 104 *
 105 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
 106 */
 107void
 108set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
 109                      u32 id)
 110{
 111        struct pnfs_layoutdriver_type *ld_type = NULL;
 112
 113        if (id == 0)
 114                goto out_no_driver;
 115        if (!(server->nfs_client->cl_exchange_flags &
 116                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
 117                printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
 118                        __func__, id, server->nfs_client->cl_exchange_flags);
 119                goto out_no_driver;
 120        }
 121        ld_type = find_pnfs_driver(id);
 122        if (!ld_type) {
 123                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
 124                ld_type = find_pnfs_driver(id);
 125                if (!ld_type) {
 126                        dprintk("%s: No pNFS module found for %u.\n",
 127                                __func__, id);
 128                        goto out_no_driver;
 129                }
 130        }
 131        server->pnfs_curr_ld = ld_type;
 132        if (ld_type->set_layoutdriver
 133            && ld_type->set_layoutdriver(server, mntfh)) {
 134                printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
 135                        "driver %u.\n", __func__, id);
 136                module_put(ld_type->owner);
 137                goto out_no_driver;
 138        }
 139        /* Bump the MDS count */
 140        atomic_inc(&server->nfs_client->cl_mds_count);
 141
 142        dprintk("%s: pNFS module for %u set\n", __func__, id);
 143        return;
 144
 145out_no_driver:
 146        dprintk("%s: Using NFSv4 I/O\n", __func__);
 147        server->pnfs_curr_ld = NULL;
 148}
 149
 150int
 151pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 152{
 153        int status = -EINVAL;
 154        struct pnfs_layoutdriver_type *tmp;
 155
 156        if (ld_type->id == 0) {
 157                printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
 158                return status;
 159        }
 160        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
 161                printk(KERN_ERR "NFS: %s Layout driver must provide "
 162                       "alloc_lseg and free_lseg.\n", __func__);
 163                return status;
 164        }
 165
 166        spin_lock(&pnfs_spinlock);
 167        tmp = find_pnfs_driver_locked(ld_type->id);
 168        if (!tmp) {
 169                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
 170                status = 0;
 171                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
 172                        ld_type->name);
 173        } else {
 174                printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
 175                        __func__, ld_type->id);
 176        }
 177        spin_unlock(&pnfs_spinlock);
 178
 179        return status;
 180}
 181EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
 182
 183void
 184pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 185{
 186        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
 187        spin_lock(&pnfs_spinlock);
 188        list_del(&ld_type->pnfs_tblid);
 189        spin_unlock(&pnfs_spinlock);
 190}
 191EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 192
 193/*
 194 * pNFS client layout cache
 195 */
 196
 197/* Need to hold i_lock if caller does not already hold reference */
 198void
 199pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
 200{
 201        atomic_inc(&lo->plh_refcount);
 202}
 203
 204static struct pnfs_layout_hdr *
 205pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 206{
 207        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
 208        return ld->alloc_layout_hdr(ino, gfp_flags);
 209}
 210
 211static void
 212pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 213{
 214        struct nfs_server *server = NFS_SERVER(lo->plh_inode);
 215        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 216
 217        if (!list_empty(&lo->plh_layouts)) {
 218                struct nfs_client *clp = server->nfs_client;
 219
 220                spin_lock(&clp->cl_lock);
 221                list_del_init(&lo->plh_layouts);
 222                spin_unlock(&clp->cl_lock);
 223        }
 224        put_rpccred(lo->plh_lc_cred);
 225        return ld->free_layout_hdr(lo);
 226}
 227
 228static void
 229pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
 230{
 231        struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
 232        dprintk("%s: freeing layout cache %p\n", __func__, lo);
 233        nfsi->layout = NULL;
 234        /* Reset MDS Threshold I/O counters */
 235        nfsi->write_io = 0;
 236        nfsi->read_io = 0;
 237}
 238
 239void
 240pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 241{
 242        struct inode *inode = lo->plh_inode;
 243
 244        pnfs_layoutreturn_before_put_layout_hdr(lo);
 245
 246        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 247                if (!list_empty(&lo->plh_segs))
 248                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
 249                pnfs_detach_layout_hdr(lo);
 250                spin_unlock(&inode->i_lock);
 251                pnfs_free_layout_hdr(lo);
 252        }
 253}
 254
 255/*
 256 * Mark a pnfs_layout_hdr and all associated layout segments as invalid
 257 *
 258 * In order to continue using the pnfs_layout_hdr, a full recovery
 259 * is required.
 260 * Note that caller must hold inode->i_lock.
 261 */
 262int
 263pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
 264                struct list_head *lseg_list)
 265{
 266        struct pnfs_layout_range range = {
 267                .iomode = IOMODE_ANY,
 268                .offset = 0,
 269                .length = NFS4_MAX_UINT64,
 270        };
 271
 272        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 273        return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
 274}
 275
 276static int
 277pnfs_iomode_to_fail_bit(u32 iomode)
 278{
 279        return iomode == IOMODE_RW ?
 280                NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 281}
 282
 283static void
 284pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 285{
 286        lo->plh_retry_timestamp = jiffies;
 287        if (!test_and_set_bit(fail_bit, &lo->plh_flags))
 288                atomic_inc(&lo->plh_refcount);
 289}
 290
 291static void
 292pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 293{
 294        if (test_and_clear_bit(fail_bit, &lo->plh_flags))
 295                atomic_dec(&lo->plh_refcount);
 296}
 297
 298static void
 299pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 300{
 301        struct inode *inode = lo->plh_inode;
 302        struct pnfs_layout_range range = {
 303                .iomode = iomode,
 304                .offset = 0,
 305                .length = NFS4_MAX_UINT64,
 306        };
 307        LIST_HEAD(head);
 308
 309        spin_lock(&inode->i_lock);
 310        pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 311        pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
 312        spin_unlock(&inode->i_lock);
 313        pnfs_free_lseg_list(&head);
 314        dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
 315                        iomode == IOMODE_RW ?  "RW" : "READ");
 316}
 317
 318static bool
 319pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 320{
 321        unsigned long start, end;
 322        int fail_bit = pnfs_iomode_to_fail_bit(iomode);
 323
 324        if (test_bit(fail_bit, &lo->plh_flags) == 0)
 325                return false;
 326        end = jiffies;
 327        start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
 328        if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
 329                /* It is time to retry the failed layoutgets */
 330                pnfs_layout_clear_fail_bit(lo, fail_bit);
 331                return false;
 332        }
 333        return true;
 334}
 335
 336static void
 337pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
 338                const struct pnfs_layout_range *range,
 339                const nfs4_stateid *stateid)
 340{
 341        INIT_LIST_HEAD(&lseg->pls_list);
 342        INIT_LIST_HEAD(&lseg->pls_lc_list);
 343        atomic_set(&lseg->pls_refcount, 1);
 344        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 345        lseg->pls_layout = lo;
 346        lseg->pls_range = *range;
 347        lseg->pls_seq = be32_to_cpu(stateid->seqid);
 348}
 349
 350static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
 351{
 352        struct inode *ino = lseg->pls_layout->plh_inode;
 353
 354        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
 355}
 356
 357static void
 358pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 359                struct pnfs_layout_segment *lseg)
 360{
 361        struct inode *inode = lo->plh_inode;
 362
 363        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 364        list_del_init(&lseg->pls_list);
 365        /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
 366        atomic_dec(&lo->plh_refcount);
 367        if (list_empty(&lo->plh_segs)) {
 368                if (atomic_read(&lo->plh_outstanding) == 0)
 369                        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 370                clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 371        }
 372        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 373}
 374
 375void
 376pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 377{
 378        struct pnfs_layout_hdr *lo;
 379        struct inode *inode;
 380
 381        if (!lseg)
 382                return;
 383
 384        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 385                atomic_read(&lseg->pls_refcount),
 386                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 387
 388        lo = lseg->pls_layout;
 389        inode = lo->plh_inode;
 390
 391        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
 392                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 393                        spin_unlock(&inode->i_lock);
 394                        return;
 395                }
 396                pnfs_get_layout_hdr(lo);
 397                pnfs_layout_remove_lseg(lo, lseg);
 398                spin_unlock(&inode->i_lock);
 399                pnfs_free_lseg(lseg);
 400                pnfs_put_layout_hdr(lo);
 401        }
 402}
 403EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 404
 405static void pnfs_free_lseg_async_work(struct work_struct *work)
 406{
 407        struct pnfs_layout_segment *lseg;
 408        struct pnfs_layout_hdr *lo;
 409
 410        lseg = container_of(work, struct pnfs_layout_segment, pls_work);
 411        lo = lseg->pls_layout;
 412
 413        pnfs_free_lseg(lseg);
 414        pnfs_put_layout_hdr(lo);
 415}
 416
 417static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
 418{
 419        INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
 420        schedule_work(&lseg->pls_work);
 421}
 422
 423void
 424pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
 425{
 426        if (!lseg)
 427                return;
 428
 429        assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
 430
 431        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 432                atomic_read(&lseg->pls_refcount),
 433                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 434        if (atomic_dec_and_test(&lseg->pls_refcount)) {
 435                struct pnfs_layout_hdr *lo = lseg->pls_layout;
 436                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
 437                        return;
 438                pnfs_get_layout_hdr(lo);
 439                pnfs_layout_remove_lseg(lo, lseg);
 440                pnfs_free_lseg_async(lseg);
 441        }
 442}
 443EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
 444
 445static u64
 446end_offset(u64 start, u64 len)
 447{
 448        u64 end;
 449
 450        end = start + len;
 451        return end >= start ? end : NFS4_MAX_UINT64;
 452}
 453
 454/*
 455 * is l2 fully contained in l1?
 456 *   start1                             end1
 457 *   [----------------------------------)
 458 *           start2           end2
 459 *           [----------------)
 460 */
 461static bool
 462pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
 463                 const struct pnfs_layout_range *l2)
 464{
 465        u64 start1 = l1->offset;
 466        u64 end1 = end_offset(start1, l1->length);
 467        u64 start2 = l2->offset;
 468        u64 end2 = end_offset(start2, l2->length);
 469
 470        return (start1 <= start2) && (end1 >= end2);
 471}
 472
 473/*
 474 * is l1 and l2 intersecting?
 475 *   start1                             end1
 476 *   [----------------------------------)
 477 *                              start2           end2
 478 *                              [----------------)
 479 */
 480static bool
 481pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
 482                    const struct pnfs_layout_range *l2)
 483{
 484        u64 start1 = l1->offset;
 485        u64 end1 = end_offset(start1, l1->length);
 486        u64 start2 = l2->offset;
 487        u64 end2 = end_offset(start2, l2->length);
 488
 489        return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
 490               (end2 == NFS4_MAX_UINT64 || end2 > start1);
 491}
 492
 493static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
 494                struct list_head *tmp_list)
 495{
 496        if (!atomic_dec_and_test(&lseg->pls_refcount))
 497                return false;
 498        pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
 499        list_add(&lseg->pls_list, tmp_list);
 500        return true;
 501}
 502
 503/* Returns 1 if lseg is removed from list, 0 otherwise */
 504static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 505                             struct list_head *tmp_list)
 506{
 507        int rv = 0;
 508
 509        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 510                /* Remove the reference keeping the lseg in the
 511                 * list.  It will now be removed when all
 512                 * outstanding io is finished.
 513                 */
 514                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
 515                        atomic_read(&lseg->pls_refcount));
 516                if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
 517                        rv = 1;
 518        }
 519        return rv;
 520}
 521
 522/*
 523 * Compare 2 layout stateid sequence ids, to see which is newer,
 524 * taking into account wraparound issues.
 525 */
 526static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 527{
 528        return (s32)(s1 - s2) > 0;
 529}
 530
 531static bool
 532pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
 533                 const struct pnfs_layout_range *recall_range)
 534{
 535        return (recall_range->iomode == IOMODE_ANY ||
 536                lseg_range->iomode == recall_range->iomode) &&
 537               pnfs_lseg_range_intersecting(lseg_range, recall_range);
 538}
 539
 540static bool
 541pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
 542                const struct pnfs_layout_range *recall_range,
 543                u32 seq)
 544{
 545        if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
 546                return false;
 547        if (recall_range == NULL)
 548                return true;
 549        return pnfs_should_free_range(&lseg->pls_range, recall_range);
 550}
 551
 552/**
 553 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
 554 * @lo: layout header containing the lsegs
 555 * @tmp_list: list head where doomed lsegs should go
 556 * @recall_range: optional recall range argument to match (may be NULL)
 557 * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
 558 *
 559 * Walk the list of lsegs in the layout header, and tear down any that should
 560 * be destroyed. If "recall_range" is specified then the segment must match
 561 * that range. If "seq" is non-zero, then only match segments that were handed
 562 * out at or before that sequence.
 563 *
 564 * Returns number of matching invalid lsegs remaining in list after scanning
 565 * it and purging them.
 566 */
 567int
 568pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 569                            struct list_head *tmp_list,
 570                            const struct pnfs_layout_range *recall_range,
 571                            u32 seq)
 572{
 573        struct pnfs_layout_segment *lseg, *next;
 574        int remaining = 0;
 575
 576        dprintk("%s:Begin lo %p\n", __func__, lo);
 577
 578        if (list_empty(&lo->plh_segs))
 579                return 0;
 580        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 581                if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
 582                        dprintk("%s: freeing lseg %p iomode %d seq %u"
 583                                "offset %llu length %llu\n", __func__,
 584                                lseg, lseg->pls_range.iomode, lseg->pls_seq,
 585                                lseg->pls_range.offset, lseg->pls_range.length);
 586                        if (!mark_lseg_invalid(lseg, tmp_list))
 587                                remaining++;
 588                }
 589        dprintk("%s:Return %i\n", __func__, remaining);
 590        return remaining;
 591}
 592
 593/* note free_me must contain lsegs from a single layout_hdr */
 594void
 595pnfs_free_lseg_list(struct list_head *free_me)
 596{
 597        struct pnfs_layout_segment *lseg, *tmp;
 598
 599        if (list_empty(free_me))
 600                return;
 601
 602        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 603                list_del(&lseg->pls_list);
 604                pnfs_free_lseg(lseg);
 605        }
 606}
 607
 608void
 609pnfs_destroy_layout(struct nfs_inode *nfsi)
 610{
 611        struct pnfs_layout_hdr *lo;
 612        LIST_HEAD(tmp_list);
 613
 614        spin_lock(&nfsi->vfs_inode.i_lock);
 615        lo = nfsi->layout;
 616        if (lo) {
 617                pnfs_get_layout_hdr(lo);
 618                pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
 619                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
 620                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
 621                spin_unlock(&nfsi->vfs_inode.i_lock);
 622                pnfs_free_lseg_list(&tmp_list);
 623                pnfs_put_layout_hdr(lo);
 624        } else
 625                spin_unlock(&nfsi->vfs_inode.i_lock);
 626}
 627EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 628
 629static bool
 630pnfs_layout_add_bulk_destroy_list(struct inode *inode,
 631                struct list_head *layout_list)
 632{
 633        struct pnfs_layout_hdr *lo;
 634        bool ret = false;
 635
 636        spin_lock(&inode->i_lock);
 637        lo = NFS_I(inode)->layout;
 638        if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
 639                pnfs_get_layout_hdr(lo);
 640                list_add(&lo->plh_bulk_destroy, layout_list);
 641                ret = true;
 642        }
 643        spin_unlock(&inode->i_lock);
 644        return ret;
 645}
 646
 647/* Caller must hold rcu_read_lock and clp->cl_lock */
 648static int
 649pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
 650                struct nfs_server *server,
 651                struct list_head *layout_list)
 652{
 653        struct pnfs_layout_hdr *lo, *next;
 654        struct inode *inode;
 655
 656        list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
 657                inode = igrab(lo->plh_inode);
 658                if (inode == NULL)
 659                        continue;
 660                list_del_init(&lo->plh_layouts);
 661                if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
 662                        continue;
 663                rcu_read_unlock();
 664                spin_unlock(&clp->cl_lock);
 665                iput(inode);
 666                spin_lock(&clp->cl_lock);
 667                rcu_read_lock();
 668                return -EAGAIN;
 669        }
 670        return 0;
 671}
 672
 673static int
 674pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 675                bool is_bulk_recall)
 676{
 677        struct pnfs_layout_hdr *lo;
 678        struct inode *inode;
 679        LIST_HEAD(lseg_list);
 680        int ret = 0;
 681
 682        while (!list_empty(layout_list)) {
 683                lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
 684                                plh_bulk_destroy);
 685                dprintk("%s freeing layout for inode %lu\n", __func__,
 686                        lo->plh_inode->i_ino);
 687                inode = lo->plh_inode;
 688
 689                pnfs_layoutcommit_inode(inode, false);
 690
 691                spin_lock(&inode->i_lock);
 692                list_del_init(&lo->plh_bulk_destroy);
 693                if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
 694                        if (is_bulk_recall)
 695                                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 696                        ret = -EAGAIN;
 697                }
 698                spin_unlock(&inode->i_lock);
 699                pnfs_free_lseg_list(&lseg_list);
 700                /* Free all lsegs that are attached to commit buckets */
 701                nfs_commit_inode(inode, 0);
 702                pnfs_put_layout_hdr(lo);
 703                iput(inode);
 704        }
 705        return ret;
 706}
 707
 708int
 709pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
 710                struct nfs_fsid *fsid,
 711                bool is_recall)
 712{
 713        struct nfs_server *server;
 714        LIST_HEAD(layout_list);
 715
 716        spin_lock(&clp->cl_lock);
 717        rcu_read_lock();
 718restart:
 719        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 720                if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
 721                        continue;
 722                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 723                                server,
 724                                &layout_list) != 0)
 725                        goto restart;
 726        }
 727        rcu_read_unlock();
 728        spin_unlock(&clp->cl_lock);
 729
 730        if (list_empty(&layout_list))
 731                return 0;
 732        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 733}
 734
 735int
 736pnfs_destroy_layouts_byclid(struct nfs_client *clp,
 737                bool is_recall)
 738{
 739        struct nfs_server *server;
 740        LIST_HEAD(layout_list);
 741
 742        spin_lock(&clp->cl_lock);
 743        rcu_read_lock();
 744restart:
 745        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 746                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 747                                        server,
 748                                        &layout_list) != 0)
 749                        goto restart;
 750        }
 751        rcu_read_unlock();
 752        spin_unlock(&clp->cl_lock);
 753
 754        if (list_empty(&layout_list))
 755                return 0;
 756        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 757}
 758
 759/*
 760 * Called by the state manger to remove all layouts established under an
 761 * expired lease.
 762 */
 763void
 764pnfs_destroy_all_layouts(struct nfs_client *clp)
 765{
 766        nfs4_deviceid_mark_client_invalid(clp);
 767        nfs4_deviceid_purge_client(clp);
 768
 769        pnfs_destroy_layouts_byclid(clp, false);
 770}
 771
 772static void
 773pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
 774{
 775        lo->plh_return_iomode = 0;
 776        lo->plh_return_seq = 0;
 777        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
 778}
 779
 780/* update lo->plh_stateid with new if is more recent */
 781void
 782pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 783                        bool update_barrier)
 784{
 785        u32 oldseq, newseq, new_barrier = 0;
 786
 787        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 788        newseq = be32_to_cpu(new->seqid);
 789
 790        if (!pnfs_layout_is_valid(lo)) {
 791                nfs4_stateid_copy(&lo->plh_stateid, new);
 792                lo->plh_barrier = newseq;
 793                pnfs_clear_layoutreturn_info(lo);
 794                clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 795                return;
 796        }
 797        if (pnfs_seqid_is_newer(newseq, oldseq)) {
 798                nfs4_stateid_copy(&lo->plh_stateid, new);
 799                /*
 800                 * Because of wraparound, we want to keep the barrier
 801                 * "close" to the current seqids.
 802                 */
 803                new_barrier = newseq - atomic_read(&lo->plh_outstanding);
 804        }
 805        if (update_barrier)
 806                new_barrier = be32_to_cpu(new->seqid);
 807        else if (new_barrier == 0)
 808                return;
 809        if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
 810                lo->plh_barrier = new_barrier;
 811}
 812
 813static bool
 814pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
 815                const nfs4_stateid *stateid)
 816{
 817        u32 seqid = be32_to_cpu(stateid->seqid);
 818
 819        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 820}
 821
 822/* lget is set to 1 if called from inside send_layoutget call chain */
 823static bool
 824pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 825{
 826        return lo->plh_block_lgets ||
 827                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 828}
 829
 830/*
 831 * Get layout from server.
 832 *    for now, assume that whole file layouts are requested.
 833 *    arg->offset: 0
 834 *    arg->length: all ones
 835 */
 836static struct pnfs_layout_segment *
 837send_layoutget(struct pnfs_layout_hdr *lo,
 838           struct nfs_open_context *ctx,
 839           nfs4_stateid *stateid,
 840           const struct pnfs_layout_range *range,
 841           long *timeout, gfp_t gfp_flags)
 842{
 843        struct inode *ino = lo->plh_inode;
 844        struct nfs_server *server = NFS_SERVER(ino);
 845        struct nfs4_layoutget *lgp;
 846        loff_t i_size;
 847
 848        dprintk("--> %s\n", __func__);
 849
 850        /*
 851         * Synchronously retrieve layout information from server and
 852         * store in lseg. If we race with a concurrent seqid morphing
 853         * op, then re-send the LAYOUTGET.
 854         */
 855        lgp = kzalloc(sizeof(*lgp), gfp_flags);
 856        if (lgp == NULL)
 857                return ERR_PTR(-ENOMEM);
 858
 859        i_size = i_size_read(ino);
 860
 861        lgp->args.minlength = PAGE_SIZE;
 862        if (lgp->args.minlength > range->length)
 863                lgp->args.minlength = range->length;
 864        if (range->iomode == IOMODE_READ) {
 865                if (range->offset >= i_size)
 866                        lgp->args.minlength = 0;
 867                else if (i_size - range->offset < lgp->args.minlength)
 868                        lgp->args.minlength = i_size - range->offset;
 869        }
 870        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 871        pnfs_copy_range(&lgp->args.range, range);
 872        lgp->args.type = server->pnfs_curr_ld->id;
 873        lgp->args.inode = ino;
 874        lgp->args.ctx = get_nfs_open_context(ctx);
 875        nfs4_stateid_copy(&lgp->args.stateid, stateid);
 876        lgp->gfp_flags = gfp_flags;
 877        lgp->cred = lo->plh_lc_cred;
 878
 879        return nfs4_proc_layoutget(lgp, timeout, gfp_flags);
 880}
 881
 882static void pnfs_clear_layoutcommit(struct inode *inode,
 883                struct list_head *head)
 884{
 885        struct nfs_inode *nfsi = NFS_I(inode);
 886        struct pnfs_layout_segment *lseg, *tmp;
 887
 888        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
 889                return;
 890        list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
 891                if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
 892                        continue;
 893                pnfs_lseg_dec_and_remove_zero(lseg, head);
 894        }
 895}
 896
 897void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 898{
 899        clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
 900        smp_mb__after_atomic();
 901        wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
 902        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 903}
 904
 905static bool
 906pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
 907                nfs4_stateid *stateid,
 908                enum pnfs_iomode *iomode)
 909{
 910        /* Serialise LAYOUTGET/LAYOUTRETURN */
 911        if (atomic_read(&lo->plh_outstanding) != 0)
 912                return false;
 913        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 914                return false;
 915        pnfs_get_layout_hdr(lo);
 916        if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
 917                if (stateid != NULL) {
 918                        nfs4_stateid_copy(stateid, &lo->plh_stateid);
 919                        if (lo->plh_return_seq != 0)
 920                                stateid->seqid = cpu_to_be32(lo->plh_return_seq);
 921                }
 922                if (iomode != NULL)
 923                        *iomode = lo->plh_return_iomode;
 924                pnfs_clear_layoutreturn_info(lo);
 925                return true;
 926        }
 927        if (stateid != NULL)
 928                nfs4_stateid_copy(stateid, &lo->plh_stateid);
 929        if (iomode != NULL)
 930                *iomode = IOMODE_ANY;
 931        return true;
 932}
 933
 934static int
 935pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
 936                       enum pnfs_iomode iomode, bool sync)
 937{
 938        struct inode *ino = lo->plh_inode;
 939        struct nfs4_layoutreturn *lrp;
 940        int status = 0;
 941
 942        lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
 943        if (unlikely(lrp == NULL)) {
 944                status = -ENOMEM;
 945                spin_lock(&ino->i_lock);
 946                pnfs_clear_layoutreturn_waitbit(lo);
 947                spin_unlock(&ino->i_lock);
 948                pnfs_put_layout_hdr(lo);
 949                goto out;
 950        }
 951
 952        nfs4_stateid_copy(&lrp->args.stateid, stateid);
 953        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
 954        lrp->args.inode = ino;
 955        lrp->args.range.iomode = iomode;
 956        lrp->args.range.offset = 0;
 957        lrp->args.range.length = NFS4_MAX_UINT64;
 958        lrp->args.layout = lo;
 959        lrp->clp = NFS_SERVER(ino)->nfs_client;
 960        lrp->cred = lo->plh_lc_cred;
 961
 962        status = nfs4_proc_layoutreturn(lrp, sync);
 963out:
 964        dprintk("<-- %s status: %d\n", __func__, status);
 965        return status;
 966}
 967
 968/* Return true if layoutreturn is needed */
 969static bool
 970pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
 971{
 972        struct pnfs_layout_segment *s;
 973
 974        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
 975                return false;
 976
 977        /* Defer layoutreturn until all lsegs are done */
 978        list_for_each_entry(s, &lo->plh_segs, pls_list) {
 979                if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
 980                        return false;
 981        }
 982
 983        return true;
 984}
 985
 986static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
 987{
 988        struct inode *inode= lo->plh_inode;
 989
 990        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
 991                return;
 992        spin_lock(&inode->i_lock);
 993        if (pnfs_layout_need_return(lo)) {
 994                nfs4_stateid stateid;
 995                enum pnfs_iomode iomode;
 996                bool send;
 997
 998                send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
 999                spin_unlock(&inode->i_lock);
1000                if (send) {
1001                        /* Send an async layoutreturn so we dont deadlock */
1002                        pnfs_send_layoutreturn(lo, &stateid, iomode, false);
1003                }
1004        } else
1005                spin_unlock(&inode->i_lock);
1006}
1007
1008/*
1009 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
1010 * when the layout segment list is empty.
1011 *
1012 * Note that a pnfs_layout_hdr can exist with an empty layout segment
1013 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
1014 * deviceid is marked invalid.
1015 */
1016int
1017_pnfs_return_layout(struct inode *ino)
1018{
1019        struct pnfs_layout_hdr *lo = NULL;
1020        struct nfs_inode *nfsi = NFS_I(ino);
1021        LIST_HEAD(tmp_list);
1022        nfs4_stateid stateid;
1023        int status = 0, empty;
1024        bool send;
1025
1026        dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
1027
1028        spin_lock(&ino->i_lock);
1029        lo = nfsi->layout;
1030        if (!lo) {
1031                spin_unlock(&ino->i_lock);
1032                dprintk("NFS: %s no layout to return\n", __func__);
1033                goto out;
1034        }
1035        /* Reference matched in nfs4_layoutreturn_release */
1036        pnfs_get_layout_hdr(lo);
1037        empty = list_empty(&lo->plh_segs);
1038        pnfs_clear_layoutcommit(ino, &tmp_list);
1039        pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
1040
1041        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
1042                struct pnfs_layout_range range = {
1043                        .iomode         = IOMODE_ANY,
1044                        .offset         = 0,
1045                        .length         = NFS4_MAX_UINT64,
1046                };
1047                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1048        }
1049
1050        /* Don't send a LAYOUTRETURN if list was initially empty */
1051        if (empty) {
1052                spin_unlock(&ino->i_lock);
1053                dprintk("NFS: %s no layout segments to return\n", __func__);
1054                goto out_put_layout_hdr;
1055        }
1056
1057        send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
1058        spin_unlock(&ino->i_lock);
1059        pnfs_free_lseg_list(&tmp_list);
1060        if (send)
1061                status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
1062out_put_layout_hdr:
1063        pnfs_put_layout_hdr(lo);
1064out:
1065        dprintk("<-- %s status: %d\n", __func__, status);
1066        return status;
1067}
1068EXPORT_SYMBOL_GPL(_pnfs_return_layout);
1069
1070int
1071pnfs_commit_and_return_layout(struct inode *inode)
1072{
1073        struct pnfs_layout_hdr *lo;
1074        int ret;
1075
1076        spin_lock(&inode->i_lock);
1077        lo = NFS_I(inode)->layout;
1078        if (lo == NULL) {
1079                spin_unlock(&inode->i_lock);
1080                return 0;
1081        }
1082        pnfs_get_layout_hdr(lo);
1083        /* Block new layoutgets and read/write to ds */
1084        lo->plh_block_lgets++;
1085        spin_unlock(&inode->i_lock);
1086        filemap_fdatawait(inode->i_mapping);
1087        ret = pnfs_layoutcommit_inode(inode, true);
1088        if (ret == 0)
1089                ret = _pnfs_return_layout(inode);
1090        spin_lock(&inode->i_lock);
1091        lo->plh_block_lgets--;
1092        spin_unlock(&inode->i_lock);
1093        pnfs_put_layout_hdr(lo);
1094        return ret;
1095}
1096
1097bool pnfs_roc(struct inode *ino)
1098{
1099        struct nfs_inode *nfsi = NFS_I(ino);
1100        struct nfs_open_context *ctx;
1101        struct nfs4_state *state;
1102        struct pnfs_layout_hdr *lo;
1103        struct pnfs_layout_segment *lseg, *tmp;
1104        nfs4_stateid stateid;
1105        LIST_HEAD(tmp_list);
1106        bool found = false, layoutreturn = false, roc = false;
1107
1108        spin_lock(&ino->i_lock);
1109        lo = nfsi->layout;
1110        if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
1111                goto out_noroc;
1112
1113        /* no roc if we hold a delegation */
1114        if (nfs4_check_delegation(ino, FMODE_READ))
1115                goto out_noroc;
1116
1117        list_for_each_entry(ctx, &nfsi->open_files, list) {
1118                state = ctx->state;
1119                /* Don't return layout if there is open file state */
1120                if (state != NULL && state->state != 0)
1121                        goto out_noroc;
1122        }
1123
1124        /* always send layoutreturn if being marked so */
1125        if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1126                layoutreturn = pnfs_prepare_layoutreturn(lo,
1127                                &stateid, NULL);
1128
1129        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
1130                /* If we are sending layoutreturn, invalidate all valid lsegs */
1131                if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
1132                        mark_lseg_invalid(lseg, &tmp_list);
1133                        found = true;
1134                }
1135        /* ROC in two conditions:
1136         * 1. there are ROC lsegs
1137         * 2. we don't send layoutreturn
1138         */
1139        if (found && !layoutreturn) {
1140                /* lo ref dropped in pnfs_roc_release() */
1141                pnfs_get_layout_hdr(lo);
1142                roc = true;
1143        }
1144
1145out_noroc:
1146        spin_unlock(&ino->i_lock);
1147        pnfs_free_lseg_list(&tmp_list);
1148        pnfs_layoutcommit_inode(ino, true);
1149        if (layoutreturn)
1150                pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
1151        return roc;
1152}
1153
1154void pnfs_roc_release(struct inode *ino)
1155{
1156        struct pnfs_layout_hdr *lo;
1157
1158        spin_lock(&ino->i_lock);
1159        lo = NFS_I(ino)->layout;
1160        pnfs_clear_layoutreturn_waitbit(lo);
1161        if (atomic_dec_and_test(&lo->plh_refcount)) {
1162                pnfs_detach_layout_hdr(lo);
1163                spin_unlock(&ino->i_lock);
1164                pnfs_free_layout_hdr(lo);
1165        } else
1166                spin_unlock(&ino->i_lock);
1167}
1168
1169void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
1170{
1171        struct pnfs_layout_hdr *lo;
1172
1173        spin_lock(&ino->i_lock);
1174        lo = NFS_I(ino)->layout;
1175        if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
1176                lo->plh_barrier = barrier;
1177        spin_unlock(&ino->i_lock);
1178        trace_nfs4_layoutreturn_on_close(ino, 0);
1179}
1180
1181void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
1182{
1183        struct nfs_inode *nfsi = NFS_I(ino);
1184        struct pnfs_layout_hdr *lo;
1185        u32 current_seqid;
1186
1187        spin_lock(&ino->i_lock);
1188        lo = nfsi->layout;
1189        current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
1190
1191        /* Since close does not return a layout stateid for use as
1192         * a barrier, we choose the worst-case barrier.
1193         */
1194        *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1195        spin_unlock(&ino->i_lock);
1196}
1197
1198bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1199{
1200        struct nfs_inode *nfsi = NFS_I(ino);
1201        struct pnfs_layout_hdr *lo;
1202        bool sleep = false;
1203
1204        /* we might not have grabbed lo reference. so need to check under
1205         * i_lock */
1206        spin_lock(&ino->i_lock);
1207        lo = nfsi->layout;
1208        if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
1209                sleep = true;
1210        spin_unlock(&ino->i_lock);
1211
1212        if (sleep)
1213                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1214
1215        return sleep;
1216}
1217
1218/*
1219 * Compare two layout segments for sorting into layout cache.
1220 * We want to preferentially return RW over RO layouts, so ensure those
1221 * are seen first.
1222 */
1223static s64
1224pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1225           const struct pnfs_layout_range *l2)
1226{
1227        s64 d;
1228
1229        /* high offset > low offset */
1230        d = l1->offset - l2->offset;
1231        if (d)
1232                return d;
1233
1234        /* short length > long length */
1235        d = l2->length - l1->length;
1236        if (d)
1237                return d;
1238
1239        /* read > read/write */
1240        return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1241}
1242
1243static bool
1244pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1245                const struct pnfs_layout_range *l2)
1246{
1247        return pnfs_lseg_range_cmp(l1, l2) > 0;
1248}
1249
1250static bool
1251pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1252                struct pnfs_layout_segment *old)
1253{
1254        return false;
1255}
1256
1257void
1258pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1259                   struct pnfs_layout_segment *lseg,
1260                   bool (*is_after)(const struct pnfs_layout_range *,
1261                           const struct pnfs_layout_range *),
1262                   bool (*do_merge)(struct pnfs_layout_segment *,
1263                           struct pnfs_layout_segment *),
1264                   struct list_head *free_me)
1265{
1266        struct pnfs_layout_segment *lp, *tmp;
1267
1268        dprintk("%s:Begin\n", __func__);
1269
1270        list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1271                if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1272                        continue;
1273                if (do_merge(lseg, lp)) {
1274                        mark_lseg_invalid(lp, free_me);
1275                        continue;
1276                }
1277                if (is_after(&lseg->pls_range, &lp->pls_range))
1278                        continue;
1279                list_add_tail(&lseg->pls_list, &lp->pls_list);
1280                dprintk("%s: inserted lseg %p "
1281                        "iomode %d offset %llu length %llu before "
1282                        "lp %p iomode %d offset %llu length %llu\n",
1283                        __func__, lseg, lseg->pls_range.iomode,
1284                        lseg->pls_range.offset, lseg->pls_range.length,
1285                        lp, lp->pls_range.iomode, lp->pls_range.offset,
1286                        lp->pls_range.length);
1287                goto out;
1288        }
1289        list_add_tail(&lseg->pls_list, &lo->plh_segs);
1290        dprintk("%s: inserted lseg %p "
1291                "iomode %d offset %llu length %llu at tail\n",
1292                __func__, lseg, lseg->pls_range.iomode,
1293                lseg->pls_range.offset, lseg->pls_range.length);
1294out:
1295        pnfs_get_layout_hdr(lo);
1296
1297        dprintk("%s:Return\n", __func__);
1298}
1299EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1300
1301static void
1302pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1303                   struct pnfs_layout_segment *lseg,
1304                   struct list_head *free_me)
1305{
1306        struct inode *inode = lo->plh_inode;
1307        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1308
1309        if (ld->add_lseg != NULL)
1310                ld->add_lseg(lo, lseg, free_me);
1311        else
1312                pnfs_generic_layout_insert_lseg(lo, lseg,
1313                                pnfs_lseg_range_is_after,
1314                                pnfs_lseg_no_merge,
1315                                free_me);
1316}
1317
1318static struct pnfs_layout_hdr *
1319alloc_init_layout_hdr(struct inode *ino,
1320                      struct nfs_open_context *ctx,
1321                      gfp_t gfp_flags)
1322{
1323        struct pnfs_layout_hdr *lo;
1324
1325        lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1326        if (!lo)
1327                return NULL;
1328        atomic_set(&lo->plh_refcount, 1);
1329        INIT_LIST_HEAD(&lo->plh_layouts);
1330        INIT_LIST_HEAD(&lo->plh_segs);
1331        INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1332        lo->plh_inode = ino;
1333        lo->plh_lc_cred = get_rpccred(ctx->cred);
1334        lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
1335        return lo;
1336}
1337
1338static struct pnfs_layout_hdr *
1339pnfs_find_alloc_layout(struct inode *ino,
1340                       struct nfs_open_context *ctx,
1341                       gfp_t gfp_flags)
1342        __releases(&ino->i_lock)
1343        __acquires(&ino->i_lock)
1344{
1345        struct nfs_inode *nfsi = NFS_I(ino);
1346        struct pnfs_layout_hdr *new = NULL;
1347
1348        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1349
1350        if (nfsi->layout != NULL)
1351                goto out_existing;
1352        spin_unlock(&ino->i_lock);
1353        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1354        spin_lock(&ino->i_lock);
1355
1356        if (likely(nfsi->layout == NULL)) {     /* Won the race? */
1357                nfsi->layout = new;
1358                return new;
1359        } else if (new != NULL)
1360                pnfs_free_layout_hdr(new);
1361out_existing:
1362        pnfs_get_layout_hdr(nfsi->layout);
1363        return nfsi->layout;
1364}
1365
1366/*
1367 * iomode matching rules:
1368 * iomode       lseg    strict match
1369 *                      iomode
1370 * -----        -----   ------ -----
1371 * ANY          READ    N/A    true
1372 * ANY          RW      N/A    true
1373 * RW           READ    N/A    false
1374 * RW           RW      N/A    true
1375 * READ         READ    N/A    true
1376 * READ         RW      true   false
1377 * READ         RW      false  true
1378 */
1379static bool
1380pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1381                 const struct pnfs_layout_range *range,
1382                 bool strict_iomode)
1383{
1384        struct pnfs_layout_range range1;
1385
1386        if ((range->iomode == IOMODE_RW &&
1387             ls_range->iomode != IOMODE_RW) ||
1388            (range->iomode != ls_range->iomode &&
1389             strict_iomode == true) ||
1390            !pnfs_lseg_range_intersecting(ls_range, range))
1391                return 0;
1392
1393        /* range1 covers only the first byte in the range */
1394        range1 = *range;
1395        range1.length = 1;
1396        return pnfs_lseg_range_contained(ls_range, &range1);
1397}
1398
1399/*
1400 * lookup range in layout
1401 */
1402static struct pnfs_layout_segment *
1403pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1404                struct pnfs_layout_range *range,
1405                bool strict_iomode)
1406{
1407        struct pnfs_layout_segment *lseg, *ret = NULL;
1408
1409        dprintk("%s:Begin\n", __func__);
1410
1411        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1412                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1413                    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1414                    pnfs_lseg_range_match(&lseg->pls_range, range,
1415                                          strict_iomode)) {
1416                        ret = pnfs_get_lseg(lseg);
1417                        break;
1418                }
1419        }
1420
1421        dprintk("%s:Return lseg %p ref %d\n",
1422                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
1423        return ret;
1424}
1425
1426/*
1427 * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1428 * to the MDS or over pNFS
1429 *
1430 * The nfs_inode read_io and write_io fields are cumulative counters reset
1431 * when there are no layout segments. Note that in pnfs_update_layout iomode
1432 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1433 * WRITE request.
1434 *
1435 * A return of true means use MDS I/O.
1436 *
1437 * From rfc 5661:
1438 * If a file's size is smaller than the file size threshold, data accesses
1439 * SHOULD be sent to the metadata server.  If an I/O request has a length that
1440 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1441 * server.  If both file size and I/O size are provided, the client SHOULD
1442 * reach or exceed  both thresholds before sending its read or write
1443 * requests to the data server.
1444 */
1445static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1446                                     struct inode *ino, int iomode)
1447{
1448        struct nfs4_threshold *t = ctx->mdsthreshold;
1449        struct nfs_inode *nfsi = NFS_I(ino);
1450        loff_t fsize = i_size_read(ino);
1451        bool size = false, size_set = false, io = false, io_set = false, ret = false;
1452
1453        if (t == NULL)
1454                return ret;
1455
1456        dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1457                __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1458
1459        switch (iomode) {
1460        case IOMODE_READ:
1461                if (t->bm & THRESHOLD_RD) {
1462                        dprintk("%s fsize %llu\n", __func__, fsize);
1463                        size_set = true;
1464                        if (fsize < t->rd_sz)
1465                                size = true;
1466                }
1467                if (t->bm & THRESHOLD_RD_IO) {
1468                        dprintk("%s nfsi->read_io %llu\n", __func__,
1469                                nfsi->read_io);
1470                        io_set = true;
1471                        if (nfsi->read_io < t->rd_io_sz)
1472                                io = true;
1473                }
1474                break;
1475        case IOMODE_RW:
1476                if (t->bm & THRESHOLD_WR) {
1477                        dprintk("%s fsize %llu\n", __func__, fsize);
1478                        size_set = true;
1479                        if (fsize < t->wr_sz)
1480                                size = true;
1481                }
1482                if (t->bm & THRESHOLD_WR_IO) {
1483                        dprintk("%s nfsi->write_io %llu\n", __func__,
1484                                nfsi->write_io);
1485                        io_set = true;
1486                        if (nfsi->write_io < t->wr_io_sz)
1487                                io = true;
1488                }
1489                break;
1490        }
1491        if (size_set && io_set) {
1492                if (size && io)
1493                        ret = true;
1494        } else if (size || io)
1495                ret = true;
1496
1497        dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1498        return ret;
1499}
1500
1501static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1502{
1503        /*
1504         * send layoutcommit as it can hold up layoutreturn due to lseg
1505         * reference
1506         */
1507        pnfs_layoutcommit_inode(lo->plh_inode, false);
1508        return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1509                                   nfs_wait_bit_killable,
1510                                   TASK_UNINTERRUPTIBLE);
1511}
1512
1513static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1514{
1515        unsigned long *bitlock = &lo->plh_flags;
1516
1517        clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1518        smp_mb__after_atomic();
1519        wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1520}
1521
1522/*
1523 * Layout segment is retreived from the server if not cached.
1524 * The appropriate layout segment is referenced and returned to the caller.
1525 */
1526struct pnfs_layout_segment *
1527pnfs_update_layout(struct inode *ino,
1528                   struct nfs_open_context *ctx,
1529                   loff_t pos,
1530                   u64 count,
1531                   enum pnfs_iomode iomode,
1532                   bool strict_iomode,
1533                   gfp_t gfp_flags)
1534{
1535        struct pnfs_layout_range arg = {
1536                .iomode = iomode,
1537                .offset = pos,
1538                .length = count,
1539        };
1540        unsigned pg_offset, seq;
1541        struct nfs_server *server = NFS_SERVER(ino);
1542        struct nfs_client *clp = server->nfs_client;
1543        struct pnfs_layout_hdr *lo = NULL;
1544        struct pnfs_layout_segment *lseg = NULL;
1545        nfs4_stateid stateid;
1546        long timeout = 0;
1547        unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
1548        bool first;
1549
1550        if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1551                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1552                                 PNFS_UPDATE_LAYOUT_NO_PNFS);
1553                goto out;
1554        }
1555
1556        if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
1557                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1558                                 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
1559                goto out;
1560        }
1561
1562        if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1563                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1564                                 PNFS_UPDATE_LAYOUT_MDSTHRESH);
1565                goto out;
1566        }
1567
1568lookup_again:
1569        nfs4_client_recover_expired_lease(clp);
1570        first = false;
1571        spin_lock(&ino->i_lock);
1572        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1573        if (lo == NULL) {
1574                spin_unlock(&ino->i_lock);
1575                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1576                                 PNFS_UPDATE_LAYOUT_NOMEM);
1577                goto out;
1578        }
1579
1580        /* Do we even need to bother with this? */
1581        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1582                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1583                                 PNFS_UPDATE_LAYOUT_BULK_RECALL);
1584                dprintk("%s matches recall, use MDS\n", __func__);
1585                goto out_unlock;
1586        }
1587
1588        /* if LAYOUTGET already failed once we don't try again */
1589        if (pnfs_layout_io_test_failed(lo, iomode)) {
1590                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1591                                 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1592                goto out_unlock;
1593        }
1594
1595        lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
1596        if (lseg) {
1597                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1598                                PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1599                goto out_unlock;
1600        }
1601
1602        if (!nfs4_valid_open_stateid(ctx->state)) {
1603                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1604                                PNFS_UPDATE_LAYOUT_INVALID_OPEN);
1605                goto out_unlock;
1606        }
1607
1608        /*
1609         * Choose a stateid for the LAYOUTGET. If we don't have a layout
1610         * stateid, or it has been invalidated, then we must use the open
1611         * stateid.
1612         */
1613        if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
1614
1615                /*
1616                 * The first layoutget for the file. Need to serialize per
1617                 * RFC 5661 Errata 3208.
1618                 */
1619                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1620                                     &lo->plh_flags)) {
1621                        spin_unlock(&ino->i_lock);
1622                        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1623                                    TASK_UNINTERRUPTIBLE);
1624                        pnfs_put_layout_hdr(lo);
1625                        dprintk("%s retrying\n", __func__);
1626                        goto lookup_again;
1627                }
1628
1629                first = true;
1630                do {
1631                        seq = read_seqbegin(&ctx->state->seqlock);
1632                        nfs4_stateid_copy(&stateid, &ctx->state->stateid);
1633                } while (read_seqretry(&ctx->state->seqlock, seq));
1634        } else {
1635                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1636        }
1637
1638        /*
1639         * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1640         * for LAYOUTRETURN even if first is true.
1641         */
1642        if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1643                spin_unlock(&ino->i_lock);
1644                dprintk("%s wait for layoutreturn\n", __func__);
1645                if (pnfs_prepare_to_retry_layoutget(lo)) {
1646                        if (first)
1647                                pnfs_clear_first_layoutget(lo);
1648                        pnfs_put_layout_hdr(lo);
1649                        dprintk("%s retrying\n", __func__);
1650                        trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1651                                        lseg, PNFS_UPDATE_LAYOUT_RETRY);
1652                        goto lookup_again;
1653                }
1654                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1655                                PNFS_UPDATE_LAYOUT_RETURN);
1656                goto out_put_layout_hdr;
1657        }
1658
1659        if (pnfs_layoutgets_blocked(lo)) {
1660                trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1661                                PNFS_UPDATE_LAYOUT_BLOCKED);
1662                goto out_unlock;
1663        }
1664        atomic_inc(&lo->plh_outstanding);
1665        spin_unlock(&ino->i_lock);
1666
1667        if (list_empty(&lo->plh_layouts)) {
1668                /* The lo must be on the clp list if there is any
1669                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1670                 */
1671                spin_lock(&clp->cl_lock);
1672                if (list_empty(&lo->plh_layouts))
1673                        list_add_tail(&lo->plh_layouts, &server->layouts);
1674                spin_unlock(&clp->cl_lock);
1675        }
1676
1677        pg_offset = arg.offset & ~PAGE_MASK;
1678        if (pg_offset) {
1679                arg.offset -= pg_offset;
1680                arg.length += pg_offset;
1681        }
1682        if (arg.length != NFS4_MAX_UINT64)
1683                arg.length = PAGE_ALIGN(arg.length);
1684
1685        lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
1686        trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1687                                 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1688        atomic_dec(&lo->plh_outstanding);
1689        if (IS_ERR(lseg)) {
1690                switch(PTR_ERR(lseg)) {
1691                case -EBUSY:
1692                        if (time_after(jiffies, giveup))
1693                                lseg = NULL;
1694                        break;
1695                case -ERECALLCONFLICT:
1696                        /* Huh? We hold no layouts, how is there a recall? */
1697                        if (first) {
1698                                lseg = NULL;
1699                                break;
1700                        }
1701                        /* Destroy the existing layout and start over */
1702                        if (time_after(jiffies, giveup))
1703                                pnfs_destroy_layout(NFS_I(ino));
1704                        /* Fallthrough */
1705                case -EAGAIN:
1706                        break;
1707                default:
1708                        if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
1709                                pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1710                                lseg = NULL;
1711                        }
1712                        goto out_put_layout_hdr;
1713                }
1714                if (lseg) {
1715                        if (first)
1716                                pnfs_clear_first_layoutget(lo);
1717                        trace_pnfs_update_layout(ino, pos, count,
1718                                iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
1719                        pnfs_put_layout_hdr(lo);
1720                        goto lookup_again;
1721                }
1722        } else {
1723                pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
1724        }
1725
1726out_put_layout_hdr:
1727        if (first)
1728                pnfs_clear_first_layoutget(lo);
1729        pnfs_put_layout_hdr(lo);
1730out:
1731        dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1732                        "(%s, offset: %llu, length: %llu)\n",
1733                        __func__, ino->i_sb->s_id,
1734                        (unsigned long long)NFS_FILEID(ino),
1735                        IS_ERR_OR_NULL(lseg) ? "not found" : "found",
1736                        iomode==IOMODE_RW ?  "read/write" : "read-only",
1737                        (unsigned long long)pos,
1738                        (unsigned long long)count);
1739        return lseg;
1740out_unlock:
1741        spin_unlock(&ino->i_lock);
1742        goto out_put_layout_hdr;
1743}
1744EXPORT_SYMBOL_GPL(pnfs_update_layout);
1745
1746static bool
1747pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
1748{
1749        switch (range->iomode) {
1750        case IOMODE_READ:
1751        case IOMODE_RW:
1752                break;
1753        default:
1754                return false;
1755        }
1756        if (range->offset == NFS4_MAX_UINT64)
1757                return false;
1758        if (range->length == 0)
1759                return false;
1760        if (range->length != NFS4_MAX_UINT64 &&
1761            range->length > NFS4_MAX_UINT64 - range->offset)
1762                return false;
1763        return true;
1764}
1765
1766struct pnfs_layout_segment *
1767pnfs_layout_process(struct nfs4_layoutget *lgp)
1768{
1769        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1770        struct nfs4_layoutget_res *res = &lgp->res;
1771        struct pnfs_layout_segment *lseg;
1772        struct inode *ino = lo->plh_inode;
1773        LIST_HEAD(free_me);
1774
1775        if (!pnfs_sanity_check_layout_range(&res->range))
1776                return ERR_PTR(-EINVAL);
1777
1778        /* Inject layout blob into I/O device driver */
1779        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1780        if (IS_ERR_OR_NULL(lseg)) {
1781                if (!lseg)
1782                        lseg = ERR_PTR(-ENOMEM);
1783
1784                dprintk("%s: Could not allocate layout: error %ld\n",
1785                       __func__, PTR_ERR(lseg));
1786                return lseg;
1787        }
1788
1789        pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
1790
1791        spin_lock(&ino->i_lock);
1792        if (pnfs_layoutgets_blocked(lo)) {
1793                dprintk("%s forget reply due to state\n", __func__);
1794                goto out_forget;
1795        }
1796
1797        if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1798                /* existing state ID, make sure the sequence number matches. */
1799                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1800                        dprintk("%s forget reply due to sequence\n", __func__);
1801                        goto out_forget;
1802                }
1803                pnfs_set_layout_stateid(lo, &res->stateid, false);
1804        } else {
1805                /*
1806                 * We got an entirely new state ID.  Mark all segments for the
1807                 * inode invalid, and don't bother validating the stateid
1808                 * sequence number.
1809                 */
1810                pnfs_mark_layout_stateid_invalid(lo, &free_me);
1811
1812                pnfs_set_layout_stateid(lo, &res->stateid, true);
1813        }
1814
1815        pnfs_get_lseg(lseg);
1816        pnfs_layout_insert_lseg(lo, lseg, &free_me);
1817
1818
1819        if (res->return_on_close)
1820                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1821
1822        spin_unlock(&ino->i_lock);
1823        pnfs_free_lseg_list(&free_me);
1824        return lseg;
1825
1826out_forget:
1827        spin_unlock(&ino->i_lock);
1828        lseg->pls_layout = lo;
1829        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1830        return ERR_PTR(-EAGAIN);
1831}
1832
1833static void
1834pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
1835                         u32 seq)
1836{
1837        if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
1838                iomode = IOMODE_ANY;
1839        lo->plh_return_iomode = iomode;
1840        set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
1841        if (seq != 0) {
1842                WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
1843                lo->plh_return_seq = seq;
1844        }
1845}
1846
1847/**
1848 * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
1849 * @lo: pointer to layout header
1850 * @tmp_list: list header to be used with pnfs_free_lseg_list()
1851 * @return_range: describe layout segment ranges to be returned
1852 *
1853 * This function is mainly intended for use by layoutrecall. It attempts
1854 * to free the layout segment immediately, or else to mark it for return
1855 * as soon as its reference count drops to zero.
1856 */
1857int
1858pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1859                                struct list_head *tmp_list,
1860                                const struct pnfs_layout_range *return_range,
1861                                u32 seq)
1862{
1863        struct pnfs_layout_segment *lseg, *next;
1864        int remaining = 0;
1865
1866        dprintk("%s:Begin lo %p\n", __func__, lo);
1867
1868        if (list_empty(&lo->plh_segs))
1869                return 0;
1870
1871        assert_spin_locked(&lo->plh_inode->i_lock);
1872
1873        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1874                if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
1875                        dprintk("%s: marking lseg %p iomode %d "
1876                                "offset %llu length %llu\n", __func__,
1877                                lseg, lseg->pls_range.iomode,
1878                                lseg->pls_range.offset,
1879                                lseg->pls_range.length);
1880                        if (mark_lseg_invalid(lseg, tmp_list))
1881                                continue;
1882                        remaining++;
1883                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1884                }
1885
1886        if (remaining)
1887                pnfs_set_plh_return_info(lo, return_range->iomode, seq);
1888
1889        return remaining;
1890}
1891
1892void pnfs_error_mark_layout_for_return(struct inode *inode,
1893                                       struct pnfs_layout_segment *lseg)
1894{
1895        struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1896        struct pnfs_layout_range range = {
1897                .iomode = lseg->pls_range.iomode,
1898                .offset = 0,
1899                .length = NFS4_MAX_UINT64,
1900        };
1901        LIST_HEAD(free_me);
1902        bool return_now = false;
1903
1904        spin_lock(&inode->i_lock);
1905        pnfs_set_plh_return_info(lo, range.iomode, 0);
1906        /*
1907         * mark all matching lsegs so that we are sure to have no live
1908         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1909         * for how it works.
1910         */
1911        if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
1912                nfs4_stateid stateid;
1913                enum pnfs_iomode iomode;
1914
1915                return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
1916                spin_unlock(&inode->i_lock);
1917                if (return_now)
1918                        pnfs_send_layoutreturn(lo, &stateid, iomode, false);
1919        } else {
1920                spin_unlock(&inode->i_lock);
1921                nfs_commit_inode(inode, 0);
1922        }
1923        pnfs_free_lseg_list(&free_me);
1924}
1925EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1926
1927void
1928pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1929{
1930        u64 rd_size = req->wb_bytes;
1931
1932        if (pgio->pg_lseg == NULL) {
1933                if (pgio->pg_dreq == NULL)
1934                        rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1935                else
1936                        rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1937
1938                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1939                                                   req->wb_context,
1940                                                   req_offset(req),
1941                                                   rd_size,
1942                                                   IOMODE_READ,
1943                                                   false,
1944                                                   GFP_KERNEL);
1945                if (IS_ERR(pgio->pg_lseg)) {
1946                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
1947                        pgio->pg_lseg = NULL;
1948                        return;
1949                }
1950        }
1951        /* If no lseg, fall back to read through mds */
1952        if (pgio->pg_lseg == NULL)
1953                nfs_pageio_reset_read_mds(pgio);
1954
1955}
1956EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1957
1958void
1959pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1960                           struct nfs_page *req, u64 wb_size)
1961{
1962        if (pgio->pg_lseg == NULL) {
1963                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1964                                                   req->wb_context,
1965                                                   req_offset(req),
1966                                                   wb_size,
1967                                                   IOMODE_RW,
1968                                                   false,
1969                                                   GFP_NOFS);
1970                if (IS_ERR(pgio->pg_lseg)) {
1971                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
1972                        pgio->pg_lseg = NULL;
1973                        return;
1974                }
1975        }
1976        /* If no lseg, fall back to write through mds */
1977        if (pgio->pg_lseg == NULL)
1978                nfs_pageio_reset_write_mds(pgio);
1979}
1980EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1981
1982void
1983pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
1984{
1985        if (desc->pg_lseg) {
1986                pnfs_put_lseg(desc->pg_lseg);
1987                desc->pg_lseg = NULL;
1988        }
1989}
1990EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
1991
1992/*
1993 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1994 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1995 */
1996size_t
1997pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
1998                     struct nfs_page *prev, struct nfs_page *req)
1999{
2000        unsigned int size;
2001        u64 seg_end, req_start, seg_left;
2002
2003        size = nfs_generic_pg_test(pgio, prev, req);
2004        if (!size)
2005                return 0;
2006
2007        /*
2008         * 'size' contains the number of bytes left in the current page (up
2009         * to the original size asked for in @req->wb_bytes).
2010         *
2011         * Calculate how many bytes are left in the layout segment
2012         * and if there are less bytes than 'size', return that instead.
2013         *
2014         * Please also note that 'end_offset' is actually the offset of the
2015         * first byte that lies outside the pnfs_layout_range. FIXME?
2016         *
2017         */
2018        if (pgio->pg_lseg) {
2019                seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
2020                                     pgio->pg_lseg->pls_range.length);
2021                req_start = req_offset(req);
2022                WARN_ON_ONCE(req_start >= seg_end);
2023                /* start of request is past the last byte of this segment */
2024                if (req_start >= seg_end) {
2025                        /* reference the new lseg */
2026                        if (pgio->pg_ops->pg_cleanup)
2027                                pgio->pg_ops->pg_cleanup(pgio);
2028                        if (pgio->pg_ops->pg_init)
2029                                pgio->pg_ops->pg_init(pgio, req);
2030                        return 0;
2031                }
2032
2033                /* adjust 'size' iff there are fewer bytes left in the
2034                 * segment than what nfs_generic_pg_test returned */
2035                seg_left = seg_end - req_start;
2036                if (seg_left < size)
2037                        size = (unsigned int)seg_left;
2038        }
2039
2040        return size;
2041}
2042EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
2043
2044int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
2045{
2046        struct nfs_pageio_descriptor pgio;
2047
2048        /* Resend all requests through the MDS */
2049        nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
2050                              hdr->completion_ops);
2051        set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
2052        return nfs_pageio_resend(&pgio, hdr);
2053}
2054EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
2055
2056static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
2057{
2058
2059        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
2060        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2061            PNFS_LAYOUTRET_ON_ERROR) {
2062                pnfs_return_layout(hdr->inode);
2063        }
2064        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2065                hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
2066}
2067
2068/*
2069 * Called by non rpc-based layout drivers
2070 */
2071void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
2072{
2073        if (likely(!hdr->pnfs_error)) {
2074                pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
2075                                hdr->mds_offset + hdr->res.count);
2076                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2077        }
2078        trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
2079        if (unlikely(hdr->pnfs_error))
2080                pnfs_ld_handle_write_error(hdr);
2081        hdr->mds_ops->rpc_release(hdr);
2082}
2083EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
2084
2085static void
2086pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
2087                struct nfs_pgio_header *hdr)
2088{
2089        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2090
2091        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2092                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2093                nfs_pageio_reset_write_mds(desc);
2094                mirror->pg_recoalesce = 1;
2095        }
2096        nfs_pgio_data_destroy(hdr);
2097        hdr->release(hdr);
2098}
2099
2100static enum pnfs_try_status
2101pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
2102                        const struct rpc_call_ops *call_ops,
2103                        struct pnfs_layout_segment *lseg,
2104                        int how)
2105{
2106        struct inode *inode = hdr->inode;
2107        enum pnfs_try_status trypnfs;
2108        struct nfs_server *nfss = NFS_SERVER(inode);
2109
2110        hdr->mds_ops = call_ops;
2111
2112        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
2113                inode->i_ino, hdr->args.count, hdr->args.offset, how);
2114        trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
2115        if (trypnfs != PNFS_NOT_ATTEMPTED)
2116                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
2117        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2118        return trypnfs;
2119}
2120
2121static void
2122pnfs_do_write(struct nfs_pageio_descriptor *desc,
2123              struct nfs_pgio_header *hdr, int how)
2124{
2125        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2126        struct pnfs_layout_segment *lseg = desc->pg_lseg;
2127        enum pnfs_try_status trypnfs;
2128
2129        trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
2130        if (trypnfs == PNFS_NOT_ATTEMPTED)
2131                pnfs_write_through_mds(desc, hdr);
2132}
2133
2134static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
2135{
2136        pnfs_put_lseg(hdr->lseg);
2137        nfs_pgio_header_free(hdr);
2138}
2139
2140int
2141pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
2142{
2143        struct nfs_pgio_header *hdr;
2144        int ret;
2145
2146        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2147        if (!hdr) {
2148                desc->pg_error = -ENOMEM;
2149                return desc->pg_error;
2150        }
2151        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
2152
2153        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2154        ret = nfs_generic_pgio(desc, hdr);
2155        if (!ret)
2156                pnfs_do_write(desc, hdr, desc->pg_ioflags);
2157
2158        return ret;
2159}
2160EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2161
2162int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2163{
2164        struct nfs_pageio_descriptor pgio;
2165
2166        /* Resend all requests through the MDS */
2167        nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2168        return nfs_pageio_resend(&pgio, hdr);
2169}
2170EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2171
2172static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2173{
2174        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2175        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2176            PNFS_LAYOUTRET_ON_ERROR) {
2177                pnfs_return_layout(hdr->inode);
2178        }
2179        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2180                hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2181}
2182
2183/*
2184 * Called by non rpc-based layout drivers
2185 */
2186void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2187{
2188        if (likely(!hdr->pnfs_error)) {
2189                __nfs4_read_done_cb(hdr);
2190                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2191        }
2192        trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2193        if (unlikely(hdr->pnfs_error))
2194                pnfs_ld_handle_read_error(hdr);
2195        hdr->mds_ops->rpc_release(hdr);
2196}
2197EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2198
2199static void
2200pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2201                struct nfs_pgio_header *hdr)
2202{
2203        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2204
2205        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2206                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2207                nfs_pageio_reset_read_mds(desc);
2208                mirror->pg_recoalesce = 1;
2209        }
2210        nfs_pgio_data_destroy(hdr);
2211        hdr->release(hdr);
2212}
2213
2214/*
2215 * Call the appropriate parallel I/O subsystem read function.
2216 */
2217static enum pnfs_try_status
2218pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2219                       const struct rpc_call_ops *call_ops,
2220                       struct pnfs_layout_segment *lseg)
2221{
2222        struct inode *inode = hdr->inode;
2223        struct nfs_server *nfss = NFS_SERVER(inode);
2224        enum pnfs_try_status trypnfs;
2225
2226        hdr->mds_ops = call_ops;
2227
2228        dprintk("%s: Reading ino:%lu %u@%llu\n",
2229                __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
2230
2231        trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
2232        if (trypnfs != PNFS_NOT_ATTEMPTED)
2233                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
2234        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2235        return trypnfs;
2236}
2237
2238/* Resend all requests through pnfs. */
2239void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2240{
2241        struct nfs_pageio_descriptor pgio;
2242
2243        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2244                nfs_pageio_init_read(&pgio, hdr->inode, false,
2245                                        hdr->completion_ops);
2246                hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
2247        }
2248}
2249EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2250
2251static void
2252pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2253{
2254        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2255        struct pnfs_layout_segment *lseg = desc->pg_lseg;
2256        enum pnfs_try_status trypnfs;
2257
2258        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2259        if (trypnfs == PNFS_TRY_AGAIN)
2260                pnfs_read_resend_pnfs(hdr);
2261        if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status)
2262                pnfs_read_through_mds(desc, hdr);
2263}
2264
2265static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2266{
2267        pnfs_put_lseg(hdr->lseg);
2268        nfs_pgio_header_free(hdr);
2269}
2270
2271int
2272pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
2273{
2274        struct nfs_pgio_header *hdr;
2275        int ret;
2276
2277        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2278        if (!hdr) {
2279                desc->pg_error = -ENOMEM;
2280                return desc->pg_error;
2281        }
2282        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
2283        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2284        ret = nfs_generic_pgio(desc, hdr);
2285        if (!ret)
2286                pnfs_do_read(desc, hdr);
2287        return ret;
2288}
2289EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
2290
2291static void pnfs_clear_layoutcommitting(struct inode *inode)
2292{
2293        unsigned long *bitlock = &NFS_I(inode)->flags;
2294
2295        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
2296        smp_mb__after_atomic();
2297        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
2298}
2299
2300/*
2301 * There can be multiple RW segments.
2302 */
2303static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
2304{
2305        struct pnfs_layout_segment *lseg;
2306
2307        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
2308                if (lseg->pls_range.iomode == IOMODE_RW &&
2309                    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
2310                        list_add(&lseg->pls_lc_list, listp);
2311        }
2312}
2313
2314static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
2315{
2316        struct pnfs_layout_segment *lseg, *tmp;
2317
2318        /* Matched by references in pnfs_set_layoutcommit */
2319        list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
2320                list_del_init(&lseg->pls_lc_list);
2321                pnfs_put_lseg(lseg);
2322        }
2323
2324        pnfs_clear_layoutcommitting(inode);
2325}
2326
2327void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
2328{
2329        pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
2330}
2331EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
2332
2333void
2334pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
2335                loff_t end_pos)
2336{
2337        struct nfs_inode *nfsi = NFS_I(inode);
2338        bool mark_as_dirty = false;
2339
2340        spin_lock(&inode->i_lock);
2341        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
2342                nfsi->layout->plh_lwb = end_pos;
2343                mark_as_dirty = true;
2344                dprintk("%s: Set layoutcommit for inode %lu ",
2345                        __func__, inode->i_ino);
2346        } else if (end_pos > nfsi->layout->plh_lwb)
2347                nfsi->layout->plh_lwb = end_pos;
2348        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
2349                /* references matched in nfs4_layoutcommit_release */
2350                pnfs_get_lseg(lseg);
2351        }
2352        spin_unlock(&inode->i_lock);
2353        dprintk("%s: lseg %p end_pos %llu\n",
2354                __func__, lseg, nfsi->layout->plh_lwb);
2355
2356        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
2357         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
2358        if (mark_as_dirty)
2359                mark_inode_dirty_sync(inode);
2360}
2361EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
2362
2363void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
2364{
2365        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
2366
2367        if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
2368                nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
2369        pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
2370}
2371
2372/*
2373 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
2374 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
2375 * data to disk to allow the server to recover the data if it crashes.
2376 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
2377 * is off, and a COMMIT is sent to a data server, or
2378 * if WRITEs to a data server return NFS_DATA_SYNC.
2379 */
2380int
2381pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2382{
2383        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2384        struct nfs4_layoutcommit_data *data;
2385        struct nfs_inode *nfsi = NFS_I(inode);
2386        loff_t end_pos;
2387        int status;
2388
2389        if (!pnfs_layoutcommit_outstanding(inode))
2390                return 0;
2391
2392        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
2393
2394        status = -EAGAIN;
2395        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
2396                if (!sync)
2397                        goto out;
2398                status = wait_on_bit_lock_action(&nfsi->flags,
2399                                NFS_INO_LAYOUTCOMMITTING,
2400                                nfs_wait_bit_killable,
2401                                TASK_KILLABLE);
2402                if (status)
2403                        goto out;
2404        }
2405
2406        status = -ENOMEM;
2407        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
2408        data = kzalloc(sizeof(*data), GFP_NOFS);
2409        if (!data)
2410                goto clear_layoutcommitting;
2411
2412        status = 0;
2413        spin_lock(&inode->i_lock);
2414        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
2415                goto out_unlock;
2416
2417        INIT_LIST_HEAD(&data->lseg_list);
2418        pnfs_list_write_lseg(inode, &data->lseg_list);
2419
2420        end_pos = nfsi->layout->plh_lwb;
2421
2422        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
2423        spin_unlock(&inode->i_lock);
2424
2425        data->args.inode = inode;
2426        data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
2427        nfs_fattr_init(&data->fattr);
2428        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
2429        data->res.fattr = &data->fattr;
2430        if (end_pos != 0)
2431                data->args.lastbytewritten = end_pos - 1;
2432        else
2433                data->args.lastbytewritten = U64_MAX;
2434        data->res.server = NFS_SERVER(inode);
2435
2436        if (ld->prepare_layoutcommit) {
2437                status = ld->prepare_layoutcommit(&data->args);
2438                if (status) {
2439                        put_rpccred(data->cred);
2440                        spin_lock(&inode->i_lock);
2441                        set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
2442                        if (end_pos > nfsi->layout->plh_lwb)
2443                                nfsi->layout->plh_lwb = end_pos;
2444                        goto out_unlock;
2445                }
2446        }
2447
2448
2449        status = nfs4_proc_layoutcommit(data, sync);
2450out:
2451        if (status)
2452                mark_inode_dirty_sync(inode);
2453        dprintk("<-- %s status %d\n", __func__, status);
2454        return status;
2455out_unlock:
2456        spin_unlock(&inode->i_lock);
2457        kfree(data);
2458clear_layoutcommitting:
2459        pnfs_clear_layoutcommitting(inode);
2460        goto out;
2461}
2462EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
2463
2464int
2465pnfs_generic_sync(struct inode *inode, bool datasync)
2466{
2467        return pnfs_layoutcommit_inode(inode, true);
2468}
2469EXPORT_SYMBOL_GPL(pnfs_generic_sync);
2470
2471struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2472{
2473        struct nfs4_threshold *thp;
2474
2475        thp = kzalloc(sizeof(*thp), GFP_NOFS);
2476        if (!thp) {
2477                dprintk("%s mdsthreshold allocation failed\n", __func__);
2478                return NULL;
2479        }
2480        return thp;
2481}
2482
2483#if IS_ENABLED(CONFIG_NFS_V4_2)
2484int
2485pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2486{
2487        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2488        struct nfs_server *server = NFS_SERVER(inode);
2489        struct nfs_inode *nfsi = NFS_I(inode);
2490        struct nfs42_layoutstat_data *data;
2491        struct pnfs_layout_hdr *hdr;
2492        int status = 0;
2493
2494        if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
2495                goto out;
2496
2497        if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
2498                goto out;
2499
2500        if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
2501                goto out;
2502
2503        spin_lock(&inode->i_lock);
2504        if (!NFS_I(inode)->layout) {
2505                spin_unlock(&inode->i_lock);
2506                goto out_clear_layoutstats;
2507        }
2508        hdr = NFS_I(inode)->layout;
2509        pnfs_get_layout_hdr(hdr);
2510        spin_unlock(&inode->i_lock);
2511
2512        data = kzalloc(sizeof(*data), gfp_flags);
2513        if (!data) {
2514                status = -ENOMEM;
2515                goto out_put;
2516        }
2517
2518        data->args.fh = NFS_FH(inode);
2519        data->args.inode = inode;
2520        status = ld->prepare_layoutstats(&data->args);
2521        if (status)
2522                goto out_free;
2523
2524        status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
2525
2526out:
2527        dprintk("%s returns %d\n", __func__, status);
2528        return status;
2529
2530out_free:
2531        kfree(data);
2532out_put:
2533        pnfs_put_layout_hdr(hdr);
2534out_clear_layoutstats:
2535        smp_mb__before_atomic();
2536        clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
2537        smp_mb__after_atomic();
2538        goto out;
2539}
2540EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
2541#endif
2542
2543unsigned int layoutstats_timer;
2544module_param(layoutstats_timer, uint, 0644);
2545EXPORT_SYMBOL_GPL(layoutstats_timer);
2546