linux/fs/nfs/pnfs.c
<<
>>
Prefs
   1/*
   2 *  pNFS functions to call and manage layout drivers.
   3 *
   4 *  Copyright (c) 2002 [year of first publication]
   5 *  The Regents of the University of Michigan
   6 *  All Rights Reserved
   7 *
   8 *  Dean Hildebrand <dhildebz@umich.edu>
   9 *
  10 *  Permission is granted to use, copy, create derivative works, and
  11 *  redistribute this software and such derivative works for any purpose,
  12 *  so long as the name of the University of Michigan is not used in
  13 *  any advertising or publicity pertaining to the use or distribution
  14 *  of this software without specific, written prior authorization. If
  15 *  the above copyright notice or any other identification of the
  16 *  University of Michigan is included in any copy of any portion of
  17 *  this software, then the disclaimer below must also be included.
  18 *
  19 *  This software is provided as is, without representation or warranty
  20 *  of any kind either express or implied, including without limitation
  21 *  the implied warranties of merchantability, fitness for a particular
  22 *  purpose, or noninfringement.  The Regents of the University of
  23 *  Michigan shall not be liable for any damages, including special,
  24 *  indirect, incidental, or consequential damages, with respect to any
  25 *  claim arising out of or in connection with the use of the software,
  26 *  even if it has been or is hereafter advised of the possibility of
  27 *  such damages.
  28 */
  29
  30#include <linux/nfs_fs.h>
  31#include <linux/nfs_page.h>
  32#include <linux/module.h>
  33#include "internal.h"
  34#include "pnfs.h"
  35#include "iostat.h"
  36#include "nfs4trace.h"
  37#include "delegation.h"
  38#include "nfs42.h"
  39
  40#define NFSDBG_FACILITY         NFSDBG_PNFS
  41#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
  42
  43/* Locking:
  44 *
  45 * pnfs_spinlock:
  46 *      protects pnfs_modules_tbl.
  47 */
  48static DEFINE_SPINLOCK(pnfs_spinlock);
  49
  50/*
  51 * pnfs_modules_tbl holds all pnfs modules
  52 */
  53static LIST_HEAD(pnfs_modules_tbl);
  54
  55static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
  56
  57/* Return the registered pnfs layout driver module matching given id */
  58static struct pnfs_layoutdriver_type *
  59find_pnfs_driver_locked(u32 id)
  60{
  61        struct pnfs_layoutdriver_type *local;
  62
  63        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
  64                if (local->id == id)
  65                        goto out;
  66        local = NULL;
  67out:
  68        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
  69        return local;
  70}
  71
  72static struct pnfs_layoutdriver_type *
  73find_pnfs_driver(u32 id)
  74{
  75        struct pnfs_layoutdriver_type *local;
  76
  77        spin_lock(&pnfs_spinlock);
  78        local = find_pnfs_driver_locked(id);
  79        if (local != NULL && !try_module_get(local->owner)) {
  80                dprintk("%s: Could not grab reference on module\n", __func__);
  81                local = NULL;
  82        }
  83        spin_unlock(&pnfs_spinlock);
  84        return local;
  85}
  86
  87void
  88unset_pnfs_layoutdriver(struct nfs_server *nfss)
  89{
  90        if (nfss->pnfs_curr_ld) {
  91                if (nfss->pnfs_curr_ld->clear_layoutdriver)
  92                        nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
  93                /* Decrement the MDS count. Purge the deviceid cache if zero */
  94                if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
  95                        nfs4_deviceid_purge_client(nfss->nfs_client);
  96                module_put(nfss->pnfs_curr_ld->owner);
  97        }
  98        nfss->pnfs_curr_ld = NULL;
  99}
 100
 101/*
 102 * Try to set the server's pnfs module to the pnfs layout type specified by id.
 103 * Currently only one pNFS layout driver per filesystem is supported.
 104 *
 105 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
 106 */
 107void
 108set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
 109                      u32 id)
 110{
 111        struct pnfs_layoutdriver_type *ld_type = NULL;
 112
 113        if (id == 0)
 114                goto out_no_driver;
 115        if (!(server->nfs_client->cl_exchange_flags &
 116                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
 117                printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
 118                        __func__, id, server->nfs_client->cl_exchange_flags);
 119                goto out_no_driver;
 120        }
 121        ld_type = find_pnfs_driver(id);
 122        if (!ld_type) {
 123                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
 124                ld_type = find_pnfs_driver(id);
 125                if (!ld_type) {
 126                        dprintk("%s: No pNFS module found for %u.\n",
 127                                __func__, id);
 128                        goto out_no_driver;
 129                }
 130        }
 131        server->pnfs_curr_ld = ld_type;
 132        if (ld_type->set_layoutdriver
 133            && ld_type->set_layoutdriver(server, mntfh)) {
 134                printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
 135                        "driver %u.\n", __func__, id);
 136                module_put(ld_type->owner);
 137                goto out_no_driver;
 138        }
 139        /* Bump the MDS count */
 140        atomic_inc(&server->nfs_client->cl_mds_count);
 141
 142        dprintk("%s: pNFS module for %u set\n", __func__, id);
 143        return;
 144
 145out_no_driver:
 146        dprintk("%s: Using NFSv4 I/O\n", __func__);
 147        server->pnfs_curr_ld = NULL;
 148}
 149
 150int
 151pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 152{
 153        int status = -EINVAL;
 154        struct pnfs_layoutdriver_type *tmp;
 155
 156        if (ld_type->id == 0) {
 157                printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
 158                return status;
 159        }
 160        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
 161                printk(KERN_ERR "NFS: %s Layout driver must provide "
 162                       "alloc_lseg and free_lseg.\n", __func__);
 163                return status;
 164        }
 165
 166        spin_lock(&pnfs_spinlock);
 167        tmp = find_pnfs_driver_locked(ld_type->id);
 168        if (!tmp) {
 169                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
 170                status = 0;
 171                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
 172                        ld_type->name);
 173        } else {
 174                printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
 175                        __func__, ld_type->id);
 176        }
 177        spin_unlock(&pnfs_spinlock);
 178
 179        return status;
 180}
 181EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
 182
 183void
 184pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 185{
 186        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
 187        spin_lock(&pnfs_spinlock);
 188        list_del(&ld_type->pnfs_tblid);
 189        spin_unlock(&pnfs_spinlock);
 190}
 191EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 192
 193/*
 194 * pNFS client layout cache
 195 */
 196
 197/* Need to hold i_lock if caller does not already hold reference */
 198void
 199pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
 200{
 201        atomic_inc(&lo->plh_refcount);
 202}
 203
 204static struct pnfs_layout_hdr *
 205pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 206{
 207        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
 208        return ld->alloc_layout_hdr(ino, gfp_flags);
 209}
 210
 211static void
 212pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 213{
 214        struct nfs_server *server = NFS_SERVER(lo->plh_inode);
 215        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 216
 217        if (!list_empty(&lo->plh_layouts)) {
 218                struct nfs_client *clp = server->nfs_client;
 219
 220                spin_lock(&clp->cl_lock);
 221                list_del_init(&lo->plh_layouts);
 222                spin_unlock(&clp->cl_lock);
 223        }
 224        put_rpccred(lo->plh_lc_cred);
 225        return ld->free_layout_hdr(lo);
 226}
 227
 228static void
 229pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
 230{
 231        struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
 232        dprintk("%s: freeing layout cache %p\n", __func__, lo);
 233        nfsi->layout = NULL;
 234        /* Reset MDS Threshold I/O counters */
 235        nfsi->write_io = 0;
 236        nfsi->read_io = 0;
 237}
 238
 239void
 240pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 241{
 242        struct inode *inode = lo->plh_inode;
 243
 244        pnfs_layoutreturn_before_put_layout_hdr(lo);
 245
 246        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 247                if (!list_empty(&lo->plh_segs))
 248                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
 249                pnfs_detach_layout_hdr(lo);
 250                spin_unlock(&inode->i_lock);
 251                pnfs_free_layout_hdr(lo);
 252        }
 253}
 254
 255/*
 256 * Mark a pnfs_layout_hdr and all associated layout segments as invalid
 257 *
 258 * In order to continue using the pnfs_layout_hdr, a full recovery
 259 * is required.
 260 * Note that caller must hold inode->i_lock.
 261 */
 262static int
 263pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
 264                struct list_head *lseg_list)
 265{
 266        struct pnfs_layout_range range = {
 267                .iomode = IOMODE_ANY,
 268                .offset = 0,
 269                .length = NFS4_MAX_UINT64,
 270        };
 271
 272        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 273        return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
 274}
 275
 276static int
 277pnfs_iomode_to_fail_bit(u32 iomode)
 278{
 279        return iomode == IOMODE_RW ?
 280                NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 281}
 282
 283static void
 284pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 285{
 286        lo->plh_retry_timestamp = jiffies;
 287        if (!test_and_set_bit(fail_bit, &lo->plh_flags))
 288                atomic_inc(&lo->plh_refcount);
 289}
 290
 291static void
 292pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 293{
 294        if (test_and_clear_bit(fail_bit, &lo->plh_flags))
 295                atomic_dec(&lo->plh_refcount);
 296}
 297
 298static void
 299pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 300{
 301        struct inode *inode = lo->plh_inode;
 302        struct pnfs_layout_range range = {
 303                .iomode = iomode,
 304                .offset = 0,
 305                .length = NFS4_MAX_UINT64,
 306        };
 307        LIST_HEAD(head);
 308
 309        spin_lock(&inode->i_lock);
 310        pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 311        pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
 312        spin_unlock(&inode->i_lock);
 313        pnfs_free_lseg_list(&head);
 314        dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
 315                        iomode == IOMODE_RW ?  "RW" : "READ");
 316}
 317
 318static bool
 319pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 320{
 321        unsigned long start, end;
 322        int fail_bit = pnfs_iomode_to_fail_bit(iomode);
 323
 324        if (test_bit(fail_bit, &lo->plh_flags) == 0)
 325                return false;
 326        end = jiffies;
 327        start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
 328        if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
 329                /* It is time to retry the failed layoutgets */
 330                pnfs_layout_clear_fail_bit(lo, fail_bit);
 331                return false;
 332        }
 333        return true;
 334}
 335
 336static void
 337init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 338{
 339        INIT_LIST_HEAD(&lseg->pls_list);
 340        INIT_LIST_HEAD(&lseg->pls_lc_list);
 341        atomic_set(&lseg->pls_refcount, 1);
 342        smp_mb();
 343        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 344        lseg->pls_layout = lo;
 345}
 346
 347static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
 348{
 349        struct inode *ino = lseg->pls_layout->plh_inode;
 350
 351        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
 352}
 353
 354static void
 355pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 356                struct pnfs_layout_segment *lseg)
 357{
 358        struct inode *inode = lo->plh_inode;
 359
 360        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 361        list_del_init(&lseg->pls_list);
 362        /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
 363        atomic_dec(&lo->plh_refcount);
 364        if (list_empty(&lo->plh_segs))
 365                clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 366        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 367}
 368
 369void
 370pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 371{
 372        struct pnfs_layout_hdr *lo;
 373        struct inode *inode;
 374
 375        if (!lseg)
 376                return;
 377
 378        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 379                atomic_read(&lseg->pls_refcount),
 380                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 381
 382        lo = lseg->pls_layout;
 383        inode = lo->plh_inode;
 384
 385        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
 386                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 387                        spin_unlock(&inode->i_lock);
 388                        return;
 389                }
 390                pnfs_get_layout_hdr(lo);
 391                pnfs_layout_remove_lseg(lo, lseg);
 392                spin_unlock(&inode->i_lock);
 393                pnfs_free_lseg(lseg);
 394                pnfs_put_layout_hdr(lo);
 395        }
 396}
 397EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 398
 399static void pnfs_free_lseg_async_work(struct work_struct *work)
 400{
 401        struct pnfs_layout_segment *lseg;
 402        struct pnfs_layout_hdr *lo;
 403
 404        lseg = container_of(work, struct pnfs_layout_segment, pls_work);
 405        lo = lseg->pls_layout;
 406
 407        pnfs_free_lseg(lseg);
 408        pnfs_put_layout_hdr(lo);
 409}
 410
 411static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
 412{
 413        INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
 414        schedule_work(&lseg->pls_work);
 415}
 416
 417void
 418pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
 419{
 420        if (!lseg)
 421                return;
 422
 423        assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
 424
 425        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 426                atomic_read(&lseg->pls_refcount),
 427                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 428        if (atomic_dec_and_test(&lseg->pls_refcount)) {
 429                struct pnfs_layout_hdr *lo = lseg->pls_layout;
 430                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
 431                        return;
 432                pnfs_get_layout_hdr(lo);
 433                pnfs_layout_remove_lseg(lo, lseg);
 434                pnfs_free_lseg_async(lseg);
 435        }
 436}
 437EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
 438
 439static u64
 440end_offset(u64 start, u64 len)
 441{
 442        u64 end;
 443
 444        end = start + len;
 445        return end >= start ? end : NFS4_MAX_UINT64;
 446}
 447
 448/*
 449 * is l2 fully contained in l1?
 450 *   start1                             end1
 451 *   [----------------------------------)
 452 *           start2           end2
 453 *           [----------------)
 454 */
 455static bool
 456pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
 457                 const struct pnfs_layout_range *l2)
 458{
 459        u64 start1 = l1->offset;
 460        u64 end1 = end_offset(start1, l1->length);
 461        u64 start2 = l2->offset;
 462        u64 end2 = end_offset(start2, l2->length);
 463
 464        return (start1 <= start2) && (end1 >= end2);
 465}
 466
 467/*
 468 * is l1 and l2 intersecting?
 469 *   start1                             end1
 470 *   [----------------------------------)
 471 *                              start2           end2
 472 *                              [----------------)
 473 */
 474static bool
 475pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
 476                    const struct pnfs_layout_range *l2)
 477{
 478        u64 start1 = l1->offset;
 479        u64 end1 = end_offset(start1, l1->length);
 480        u64 start2 = l2->offset;
 481        u64 end2 = end_offset(start2, l2->length);
 482
 483        return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
 484               (end2 == NFS4_MAX_UINT64 || end2 > start1);
 485}
 486
 487static bool
 488should_free_lseg(const struct pnfs_layout_range *lseg_range,
 489                 const struct pnfs_layout_range *recall_range)
 490{
 491        return (recall_range->iomode == IOMODE_ANY ||
 492                lseg_range->iomode == recall_range->iomode) &&
 493               pnfs_lseg_range_intersecting(lseg_range, recall_range);
 494}
 495
 496static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
 497                struct list_head *tmp_list)
 498{
 499        if (!atomic_dec_and_test(&lseg->pls_refcount))
 500                return false;
 501        pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
 502        list_add(&lseg->pls_list, tmp_list);
 503        return true;
 504}
 505
 506/* Returns 1 if lseg is removed from list, 0 otherwise */
 507static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 508                             struct list_head *tmp_list)
 509{
 510        int rv = 0;
 511
 512        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 513                /* Remove the reference keeping the lseg in the
 514                 * list.  It will now be removed when all
 515                 * outstanding io is finished.
 516                 */
 517                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
 518                        atomic_read(&lseg->pls_refcount));
 519                if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
 520                        rv = 1;
 521        }
 522        return rv;
 523}
 524
 525/* Returns count of number of matching invalid lsegs remaining in list
 526 * after call.
 527 */
 528int
 529pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 530                            struct list_head *tmp_list,
 531                            const struct pnfs_layout_range *recall_range)
 532{
 533        struct pnfs_layout_segment *lseg, *next;
 534        int remaining = 0;
 535
 536        dprintk("%s:Begin lo %p\n", __func__, lo);
 537
 538        if (list_empty(&lo->plh_segs))
 539                return 0;
 540        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 541                if (!recall_range ||
 542                    should_free_lseg(&lseg->pls_range, recall_range)) {
 543                        dprintk("%s: freeing lseg %p iomode %d "
 544                                "offset %llu length %llu\n", __func__,
 545                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
 546                                lseg->pls_range.length);
 547                        if (!mark_lseg_invalid(lseg, tmp_list))
 548                                remaining++;
 549                }
 550        dprintk("%s:Return %i\n", __func__, remaining);
 551        return remaining;
 552}
 553
 554/* note free_me must contain lsegs from a single layout_hdr */
 555void
 556pnfs_free_lseg_list(struct list_head *free_me)
 557{
 558        struct pnfs_layout_segment *lseg, *tmp;
 559
 560        if (list_empty(free_me))
 561                return;
 562
 563        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 564                list_del(&lseg->pls_list);
 565                pnfs_free_lseg(lseg);
 566        }
 567}
 568
 569void
 570pnfs_destroy_layout(struct nfs_inode *nfsi)
 571{
 572        struct pnfs_layout_hdr *lo;
 573        LIST_HEAD(tmp_list);
 574
 575        spin_lock(&nfsi->vfs_inode.i_lock);
 576        lo = nfsi->layout;
 577        if (lo) {
 578                pnfs_get_layout_hdr(lo);
 579                pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
 580                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
 581                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
 582                spin_unlock(&nfsi->vfs_inode.i_lock);
 583                pnfs_free_lseg_list(&tmp_list);
 584                pnfs_put_layout_hdr(lo);
 585        } else
 586                spin_unlock(&nfsi->vfs_inode.i_lock);
 587}
 588EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 589
 590static bool
 591pnfs_layout_add_bulk_destroy_list(struct inode *inode,
 592                struct list_head *layout_list)
 593{
 594        struct pnfs_layout_hdr *lo;
 595        bool ret = false;
 596
 597        spin_lock(&inode->i_lock);
 598        lo = NFS_I(inode)->layout;
 599        if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
 600                pnfs_get_layout_hdr(lo);
 601                list_add(&lo->plh_bulk_destroy, layout_list);
 602                ret = true;
 603        }
 604        spin_unlock(&inode->i_lock);
 605        return ret;
 606}
 607
 608/* Caller must hold rcu_read_lock and clp->cl_lock */
 609static int
 610pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
 611                struct nfs_server *server,
 612                struct list_head *layout_list)
 613{
 614        struct pnfs_layout_hdr *lo, *next;
 615        struct inode *inode;
 616
 617        list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
 618                inode = igrab(lo->plh_inode);
 619                if (inode == NULL)
 620                        continue;
 621                list_del_init(&lo->plh_layouts);
 622                if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
 623                        continue;
 624                rcu_read_unlock();
 625                spin_unlock(&clp->cl_lock);
 626                iput(inode);
 627                spin_lock(&clp->cl_lock);
 628                rcu_read_lock();
 629                return -EAGAIN;
 630        }
 631        return 0;
 632}
 633
 634static int
 635pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 636                bool is_bulk_recall)
 637{
 638        struct pnfs_layout_hdr *lo;
 639        struct inode *inode;
 640        LIST_HEAD(lseg_list);
 641        int ret = 0;
 642
 643        while (!list_empty(layout_list)) {
 644                lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
 645                                plh_bulk_destroy);
 646                dprintk("%s freeing layout for inode %lu\n", __func__,
 647                        lo->plh_inode->i_ino);
 648                inode = lo->plh_inode;
 649
 650                pnfs_layoutcommit_inode(inode, false);
 651
 652                spin_lock(&inode->i_lock);
 653                list_del_init(&lo->plh_bulk_destroy);
 654                if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
 655                        if (is_bulk_recall)
 656                                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 657                        ret = -EAGAIN;
 658                }
 659                spin_unlock(&inode->i_lock);
 660                pnfs_free_lseg_list(&lseg_list);
 661                /* Free all lsegs that are attached to commit buckets */
 662                nfs_commit_inode(inode, 0);
 663                pnfs_put_layout_hdr(lo);
 664                iput(inode);
 665        }
 666        return ret;
 667}
 668
 669int
 670pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
 671                struct nfs_fsid *fsid,
 672                bool is_recall)
 673{
 674        struct nfs_server *server;
 675        LIST_HEAD(layout_list);
 676
 677        spin_lock(&clp->cl_lock);
 678        rcu_read_lock();
 679restart:
 680        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 681                if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
 682                        continue;
 683                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 684                                server,
 685                                &layout_list) != 0)
 686                        goto restart;
 687        }
 688        rcu_read_unlock();
 689        spin_unlock(&clp->cl_lock);
 690
 691        if (list_empty(&layout_list))
 692                return 0;
 693        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 694}
 695
 696int
 697pnfs_destroy_layouts_byclid(struct nfs_client *clp,
 698                bool is_recall)
 699{
 700        struct nfs_server *server;
 701        LIST_HEAD(layout_list);
 702
 703        spin_lock(&clp->cl_lock);
 704        rcu_read_lock();
 705restart:
 706        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 707                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 708                                        server,
 709                                        &layout_list) != 0)
 710                        goto restart;
 711        }
 712        rcu_read_unlock();
 713        spin_unlock(&clp->cl_lock);
 714
 715        if (list_empty(&layout_list))
 716                return 0;
 717        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 718}
 719
 720/*
 721 * Called by the state manger to remove all layouts established under an
 722 * expired lease.
 723 */
 724void
 725pnfs_destroy_all_layouts(struct nfs_client *clp)
 726{
 727        nfs4_deviceid_mark_client_invalid(clp);
 728        nfs4_deviceid_purge_client(clp);
 729
 730        pnfs_destroy_layouts_byclid(clp, false);
 731}
 732
 733/*
 734 * Compare 2 layout stateid sequence ids, to see which is newer,
 735 * taking into account wraparound issues.
 736 */
 737static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 738{
 739        return (s32)(s1 - s2) > 0;
 740}
 741
 742/* update lo->plh_stateid with new if is more recent */
 743void
 744pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 745                        bool update_barrier)
 746{
 747        u32 oldseq, newseq, new_barrier;
 748        int empty = list_empty(&lo->plh_segs);
 749
 750        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 751        newseq = be32_to_cpu(new->seqid);
 752        if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
 753                nfs4_stateid_copy(&lo->plh_stateid, new);
 754                if (update_barrier) {
 755                        new_barrier = be32_to_cpu(new->seqid);
 756                } else {
 757                        /* Because of wraparound, we want to keep the barrier
 758                         * "close" to the current seqids.
 759                         */
 760                        new_barrier = newseq - atomic_read(&lo->plh_outstanding);
 761                }
 762                if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
 763                        lo->plh_barrier = new_barrier;
 764        }
 765}
 766
 767static bool
 768pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
 769                const nfs4_stateid *stateid)
 770{
 771        u32 seqid = be32_to_cpu(stateid->seqid);
 772
 773        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 774}
 775
 776/* lget is set to 1 if called from inside send_layoutget call chain */
 777static bool
 778pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 779{
 780        return lo->plh_block_lgets ||
 781                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 782}
 783
 784int
 785pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 786                              const struct pnfs_layout_range *range,
 787                              struct nfs4_state *open_state)
 788{
 789        int status = 0;
 790
 791        dprintk("--> %s\n", __func__);
 792        spin_lock(&lo->plh_inode->i_lock);
 793        if (pnfs_layoutgets_blocked(lo)) {
 794                status = -EAGAIN;
 795        } else if (!nfs4_valid_open_stateid(open_state)) {
 796                status = -EBADF;
 797        } else if (list_empty(&lo->plh_segs) ||
 798                   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
 799                int seq;
 800
 801                do {
 802                        seq = read_seqbegin(&open_state->seqlock);
 803                        nfs4_stateid_copy(dst, &open_state->stateid);
 804                } while (read_seqretry(&open_state->seqlock, seq));
 805        } else
 806                nfs4_stateid_copy(dst, &lo->plh_stateid);
 807        spin_unlock(&lo->plh_inode->i_lock);
 808        dprintk("<-- %s\n", __func__);
 809        return status;
 810}
 811
 812/*
 813* Get layout from server.
 814*    for now, assume that whole file layouts are requested.
 815*    arg->offset: 0
 816*    arg->length: all ones
 817*/
 818static struct pnfs_layout_segment *
 819send_layoutget(struct pnfs_layout_hdr *lo,
 820           struct nfs_open_context *ctx,
 821           const struct pnfs_layout_range *range,
 822           gfp_t gfp_flags)
 823{
 824        struct inode *ino = lo->plh_inode;
 825        struct nfs_server *server = NFS_SERVER(ino);
 826        struct nfs4_layoutget *lgp;
 827        struct pnfs_layout_segment *lseg;
 828        loff_t i_size;
 829
 830        dprintk("--> %s\n", __func__);
 831
 832        /*
 833         * Synchronously retrieve layout information from server and
 834         * store in lseg. If we race with a concurrent seqid morphing
 835         * op, then re-send the LAYOUTGET.
 836         */
 837        do {
 838                lgp = kzalloc(sizeof(*lgp), gfp_flags);
 839                if (lgp == NULL)
 840                        return NULL;
 841
 842                i_size = i_size_read(ino);
 843
 844                lgp->args.minlength = PAGE_SIZE;
 845                if (lgp->args.minlength > range->length)
 846                        lgp->args.minlength = range->length;
 847                if (range->iomode == IOMODE_READ) {
 848                        if (range->offset >= i_size)
 849                                lgp->args.minlength = 0;
 850                        else if (i_size - range->offset < lgp->args.minlength)
 851                                lgp->args.minlength = i_size - range->offset;
 852                }
 853                lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 854                pnfs_copy_range(&lgp->args.range, range);
 855                lgp->args.type = server->pnfs_curr_ld->id;
 856                lgp->args.inode = ino;
 857                lgp->args.ctx = get_nfs_open_context(ctx);
 858                lgp->gfp_flags = gfp_flags;
 859                lgp->cred = lo->plh_lc_cred;
 860
 861                lseg = nfs4_proc_layoutget(lgp, gfp_flags);
 862        } while (lseg == ERR_PTR(-EAGAIN));
 863
 864        if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
 865                lseg = NULL;
 866        else
 867                pnfs_layout_clear_fail_bit(lo,
 868                                pnfs_iomode_to_fail_bit(range->iomode));
 869
 870        return lseg;
 871}
 872
 873static void pnfs_clear_layoutcommit(struct inode *inode,
 874                struct list_head *head)
 875{
 876        struct nfs_inode *nfsi = NFS_I(inode);
 877        struct pnfs_layout_segment *lseg, *tmp;
 878
 879        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
 880                return;
 881        list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
 882                if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
 883                        continue;
 884                pnfs_lseg_dec_and_remove_zero(lseg, head);
 885        }
 886}
 887
 888void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 889{
 890        clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
 891        smp_mb__after_atomic();
 892        wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
 893        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 894}
 895
 896static bool
 897pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
 898{
 899        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 900                return false;
 901        lo->plh_return_iomode = 0;
 902        pnfs_get_layout_hdr(lo);
 903        clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
 904        return true;
 905}
 906
 907static int
 908pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
 909                       enum pnfs_iomode iomode, bool sync)
 910{
 911        struct inode *ino = lo->plh_inode;
 912        struct nfs4_layoutreturn *lrp;
 913        int status = 0;
 914
 915        lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
 916        if (unlikely(lrp == NULL)) {
 917                status = -ENOMEM;
 918                spin_lock(&ino->i_lock);
 919                pnfs_clear_layoutreturn_waitbit(lo);
 920                spin_unlock(&ino->i_lock);
 921                pnfs_put_layout_hdr(lo);
 922                goto out;
 923        }
 924
 925        nfs4_stateid_copy(&lrp->args.stateid, stateid);
 926        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
 927        lrp->args.inode = ino;
 928        lrp->args.range.iomode = iomode;
 929        lrp->args.range.offset = 0;
 930        lrp->args.range.length = NFS4_MAX_UINT64;
 931        lrp->args.layout = lo;
 932        lrp->clp = NFS_SERVER(ino)->nfs_client;
 933        lrp->cred = lo->plh_lc_cred;
 934
 935        status = nfs4_proc_layoutreturn(lrp, sync);
 936out:
 937        dprintk("<-- %s status: %d\n", __func__, status);
 938        return status;
 939}
 940
 941/* Return true if layoutreturn is needed */
 942static bool
 943pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
 944{
 945        struct pnfs_layout_segment *s;
 946
 947        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
 948                return false;
 949
 950        /* Defer layoutreturn until all lsegs are done */
 951        list_for_each_entry(s, &lo->plh_segs, pls_list) {
 952                if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
 953                        return false;
 954        }
 955
 956        return true;
 957}
 958
 959static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
 960{
 961        struct inode *inode= lo->plh_inode;
 962
 963        if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
 964                return;
 965        spin_lock(&inode->i_lock);
 966        if (pnfs_layout_need_return(lo)) {
 967                nfs4_stateid stateid;
 968                enum pnfs_iomode iomode;
 969                bool send;
 970
 971                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
 972                iomode = lo->plh_return_iomode;
 973                send = pnfs_prepare_layoutreturn(lo);
 974                spin_unlock(&inode->i_lock);
 975                if (send) {
 976                        /* Send an async layoutreturn so we dont deadlock */
 977                        pnfs_send_layoutreturn(lo, &stateid, iomode, false);
 978                }
 979        } else
 980                spin_unlock(&inode->i_lock);
 981}
 982
 983/*
 984 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
 985 * when the layout segment list is empty.
 986 *
 987 * Note that a pnfs_layout_hdr can exist with an empty layout segment
 988 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
 989 * deviceid is marked invalid.
 990 */
 991int
 992_pnfs_return_layout(struct inode *ino)
 993{
 994        struct pnfs_layout_hdr *lo = NULL;
 995        struct nfs_inode *nfsi = NFS_I(ino);
 996        LIST_HEAD(tmp_list);
 997        nfs4_stateid stateid;
 998        int status = 0, empty;
 999        bool send;
1000
1001        dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
1002
1003        spin_lock(&ino->i_lock);
1004        lo = nfsi->layout;
1005        if (!lo) {
1006                spin_unlock(&ino->i_lock);
1007                dprintk("NFS: %s no layout to return\n", __func__);
1008                goto out;
1009        }
1010        nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
1011        /* Reference matched in nfs4_layoutreturn_release */
1012        pnfs_get_layout_hdr(lo);
1013        empty = list_empty(&lo->plh_segs);
1014        pnfs_clear_layoutcommit(ino, &tmp_list);
1015        pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
1016
1017        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
1018                struct pnfs_layout_range range = {
1019                        .iomode         = IOMODE_ANY,
1020                        .offset         = 0,
1021                        .length         = NFS4_MAX_UINT64,
1022                };
1023                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1024        }
1025
1026        /* Don't send a LAYOUTRETURN if list was initially empty */
1027        if (empty) {
1028                spin_unlock(&ino->i_lock);
1029                dprintk("NFS: %s no layout segments to return\n", __func__);
1030                goto out_put_layout_hdr;
1031        }
1032
1033        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1034        send = pnfs_prepare_layoutreturn(lo);
1035        spin_unlock(&ino->i_lock);
1036        pnfs_free_lseg_list(&tmp_list);
1037        if (send)
1038                status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
1039out_put_layout_hdr:
1040        pnfs_put_layout_hdr(lo);
1041out:
1042        dprintk("<-- %s status: %d\n", __func__, status);
1043        return status;
1044}
1045EXPORT_SYMBOL_GPL(_pnfs_return_layout);
1046
1047int
1048pnfs_commit_and_return_layout(struct inode *inode)
1049{
1050        struct pnfs_layout_hdr *lo;
1051        int ret;
1052
1053        spin_lock(&inode->i_lock);
1054        lo = NFS_I(inode)->layout;
1055        if (lo == NULL) {
1056                spin_unlock(&inode->i_lock);
1057                return 0;
1058        }
1059        pnfs_get_layout_hdr(lo);
1060        /* Block new layoutgets and read/write to ds */
1061        lo->plh_block_lgets++;
1062        spin_unlock(&inode->i_lock);
1063        filemap_fdatawait(inode->i_mapping);
1064        ret = pnfs_layoutcommit_inode(inode, true);
1065        if (ret == 0)
1066                ret = _pnfs_return_layout(inode);
1067        spin_lock(&inode->i_lock);
1068        lo->plh_block_lgets--;
1069        spin_unlock(&inode->i_lock);
1070        pnfs_put_layout_hdr(lo);
1071        return ret;
1072}
1073
1074bool pnfs_roc(struct inode *ino)
1075{
1076        struct nfs_inode *nfsi = NFS_I(ino);
1077        struct nfs_open_context *ctx;
1078        struct nfs4_state *state;
1079        struct pnfs_layout_hdr *lo;
1080        struct pnfs_layout_segment *lseg, *tmp;
1081        nfs4_stateid stateid;
1082        LIST_HEAD(tmp_list);
1083        bool found = false, layoutreturn = false, roc = false;
1084
1085        spin_lock(&ino->i_lock);
1086        lo = nfsi->layout;
1087        if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
1088                goto out_noroc;
1089
1090        /* no roc if we hold a delegation */
1091        if (nfs4_check_delegation(ino, FMODE_READ))
1092                goto out_noroc;
1093
1094        list_for_each_entry(ctx, &nfsi->open_files, list) {
1095                state = ctx->state;
1096                /* Don't return layout if there is open file state */
1097                if (state != NULL && state->state != 0)
1098                        goto out_noroc;
1099        }
1100
1101        nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1102        /* always send layoutreturn if being marked so */
1103        if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
1104                                   &lo->plh_flags))
1105                layoutreturn = pnfs_prepare_layoutreturn(lo);
1106
1107        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
1108                /* If we are sending layoutreturn, invalidate all valid lsegs */
1109                if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
1110                        mark_lseg_invalid(lseg, &tmp_list);
1111                        found = true;
1112                }
1113        /* ROC in two conditions:
1114         * 1. there are ROC lsegs
1115         * 2. we don't send layoutreturn
1116         */
1117        if (found && !layoutreturn) {
1118                /* lo ref dropped in pnfs_roc_release() */
1119                pnfs_get_layout_hdr(lo);
1120                roc = true;
1121        }
1122
1123out_noroc:
1124        spin_unlock(&ino->i_lock);
1125        pnfs_free_lseg_list(&tmp_list);
1126        pnfs_layoutcommit_inode(ino, true);
1127        if (layoutreturn)
1128                pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
1129        return roc;
1130}
1131
1132void pnfs_roc_release(struct inode *ino)
1133{
1134        struct pnfs_layout_hdr *lo;
1135
1136        spin_lock(&ino->i_lock);
1137        lo = NFS_I(ino)->layout;
1138        pnfs_clear_layoutreturn_waitbit(lo);
1139        if (atomic_dec_and_test(&lo->plh_refcount)) {
1140                pnfs_detach_layout_hdr(lo);
1141                spin_unlock(&ino->i_lock);
1142                pnfs_free_layout_hdr(lo);
1143        } else
1144                spin_unlock(&ino->i_lock);
1145}
1146
1147void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
1148{
1149        struct pnfs_layout_hdr *lo;
1150
1151        spin_lock(&ino->i_lock);
1152        lo = NFS_I(ino)->layout;
1153        pnfs_mark_layout_returned_if_empty(lo);
1154        if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
1155                lo->plh_barrier = barrier;
1156        spin_unlock(&ino->i_lock);
1157        trace_nfs4_layoutreturn_on_close(ino, 0);
1158}
1159
1160void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
1161{
1162        struct nfs_inode *nfsi = NFS_I(ino);
1163        struct pnfs_layout_hdr *lo;
1164        u32 current_seqid;
1165
1166        spin_lock(&ino->i_lock);
1167        lo = nfsi->layout;
1168        current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
1169
1170        /* Since close does not return a layout stateid for use as
1171         * a barrier, we choose the worst-case barrier.
1172         */
1173        *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1174        spin_unlock(&ino->i_lock);
1175}
1176
1177bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1178{
1179        struct nfs_inode *nfsi = NFS_I(ino);
1180        struct pnfs_layout_hdr *lo;
1181        bool sleep = false;
1182
1183        /* we might not have grabbed lo reference. so need to check under
1184         * i_lock */
1185        spin_lock(&ino->i_lock);
1186        lo = nfsi->layout;
1187        if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
1188                sleep = true;
1189        spin_unlock(&ino->i_lock);
1190
1191        if (sleep)
1192                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1193
1194        return sleep;
1195}
1196
1197/*
1198 * Compare two layout segments for sorting into layout cache.
1199 * We want to preferentially return RW over RO layouts, so ensure those
1200 * are seen first.
1201 */
1202static s64
1203pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1204           const struct pnfs_layout_range *l2)
1205{
1206        s64 d;
1207
1208        /* high offset > low offset */
1209        d = l1->offset - l2->offset;
1210        if (d)
1211                return d;
1212
1213        /* short length > long length */
1214        d = l2->length - l1->length;
1215        if (d)
1216                return d;
1217
1218        /* read > read/write */
1219        return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1220}
1221
1222static bool
1223pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1224                const struct pnfs_layout_range *l2)
1225{
1226        return pnfs_lseg_range_cmp(l1, l2) > 0;
1227}
1228
1229static bool
1230pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1231                struct pnfs_layout_segment *old)
1232{
1233        return false;
1234}
1235
1236void
1237pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1238                   struct pnfs_layout_segment *lseg,
1239                   bool (*is_after)(const struct pnfs_layout_range *,
1240                           const struct pnfs_layout_range *),
1241                   bool (*do_merge)(struct pnfs_layout_segment *,
1242                           struct pnfs_layout_segment *),
1243                   struct list_head *free_me)
1244{
1245        struct pnfs_layout_segment *lp, *tmp;
1246
1247        dprintk("%s:Begin\n", __func__);
1248
1249        list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1250                if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1251                        continue;
1252                if (do_merge(lseg, lp)) {
1253                        mark_lseg_invalid(lp, free_me);
1254                        continue;
1255                }
1256                if (is_after(&lseg->pls_range, &lp->pls_range))
1257                        continue;
1258                list_add_tail(&lseg->pls_list, &lp->pls_list);
1259                dprintk("%s: inserted lseg %p "
1260                        "iomode %d offset %llu length %llu before "
1261                        "lp %p iomode %d offset %llu length %llu\n",
1262                        __func__, lseg, lseg->pls_range.iomode,
1263                        lseg->pls_range.offset, lseg->pls_range.length,
1264                        lp, lp->pls_range.iomode, lp->pls_range.offset,
1265                        lp->pls_range.length);
1266                goto out;
1267        }
1268        list_add_tail(&lseg->pls_list, &lo->plh_segs);
1269        dprintk("%s: inserted lseg %p "
1270                "iomode %d offset %llu length %llu at tail\n",
1271                __func__, lseg, lseg->pls_range.iomode,
1272                lseg->pls_range.offset, lseg->pls_range.length);
1273out:
1274        pnfs_get_layout_hdr(lo);
1275
1276        dprintk("%s:Return\n", __func__);
1277}
1278EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1279
1280static void
1281pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1282                   struct pnfs_layout_segment *lseg,
1283                   struct list_head *free_me)
1284{
1285        struct inode *inode = lo->plh_inode;
1286        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1287
1288        if (ld->add_lseg != NULL)
1289                ld->add_lseg(lo, lseg, free_me);
1290        else
1291                pnfs_generic_layout_insert_lseg(lo, lseg,
1292                                pnfs_lseg_range_is_after,
1293                                pnfs_lseg_no_merge,
1294                                free_me);
1295}
1296
1297static struct pnfs_layout_hdr *
1298alloc_init_layout_hdr(struct inode *ino,
1299                      struct nfs_open_context *ctx,
1300                      gfp_t gfp_flags)
1301{
1302        struct pnfs_layout_hdr *lo;
1303
1304        lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1305        if (!lo)
1306                return NULL;
1307        atomic_set(&lo->plh_refcount, 1);
1308        INIT_LIST_HEAD(&lo->plh_layouts);
1309        INIT_LIST_HEAD(&lo->plh_segs);
1310        INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1311        lo->plh_inode = ino;
1312        lo->plh_lc_cred = get_rpccred(ctx->cred);
1313        return lo;
1314}
1315
1316static struct pnfs_layout_hdr *
1317pnfs_find_alloc_layout(struct inode *ino,
1318                       struct nfs_open_context *ctx,
1319                       gfp_t gfp_flags)
1320{
1321        struct nfs_inode *nfsi = NFS_I(ino);
1322        struct pnfs_layout_hdr *new = NULL;
1323
1324        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1325
1326        if (nfsi->layout != NULL)
1327                goto out_existing;
1328        spin_unlock(&ino->i_lock);
1329        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1330        spin_lock(&ino->i_lock);
1331
1332        if (likely(nfsi->layout == NULL)) {     /* Won the race? */
1333                nfsi->layout = new;
1334                return new;
1335        } else if (new != NULL)
1336                pnfs_free_layout_hdr(new);
1337out_existing:
1338        pnfs_get_layout_hdr(nfsi->layout);
1339        return nfsi->layout;
1340}
1341
1342/*
1343 * iomode matching rules:
1344 * iomode       lseg    match
1345 * -----        -----   -----
1346 * ANY          READ    true
1347 * ANY          RW      true
1348 * RW           READ    false
1349 * RW           RW      true
1350 * READ         READ    true
1351 * READ         RW      true
1352 */
1353static bool
1354pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1355                 const struct pnfs_layout_range *range)
1356{
1357        struct pnfs_layout_range range1;
1358
1359        if ((range->iomode == IOMODE_RW &&
1360             ls_range->iomode != IOMODE_RW) ||
1361            !pnfs_lseg_range_intersecting(ls_range, range))
1362                return 0;
1363
1364        /* range1 covers only the first byte in the range */
1365        range1 = *range;
1366        range1.length = 1;
1367        return pnfs_lseg_range_contained(ls_range, &range1);
1368}
1369
1370/*
1371 * lookup range in layout
1372 */
1373static struct pnfs_layout_segment *
1374pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1375                struct pnfs_layout_range *range)
1376{
1377        struct pnfs_layout_segment *lseg, *ret = NULL;
1378
1379        dprintk("%s:Begin\n", __func__);
1380
1381        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1382                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1383                    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1384                    pnfs_lseg_range_match(&lseg->pls_range, range)) {
1385                        ret = pnfs_get_lseg(lseg);
1386                        break;
1387                }
1388        }
1389
1390        dprintk("%s:Return lseg %p ref %d\n",
1391                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
1392        return ret;
1393}
1394
1395/*
1396 * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1397 * to the MDS or over pNFS
1398 *
1399 * The nfs_inode read_io and write_io fields are cumulative counters reset
1400 * when there are no layout segments. Note that in pnfs_update_layout iomode
1401 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1402 * WRITE request.
1403 *
1404 * A return of true means use MDS I/O.
1405 *
1406 * From rfc 5661:
1407 * If a file's size is smaller than the file size threshold, data accesses
1408 * SHOULD be sent to the metadata server.  If an I/O request has a length that
1409 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1410 * server.  If both file size and I/O size are provided, the client SHOULD
1411 * reach or exceed  both thresholds before sending its read or write
1412 * requests to the data server.
1413 */
1414static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1415                                     struct inode *ino, int iomode)
1416{
1417        struct nfs4_threshold *t = ctx->mdsthreshold;
1418        struct nfs_inode *nfsi = NFS_I(ino);
1419        loff_t fsize = i_size_read(ino);
1420        bool size = false, size_set = false, io = false, io_set = false, ret = false;
1421
1422        if (t == NULL)
1423                return ret;
1424
1425        dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1426                __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1427
1428        switch (iomode) {
1429        case IOMODE_READ:
1430                if (t->bm & THRESHOLD_RD) {
1431                        dprintk("%s fsize %llu\n", __func__, fsize);
1432                        size_set = true;
1433                        if (fsize < t->rd_sz)
1434                                size = true;
1435                }
1436                if (t->bm & THRESHOLD_RD_IO) {
1437                        dprintk("%s nfsi->read_io %llu\n", __func__,
1438                                nfsi->read_io);
1439                        io_set = true;
1440                        if (nfsi->read_io < t->rd_io_sz)
1441                                io = true;
1442                }
1443                break;
1444        case IOMODE_RW:
1445                if (t->bm & THRESHOLD_WR) {
1446                        dprintk("%s fsize %llu\n", __func__, fsize);
1447                        size_set = true;
1448                        if (fsize < t->wr_sz)
1449                                size = true;
1450                }
1451                if (t->bm & THRESHOLD_WR_IO) {
1452                        dprintk("%s nfsi->write_io %llu\n", __func__,
1453                                nfsi->write_io);
1454                        io_set = true;
1455                        if (nfsi->write_io < t->wr_io_sz)
1456                                io = true;
1457                }
1458                break;
1459        }
1460        if (size_set && io_set) {
1461                if (size && io)
1462                        ret = true;
1463        } else if (size || io)
1464                ret = true;
1465
1466        dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1467        return ret;
1468}
1469
1470static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1471{
1472        /*
1473         * send layoutcommit as it can hold up layoutreturn due to lseg
1474         * reference
1475         */
1476        pnfs_layoutcommit_inode(lo->plh_inode, false);
1477        return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1478                                   nfs_wait_bit_killable,
1479                                   TASK_UNINTERRUPTIBLE);
1480}
1481
1482static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1483{
1484        unsigned long *bitlock = &lo->plh_flags;
1485
1486        clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1487        smp_mb__after_atomic();
1488        wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1489}
1490
1491/*
1492 * Layout segment is retreived from the server if not cached.
1493 * The appropriate layout segment is referenced and returned to the caller.
1494 */
1495struct pnfs_layout_segment *
1496pnfs_update_layout(struct inode *ino,
1497                   struct nfs_open_context *ctx,
1498                   loff_t pos,
1499                   u64 count,
1500                   enum pnfs_iomode iomode,
1501                   gfp_t gfp_flags)
1502{
1503        struct pnfs_layout_range arg = {
1504                .iomode = iomode,
1505                .offset = pos,
1506                .length = count,
1507        };
1508        unsigned pg_offset;
1509        struct nfs_server *server = NFS_SERVER(ino);
1510        struct nfs_client *clp = server->nfs_client;
1511        struct pnfs_layout_hdr *lo;
1512        struct pnfs_layout_segment *lseg = NULL;
1513        bool first;
1514
1515        if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1516                trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
1517                                 PNFS_UPDATE_LAYOUT_NO_PNFS);
1518                goto out;
1519        }
1520
1521        if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
1522                trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
1523                                 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
1524                goto out;
1525        }
1526
1527        if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1528                trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
1529                                 PNFS_UPDATE_LAYOUT_MDSTHRESH);
1530                goto out;
1531        }
1532
1533lookup_again:
1534        first = false;
1535        spin_lock(&ino->i_lock);
1536        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1537        if (lo == NULL) {
1538                spin_unlock(&ino->i_lock);
1539                trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
1540                                 PNFS_UPDATE_LAYOUT_NOMEM);
1541                goto out;
1542        }
1543
1544        /* Do we even need to bother with this? */
1545        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1546                trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1547                                 PNFS_UPDATE_LAYOUT_BULK_RECALL);
1548                dprintk("%s matches recall, use MDS\n", __func__);
1549                goto out_unlock;
1550        }
1551
1552        /* if LAYOUTGET already failed once we don't try again */
1553        if (pnfs_layout_io_test_failed(lo, iomode)) {
1554                trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1555                                 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
1556                goto out_unlock;
1557        }
1558
1559        first = list_empty(&lo->plh_segs);
1560        if (first) {
1561                /* The first layoutget for the file. Need to serialize per
1562                 * RFC 5661 Errata 3208.
1563                 */
1564                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1565                                     &lo->plh_flags)) {
1566                        spin_unlock(&ino->i_lock);
1567                        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1568                                    TASK_UNINTERRUPTIBLE);
1569                        pnfs_put_layout_hdr(lo);
1570                        goto lookup_again;
1571                }
1572        } else {
1573                /* Check to see if the layout for the given range
1574                 * already exists
1575                 */
1576                lseg = pnfs_find_lseg(lo, &arg);
1577                if (lseg) {
1578                        trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1579                                        PNFS_UPDATE_LAYOUT_FOUND_CACHED);
1580                        goto out_unlock;
1581                }
1582        }
1583
1584        /*
1585         * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1586         * for LAYOUTRETURN even if first is true.
1587         */
1588        if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1589                spin_unlock(&ino->i_lock);
1590                dprintk("%s wait for layoutreturn\n", __func__);
1591                if (pnfs_prepare_to_retry_layoutget(lo)) {
1592                        if (first)
1593                                pnfs_clear_first_layoutget(lo);
1594                        pnfs_put_layout_hdr(lo);
1595                        dprintk("%s retrying\n", __func__);
1596                        goto lookup_again;
1597                }
1598                trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1599                                PNFS_UPDATE_LAYOUT_RETURN);
1600                goto out_put_layout_hdr;
1601        }
1602
1603        if (pnfs_layoutgets_blocked(lo)) {
1604                trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1605                                PNFS_UPDATE_LAYOUT_BLOCKED);
1606                goto out_unlock;
1607        }
1608        atomic_inc(&lo->plh_outstanding);
1609        spin_unlock(&ino->i_lock);
1610
1611        if (list_empty(&lo->plh_layouts)) {
1612                /* The lo must be on the clp list if there is any
1613                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1614                 */
1615                spin_lock(&clp->cl_lock);
1616                if (list_empty(&lo->plh_layouts))
1617                        list_add_tail(&lo->plh_layouts, &server->layouts);
1618                spin_unlock(&clp->cl_lock);
1619        }
1620
1621        pg_offset = arg.offset & ~PAGE_MASK;
1622        if (pg_offset) {
1623                arg.offset -= pg_offset;
1624                arg.length += pg_offset;
1625        }
1626        if (arg.length != NFS4_MAX_UINT64)
1627                arg.length = PAGE_ALIGN(arg.length);
1628
1629        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1630        atomic_dec(&lo->plh_outstanding);
1631        trace_pnfs_update_layout(ino, pos, count, iomode, lo,
1632                                 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
1633out_put_layout_hdr:
1634        if (first)
1635                pnfs_clear_first_layoutget(lo);
1636        pnfs_put_layout_hdr(lo);
1637out:
1638        dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1639                        "(%s, offset: %llu, length: %llu)\n",
1640                        __func__, ino->i_sb->s_id,
1641                        (unsigned long long)NFS_FILEID(ino),
1642                        IS_ERR_OR_NULL(lseg) ? "not found" : "found",
1643                        iomode==IOMODE_RW ?  "read/write" : "read-only",
1644                        (unsigned long long)pos,
1645                        (unsigned long long)count);
1646        return lseg;
1647out_unlock:
1648        spin_unlock(&ino->i_lock);
1649        goto out_put_layout_hdr;
1650}
1651EXPORT_SYMBOL_GPL(pnfs_update_layout);
1652
1653static bool
1654pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
1655{
1656        switch (range->iomode) {
1657        case IOMODE_READ:
1658        case IOMODE_RW:
1659                break;
1660        default:
1661                return false;
1662        }
1663        if (range->offset == NFS4_MAX_UINT64)
1664                return false;
1665        if (range->length == 0)
1666                return false;
1667        if (range->length != NFS4_MAX_UINT64 &&
1668            range->length > NFS4_MAX_UINT64 - range->offset)
1669                return false;
1670        return true;
1671}
1672
1673struct pnfs_layout_segment *
1674pnfs_layout_process(struct nfs4_layoutget *lgp)
1675{
1676        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1677        struct nfs4_layoutget_res *res = &lgp->res;
1678        struct pnfs_layout_segment *lseg;
1679        struct inode *ino = lo->plh_inode;
1680        LIST_HEAD(free_me);
1681        int status = -EINVAL;
1682
1683        if (!pnfs_sanity_check_layout_range(&res->range))
1684                goto out;
1685
1686        /* Inject layout blob into I/O device driver */
1687        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1688        if (!lseg || IS_ERR(lseg)) {
1689                if (!lseg)
1690                        status = -ENOMEM;
1691                else
1692                        status = PTR_ERR(lseg);
1693                dprintk("%s: Could not allocate layout: error %d\n",
1694                       __func__, status);
1695                goto out;
1696        }
1697
1698        init_lseg(lo, lseg);
1699        lseg->pls_range = res->range;
1700
1701        spin_lock(&ino->i_lock);
1702        if (pnfs_layoutgets_blocked(lo)) {
1703                dprintk("%s forget reply due to state\n", __func__);
1704                goto out_forget_reply;
1705        }
1706
1707        if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1708                /* existing state ID, make sure the sequence number matches. */
1709                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1710                        dprintk("%s forget reply due to sequence\n", __func__);
1711                        status = -EAGAIN;
1712                        goto out_forget_reply;
1713                }
1714                pnfs_set_layout_stateid(lo, &res->stateid, false);
1715        } else {
1716                /*
1717                 * We got an entirely new state ID.  Mark all segments for the
1718                 * inode invalid, and don't bother validating the stateid
1719                 * sequence number.
1720                 */
1721                pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
1722
1723                nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1724                lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1725        }
1726
1727        clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1728
1729        pnfs_get_lseg(lseg);
1730        pnfs_layout_insert_lseg(lo, lseg, &free_me);
1731
1732        if (res->return_on_close)
1733                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1734
1735        spin_unlock(&ino->i_lock);
1736        pnfs_free_lseg_list(&free_me);
1737        return lseg;
1738out:
1739        return ERR_PTR(status);
1740
1741out_forget_reply:
1742        spin_unlock(&ino->i_lock);
1743        lseg->pls_layout = lo;
1744        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1745        goto out;
1746}
1747
1748static void
1749pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
1750{
1751        if (lo->plh_return_iomode == iomode)
1752                return;
1753        if (lo->plh_return_iomode != 0)
1754                iomode = IOMODE_ANY;
1755        lo->plh_return_iomode = iomode;
1756        set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
1757}
1758
1759/**
1760 * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
1761 * @lo: pointer to layout header
1762 * @tmp_list: list header to be used with pnfs_free_lseg_list()
1763 * @return_range: describe layout segment ranges to be returned
1764 *
1765 * This function is mainly intended for use by layoutrecall. It attempts
1766 * to free the layout segment immediately, or else to mark it for return
1767 * as soon as its reference count drops to zero.
1768 */
1769int
1770pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1771                                struct list_head *tmp_list,
1772                                const struct pnfs_layout_range *return_range)
1773{
1774        struct pnfs_layout_segment *lseg, *next;
1775        int remaining = 0;
1776
1777        dprintk("%s:Begin lo %p\n", __func__, lo);
1778
1779        if (list_empty(&lo->plh_segs))
1780                return 0;
1781
1782        assert_spin_locked(&lo->plh_inode->i_lock);
1783
1784        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1785                if (should_free_lseg(&lseg->pls_range, return_range)) {
1786                        dprintk("%s: marking lseg %p iomode %d "
1787                                "offset %llu length %llu\n", __func__,
1788                                lseg, lseg->pls_range.iomode,
1789                                lseg->pls_range.offset,
1790                                lseg->pls_range.length);
1791                        if (mark_lseg_invalid(lseg, tmp_list))
1792                                continue;
1793                        remaining++;
1794                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1795                        pnfs_set_plh_return_iomode(lo, return_range->iomode);
1796                }
1797        return remaining;
1798}
1799
1800void pnfs_error_mark_layout_for_return(struct inode *inode,
1801                                       struct pnfs_layout_segment *lseg)
1802{
1803        struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1804        struct pnfs_layout_range range = {
1805                .iomode = lseg->pls_range.iomode,
1806                .offset = 0,
1807                .length = NFS4_MAX_UINT64,
1808        };
1809        LIST_HEAD(free_me);
1810        bool return_now = false;
1811
1812        spin_lock(&inode->i_lock);
1813        pnfs_set_plh_return_iomode(lo, range.iomode);
1814        /*
1815         * mark all matching lsegs so that we are sure to have no live
1816         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1817         * for how it works.
1818         */
1819        if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
1820                nfs4_stateid stateid;
1821                enum pnfs_iomode iomode = lo->plh_return_iomode;
1822
1823                nfs4_stateid_copy(&stateid, &lo->plh_stateid);
1824                return_now = pnfs_prepare_layoutreturn(lo);
1825                spin_unlock(&inode->i_lock);
1826                if (return_now)
1827                        pnfs_send_layoutreturn(lo, &stateid, iomode, false);
1828        } else {
1829                spin_unlock(&inode->i_lock);
1830                nfs_commit_inode(inode, 0);
1831        }
1832        pnfs_free_lseg_list(&free_me);
1833}
1834EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1835
1836void
1837pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1838{
1839        u64 rd_size = req->wb_bytes;
1840
1841        if (pgio->pg_lseg == NULL) {
1842                if (pgio->pg_dreq == NULL)
1843                        rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1844                else
1845                        rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1846
1847                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1848                                                   req->wb_context,
1849                                                   req_offset(req),
1850                                                   rd_size,
1851                                                   IOMODE_READ,
1852                                                   GFP_KERNEL);
1853                if (IS_ERR(pgio->pg_lseg)) {
1854                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
1855                        pgio->pg_lseg = NULL;
1856                        return;
1857                }
1858        }
1859        /* If no lseg, fall back to read through mds */
1860        if (pgio->pg_lseg == NULL)
1861                nfs_pageio_reset_read_mds(pgio);
1862
1863}
1864EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1865
1866void
1867pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1868                           struct nfs_page *req, u64 wb_size)
1869{
1870        if (pgio->pg_lseg == NULL) {
1871                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1872                                                   req->wb_context,
1873                                                   req_offset(req),
1874                                                   wb_size,
1875                                                   IOMODE_RW,
1876                                                   GFP_NOFS);
1877                if (IS_ERR(pgio->pg_lseg)) {
1878                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
1879                        pgio->pg_lseg = NULL;
1880                        return;
1881                }
1882        }
1883        /* If no lseg, fall back to write through mds */
1884        if (pgio->pg_lseg == NULL)
1885                nfs_pageio_reset_write_mds(pgio);
1886}
1887EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1888
1889void
1890pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
1891{
1892        if (desc->pg_lseg) {
1893                pnfs_put_lseg(desc->pg_lseg);
1894                desc->pg_lseg = NULL;
1895        }
1896}
1897EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
1898
1899/*
1900 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1901 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1902 */
1903size_t
1904pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
1905                     struct nfs_page *prev, struct nfs_page *req)
1906{
1907        unsigned int size;
1908        u64 seg_end, req_start, seg_left;
1909
1910        size = nfs_generic_pg_test(pgio, prev, req);
1911        if (!size)
1912                return 0;
1913
1914        /*
1915         * 'size' contains the number of bytes left in the current page (up
1916         * to the original size asked for in @req->wb_bytes).
1917         *
1918         * Calculate how many bytes are left in the layout segment
1919         * and if there are less bytes than 'size', return that instead.
1920         *
1921         * Please also note that 'end_offset' is actually the offset of the
1922         * first byte that lies outside the pnfs_layout_range. FIXME?
1923         *
1924         */
1925        if (pgio->pg_lseg) {
1926                seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
1927                                     pgio->pg_lseg->pls_range.length);
1928                req_start = req_offset(req);
1929                WARN_ON_ONCE(req_start >= seg_end);
1930                /* start of request is past the last byte of this segment */
1931                if (req_start >= seg_end) {
1932                        /* reference the new lseg */
1933                        if (pgio->pg_ops->pg_cleanup)
1934                                pgio->pg_ops->pg_cleanup(pgio);
1935                        if (pgio->pg_ops->pg_init)
1936                                pgio->pg_ops->pg_init(pgio, req);
1937                        return 0;
1938                }
1939
1940                /* adjust 'size' iff there are fewer bytes left in the
1941                 * segment than what nfs_generic_pg_test returned */
1942                seg_left = seg_end - req_start;
1943                if (seg_left < size)
1944                        size = (unsigned int)seg_left;
1945        }
1946
1947        return size;
1948}
1949EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1950
1951int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
1952{
1953        struct nfs_pageio_descriptor pgio;
1954
1955        /* Resend all requests through the MDS */
1956        nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
1957                              hdr->completion_ops);
1958        set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
1959        return nfs_pageio_resend(&pgio, hdr);
1960}
1961EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
1962
1963static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
1964{
1965
1966        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
1967        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1968            PNFS_LAYOUTRET_ON_ERROR) {
1969                pnfs_return_layout(hdr->inode);
1970        }
1971        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1972                hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
1973}
1974
1975/*
1976 * Called by non rpc-based layout drivers
1977 */
1978void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
1979{
1980        if (likely(!hdr->pnfs_error)) {
1981                pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
1982                                hdr->mds_offset + hdr->res.count);
1983                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1984        }
1985        trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
1986        if (unlikely(hdr->pnfs_error))
1987                pnfs_ld_handle_write_error(hdr);
1988        hdr->mds_ops->rpc_release(hdr);
1989}
1990EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1991
1992static void
1993pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1994                struct nfs_pgio_header *hdr)
1995{
1996        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1997
1998        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1999                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2000                nfs_pageio_reset_write_mds(desc);
2001                mirror->pg_recoalesce = 1;
2002        }
2003        nfs_pgio_data_destroy(hdr);
2004        hdr->release(hdr);
2005}
2006
2007static enum pnfs_try_status
2008pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
2009                        const struct rpc_call_ops *call_ops,
2010                        struct pnfs_layout_segment *lseg,
2011                        int how)
2012{
2013        struct inode *inode = hdr->inode;
2014        enum pnfs_try_status trypnfs;
2015        struct nfs_server *nfss = NFS_SERVER(inode);
2016
2017        hdr->mds_ops = call_ops;
2018
2019        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
2020                inode->i_ino, hdr->args.count, hdr->args.offset, how);
2021        trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
2022        if (trypnfs != PNFS_NOT_ATTEMPTED)
2023                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
2024        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2025        return trypnfs;
2026}
2027
2028static void
2029pnfs_do_write(struct nfs_pageio_descriptor *desc,
2030              struct nfs_pgio_header *hdr, int how)
2031{
2032        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2033        struct pnfs_layout_segment *lseg = desc->pg_lseg;
2034        enum pnfs_try_status trypnfs;
2035
2036        trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
2037        if (trypnfs == PNFS_NOT_ATTEMPTED)
2038                pnfs_write_through_mds(desc, hdr);
2039}
2040
2041static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
2042{
2043        pnfs_put_lseg(hdr->lseg);
2044        nfs_pgio_header_free(hdr);
2045}
2046
2047int
2048pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
2049{
2050        struct nfs_pgio_header *hdr;
2051        int ret;
2052
2053        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2054        if (!hdr) {
2055                desc->pg_error = -ENOMEM;
2056                return desc->pg_error;
2057        }
2058        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
2059
2060        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2061        ret = nfs_generic_pgio(desc, hdr);
2062        if (!ret)
2063                pnfs_do_write(desc, hdr, desc->pg_ioflags);
2064
2065        return ret;
2066}
2067EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2068
2069int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2070{
2071        struct nfs_pageio_descriptor pgio;
2072
2073        /* Resend all requests through the MDS */
2074        nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2075        return nfs_pageio_resend(&pgio, hdr);
2076}
2077EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2078
2079static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2080{
2081        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2082        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2083            PNFS_LAYOUTRET_ON_ERROR) {
2084                pnfs_return_layout(hdr->inode);
2085        }
2086        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2087                hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2088}
2089
2090/*
2091 * Called by non rpc-based layout drivers
2092 */
2093void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2094{
2095        if (likely(!hdr->pnfs_error)) {
2096                __nfs4_read_done_cb(hdr);
2097                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2098        }
2099        trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2100        if (unlikely(hdr->pnfs_error))
2101                pnfs_ld_handle_read_error(hdr);
2102        hdr->mds_ops->rpc_release(hdr);
2103}
2104EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2105
2106static void
2107pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2108                struct nfs_pgio_header *hdr)
2109{
2110        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2111
2112        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2113                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2114                nfs_pageio_reset_read_mds(desc);
2115                mirror->pg_recoalesce = 1;
2116        }
2117        nfs_pgio_data_destroy(hdr);
2118        hdr->release(hdr);
2119}
2120
2121/*
2122 * Call the appropriate parallel I/O subsystem read function.
2123 */
2124static enum pnfs_try_status
2125pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2126                       const struct rpc_call_ops *call_ops,
2127                       struct pnfs_layout_segment *lseg)
2128{
2129        struct inode *inode = hdr->inode;
2130        struct nfs_server *nfss = NFS_SERVER(inode);
2131        enum pnfs_try_status trypnfs;
2132
2133        hdr->mds_ops = call_ops;
2134
2135        dprintk("%s: Reading ino:%lu %u@%llu\n",
2136                __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
2137
2138        trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
2139        if (trypnfs != PNFS_NOT_ATTEMPTED)
2140                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
2141        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2142        return trypnfs;
2143}
2144
2145/* Resend all requests through pnfs. */
2146int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2147{
2148        struct nfs_pageio_descriptor pgio;
2149
2150        nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
2151        return nfs_pageio_resend(&pgio, hdr);
2152}
2153EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2154
2155static void
2156pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2157{
2158        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2159        struct pnfs_layout_segment *lseg = desc->pg_lseg;
2160        enum pnfs_try_status trypnfs;
2161        int err = 0;
2162
2163        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2164        if (trypnfs == PNFS_TRY_AGAIN)
2165                err = pnfs_read_resend_pnfs(hdr);
2166        if (trypnfs == PNFS_NOT_ATTEMPTED || err)
2167                pnfs_read_through_mds(desc, hdr);
2168}
2169
2170static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2171{
2172        pnfs_put_lseg(hdr->lseg);
2173        nfs_pgio_header_free(hdr);
2174}
2175
2176int
2177pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
2178{
2179        struct nfs_pgio_header *hdr;
2180        int ret;
2181
2182        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2183        if (!hdr) {
2184                desc->pg_error = -ENOMEM;
2185                return desc->pg_error;
2186        }
2187        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
2188        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2189        ret = nfs_generic_pgio(desc, hdr);
2190        if (!ret)
2191                pnfs_do_read(desc, hdr);
2192        return ret;
2193}
2194EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
2195
2196static void pnfs_clear_layoutcommitting(struct inode *inode)
2197{
2198        unsigned long *bitlock = &NFS_I(inode)->flags;
2199
2200        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
2201        smp_mb__after_atomic();
2202        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
2203}
2204
2205/*
2206 * There can be multiple RW segments.
2207 */
2208static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
2209{
2210        struct pnfs_layout_segment *lseg;
2211
2212        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
2213                if (lseg->pls_range.iomode == IOMODE_RW &&
2214                    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
2215                        list_add(&lseg->pls_lc_list, listp);
2216        }
2217}
2218
2219static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
2220{
2221        struct pnfs_layout_segment *lseg, *tmp;
2222
2223        /* Matched by references in pnfs_set_layoutcommit */
2224        list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
2225                list_del_init(&lseg->pls_lc_list);
2226                pnfs_put_lseg(lseg);
2227        }
2228
2229        pnfs_clear_layoutcommitting(inode);
2230}
2231
2232void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
2233{
2234        pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
2235}
2236EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
2237
2238void
2239pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
2240                loff_t end_pos)
2241{
2242        struct nfs_inode *nfsi = NFS_I(inode);
2243        bool mark_as_dirty = false;
2244
2245        spin_lock(&inode->i_lock);
2246        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
2247                nfsi->layout->plh_lwb = end_pos;
2248                mark_as_dirty = true;
2249                dprintk("%s: Set layoutcommit for inode %lu ",
2250                        __func__, inode->i_ino);
2251        } else if (end_pos > nfsi->layout->plh_lwb)
2252                nfsi->layout->plh_lwb = end_pos;
2253        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
2254                /* references matched in nfs4_layoutcommit_release */
2255                pnfs_get_lseg(lseg);
2256        }
2257        spin_unlock(&inode->i_lock);
2258        dprintk("%s: lseg %p end_pos %llu\n",
2259                __func__, lseg, nfsi->layout->plh_lwb);
2260
2261        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
2262         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
2263        if (mark_as_dirty)
2264                mark_inode_dirty_sync(inode);
2265}
2266EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
2267
2268void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
2269{
2270        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
2271
2272        if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
2273                nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
2274        pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
2275}
2276
2277/*
2278 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
2279 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
2280 * data to disk to allow the server to recover the data if it crashes.
2281 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
2282 * is off, and a COMMIT is sent to a data server, or
2283 * if WRITEs to a data server return NFS_DATA_SYNC.
2284 */
2285int
2286pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2287{
2288        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2289        struct nfs4_layoutcommit_data *data;
2290        struct nfs_inode *nfsi = NFS_I(inode);
2291        loff_t end_pos;
2292        int status;
2293
2294        if (!pnfs_layoutcommit_outstanding(inode))
2295                return 0;
2296
2297        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
2298
2299        status = -EAGAIN;
2300        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
2301                if (!sync)
2302                        goto out;
2303                status = wait_on_bit_lock_action(&nfsi->flags,
2304                                NFS_INO_LAYOUTCOMMITTING,
2305                                nfs_wait_bit_killable,
2306                                TASK_KILLABLE);
2307                if (status)
2308                        goto out;
2309        }
2310
2311        status = -ENOMEM;
2312        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
2313        data = kzalloc(sizeof(*data), GFP_NOFS);
2314        if (!data)
2315                goto clear_layoutcommitting;
2316
2317        status = 0;
2318        spin_lock(&inode->i_lock);
2319        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
2320                goto out_unlock;
2321
2322        INIT_LIST_HEAD(&data->lseg_list);
2323        pnfs_list_write_lseg(inode, &data->lseg_list);
2324
2325        end_pos = nfsi->layout->plh_lwb;
2326
2327        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
2328        spin_unlock(&inode->i_lock);
2329
2330        data->args.inode = inode;
2331        data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
2332        nfs_fattr_init(&data->fattr);
2333        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
2334        data->res.fattr = &data->fattr;
2335        data->args.lastbytewritten = end_pos - 1;
2336        data->res.server = NFS_SERVER(inode);
2337
2338        if (ld->prepare_layoutcommit) {
2339                status = ld->prepare_layoutcommit(&data->args);
2340                if (status) {
2341                        put_rpccred(data->cred);
2342                        spin_lock(&inode->i_lock);
2343                        set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
2344                        if (end_pos > nfsi->layout->plh_lwb)
2345                                nfsi->layout->plh_lwb = end_pos;
2346                        goto out_unlock;
2347                }
2348        }
2349
2350
2351        status = nfs4_proc_layoutcommit(data, sync);
2352out:
2353        if (status)
2354                mark_inode_dirty_sync(inode);
2355        dprintk("<-- %s status %d\n", __func__, status);
2356        return status;
2357out_unlock:
2358        spin_unlock(&inode->i_lock);
2359        kfree(data);
2360clear_layoutcommitting:
2361        pnfs_clear_layoutcommitting(inode);
2362        goto out;
2363}
2364EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
2365
2366int
2367pnfs_generic_sync(struct inode *inode, bool datasync)
2368{
2369        return pnfs_layoutcommit_inode(inode, true);
2370}
2371EXPORT_SYMBOL_GPL(pnfs_generic_sync);
2372
2373struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2374{
2375        struct nfs4_threshold *thp;
2376
2377        thp = kzalloc(sizeof(*thp), GFP_NOFS);
2378        if (!thp) {
2379                dprintk("%s mdsthreshold allocation failed\n", __func__);
2380                return NULL;
2381        }
2382        return thp;
2383}
2384
2385#if IS_ENABLED(CONFIG_NFS_V4_2)
2386int
2387pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2388{
2389        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2390        struct nfs_server *server = NFS_SERVER(inode);
2391        struct nfs_inode *nfsi = NFS_I(inode);
2392        struct nfs42_layoutstat_data *data;
2393        struct pnfs_layout_hdr *hdr;
2394        int status = 0;
2395
2396        if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
2397                goto out;
2398
2399        if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
2400                goto out;
2401
2402        if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
2403                goto out;
2404
2405        spin_lock(&inode->i_lock);
2406        if (!NFS_I(inode)->layout) {
2407                spin_unlock(&inode->i_lock);
2408                goto out;
2409        }
2410        hdr = NFS_I(inode)->layout;
2411        pnfs_get_layout_hdr(hdr);
2412        spin_unlock(&inode->i_lock);
2413
2414        data = kzalloc(sizeof(*data), gfp_flags);
2415        if (!data) {
2416                status = -ENOMEM;
2417                goto out_put;
2418        }
2419
2420        data->args.fh = NFS_FH(inode);
2421        data->args.inode = inode;
2422        nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
2423        status = ld->prepare_layoutstats(&data->args);
2424        if (status)
2425                goto out_free;
2426
2427        status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
2428
2429out:
2430        dprintk("%s returns %d\n", __func__, status);
2431        return status;
2432
2433out_free:
2434        kfree(data);
2435out_put:
2436        pnfs_put_layout_hdr(hdr);
2437        smp_mb__before_atomic();
2438        clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
2439        smp_mb__after_atomic();
2440        goto out;
2441}
2442EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
2443#endif
2444
2445unsigned int layoutstats_timer;
2446module_param(layoutstats_timer, uint, 0644);
2447EXPORT_SYMBOL_GPL(layoutstats_timer);
2448