linux/fs/nfs/pnfs.c
<<
>>
Prefs
   1/*
   2 *  pNFS functions to call and manage layout drivers.
   3 *
   4 *  Copyright (c) 2002 [year of first publication]
   5 *  The Regents of the University of Michigan
   6 *  All Rights Reserved
   7 *
   8 *  Dean Hildebrand <dhildebz@umich.edu>
   9 *
  10 *  Permission is granted to use, copy, create derivative works, and
  11 *  redistribute this software and such derivative works for any purpose,
  12 *  so long as the name of the University of Michigan is not used in
  13 *  any advertising or publicity pertaining to the use or distribution
  14 *  of this software without specific, written prior authorization. If
  15 *  the above copyright notice or any other identification of the
  16 *  University of Michigan is included in any copy of any portion of
  17 *  this software, then the disclaimer below must also be included.
  18 *
  19 *  This software is provided as is, without representation or warranty
  20 *  of any kind either express or implied, including without limitation
  21 *  the implied warranties of merchantability, fitness for a particular
  22 *  purpose, or noninfringement.  The Regents of the University of
  23 *  Michigan shall not be liable for any damages, including special,
  24 *  indirect, incidental, or consequential damages, with respect to any
  25 *  claim arising out of or in connection with the use of the software,
  26 *  even if it has been or is hereafter advised of the possibility of
  27 *  such damages.
  28 */
  29
  30#include <linux/nfs_fs.h>
  31#include <linux/nfs_page.h>
  32#include <linux/module.h>
  33#include "internal.h"
  34#include "pnfs.h"
  35#include "iostat.h"
  36#include "nfs4trace.h"
  37#include "delegation.h"
  38#include "nfs42.h"
  39
  40#define NFSDBG_FACILITY         NFSDBG_PNFS
  41#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
  42
  43/* Locking:
  44 *
  45 * pnfs_spinlock:
  46 *      protects pnfs_modules_tbl.
  47 */
  48static DEFINE_SPINLOCK(pnfs_spinlock);
  49
  50/*
  51 * pnfs_modules_tbl holds all pnfs modules
  52 */
  53static LIST_HEAD(pnfs_modules_tbl);
  54
  55static int
  56pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
  57                       enum pnfs_iomode iomode, bool sync);
  58
  59/* Return the registered pnfs layout driver module matching given id */
  60static struct pnfs_layoutdriver_type *
  61find_pnfs_driver_locked(u32 id)
  62{
  63        struct pnfs_layoutdriver_type *local;
  64
  65        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
  66                if (local->id == id)
  67                        goto out;
  68        local = NULL;
  69out:
  70        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
  71        return local;
  72}
  73
  74static struct pnfs_layoutdriver_type *
  75find_pnfs_driver(u32 id)
  76{
  77        struct pnfs_layoutdriver_type *local;
  78
  79        spin_lock(&pnfs_spinlock);
  80        local = find_pnfs_driver_locked(id);
  81        if (local != NULL && !try_module_get(local->owner)) {
  82                dprintk("%s: Could not grab reference on module\n", __func__);
  83                local = NULL;
  84        }
  85        spin_unlock(&pnfs_spinlock);
  86        return local;
  87}
  88
  89void
  90unset_pnfs_layoutdriver(struct nfs_server *nfss)
  91{
  92        if (nfss->pnfs_curr_ld) {
  93                if (nfss->pnfs_curr_ld->clear_layoutdriver)
  94                        nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
  95                /* Decrement the MDS count. Purge the deviceid cache if zero */
  96                if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
  97                        nfs4_deviceid_purge_client(nfss->nfs_client);
  98                module_put(nfss->pnfs_curr_ld->owner);
  99        }
 100        nfss->pnfs_curr_ld = NULL;
 101}
 102
 103/*
 104 * Try to set the server's pnfs module to the pnfs layout type specified by id.
 105 * Currently only one pNFS layout driver per filesystem is supported.
 106 *
 107 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
 108 */
 109void
 110set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
 111                      u32 id)
 112{
 113        struct pnfs_layoutdriver_type *ld_type = NULL;
 114
 115        if (id == 0)
 116                goto out_no_driver;
 117        if (!(server->nfs_client->cl_exchange_flags &
 118                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
 119                printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
 120                        __func__, id, server->nfs_client->cl_exchange_flags);
 121                goto out_no_driver;
 122        }
 123        ld_type = find_pnfs_driver(id);
 124        if (!ld_type) {
 125                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
 126                ld_type = find_pnfs_driver(id);
 127                if (!ld_type) {
 128                        dprintk("%s: No pNFS module found for %u.\n",
 129                                __func__, id);
 130                        goto out_no_driver;
 131                }
 132        }
 133        server->pnfs_curr_ld = ld_type;
 134        if (ld_type->set_layoutdriver
 135            && ld_type->set_layoutdriver(server, mntfh)) {
 136                printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
 137                        "driver %u.\n", __func__, id);
 138                module_put(ld_type->owner);
 139                goto out_no_driver;
 140        }
 141        /* Bump the MDS count */
 142        atomic_inc(&server->nfs_client->cl_mds_count);
 143
 144        dprintk("%s: pNFS module for %u set\n", __func__, id);
 145        return;
 146
 147out_no_driver:
 148        dprintk("%s: Using NFSv4 I/O\n", __func__);
 149        server->pnfs_curr_ld = NULL;
 150}
 151
 152int
 153pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 154{
 155        int status = -EINVAL;
 156        struct pnfs_layoutdriver_type *tmp;
 157
 158        if (ld_type->id == 0) {
 159                printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
 160                return status;
 161        }
 162        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
 163                printk(KERN_ERR "NFS: %s Layout driver must provide "
 164                       "alloc_lseg and free_lseg.\n", __func__);
 165                return status;
 166        }
 167
 168        spin_lock(&pnfs_spinlock);
 169        tmp = find_pnfs_driver_locked(ld_type->id);
 170        if (!tmp) {
 171                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
 172                status = 0;
 173                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
 174                        ld_type->name);
 175        } else {
 176                printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
 177                        __func__, ld_type->id);
 178        }
 179        spin_unlock(&pnfs_spinlock);
 180
 181        return status;
 182}
 183EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
 184
 185void
 186pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 187{
 188        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
 189        spin_lock(&pnfs_spinlock);
 190        list_del(&ld_type->pnfs_tblid);
 191        spin_unlock(&pnfs_spinlock);
 192}
 193EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 194
 195/*
 196 * pNFS client layout cache
 197 */
 198
 199/* Need to hold i_lock if caller does not already hold reference */
 200void
 201pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
 202{
 203        atomic_inc(&lo->plh_refcount);
 204}
 205
 206static struct pnfs_layout_hdr *
 207pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 208{
 209        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
 210        return ld->alloc_layout_hdr(ino, gfp_flags);
 211}
 212
 213static void
 214pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
 215{
 216        struct nfs_server *server = NFS_SERVER(lo->plh_inode);
 217        struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
 218
 219        if (!list_empty(&lo->plh_layouts)) {
 220                struct nfs_client *clp = server->nfs_client;
 221
 222                spin_lock(&clp->cl_lock);
 223                list_del_init(&lo->plh_layouts);
 224                spin_unlock(&clp->cl_lock);
 225        }
 226        put_rpccred(lo->plh_lc_cred);
 227        return ld->free_layout_hdr(lo);
 228}
 229
 230static void
 231pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
 232{
 233        struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
 234        dprintk("%s: freeing layout cache %p\n", __func__, lo);
 235        nfsi->layout = NULL;
 236        /* Reset MDS Threshold I/O counters */
 237        nfsi->write_io = 0;
 238        nfsi->read_io = 0;
 239}
 240
 241void
 242pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 243{
 244        struct inode *inode = lo->plh_inode;
 245
 246        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 247                if (!list_empty(&lo->plh_segs))
 248                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
 249                pnfs_detach_layout_hdr(lo);
 250                spin_unlock(&inode->i_lock);
 251                pnfs_free_layout_hdr(lo);
 252        }
 253}
 254
 255static int
 256pnfs_iomode_to_fail_bit(u32 iomode)
 257{
 258        return iomode == IOMODE_RW ?
 259                NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 260}
 261
 262static void
 263pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 264{
 265        lo->plh_retry_timestamp = jiffies;
 266        if (!test_and_set_bit(fail_bit, &lo->plh_flags))
 267                atomic_inc(&lo->plh_refcount);
 268}
 269
 270static void
 271pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 272{
 273        if (test_and_clear_bit(fail_bit, &lo->plh_flags))
 274                atomic_dec(&lo->plh_refcount);
 275}
 276
 277static void
 278pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 279{
 280        struct inode *inode = lo->plh_inode;
 281        struct pnfs_layout_range range = {
 282                .iomode = iomode,
 283                .offset = 0,
 284                .length = NFS4_MAX_UINT64,
 285        };
 286        LIST_HEAD(head);
 287
 288        spin_lock(&inode->i_lock);
 289        pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
 290        pnfs_mark_matching_lsegs_invalid(lo, &head, &range);
 291        spin_unlock(&inode->i_lock);
 292        pnfs_free_lseg_list(&head);
 293        dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
 294                        iomode == IOMODE_RW ?  "RW" : "READ");
 295}
 296
 297static bool
 298pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
 299{
 300        unsigned long start, end;
 301        int fail_bit = pnfs_iomode_to_fail_bit(iomode);
 302
 303        if (test_bit(fail_bit, &lo->plh_flags) == 0)
 304                return false;
 305        end = jiffies;
 306        start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
 307        if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
 308                /* It is time to retry the failed layoutgets */
 309                pnfs_layout_clear_fail_bit(lo, fail_bit);
 310                return false;
 311        }
 312        return true;
 313}
 314
 315static void
 316init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 317{
 318        INIT_LIST_HEAD(&lseg->pls_list);
 319        INIT_LIST_HEAD(&lseg->pls_lc_list);
 320        atomic_set(&lseg->pls_refcount, 1);
 321        smp_mb();
 322        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 323        lseg->pls_layout = lo;
 324}
 325
 326static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
 327{
 328        struct inode *ino = lseg->pls_layout->plh_inode;
 329
 330        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
 331}
 332
 333static void
 334pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 335                struct pnfs_layout_segment *lseg)
 336{
 337        struct inode *inode = lo->plh_inode;
 338
 339        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 340        list_del_init(&lseg->pls_list);
 341        /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
 342        atomic_dec(&lo->plh_refcount);
 343        if (list_empty(&lo->plh_segs))
 344                clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 345        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 346}
 347
 348/* Return true if layoutreturn is needed */
 349static bool
 350pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
 351                        struct pnfs_layout_segment *lseg)
 352{
 353        struct pnfs_layout_segment *s;
 354
 355        if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
 356                return false;
 357
 358        list_for_each_entry(s, &lo->plh_segs, pls_list)
 359                if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
 360                        return false;
 361
 362        return true;
 363}
 364
 365static bool
 366pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
 367{
 368        if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
 369                return false;
 370        lo->plh_return_iomode = 0;
 371        pnfs_get_layout_hdr(lo);
 372        clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
 373        return true;
 374}
 375
 376static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
 377                struct pnfs_layout_hdr *lo, struct inode *inode)
 378{
 379        lo = lseg->pls_layout;
 380        inode = lo->plh_inode;
 381
 382        spin_lock(&inode->i_lock);
 383        if (pnfs_layout_need_return(lo, lseg)) {
 384                nfs4_stateid stateid;
 385                enum pnfs_iomode iomode;
 386                bool send;
 387
 388                stateid = lo->plh_stateid;
 389                iomode = lo->plh_return_iomode;
 390                send = pnfs_prepare_layoutreturn(lo);
 391                spin_unlock(&inode->i_lock);
 392                if (send) {
 393                        /* Send an async layoutreturn so we dont deadlock */
 394                        pnfs_send_layoutreturn(lo, stateid, iomode, false);
 395                }
 396        } else
 397                spin_unlock(&inode->i_lock);
 398}
 399
 400void
 401pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 402{
 403        struct pnfs_layout_hdr *lo;
 404        struct inode *inode;
 405
 406        if (!lseg)
 407                return;
 408
 409        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 410                atomic_read(&lseg->pls_refcount),
 411                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 412
 413        /* Handle the case where refcount != 1 */
 414        if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
 415                return;
 416
 417        lo = lseg->pls_layout;
 418        inode = lo->plh_inode;
 419        /* Do we need a layoutreturn? */
 420        if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
 421                pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
 422
 423        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
 424                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 425                        spin_unlock(&inode->i_lock);
 426                        return;
 427                }
 428                pnfs_get_layout_hdr(lo);
 429                pnfs_layout_remove_lseg(lo, lseg);
 430                spin_unlock(&inode->i_lock);
 431                pnfs_free_lseg(lseg);
 432                pnfs_put_layout_hdr(lo);
 433        }
 434}
 435EXPORT_SYMBOL_GPL(pnfs_put_lseg);
 436
 437static void pnfs_free_lseg_async_work(struct work_struct *work)
 438{
 439        struct pnfs_layout_segment *lseg;
 440        struct pnfs_layout_hdr *lo;
 441
 442        lseg = container_of(work, struct pnfs_layout_segment, pls_work);
 443        lo = lseg->pls_layout;
 444
 445        pnfs_free_lseg(lseg);
 446        pnfs_put_layout_hdr(lo);
 447}
 448
 449static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
 450{
 451        INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
 452        schedule_work(&lseg->pls_work);
 453}
 454
 455void
 456pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
 457{
 458        if (!lseg)
 459                return;
 460
 461        assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
 462
 463        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 464                atomic_read(&lseg->pls_refcount),
 465                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 466        if (atomic_dec_and_test(&lseg->pls_refcount)) {
 467                struct pnfs_layout_hdr *lo = lseg->pls_layout;
 468                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
 469                        return;
 470                pnfs_get_layout_hdr(lo);
 471                pnfs_layout_remove_lseg(lo, lseg);
 472                pnfs_free_lseg_async(lseg);
 473        }
 474}
 475EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
 476
 477static u64
 478end_offset(u64 start, u64 len)
 479{
 480        u64 end;
 481
 482        end = start + len;
 483        return end >= start ? end : NFS4_MAX_UINT64;
 484}
 485
 486/*
 487 * is l2 fully contained in l1?
 488 *   start1                             end1
 489 *   [----------------------------------)
 490 *           start2           end2
 491 *           [----------------)
 492 */
 493static bool
 494pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
 495                 const struct pnfs_layout_range *l2)
 496{
 497        u64 start1 = l1->offset;
 498        u64 end1 = end_offset(start1, l1->length);
 499        u64 start2 = l2->offset;
 500        u64 end2 = end_offset(start2, l2->length);
 501
 502        return (start1 <= start2) && (end1 >= end2);
 503}
 504
 505/*
 506 * is l1 and l2 intersecting?
 507 *   start1                             end1
 508 *   [----------------------------------)
 509 *                              start2           end2
 510 *                              [----------------)
 511 */
 512static bool
 513pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
 514                    const struct pnfs_layout_range *l2)
 515{
 516        u64 start1 = l1->offset;
 517        u64 end1 = end_offset(start1, l1->length);
 518        u64 start2 = l2->offset;
 519        u64 end2 = end_offset(start2, l2->length);
 520
 521        return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
 522               (end2 == NFS4_MAX_UINT64 || end2 > start1);
 523}
 524
 525static bool
 526should_free_lseg(const struct pnfs_layout_range *lseg_range,
 527                 const struct pnfs_layout_range *recall_range)
 528{
 529        return (recall_range->iomode == IOMODE_ANY ||
 530                lseg_range->iomode == recall_range->iomode) &&
 531               pnfs_lseg_range_intersecting(lseg_range, recall_range);
 532}
 533
 534static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
 535                struct list_head *tmp_list)
 536{
 537        if (!atomic_dec_and_test(&lseg->pls_refcount))
 538                return false;
 539        pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
 540        list_add(&lseg->pls_list, tmp_list);
 541        return true;
 542}
 543
 544/* Returns 1 if lseg is removed from list, 0 otherwise */
 545static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 546                             struct list_head *tmp_list)
 547{
 548        int rv = 0;
 549
 550        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 551                /* Remove the reference keeping the lseg in the
 552                 * list.  It will now be removed when all
 553                 * outstanding io is finished.
 554                 */
 555                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
 556                        atomic_read(&lseg->pls_refcount));
 557                if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
 558                        rv = 1;
 559        }
 560        return rv;
 561}
 562
 563/* Returns count of number of matching invalid lsegs remaining in list
 564 * after call.
 565 */
 566int
 567pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 568                            struct list_head *tmp_list,
 569                            struct pnfs_layout_range *recall_range)
 570{
 571        struct pnfs_layout_segment *lseg, *next;
 572        int invalid = 0, removed = 0;
 573
 574        dprintk("%s:Begin lo %p\n", __func__, lo);
 575
 576        if (list_empty(&lo->plh_segs))
 577                return 0;
 578        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 579                if (!recall_range ||
 580                    should_free_lseg(&lseg->pls_range, recall_range)) {
 581                        dprintk("%s: freeing lseg %p iomode %d "
 582                                "offset %llu length %llu\n", __func__,
 583                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
 584                                lseg->pls_range.length);
 585                        invalid++;
 586                        removed += mark_lseg_invalid(lseg, tmp_list);
 587                }
 588        dprintk("%s:Return %i\n", __func__, invalid - removed);
 589        return invalid - removed;
 590}
 591
 592/* note free_me must contain lsegs from a single layout_hdr */
 593void
 594pnfs_free_lseg_list(struct list_head *free_me)
 595{
 596        struct pnfs_layout_segment *lseg, *tmp;
 597
 598        if (list_empty(free_me))
 599                return;
 600
 601        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 602                list_del(&lseg->pls_list);
 603                pnfs_free_lseg(lseg);
 604        }
 605}
 606
 607void
 608pnfs_destroy_layout(struct nfs_inode *nfsi)
 609{
 610        struct pnfs_layout_hdr *lo;
 611        LIST_HEAD(tmp_list);
 612
 613        spin_lock(&nfsi->vfs_inode.i_lock);
 614        lo = nfsi->layout;
 615        if (lo) {
 616                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
 617                pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
 618                pnfs_get_layout_hdr(lo);
 619                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
 620                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
 621                pnfs_clear_retry_layoutget(lo);
 622                spin_unlock(&nfsi->vfs_inode.i_lock);
 623                pnfs_free_lseg_list(&tmp_list);
 624                pnfs_put_layout_hdr(lo);
 625        } else
 626                spin_unlock(&nfsi->vfs_inode.i_lock);
 627}
 628EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 629
 630static bool
 631pnfs_layout_add_bulk_destroy_list(struct inode *inode,
 632                struct list_head *layout_list)
 633{
 634        struct pnfs_layout_hdr *lo;
 635        bool ret = false;
 636
 637        spin_lock(&inode->i_lock);
 638        lo = NFS_I(inode)->layout;
 639        if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
 640                pnfs_get_layout_hdr(lo);
 641                list_add(&lo->plh_bulk_destroy, layout_list);
 642                ret = true;
 643        }
 644        spin_unlock(&inode->i_lock);
 645        return ret;
 646}
 647
 648/* Caller must hold rcu_read_lock and clp->cl_lock */
 649static int
 650pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
 651                struct nfs_server *server,
 652                struct list_head *layout_list)
 653{
 654        struct pnfs_layout_hdr *lo, *next;
 655        struct inode *inode;
 656
 657        list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
 658                inode = igrab(lo->plh_inode);
 659                if (inode == NULL)
 660                        continue;
 661                list_del_init(&lo->plh_layouts);
 662                if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
 663                        continue;
 664                rcu_read_unlock();
 665                spin_unlock(&clp->cl_lock);
 666                iput(inode);
 667                spin_lock(&clp->cl_lock);
 668                rcu_read_lock();
 669                return -EAGAIN;
 670        }
 671        return 0;
 672}
 673
 674static int
 675pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 676                bool is_bulk_recall)
 677{
 678        struct pnfs_layout_hdr *lo;
 679        struct inode *inode;
 680        struct pnfs_layout_range range = {
 681                .iomode = IOMODE_ANY,
 682                .offset = 0,
 683                .length = NFS4_MAX_UINT64,
 684        };
 685        LIST_HEAD(lseg_list);
 686        int ret = 0;
 687
 688        while (!list_empty(layout_list)) {
 689                lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
 690                                plh_bulk_destroy);
 691                dprintk("%s freeing layout for inode %lu\n", __func__,
 692                        lo->plh_inode->i_ino);
 693                inode = lo->plh_inode;
 694
 695                pnfs_layoutcommit_inode(inode, false);
 696
 697                spin_lock(&inode->i_lock);
 698                list_del_init(&lo->plh_bulk_destroy);
 699                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
 700                if (is_bulk_recall)
 701                        set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 702                if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
 703                        ret = -EAGAIN;
 704                spin_unlock(&inode->i_lock);
 705                pnfs_free_lseg_list(&lseg_list);
 706                pnfs_put_layout_hdr(lo);
 707                iput(inode);
 708        }
 709        return ret;
 710}
 711
 712int
 713pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
 714                struct nfs_fsid *fsid,
 715                bool is_recall)
 716{
 717        struct nfs_server *server;
 718        LIST_HEAD(layout_list);
 719
 720        spin_lock(&clp->cl_lock);
 721        rcu_read_lock();
 722restart:
 723        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 724                if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
 725                        continue;
 726                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 727                                server,
 728                                &layout_list) != 0)
 729                        goto restart;
 730        }
 731        rcu_read_unlock();
 732        spin_unlock(&clp->cl_lock);
 733
 734        if (list_empty(&layout_list))
 735                return 0;
 736        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 737}
 738
 739int
 740pnfs_destroy_layouts_byclid(struct nfs_client *clp,
 741                bool is_recall)
 742{
 743        struct nfs_server *server;
 744        LIST_HEAD(layout_list);
 745
 746        spin_lock(&clp->cl_lock);
 747        rcu_read_lock();
 748restart:
 749        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
 750                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
 751                                        server,
 752                                        &layout_list) != 0)
 753                        goto restart;
 754        }
 755        rcu_read_unlock();
 756        spin_unlock(&clp->cl_lock);
 757
 758        if (list_empty(&layout_list))
 759                return 0;
 760        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
 761}
 762
 763/*
 764 * Called by the state manger to remove all layouts established under an
 765 * expired lease.
 766 */
 767void
 768pnfs_destroy_all_layouts(struct nfs_client *clp)
 769{
 770        nfs4_deviceid_mark_client_invalid(clp);
 771        nfs4_deviceid_purge_client(clp);
 772
 773        pnfs_destroy_layouts_byclid(clp, false);
 774}
 775
 776/*
 777 * Compare 2 layout stateid sequence ids, to see which is newer,
 778 * taking into account wraparound issues.
 779 */
 780static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 781{
 782        return (s32)(s1 - s2) > 0;
 783}
 784
 785/* update lo->plh_stateid with new if is more recent */
 786void
 787pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 788                        bool update_barrier)
 789{
 790        u32 oldseq, newseq, new_barrier;
 791        int empty = list_empty(&lo->plh_segs);
 792
 793        oldseq = be32_to_cpu(lo->plh_stateid.seqid);
 794        newseq = be32_to_cpu(new->seqid);
 795        if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
 796                nfs4_stateid_copy(&lo->plh_stateid, new);
 797                if (update_barrier) {
 798                        new_barrier = be32_to_cpu(new->seqid);
 799                } else {
 800                        /* Because of wraparound, we want to keep the barrier
 801                         * "close" to the current seqids.
 802                         */
 803                        new_barrier = newseq - atomic_read(&lo->plh_outstanding);
 804                }
 805                if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
 806                        lo->plh_barrier = new_barrier;
 807        }
 808}
 809
 810static bool
 811pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
 812                const nfs4_stateid *stateid)
 813{
 814        u32 seqid = be32_to_cpu(stateid->seqid);
 815
 816        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 817}
 818
 819/* lget is set to 1 if called from inside send_layoutget call chain */
 820static bool
 821pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 822{
 823        return lo->plh_block_lgets ||
 824                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 825}
 826
 827int
 828pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 829                              struct pnfs_layout_range *range,
 830                              struct nfs4_state *open_state)
 831{
 832        int status = 0;
 833
 834        dprintk("--> %s\n", __func__);
 835        spin_lock(&lo->plh_inode->i_lock);
 836        if (pnfs_layoutgets_blocked(lo)) {
 837                status = -EAGAIN;
 838        } else if (!nfs4_valid_open_stateid(open_state)) {
 839                status = -EBADF;
 840        } else if (list_empty(&lo->plh_segs) ||
 841                   test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
 842                int seq;
 843
 844                do {
 845                        seq = read_seqbegin(&open_state->seqlock);
 846                        nfs4_stateid_copy(dst, &open_state->stateid);
 847                } while (read_seqretry(&open_state->seqlock, seq));
 848        } else
 849                nfs4_stateid_copy(dst, &lo->plh_stateid);
 850        spin_unlock(&lo->plh_inode->i_lock);
 851        dprintk("<-- %s\n", __func__);
 852        return status;
 853}
 854
 855/*
 856* Get layout from server.
 857*    for now, assume that whole file layouts are requested.
 858*    arg->offset: 0
 859*    arg->length: all ones
 860*/
 861static struct pnfs_layout_segment *
 862send_layoutget(struct pnfs_layout_hdr *lo,
 863           struct nfs_open_context *ctx,
 864           struct pnfs_layout_range *range,
 865           gfp_t gfp_flags)
 866{
 867        struct inode *ino = lo->plh_inode;
 868        struct nfs_server *server = NFS_SERVER(ino);
 869        struct nfs4_layoutget *lgp;
 870        struct pnfs_layout_segment *lseg;
 871        loff_t i_size;
 872
 873        dprintk("--> %s\n", __func__);
 874
 875        lgp = kzalloc(sizeof(*lgp), gfp_flags);
 876        if (lgp == NULL)
 877                return NULL;
 878
 879        i_size = i_size_read(ino);
 880
 881        lgp->args.minlength = PAGE_CACHE_SIZE;
 882        if (lgp->args.minlength > range->length)
 883                lgp->args.minlength = range->length;
 884        if (range->iomode == IOMODE_READ) {
 885                if (range->offset >= i_size)
 886                        lgp->args.minlength = 0;
 887                else if (i_size - range->offset < lgp->args.minlength)
 888                        lgp->args.minlength = i_size - range->offset;
 889        }
 890        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 891        lgp->args.range = *range;
 892        lgp->args.type = server->pnfs_curr_ld->id;
 893        lgp->args.inode = ino;
 894        lgp->args.ctx = get_nfs_open_context(ctx);
 895        lgp->gfp_flags = gfp_flags;
 896        lgp->cred = lo->plh_lc_cred;
 897
 898        /* Synchronously retrieve layout information from server and
 899         * store in lseg.
 900         */
 901        lseg = nfs4_proc_layoutget(lgp, gfp_flags);
 902        if (IS_ERR(lseg)) {
 903                switch (PTR_ERR(lseg)) {
 904                case -ENOMEM:
 905                case -ERESTARTSYS:
 906                        break;
 907                default:
 908                        /* remember that LAYOUTGET failed and suspend trying */
 909                        pnfs_layout_io_set_failed(lo, range->iomode);
 910                }
 911                return NULL;
 912        } else
 913                pnfs_layout_clear_fail_bit(lo,
 914                                pnfs_iomode_to_fail_bit(range->iomode));
 915
 916        return lseg;
 917}
 918
 919static void pnfs_clear_layoutcommit(struct inode *inode,
 920                struct list_head *head)
 921{
 922        struct nfs_inode *nfsi = NFS_I(inode);
 923        struct pnfs_layout_segment *lseg, *tmp;
 924
 925        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
 926                return;
 927        list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
 928                if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
 929                        continue;
 930                pnfs_lseg_dec_and_remove_zero(lseg, head);
 931        }
 932}
 933
 934void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 935{
 936        clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
 937        smp_mb__after_atomic();
 938        wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
 939        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 940}
 941
 942static int
 943pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 944                       enum pnfs_iomode iomode, bool sync)
 945{
 946        struct inode *ino = lo->plh_inode;
 947        struct nfs4_layoutreturn *lrp;
 948        int status = 0;
 949
 950        lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
 951        if (unlikely(lrp == NULL)) {
 952                status = -ENOMEM;
 953                spin_lock(&ino->i_lock);
 954                pnfs_clear_layoutreturn_waitbit(lo);
 955                spin_unlock(&ino->i_lock);
 956                pnfs_put_layout_hdr(lo);
 957                goto out;
 958        }
 959
 960        lrp->args.stateid = stateid;
 961        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
 962        lrp->args.inode = ino;
 963        lrp->args.range.iomode = iomode;
 964        lrp->args.range.offset = 0;
 965        lrp->args.range.length = NFS4_MAX_UINT64;
 966        lrp->args.layout = lo;
 967        lrp->clp = NFS_SERVER(ino)->nfs_client;
 968        lrp->cred = lo->plh_lc_cred;
 969
 970        status = nfs4_proc_layoutreturn(lrp, sync);
 971out:
 972        dprintk("<-- %s status: %d\n", __func__, status);
 973        return status;
 974}
 975
 976/*
 977 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
 978 * when the layout segment list is empty.
 979 *
 980 * Note that a pnfs_layout_hdr can exist with an empty layout segment
 981 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
 982 * deviceid is marked invalid.
 983 */
 984int
 985_pnfs_return_layout(struct inode *ino)
 986{
 987        struct pnfs_layout_hdr *lo = NULL;
 988        struct nfs_inode *nfsi = NFS_I(ino);
 989        LIST_HEAD(tmp_list);
 990        nfs4_stateid stateid;
 991        int status = 0, empty;
 992        bool send;
 993
 994        dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
 995
 996        spin_lock(&ino->i_lock);
 997        lo = nfsi->layout;
 998        if (!lo) {
 999                spin_unlock(&ino->i_lock);
1000                dprintk("NFS: %s no layout to return\n", __func__);
1001                goto out;
1002        }
1003        stateid = nfsi->layout->plh_stateid;
1004        /* Reference matched in nfs4_layoutreturn_release */
1005        pnfs_get_layout_hdr(lo);
1006        empty = list_empty(&lo->plh_segs);
1007        pnfs_clear_layoutcommit(ino, &tmp_list);
1008        pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
1009
1010        if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
1011                struct pnfs_layout_range range = {
1012                        .iomode         = IOMODE_ANY,
1013                        .offset         = 0,
1014                        .length         = NFS4_MAX_UINT64,
1015                };
1016                NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1017        }
1018
1019        /* Don't send a LAYOUTRETURN if list was initially empty */
1020        if (empty) {
1021                spin_unlock(&ino->i_lock);
1022                dprintk("NFS: %s no layout segments to return\n", __func__);
1023                goto out_put_layout_hdr;
1024        }
1025
1026        set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1027        send = pnfs_prepare_layoutreturn(lo);
1028        spin_unlock(&ino->i_lock);
1029        pnfs_free_lseg_list(&tmp_list);
1030        if (send)
1031                status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
1032out_put_layout_hdr:
1033        pnfs_put_layout_hdr(lo);
1034out:
1035        dprintk("<-- %s status: %d\n", __func__, status);
1036        return status;
1037}
1038EXPORT_SYMBOL_GPL(_pnfs_return_layout);
1039
1040int
1041pnfs_commit_and_return_layout(struct inode *inode)
1042{
1043        struct pnfs_layout_hdr *lo;
1044        int ret;
1045
1046        spin_lock(&inode->i_lock);
1047        lo = NFS_I(inode)->layout;
1048        if (lo == NULL) {
1049                spin_unlock(&inode->i_lock);
1050                return 0;
1051        }
1052        pnfs_get_layout_hdr(lo);
1053        /* Block new layoutgets and read/write to ds */
1054        lo->plh_block_lgets++;
1055        spin_unlock(&inode->i_lock);
1056        filemap_fdatawait(inode->i_mapping);
1057        ret = pnfs_layoutcommit_inode(inode, true);
1058        if (ret == 0)
1059                ret = _pnfs_return_layout(inode);
1060        spin_lock(&inode->i_lock);
1061        lo->plh_block_lgets--;
1062        spin_unlock(&inode->i_lock);
1063        pnfs_put_layout_hdr(lo);
1064        return ret;
1065}
1066
1067bool pnfs_roc(struct inode *ino)
1068{
1069        struct nfs_inode *nfsi = NFS_I(ino);
1070        struct nfs_open_context *ctx;
1071        struct nfs4_state *state;
1072        struct pnfs_layout_hdr *lo;
1073        struct pnfs_layout_segment *lseg, *tmp;
1074        nfs4_stateid stateid;
1075        LIST_HEAD(tmp_list);
1076        bool found = false, layoutreturn = false, roc = false;
1077
1078        spin_lock(&ino->i_lock);
1079        lo = nfsi->layout;
1080        if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
1081                goto out_noroc;
1082
1083        /* no roc if we hold a delegation */
1084        if (nfs4_check_delegation(ino, FMODE_READ))
1085                goto out_noroc;
1086
1087        list_for_each_entry(ctx, &nfsi->open_files, list) {
1088                state = ctx->state;
1089                /* Don't return layout if there is open file state */
1090                if (state != NULL && state->state != 0)
1091                        goto out_noroc;
1092        }
1093
1094        stateid = lo->plh_stateid;
1095        /* always send layoutreturn if being marked so */
1096        if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1097                                   &lo->plh_flags))
1098                layoutreturn = pnfs_prepare_layoutreturn(lo);
1099
1100        pnfs_clear_retry_layoutget(lo);
1101        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
1102                /* If we are sending layoutreturn, invalidate all valid lsegs */
1103                if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
1104                        mark_lseg_invalid(lseg, &tmp_list);
1105                        found = true;
1106                }
1107        /* ROC in two conditions:
1108         * 1. there are ROC lsegs
1109         * 2. we don't send layoutreturn
1110         */
1111        if (found && !layoutreturn) {
1112                /* lo ref dropped in pnfs_roc_release() */
1113                pnfs_get_layout_hdr(lo);
1114                roc = true;
1115        }
1116
1117out_noroc:
1118        spin_unlock(&ino->i_lock);
1119        pnfs_free_lseg_list(&tmp_list);
1120        pnfs_layoutcommit_inode(ino, true);
1121        if (layoutreturn)
1122                pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
1123        return roc;
1124}
1125
1126void pnfs_roc_release(struct inode *ino)
1127{
1128        struct pnfs_layout_hdr *lo;
1129
1130        spin_lock(&ino->i_lock);
1131        lo = NFS_I(ino)->layout;
1132        pnfs_clear_layoutreturn_waitbit(lo);
1133        if (atomic_dec_and_test(&lo->plh_refcount)) {
1134                pnfs_detach_layout_hdr(lo);
1135                spin_unlock(&ino->i_lock);
1136                pnfs_free_layout_hdr(lo);
1137        } else
1138                spin_unlock(&ino->i_lock);
1139}
1140
1141void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
1142{
1143        struct pnfs_layout_hdr *lo;
1144
1145        spin_lock(&ino->i_lock);
1146        lo = NFS_I(ino)->layout;
1147        if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
1148                lo->plh_barrier = barrier;
1149        spin_unlock(&ino->i_lock);
1150        trace_nfs4_layoutreturn_on_close(ino, 0);
1151}
1152
1153void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
1154{
1155        struct nfs_inode *nfsi = NFS_I(ino);
1156        struct pnfs_layout_hdr *lo;
1157        u32 current_seqid;
1158
1159        spin_lock(&ino->i_lock);
1160        lo = nfsi->layout;
1161        current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
1162
1163        /* Since close does not return a layout stateid for use as
1164         * a barrier, we choose the worst-case barrier.
1165         */
1166        *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
1167        spin_unlock(&ino->i_lock);
1168}
1169
1170bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1171{
1172        struct nfs_inode *nfsi = NFS_I(ino);
1173        struct pnfs_layout_hdr *lo;
1174        bool sleep = false;
1175
1176        /* we might not have grabbed lo reference. so need to check under
1177         * i_lock */
1178        spin_lock(&ino->i_lock);
1179        lo = nfsi->layout;
1180        if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
1181                sleep = true;
1182        spin_unlock(&ino->i_lock);
1183
1184        if (sleep)
1185                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1186
1187        return sleep;
1188}
1189
1190/*
1191 * Compare two layout segments for sorting into layout cache.
1192 * We want to preferentially return RW over RO layouts, so ensure those
1193 * are seen first.
1194 */
1195static s64
1196pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1197           const struct pnfs_layout_range *l2)
1198{
1199        s64 d;
1200
1201        /* high offset > low offset */
1202        d = l1->offset - l2->offset;
1203        if (d)
1204                return d;
1205
1206        /* short length > long length */
1207        d = l2->length - l1->length;
1208        if (d)
1209                return d;
1210
1211        /* read > read/write */
1212        return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1213}
1214
1215static bool
1216pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1217                const struct pnfs_layout_range *l2)
1218{
1219        return pnfs_lseg_range_cmp(l1, l2) > 0;
1220}
1221
1222static bool
1223pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1224                struct pnfs_layout_segment *old)
1225{
1226        return false;
1227}
1228
1229void
1230pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1231                   struct pnfs_layout_segment *lseg,
1232                   bool (*is_after)(const struct pnfs_layout_range *,
1233                           const struct pnfs_layout_range *),
1234                   bool (*do_merge)(struct pnfs_layout_segment *,
1235                           struct pnfs_layout_segment *),
1236                   struct list_head *free_me)
1237{
1238        struct pnfs_layout_segment *lp, *tmp;
1239
1240        dprintk("%s:Begin\n", __func__);
1241
1242        list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1243                if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1244                        continue;
1245                if (do_merge(lseg, lp)) {
1246                        mark_lseg_invalid(lp, free_me);
1247                        continue;
1248                }
1249                if (is_after(&lseg->pls_range, &lp->pls_range))
1250                        continue;
1251                list_add_tail(&lseg->pls_list, &lp->pls_list);
1252                dprintk("%s: inserted lseg %p "
1253                        "iomode %d offset %llu length %llu before "
1254                        "lp %p iomode %d offset %llu length %llu\n",
1255                        __func__, lseg, lseg->pls_range.iomode,
1256                        lseg->pls_range.offset, lseg->pls_range.length,
1257                        lp, lp->pls_range.iomode, lp->pls_range.offset,
1258                        lp->pls_range.length);
1259                goto out;
1260        }
1261        list_add_tail(&lseg->pls_list, &lo->plh_segs);
1262        dprintk("%s: inserted lseg %p "
1263                "iomode %d offset %llu length %llu at tail\n",
1264                __func__, lseg, lseg->pls_range.iomode,
1265                lseg->pls_range.offset, lseg->pls_range.length);
1266out:
1267        pnfs_get_layout_hdr(lo);
1268
1269        dprintk("%s:Return\n", __func__);
1270}
1271EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1272
1273static void
1274pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1275                   struct pnfs_layout_segment *lseg,
1276                   struct list_head *free_me)
1277{
1278        struct inode *inode = lo->plh_inode;
1279        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1280
1281        if (ld->add_lseg != NULL)
1282                ld->add_lseg(lo, lseg, free_me);
1283        else
1284                pnfs_generic_layout_insert_lseg(lo, lseg,
1285                                pnfs_lseg_range_is_after,
1286                                pnfs_lseg_no_merge,
1287                                free_me);
1288}
1289
1290static struct pnfs_layout_hdr *
1291alloc_init_layout_hdr(struct inode *ino,
1292                      struct nfs_open_context *ctx,
1293                      gfp_t gfp_flags)
1294{
1295        struct pnfs_layout_hdr *lo;
1296
1297        lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1298        if (!lo)
1299                return NULL;
1300        atomic_set(&lo->plh_refcount, 1);
1301        INIT_LIST_HEAD(&lo->plh_layouts);
1302        INIT_LIST_HEAD(&lo->plh_segs);
1303        INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1304        lo->plh_inode = ino;
1305        lo->plh_lc_cred = get_rpccred(ctx->cred);
1306        return lo;
1307}
1308
1309static struct pnfs_layout_hdr *
1310pnfs_find_alloc_layout(struct inode *ino,
1311                       struct nfs_open_context *ctx,
1312                       gfp_t gfp_flags)
1313{
1314        struct nfs_inode *nfsi = NFS_I(ino);
1315        struct pnfs_layout_hdr *new = NULL;
1316
1317        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1318
1319        if (nfsi->layout != NULL)
1320                goto out_existing;
1321        spin_unlock(&ino->i_lock);
1322        new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1323        spin_lock(&ino->i_lock);
1324
1325        if (likely(nfsi->layout == NULL)) {     /* Won the race? */
1326                nfsi->layout = new;
1327                return new;
1328        } else if (new != NULL)
1329                pnfs_free_layout_hdr(new);
1330out_existing:
1331        pnfs_get_layout_hdr(nfsi->layout);
1332        return nfsi->layout;
1333}
1334
1335/*
1336 * iomode matching rules:
1337 * iomode       lseg    match
1338 * -----        -----   -----
1339 * ANY          READ    true
1340 * ANY          RW      true
1341 * RW           READ    false
1342 * RW           RW      true
1343 * READ         READ    true
1344 * READ         RW      true
1345 */
1346static bool
1347pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1348                 const struct pnfs_layout_range *range)
1349{
1350        struct pnfs_layout_range range1;
1351
1352        if ((range->iomode == IOMODE_RW &&
1353             ls_range->iomode != IOMODE_RW) ||
1354            !pnfs_lseg_range_intersecting(ls_range, range))
1355                return 0;
1356
1357        /* range1 covers only the first byte in the range */
1358        range1 = *range;
1359        range1.length = 1;
1360        return pnfs_lseg_range_contained(ls_range, &range1);
1361}
1362
1363/*
1364 * lookup range in layout
1365 */
1366static struct pnfs_layout_segment *
1367pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1368                struct pnfs_layout_range *range)
1369{
1370        struct pnfs_layout_segment *lseg, *ret = NULL;
1371
1372        dprintk("%s:Begin\n", __func__);
1373
1374        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1375                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1376                    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
1377                    pnfs_lseg_range_match(&lseg->pls_range, range)) {
1378                        ret = pnfs_get_lseg(lseg);
1379                        break;
1380                }
1381        }
1382
1383        dprintk("%s:Return lseg %p ref %d\n",
1384                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
1385        return ret;
1386}
1387
1388/*
1389 * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1390 * to the MDS or over pNFS
1391 *
1392 * The nfs_inode read_io and write_io fields are cumulative counters reset
1393 * when there are no layout segments. Note that in pnfs_update_layout iomode
1394 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1395 * WRITE request.
1396 *
1397 * A return of true means use MDS I/O.
1398 *
1399 * From rfc 5661:
1400 * If a file's size is smaller than the file size threshold, data accesses
1401 * SHOULD be sent to the metadata server.  If an I/O request has a length that
1402 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1403 * server.  If both file size and I/O size are provided, the client SHOULD
1404 * reach or exceed  both thresholds before sending its read or write
1405 * requests to the data server.
1406 */
1407static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1408                                     struct inode *ino, int iomode)
1409{
1410        struct nfs4_threshold *t = ctx->mdsthreshold;
1411        struct nfs_inode *nfsi = NFS_I(ino);
1412        loff_t fsize = i_size_read(ino);
1413        bool size = false, size_set = false, io = false, io_set = false, ret = false;
1414
1415        if (t == NULL)
1416                return ret;
1417
1418        dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1419                __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1420
1421        switch (iomode) {
1422        case IOMODE_READ:
1423                if (t->bm & THRESHOLD_RD) {
1424                        dprintk("%s fsize %llu\n", __func__, fsize);
1425                        size_set = true;
1426                        if (fsize < t->rd_sz)
1427                                size = true;
1428                }
1429                if (t->bm & THRESHOLD_RD_IO) {
1430                        dprintk("%s nfsi->read_io %llu\n", __func__,
1431                                nfsi->read_io);
1432                        io_set = true;
1433                        if (nfsi->read_io < t->rd_io_sz)
1434                                io = true;
1435                }
1436                break;
1437        case IOMODE_RW:
1438                if (t->bm & THRESHOLD_WR) {
1439                        dprintk("%s fsize %llu\n", __func__, fsize);
1440                        size_set = true;
1441                        if (fsize < t->wr_sz)
1442                                size = true;
1443                }
1444                if (t->bm & THRESHOLD_WR_IO) {
1445                        dprintk("%s nfsi->write_io %llu\n", __func__,
1446                                nfsi->write_io);
1447                        io_set = true;
1448                        if (nfsi->write_io < t->wr_io_sz)
1449                                io = true;
1450                }
1451                break;
1452        }
1453        if (size_set && io_set) {
1454                if (size && io)
1455                        ret = true;
1456        } else if (size || io)
1457                ret = true;
1458
1459        dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1460        return ret;
1461}
1462
1463/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
1464static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
1465{
1466        if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
1467                return 1;
1468        return nfs_wait_bit_killable(key);
1469}
1470
1471static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1472{
1473        if (!pnfs_should_retry_layoutget(lo))
1474                return false;
1475        /*
1476         * send layoutcommit as it can hold up layoutreturn due to lseg
1477         * reference
1478         */
1479        pnfs_layoutcommit_inode(lo->plh_inode, false);
1480        return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1481                                   pnfs_layoutget_retry_bit_wait,
1482                                   TASK_UNINTERRUPTIBLE);
1483}
1484
1485static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1486{
1487        unsigned long *bitlock = &lo->plh_flags;
1488
1489        clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1490        smp_mb__after_atomic();
1491        wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1492}
1493
1494/*
1495 * Layout segment is retreived from the server if not cached.
1496 * The appropriate layout segment is referenced and returned to the caller.
1497 */
1498struct pnfs_layout_segment *
1499pnfs_update_layout(struct inode *ino,
1500                   struct nfs_open_context *ctx,
1501                   loff_t pos,
1502                   u64 count,
1503                   enum pnfs_iomode iomode,
1504                   gfp_t gfp_flags)
1505{
1506        struct pnfs_layout_range arg = {
1507                .iomode = iomode,
1508                .offset = pos,
1509                .length = count,
1510        };
1511        unsigned pg_offset;
1512        struct nfs_server *server = NFS_SERVER(ino);
1513        struct nfs_client *clp = server->nfs_client;
1514        struct pnfs_layout_hdr *lo;
1515        struct pnfs_layout_segment *lseg = NULL;
1516        bool first;
1517
1518        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
1519                goto out;
1520
1521        if (iomode == IOMODE_READ && i_size_read(ino) == 0)
1522                goto out;
1523
1524        if (pnfs_within_mdsthreshold(ctx, ino, iomode))
1525                goto out;
1526
1527lookup_again:
1528        first = false;
1529        spin_lock(&ino->i_lock);
1530        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
1531        if (lo == NULL) {
1532                spin_unlock(&ino->i_lock);
1533                goto out;
1534        }
1535
1536        /* Do we even need to bother with this? */
1537        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1538                dprintk("%s matches recall, use MDS\n", __func__);
1539                goto out_unlock;
1540        }
1541
1542        /* if LAYOUTGET already failed once we don't try again */
1543        if (pnfs_layout_io_test_failed(lo, iomode) &&
1544            !pnfs_should_retry_layoutget(lo))
1545                goto out_unlock;
1546
1547        first = list_empty(&lo->plh_segs);
1548        if (first) {
1549                /* The first layoutget for the file. Need to serialize per
1550                 * RFC 5661 Errata 3208.
1551                 */
1552                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
1553                                     &lo->plh_flags)) {
1554                        spin_unlock(&ino->i_lock);
1555                        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
1556                                    TASK_UNINTERRUPTIBLE);
1557                        pnfs_put_layout_hdr(lo);
1558                        goto lookup_again;
1559                }
1560        } else {
1561                /* Check to see if the layout for the given range
1562                 * already exists
1563                 */
1564                lseg = pnfs_find_lseg(lo, &arg);
1565                if (lseg)
1566                        goto out_unlock;
1567        }
1568
1569        /*
1570         * Because we free lsegs before sending LAYOUTRETURN, we need to wait
1571         * for LAYOUTRETURN even if first is true.
1572         */
1573        if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1574                spin_unlock(&ino->i_lock);
1575                dprintk("%s wait for layoutreturn\n", __func__);
1576                if (pnfs_prepare_to_retry_layoutget(lo)) {
1577                        if (first)
1578                                pnfs_clear_first_layoutget(lo);
1579                        pnfs_put_layout_hdr(lo);
1580                        dprintk("%s retrying\n", __func__);
1581                        goto lookup_again;
1582                }
1583                goto out_put_layout_hdr;
1584        }
1585
1586        if (pnfs_layoutgets_blocked(lo))
1587                goto out_unlock;
1588        atomic_inc(&lo->plh_outstanding);
1589        spin_unlock(&ino->i_lock);
1590
1591        if (list_empty(&lo->plh_layouts)) {
1592                /* The lo must be on the clp list if there is any
1593                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1594                 */
1595                spin_lock(&clp->cl_lock);
1596                if (list_empty(&lo->plh_layouts))
1597                        list_add_tail(&lo->plh_layouts, &server->layouts);
1598                spin_unlock(&clp->cl_lock);
1599        }
1600
1601        pg_offset = arg.offset & ~PAGE_CACHE_MASK;
1602        if (pg_offset) {
1603                arg.offset -= pg_offset;
1604                arg.length += pg_offset;
1605        }
1606        if (arg.length != NFS4_MAX_UINT64)
1607                arg.length = PAGE_CACHE_ALIGN(arg.length);
1608
1609        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
1610        pnfs_clear_retry_layoutget(lo);
1611        atomic_dec(&lo->plh_outstanding);
1612out_put_layout_hdr:
1613        if (first)
1614                pnfs_clear_first_layoutget(lo);
1615        pnfs_put_layout_hdr(lo);
1616out:
1617        dprintk("%s: inode %s/%llu pNFS layout segment %s for "
1618                        "(%s, offset: %llu, length: %llu)\n",
1619                        __func__, ino->i_sb->s_id,
1620                        (unsigned long long)NFS_FILEID(ino),
1621                        lseg == NULL ? "not found" : "found",
1622                        iomode==IOMODE_RW ?  "read/write" : "read-only",
1623                        (unsigned long long)pos,
1624                        (unsigned long long)count);
1625        return lseg;
1626out_unlock:
1627        spin_unlock(&ino->i_lock);
1628        goto out_put_layout_hdr;
1629}
1630EXPORT_SYMBOL_GPL(pnfs_update_layout);
1631
1632static bool
1633pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
1634{
1635        switch (range->iomode) {
1636        case IOMODE_READ:
1637        case IOMODE_RW:
1638                break;
1639        default:
1640                return false;
1641        }
1642        if (range->offset == NFS4_MAX_UINT64)
1643                return false;
1644        if (range->length == 0)
1645                return false;
1646        if (range->length != NFS4_MAX_UINT64 &&
1647            range->length > NFS4_MAX_UINT64 - range->offset)
1648                return false;
1649        return true;
1650}
1651
1652struct pnfs_layout_segment *
1653pnfs_layout_process(struct nfs4_layoutget *lgp)
1654{
1655        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
1656        struct nfs4_layoutget_res *res = &lgp->res;
1657        struct pnfs_layout_segment *lseg;
1658        struct inode *ino = lo->plh_inode;
1659        LIST_HEAD(free_me);
1660        int status = -EINVAL;
1661
1662        if (!pnfs_sanity_check_layout_range(&res->range))
1663                goto out;
1664
1665        /* Inject layout blob into I/O device driver */
1666        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
1667        if (!lseg || IS_ERR(lseg)) {
1668                if (!lseg)
1669                        status = -ENOMEM;
1670                else
1671                        status = PTR_ERR(lseg);
1672                dprintk("%s: Could not allocate layout: error %d\n",
1673                       __func__, status);
1674                goto out;
1675        }
1676
1677        init_lseg(lo, lseg);
1678        lseg->pls_range = res->range;
1679
1680        spin_lock(&ino->i_lock);
1681        if (pnfs_layoutgets_blocked(lo)) {
1682                dprintk("%s forget reply due to state\n", __func__);
1683                goto out_forget_reply;
1684        }
1685
1686        if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1687                /* existing state ID, make sure the sequence number matches. */
1688                if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1689                        dprintk("%s forget reply due to sequence\n", __func__);
1690                        goto out_forget_reply;
1691                }
1692                pnfs_set_layout_stateid(lo, &res->stateid, false);
1693        } else {
1694                /*
1695                 * We got an entirely new state ID.  Mark all segments for the
1696                 * inode invalid, and don't bother validating the stateid
1697                 * sequence number.
1698                 */
1699                pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
1700
1701                nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1702                lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1703        }
1704
1705        clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1706
1707        pnfs_get_lseg(lseg);
1708        pnfs_layout_insert_lseg(lo, lseg, &free_me);
1709
1710        if (res->return_on_close)
1711                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
1712
1713        spin_unlock(&ino->i_lock);
1714        pnfs_free_lseg_list(&free_me);
1715        return lseg;
1716out:
1717        return ERR_PTR(status);
1718
1719out_forget_reply:
1720        spin_unlock(&ino->i_lock);
1721        lseg->pls_layout = lo;
1722        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
1723        goto out;
1724}
1725
1726static void
1727pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
1728                                struct list_head *tmp_list,
1729                                struct pnfs_layout_range *return_range)
1730{
1731        struct pnfs_layout_segment *lseg, *next;
1732
1733        dprintk("%s:Begin lo %p\n", __func__, lo);
1734
1735        if (list_empty(&lo->plh_segs))
1736                return;
1737
1738        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
1739                if (should_free_lseg(&lseg->pls_range, return_range)) {
1740                        dprintk("%s: marking lseg %p iomode %d "
1741                                "offset %llu length %llu\n", __func__,
1742                                lseg, lseg->pls_range.iomode,
1743                                lseg->pls_range.offset,
1744                                lseg->pls_range.length);
1745                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1746                        mark_lseg_invalid(lseg, tmp_list);
1747                        set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1748                                        &lo->plh_flags);
1749                }
1750}
1751
1752void pnfs_error_mark_layout_for_return(struct inode *inode,
1753                                       struct pnfs_layout_segment *lseg)
1754{
1755        struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
1756        int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
1757        struct pnfs_layout_range range = {
1758                .iomode = lseg->pls_range.iomode,
1759                .offset = 0,
1760                .length = NFS4_MAX_UINT64,
1761        };
1762        LIST_HEAD(free_me);
1763
1764        spin_lock(&inode->i_lock);
1765        /* set failure bit so that pnfs path will be retried later */
1766        pnfs_layout_set_fail_bit(lo, iomode);
1767        if (lo->plh_return_iomode == 0)
1768                lo->plh_return_iomode = range.iomode;
1769        else if (lo->plh_return_iomode != range.iomode)
1770                lo->plh_return_iomode = IOMODE_ANY;
1771        /*
1772         * mark all matching lsegs so that we are sure to have no live
1773         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
1774         * for how it works.
1775         */
1776        pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
1777        spin_unlock(&inode->i_lock);
1778        pnfs_free_lseg_list(&free_me);
1779}
1780EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
1781
1782void
1783pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1784{
1785        u64 rd_size = req->wb_bytes;
1786
1787        if (pgio->pg_lseg == NULL) {
1788                if (pgio->pg_dreq == NULL)
1789                        rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
1790                else
1791                        rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1792
1793                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1794                                                   req->wb_context,
1795                                                   req_offset(req),
1796                                                   rd_size,
1797                                                   IOMODE_READ,
1798                                                   GFP_KERNEL);
1799        }
1800        /* If no lseg, fall back to read through mds */
1801        if (pgio->pg_lseg == NULL)
1802                nfs_pageio_reset_read_mds(pgio);
1803
1804}
1805EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
1806
1807void
1808pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
1809                           struct nfs_page *req, u64 wb_size)
1810{
1811        if (pgio->pg_lseg == NULL)
1812                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1813                                                   req->wb_context,
1814                                                   req_offset(req),
1815                                                   wb_size,
1816                                                   IOMODE_RW,
1817                                                   GFP_NOFS);
1818        /* If no lseg, fall back to write through mds */
1819        if (pgio->pg_lseg == NULL)
1820                nfs_pageio_reset_write_mds(pgio);
1821}
1822EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
1823
1824void
1825pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
1826{
1827        if (desc->pg_lseg) {
1828                pnfs_put_lseg(desc->pg_lseg);
1829                desc->pg_lseg = NULL;
1830        }
1831}
1832EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
1833
1834/*
1835 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
1836 * of bytes (maximum @req->wb_bytes) that can be coalesced.
1837 */
1838size_t
1839pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
1840                     struct nfs_page *prev, struct nfs_page *req)
1841{
1842        unsigned int size;
1843        u64 seg_end, req_start, seg_left;
1844
1845        size = nfs_generic_pg_test(pgio, prev, req);
1846        if (!size)
1847                return 0;
1848
1849        /*
1850         * 'size' contains the number of bytes left in the current page (up
1851         * to the original size asked for in @req->wb_bytes).
1852         *
1853         * Calculate how many bytes are left in the layout segment
1854         * and if there are less bytes than 'size', return that instead.
1855         *
1856         * Please also note that 'end_offset' is actually the offset of the
1857         * first byte that lies outside the pnfs_layout_range. FIXME?
1858         *
1859         */
1860        if (pgio->pg_lseg) {
1861                seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
1862                                     pgio->pg_lseg->pls_range.length);
1863                req_start = req_offset(req);
1864                WARN_ON_ONCE(req_start >= seg_end);
1865                /* start of request is past the last byte of this segment */
1866                if (req_start >= seg_end) {
1867                        /* reference the new lseg */
1868                        if (pgio->pg_ops->pg_cleanup)
1869                                pgio->pg_ops->pg_cleanup(pgio);
1870                        if (pgio->pg_ops->pg_init)
1871                                pgio->pg_ops->pg_init(pgio, req);
1872                        return 0;
1873                }
1874
1875                /* adjust 'size' iff there are fewer bytes left in the
1876                 * segment than what nfs_generic_pg_test returned */
1877                seg_left = seg_end - req_start;
1878                if (seg_left < size)
1879                        size = (unsigned int)seg_left;
1880        }
1881
1882        return size;
1883}
1884EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1885
1886int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
1887{
1888        struct nfs_pageio_descriptor pgio;
1889
1890        /* Resend all requests through the MDS */
1891        nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
1892                              hdr->completion_ops);
1893        set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
1894        return nfs_pageio_resend(&pgio, hdr);
1895}
1896EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
1897
1898static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
1899{
1900
1901        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
1902        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1903            PNFS_LAYOUTRET_ON_ERROR) {
1904                pnfs_return_layout(hdr->inode);
1905        }
1906        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1907                hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
1908}
1909
1910/*
1911 * Called by non rpc-based layout drivers
1912 */
1913void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
1914{
1915        trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
1916        if (!hdr->pnfs_error) {
1917                pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
1918                                hdr->mds_offset + hdr->res.count);
1919                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1920        } else
1921                pnfs_ld_handle_write_error(hdr);
1922        hdr->mds_ops->rpc_release(hdr);
1923}
1924EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1925
1926static void
1927pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1928                struct nfs_pgio_header *hdr)
1929{
1930        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1931
1932        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1933                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
1934                nfs_pageio_reset_write_mds(desc);
1935                mirror->pg_recoalesce = 1;
1936        }
1937        nfs_pgio_data_destroy(hdr);
1938        hdr->release(hdr);
1939}
1940
1941static enum pnfs_try_status
1942pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
1943                        const struct rpc_call_ops *call_ops,
1944                        struct pnfs_layout_segment *lseg,
1945                        int how)
1946{
1947        struct inode *inode = hdr->inode;
1948        enum pnfs_try_status trypnfs;
1949        struct nfs_server *nfss = NFS_SERVER(inode);
1950
1951        hdr->mds_ops = call_ops;
1952
1953        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
1954                inode->i_ino, hdr->args.count, hdr->args.offset, how);
1955        trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
1956        if (trypnfs != PNFS_NOT_ATTEMPTED)
1957                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
1958        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
1959        return trypnfs;
1960}
1961
1962static void
1963pnfs_do_write(struct nfs_pageio_descriptor *desc,
1964              struct nfs_pgio_header *hdr, int how)
1965{
1966        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1967        struct pnfs_layout_segment *lseg = desc->pg_lseg;
1968        enum pnfs_try_status trypnfs;
1969
1970        trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
1971        if (trypnfs == PNFS_NOT_ATTEMPTED)
1972                pnfs_write_through_mds(desc, hdr);
1973}
1974
1975static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1976{
1977        pnfs_put_lseg(hdr->lseg);
1978        nfs_pgio_header_free(hdr);
1979}
1980
1981int
1982pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1983{
1984        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
1985
1986        struct nfs_pgio_header *hdr;
1987        int ret;
1988
1989        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1990        if (!hdr) {
1991                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
1992                return -ENOMEM;
1993        }
1994        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1995
1996        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1997        ret = nfs_generic_pgio(desc, hdr);
1998        if (!ret)
1999                pnfs_do_write(desc, hdr, desc->pg_ioflags);
2000
2001        return ret;
2002}
2003EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2004
2005int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2006{
2007        struct nfs_pageio_descriptor pgio;
2008
2009        /* Resend all requests through the MDS */
2010        nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2011        return nfs_pageio_resend(&pgio, hdr);
2012}
2013EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2014
2015static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2016{
2017        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2018        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2019            PNFS_LAYOUTRET_ON_ERROR) {
2020                pnfs_return_layout(hdr->inode);
2021        }
2022        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2023                hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2024}
2025
2026/*
2027 * Called by non rpc-based layout drivers
2028 */
2029void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2030{
2031        trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2032        if (likely(!hdr->pnfs_error)) {
2033                __nfs4_read_done_cb(hdr);
2034                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2035        } else
2036                pnfs_ld_handle_read_error(hdr);
2037        hdr->mds_ops->rpc_release(hdr);
2038}
2039EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2040
2041static void
2042pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2043                struct nfs_pgio_header *hdr)
2044{
2045        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2046
2047        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2048                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2049                nfs_pageio_reset_read_mds(desc);
2050                mirror->pg_recoalesce = 1;
2051        }
2052        nfs_pgio_data_destroy(hdr);
2053        hdr->release(hdr);
2054}
2055
2056/*
2057 * Call the appropriate parallel I/O subsystem read function.
2058 */
2059static enum pnfs_try_status
2060pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
2061                       const struct rpc_call_ops *call_ops,
2062                       struct pnfs_layout_segment *lseg)
2063{
2064        struct inode *inode = hdr->inode;
2065        struct nfs_server *nfss = NFS_SERVER(inode);
2066        enum pnfs_try_status trypnfs;
2067
2068        hdr->mds_ops = call_ops;
2069
2070        dprintk("%s: Reading ino:%lu %u@%llu\n",
2071                __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
2072
2073        trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
2074        if (trypnfs != PNFS_NOT_ATTEMPTED)
2075                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
2076        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2077        return trypnfs;
2078}
2079
2080/* Resend all requests through pnfs. */
2081int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
2082{
2083        struct nfs_pageio_descriptor pgio;
2084
2085        nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
2086        return nfs_pageio_resend(&pgio, hdr);
2087}
2088EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
2089
2090static void
2091pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
2092{
2093        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2094        struct pnfs_layout_segment *lseg = desc->pg_lseg;
2095        enum pnfs_try_status trypnfs;
2096        int err = 0;
2097
2098        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
2099        if (trypnfs == PNFS_TRY_AGAIN)
2100                err = pnfs_read_resend_pnfs(hdr);
2101        if (trypnfs == PNFS_NOT_ATTEMPTED || err)
2102                pnfs_read_through_mds(desc, hdr);
2103}
2104
2105static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
2106{
2107        pnfs_put_lseg(hdr->lseg);
2108        nfs_pgio_header_free(hdr);
2109}
2110
2111int
2112pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
2113{
2114        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2115
2116        struct nfs_pgio_header *hdr;
2117        int ret;
2118
2119        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2120        if (!hdr) {
2121                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
2122                return -ENOMEM;
2123        }
2124        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
2125        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2126        ret = nfs_generic_pgio(desc, hdr);
2127        if (!ret)
2128                pnfs_do_read(desc, hdr);
2129        return ret;
2130}
2131EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
2132
2133static void pnfs_clear_layoutcommitting(struct inode *inode)
2134{
2135        unsigned long *bitlock = &NFS_I(inode)->flags;
2136
2137        clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
2138        smp_mb__after_atomic();
2139        wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
2140}
2141
2142/*
2143 * There can be multiple RW segments.
2144 */
2145static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
2146{
2147        struct pnfs_layout_segment *lseg;
2148
2149        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
2150                if (lseg->pls_range.iomode == IOMODE_RW &&
2151                    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
2152                        list_add(&lseg->pls_lc_list, listp);
2153        }
2154}
2155
2156static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
2157{
2158        struct pnfs_layout_segment *lseg, *tmp;
2159
2160        /* Matched by references in pnfs_set_layoutcommit */
2161        list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
2162                list_del_init(&lseg->pls_lc_list);
2163                pnfs_put_lseg(lseg);
2164        }
2165
2166        pnfs_clear_layoutcommitting(inode);
2167}
2168
2169void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
2170{
2171        pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
2172}
2173EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
2174
2175void
2176pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
2177                loff_t end_pos)
2178{
2179        struct nfs_inode *nfsi = NFS_I(inode);
2180        bool mark_as_dirty = false;
2181
2182        spin_lock(&inode->i_lock);
2183        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
2184                nfsi->layout->plh_lwb = end_pos;
2185                mark_as_dirty = true;
2186                dprintk("%s: Set layoutcommit for inode %lu ",
2187                        __func__, inode->i_ino);
2188        } else if (end_pos > nfsi->layout->plh_lwb)
2189                nfsi->layout->plh_lwb = end_pos;
2190        if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
2191                /* references matched in nfs4_layoutcommit_release */
2192                pnfs_get_lseg(lseg);
2193        }
2194        spin_unlock(&inode->i_lock);
2195        dprintk("%s: lseg %p end_pos %llu\n",
2196                __func__, lseg, nfsi->layout->plh_lwb);
2197
2198        /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
2199         * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
2200        if (mark_as_dirty)
2201                mark_inode_dirty_sync(inode);
2202}
2203EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
2204
2205void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
2206{
2207        struct nfs_server *nfss = NFS_SERVER(data->args.inode);
2208
2209        if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
2210                nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
2211        pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
2212}
2213
2214/*
2215 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
2216 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
2217 * data to disk to allow the server to recover the data if it crashes.
2218 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
2219 * is off, and a COMMIT is sent to a data server, or
2220 * if WRITEs to a data server return NFS_DATA_SYNC.
2221 */
2222int
2223pnfs_layoutcommit_inode(struct inode *inode, bool sync)
2224{
2225        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2226        struct nfs4_layoutcommit_data *data;
2227        struct nfs_inode *nfsi = NFS_I(inode);
2228        loff_t end_pos;
2229        int status;
2230
2231        if (!pnfs_layoutcommit_outstanding(inode))
2232                return 0;
2233
2234        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
2235
2236        status = -EAGAIN;
2237        if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
2238                if (!sync)
2239                        goto out;
2240                status = wait_on_bit_lock_action(&nfsi->flags,
2241                                NFS_INO_LAYOUTCOMMITTING,
2242                                nfs_wait_bit_killable,
2243                                TASK_KILLABLE);
2244                if (status)
2245                        goto out;
2246        }
2247
2248        status = -ENOMEM;
2249        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
2250        data = kzalloc(sizeof(*data), GFP_NOFS);
2251        if (!data)
2252                goto clear_layoutcommitting;
2253
2254        status = 0;
2255        spin_lock(&inode->i_lock);
2256        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
2257                goto out_unlock;
2258
2259        INIT_LIST_HEAD(&data->lseg_list);
2260        pnfs_list_write_lseg(inode, &data->lseg_list);
2261
2262        end_pos = nfsi->layout->plh_lwb;
2263
2264        nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
2265        spin_unlock(&inode->i_lock);
2266
2267        data->args.inode = inode;
2268        data->cred = get_rpccred(nfsi->layout->plh_lc_cred);
2269        nfs_fattr_init(&data->fattr);
2270        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
2271        data->res.fattr = &data->fattr;
2272        data->args.lastbytewritten = end_pos - 1;
2273        data->res.server = NFS_SERVER(inode);
2274
2275        if (ld->prepare_layoutcommit) {
2276                status = ld->prepare_layoutcommit(&data->args);
2277                if (status) {
2278                        put_rpccred(data->cred);
2279                        spin_lock(&inode->i_lock);
2280                        set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
2281                        if (end_pos > nfsi->layout->plh_lwb)
2282                                nfsi->layout->plh_lwb = end_pos;
2283                        goto out_unlock;
2284                }
2285        }
2286
2287
2288        status = nfs4_proc_layoutcommit(data, sync);
2289out:
2290        if (status)
2291                mark_inode_dirty_sync(inode);
2292        dprintk("<-- %s status %d\n", __func__, status);
2293        return status;
2294out_unlock:
2295        spin_unlock(&inode->i_lock);
2296        kfree(data);
2297clear_layoutcommitting:
2298        pnfs_clear_layoutcommitting(inode);
2299        goto out;
2300}
2301EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
2302
2303int
2304pnfs_generic_sync(struct inode *inode, bool datasync)
2305{
2306        return pnfs_layoutcommit_inode(inode, true);
2307}
2308EXPORT_SYMBOL_GPL(pnfs_generic_sync);
2309
2310struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
2311{
2312        struct nfs4_threshold *thp;
2313
2314        thp = kzalloc(sizeof(*thp), GFP_NOFS);
2315        if (!thp) {
2316                dprintk("%s mdsthreshold allocation failed\n", __func__);
2317                return NULL;
2318        }
2319        return thp;
2320}
2321
2322#if IS_ENABLED(CONFIG_NFS_V4_2)
2323int
2324pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2325{
2326        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
2327        struct nfs_server *server = NFS_SERVER(inode);
2328        struct nfs_inode *nfsi = NFS_I(inode);
2329        struct nfs42_layoutstat_data *data;
2330        struct pnfs_layout_hdr *hdr;
2331        int status = 0;
2332
2333        if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
2334                goto out;
2335
2336        if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
2337                goto out;
2338
2339        if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
2340                goto out;
2341
2342        spin_lock(&inode->i_lock);
2343        if (!NFS_I(inode)->layout) {
2344                spin_unlock(&inode->i_lock);
2345                goto out;
2346        }
2347        hdr = NFS_I(inode)->layout;
2348        pnfs_get_layout_hdr(hdr);
2349        spin_unlock(&inode->i_lock);
2350
2351        data = kzalloc(sizeof(*data), gfp_flags);
2352        if (!data) {
2353                status = -ENOMEM;
2354                goto out_put;
2355        }
2356
2357        data->args.fh = NFS_FH(inode);
2358        data->args.inode = inode;
2359        nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
2360        status = ld->prepare_layoutstats(&data->args);
2361        if (status)
2362                goto out_free;
2363
2364        status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
2365
2366out:
2367        dprintk("%s returns %d\n", __func__, status);
2368        return status;
2369
2370out_free:
2371        kfree(data);
2372out_put:
2373        pnfs_put_layout_hdr(hdr);
2374        smp_mb__before_atomic();
2375        clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
2376        smp_mb__after_atomic();
2377        goto out;
2378}
2379EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
2380#endif
2381
2382unsigned int layoutstats_timer;
2383module_param(layoutstats_timer, uint, 0644);
2384EXPORT_SYMBOL_GPL(layoutstats_timer);
2385