linux/fs/nfs/pnfs.c
<<
>>
Prefs
   1/*
   2 *  pNFS functions to call and manage layout drivers.
   3 *
   4 *  Copyright (c) 2002 [year of first publication]
   5 *  The Regents of the University of Michigan
   6 *  All Rights Reserved
   7 *
   8 *  Dean Hildebrand <dhildebz@umich.edu>
   9 *
  10 *  Permission is granted to use, copy, create derivative works, and
  11 *  redistribute this software and such derivative works for any purpose,
  12 *  so long as the name of the University of Michigan is not used in
  13 *  any advertising or publicity pertaining to the use or distribution
  14 *  of this software without specific, written prior authorization. If
  15 *  the above copyright notice or any other identification of the
  16 *  University of Michigan is included in any copy of any portion of
  17 *  this software, then the disclaimer below must also be included.
  18 *
  19 *  This software is provided as is, without representation or warranty
  20 *  of any kind either express or implied, including without limitation
  21 *  the implied warranties of merchantability, fitness for a particular
  22 *  purpose, or noninfringement.  The Regents of the University of
  23 *  Michigan shall not be liable for any damages, including special,
  24 *  indirect, incidental, or consequential damages, with respect to any
  25 *  claim arising out of or in connection with the use of the software,
  26 *  even if it has been or is hereafter advised of the possibility of
  27 *  such damages.
  28 */
  29
  30#include <linux/nfs_fs.h>
  31#include "internal.h"
  32#include "pnfs.h"
  33
  34#define NFSDBG_FACILITY         NFSDBG_PNFS
  35
  36/* Locking:
  37 *
  38 * pnfs_spinlock:
  39 *      protects pnfs_modules_tbl.
  40 */
  41static DEFINE_SPINLOCK(pnfs_spinlock);
  42
  43/*
  44 * pnfs_modules_tbl holds all pnfs modules
  45 */
  46static LIST_HEAD(pnfs_modules_tbl);
  47
  48/* Return the registered pnfs layout driver module matching given id */
  49static struct pnfs_layoutdriver_type *
  50find_pnfs_driver_locked(u32 id)
  51{
  52        struct pnfs_layoutdriver_type *local;
  53
  54        list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
  55                if (local->id == id)
  56                        goto out;
  57        local = NULL;
  58out:
  59        dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
  60        return local;
  61}
  62
  63static struct pnfs_layoutdriver_type *
  64find_pnfs_driver(u32 id)
  65{
  66        struct pnfs_layoutdriver_type *local;
  67
  68        spin_lock(&pnfs_spinlock);
  69        local = find_pnfs_driver_locked(id);
  70        spin_unlock(&pnfs_spinlock);
  71        return local;
  72}
  73
  74void
  75unset_pnfs_layoutdriver(struct nfs_server *nfss)
  76{
  77        if (nfss->pnfs_curr_ld) {
  78                nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
  79                module_put(nfss->pnfs_curr_ld->owner);
  80        }
  81        nfss->pnfs_curr_ld = NULL;
  82}
  83
  84/*
  85 * Try to set the server's pnfs module to the pnfs layout type specified by id.
  86 * Currently only one pNFS layout driver per filesystem is supported.
  87 *
  88 * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
  89 */
  90void
  91set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
  92{
  93        struct pnfs_layoutdriver_type *ld_type = NULL;
  94
  95        if (id == 0)
  96                goto out_no_driver;
  97        if (!(server->nfs_client->cl_exchange_flags &
  98                 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
  99                printk(KERN_ERR "%s: id %u cl_exchange_flags 0x%x\n", __func__,
 100                       id, server->nfs_client->cl_exchange_flags);
 101                goto out_no_driver;
 102        }
 103        ld_type = find_pnfs_driver(id);
 104        if (!ld_type) {
 105                request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
 106                ld_type = find_pnfs_driver(id);
 107                if (!ld_type) {
 108                        dprintk("%s: No pNFS module found for %u.\n",
 109                                __func__, id);
 110                        goto out_no_driver;
 111                }
 112        }
 113        if (!try_module_get(ld_type->owner)) {
 114                dprintk("%s: Could not grab reference on module\n", __func__);
 115                goto out_no_driver;
 116        }
 117        server->pnfs_curr_ld = ld_type;
 118        if (ld_type->set_layoutdriver(server)) {
 119                printk(KERN_ERR
 120                       "%s: Error initializing mount point for layout driver %u.\n",
 121                       __func__, id);
 122                module_put(ld_type->owner);
 123                goto out_no_driver;
 124        }
 125        dprintk("%s: pNFS module for %u set\n", __func__, id);
 126        return;
 127
 128out_no_driver:
 129        dprintk("%s: Using NFSv4 I/O\n", __func__);
 130        server->pnfs_curr_ld = NULL;
 131}
 132
 133int
 134pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 135{
 136        int status = -EINVAL;
 137        struct pnfs_layoutdriver_type *tmp;
 138
 139        if (ld_type->id == 0) {
 140                printk(KERN_ERR "%s id 0 is reserved\n", __func__);
 141                return status;
 142        }
 143        if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
 144                printk(KERN_ERR "%s Layout driver must provide "
 145                       "alloc_lseg and free_lseg.\n", __func__);
 146                return status;
 147        }
 148
 149        spin_lock(&pnfs_spinlock);
 150        tmp = find_pnfs_driver_locked(ld_type->id);
 151        if (!tmp) {
 152                list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
 153                status = 0;
 154                dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
 155                        ld_type->name);
 156        } else {
 157                printk(KERN_ERR "%s Module with id %d already loaded!\n",
 158                        __func__, ld_type->id);
 159        }
 160        spin_unlock(&pnfs_spinlock);
 161
 162        return status;
 163}
 164EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
 165
 166void
 167pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
 168{
 169        dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
 170        spin_lock(&pnfs_spinlock);
 171        list_del(&ld_type->pnfs_tblid);
 172        spin_unlock(&pnfs_spinlock);
 173}
 174EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
 175
 176/*
 177 * pNFS client layout cache
 178 */
 179
 180/* Need to hold i_lock if caller does not already hold reference */
 181void
 182get_layout_hdr(struct pnfs_layout_hdr *lo)
 183{
 184        atomic_inc(&lo->plh_refcount);
 185}
 186
 187static void
 188destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 189{
 190        dprintk("%s: freeing layout cache %p\n", __func__, lo);
 191        BUG_ON(!list_empty(&lo->plh_layouts));
 192        NFS_I(lo->plh_inode)->layout = NULL;
 193        kfree(lo);
 194}
 195
 196static void
 197put_layout_hdr_locked(struct pnfs_layout_hdr *lo)
 198{
 199        if (atomic_dec_and_test(&lo->plh_refcount))
 200                destroy_layout_hdr(lo);
 201}
 202
 203void
 204put_layout_hdr(struct pnfs_layout_hdr *lo)
 205{
 206        struct inode *inode = lo->plh_inode;
 207
 208        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 209                destroy_layout_hdr(lo);
 210                spin_unlock(&inode->i_lock);
 211        }
 212}
 213
 214static void
 215init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
 216{
 217        INIT_LIST_HEAD(&lseg->pls_list);
 218        atomic_set(&lseg->pls_refcount, 1);
 219        smp_mb();
 220        set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
 221        lseg->pls_layout = lo;
 222}
 223
 224static void free_lseg(struct pnfs_layout_segment *lseg)
 225{
 226        struct inode *ino = lseg->pls_layout->plh_inode;
 227
 228        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
 229        /* Matched by get_layout_hdr in pnfs_insert_layout */
 230        put_layout_hdr(NFS_I(ino)->layout);
 231}
 232
 233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
 234 * could sleep, so must be called outside of the lock.
 235 * Returns 1 if object was removed, otherwise return 0.
 236 */
 237static int
 238put_lseg_locked(struct pnfs_layout_segment *lseg,
 239                struct list_head *tmp_list)
 240{
 241        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
 242                atomic_read(&lseg->pls_refcount),
 243                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 244        if (atomic_dec_and_test(&lseg->pls_refcount)) {
 245                struct inode *ino = lseg->pls_layout->plh_inode;
 246
 247                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 248                list_del(&lseg->pls_list);
 249                if (list_empty(&lseg->pls_layout->plh_segs)) {
 250                        struct nfs_client *clp;
 251
 252                        clp = NFS_SERVER(ino)->nfs_client;
 253                        spin_lock(&clp->cl_lock);
 254                        /* List does not take a reference, so no need for put here */
 255                        list_del_init(&lseg->pls_layout->plh_layouts);
 256                        spin_unlock(&clp->cl_lock);
 257                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
 258                }
 259                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
 260                list_add(&lseg->pls_list, tmp_list);
 261                return 1;
 262        }
 263        return 0;
 264}
 265
 266static bool
 267should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
 268{
 269        return (recall_iomode == IOMODE_ANY ||
 270                lseg_iomode == recall_iomode);
 271}
 272
 273/* Returns 1 if lseg is removed from list, 0 otherwise */
 274static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 275                             struct list_head *tmp_list)
 276{
 277        int rv = 0;
 278
 279        if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
 280                /* Remove the reference keeping the lseg in the
 281                 * list.  It will now be removed when all
 282                 * outstanding io is finished.
 283                 */
 284                rv = put_lseg_locked(lseg, tmp_list);
 285        }
 286        return rv;
 287}
 288
 289/* Returns count of number of matching invalid lsegs remaining in list
 290 * after call.
 291 */
 292int
 293mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 294                            struct list_head *tmp_list,
 295                            u32 iomode)
 296{
 297        struct pnfs_layout_segment *lseg, *next;
 298        int invalid = 0, removed = 0;
 299
 300        dprintk("%s:Begin lo %p\n", __func__, lo);
 301
 302        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 303                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
 304                        dprintk("%s: freeing lseg %p iomode %d "
 305                                "offset %llu length %llu\n", __func__,
 306                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
 307                                lseg->pls_range.length);
 308                        invalid++;
 309                        removed += mark_lseg_invalid(lseg, tmp_list);
 310                }
 311        dprintk("%s:Return %i\n", __func__, invalid - removed);
 312        return invalid - removed;
 313}
 314
 315void
 316pnfs_free_lseg_list(struct list_head *free_me)
 317{
 318        struct pnfs_layout_segment *lseg, *tmp;
 319
 320        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
 321                list_del(&lseg->pls_list);
 322                free_lseg(lseg);
 323        }
 324}
 325
 326void
 327pnfs_destroy_layout(struct nfs_inode *nfsi)
 328{
 329        struct pnfs_layout_hdr *lo;
 330        LIST_HEAD(tmp_list);
 331
 332        spin_lock(&nfsi->vfs_inode.i_lock);
 333        lo = nfsi->layout;
 334        if (lo) {
 335                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
 336                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
 337                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
 338                put_layout_hdr_locked(lo);
 339        }
 340        spin_unlock(&nfsi->vfs_inode.i_lock);
 341        pnfs_free_lseg_list(&tmp_list);
 342}
 343
 344/*
 345 * Called by the state manger to remove all layouts established under an
 346 * expired lease.
 347 */
 348void
 349pnfs_destroy_all_layouts(struct nfs_client *clp)
 350{
 351        struct pnfs_layout_hdr *lo;
 352        LIST_HEAD(tmp_list);
 353
 354        spin_lock(&clp->cl_lock);
 355        list_splice_init(&clp->cl_layouts, &tmp_list);
 356        spin_unlock(&clp->cl_lock);
 357
 358        while (!list_empty(&tmp_list)) {
 359                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
 360                                plh_layouts);
 361                dprintk("%s freeing layout for inode %lu\n", __func__,
 362                        lo->plh_inode->i_ino);
 363                pnfs_destroy_layout(NFS_I(lo->plh_inode));
 364        }
 365}
 366
 367/* update lo->plh_stateid with new if is more recent */
 368void
 369pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
 370                        bool update_barrier)
 371{
 372        u32 oldseq, newseq;
 373
 374        oldseq = be32_to_cpu(lo->plh_stateid.stateid.seqid);
 375        newseq = be32_to_cpu(new->stateid.seqid);
 376        if ((int)(newseq - oldseq) > 0) {
 377                memcpy(&lo->plh_stateid, &new->stateid, sizeof(new->stateid));
 378                if (update_barrier) {
 379                        u32 new_barrier = be32_to_cpu(new->stateid.seqid);
 380
 381                        if ((int)(new_barrier - lo->plh_barrier))
 382                                lo->plh_barrier = new_barrier;
 383                } else {
 384                        /* Because of wraparound, we want to keep the barrier
 385                         * "close" to the current seqids.  It needs to be
 386                         * within 2**31 to count as "behind", so if it
 387                         * gets too near that limit, give us a litle leeway
 388                         * and bring it to within 2**30.
 389                         * NOTE - and yes, this is all unsigned arithmetic.
 390                         */
 391                        if (unlikely((newseq - lo->plh_barrier) > (3 << 29)))
 392                                lo->plh_barrier = newseq - (1 << 30);
 393                }
 394        }
 395}
 396
 397/* lget is set to 1 if called from inside send_layoutget call chain */
 398static bool
 399pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
 400                        int lget)
 401{
 402        if ((stateid) &&
 403            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
 404                return true;
 405        return lo->plh_block_lgets ||
 406                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
 407                (list_empty(&lo->plh_segs) &&
 408                 (atomic_read(&lo->plh_outstanding) > lget));
 409}
 410
 411int
 412pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 413                              struct nfs4_state *open_state)
 414{
 415        int status = 0;
 416
 417        dprintk("--> %s\n", __func__);
 418        spin_lock(&lo->plh_inode->i_lock);
 419        if (pnfs_layoutgets_blocked(lo, NULL, 1)) {
 420                status = -EAGAIN;
 421        } else if (list_empty(&lo->plh_segs)) {
 422                int seq;
 423
 424                do {
 425                        seq = read_seqbegin(&open_state->seqlock);
 426                        memcpy(dst->data, open_state->stateid.data,
 427                               sizeof(open_state->stateid.data));
 428                } while (read_seqretry(&open_state->seqlock, seq));
 429        } else
 430                memcpy(dst->data, lo->plh_stateid.data, sizeof(lo->plh_stateid.data));
 431        spin_unlock(&lo->plh_inode->i_lock);
 432        dprintk("<-- %s\n", __func__);
 433        return status;
 434}
 435
 436/*
 437* Get layout from server.
 438*    for now, assume that whole file layouts are requested.
 439*    arg->offset: 0
 440*    arg->length: all ones
 441*/
 442static struct pnfs_layout_segment *
 443send_layoutget(struct pnfs_layout_hdr *lo,
 444           struct nfs_open_context *ctx,
 445           u32 iomode)
 446{
 447        struct inode *ino = lo->plh_inode;
 448        struct nfs_server *server = NFS_SERVER(ino);
 449        struct nfs4_layoutget *lgp;
 450        struct pnfs_layout_segment *lseg = NULL;
 451
 452        dprintk("--> %s\n", __func__);
 453
 454        BUG_ON(ctx == NULL);
 455        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
 456        if (lgp == NULL)
 457                return NULL;
 458        lgp->args.minlength = NFS4_MAX_UINT64;
 459        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
 460        lgp->args.range.iomode = iomode;
 461        lgp->args.range.offset = 0;
 462        lgp->args.range.length = NFS4_MAX_UINT64;
 463        lgp->args.type = server->pnfs_curr_ld->id;
 464        lgp->args.inode = ino;
 465        lgp->args.ctx = get_nfs_open_context(ctx);
 466        lgp->lsegpp = &lseg;
 467
 468        /* Synchronously retrieve layout information from server and
 469         * store in lseg.
 470         */
 471        nfs4_proc_layoutget(lgp);
 472        if (!lseg) {
 473                /* remember that LAYOUTGET failed and suspend trying */
 474                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
 475        }
 476        return lseg;
 477}
 478
 479bool pnfs_roc(struct inode *ino)
 480{
 481        struct pnfs_layout_hdr *lo;
 482        struct pnfs_layout_segment *lseg, *tmp;
 483        LIST_HEAD(tmp_list);
 484        bool found = false;
 485
 486        spin_lock(&ino->i_lock);
 487        lo = NFS_I(ino)->layout;
 488        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
 489            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
 490                goto out_nolayout;
 491        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
 492                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
 493                        mark_lseg_invalid(lseg, &tmp_list);
 494                        found = true;
 495                }
 496        if (!found)
 497                goto out_nolayout;
 498        lo->plh_block_lgets++;
 499        get_layout_hdr(lo); /* matched in pnfs_roc_release */
 500        spin_unlock(&ino->i_lock);
 501        pnfs_free_lseg_list(&tmp_list);
 502        return true;
 503
 504out_nolayout:
 505        spin_unlock(&ino->i_lock);
 506        return false;
 507}
 508
 509void pnfs_roc_release(struct inode *ino)
 510{
 511        struct pnfs_layout_hdr *lo;
 512
 513        spin_lock(&ino->i_lock);
 514        lo = NFS_I(ino)->layout;
 515        lo->plh_block_lgets--;
 516        put_layout_hdr_locked(lo);
 517        spin_unlock(&ino->i_lock);
 518}
 519
 520void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 521{
 522        struct pnfs_layout_hdr *lo;
 523
 524        spin_lock(&ino->i_lock);
 525        lo = NFS_I(ino)->layout;
 526        if ((int)(barrier - lo->plh_barrier) > 0)
 527                lo->plh_barrier = barrier;
 528        spin_unlock(&ino->i_lock);
 529}
 530
 531bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
 532{
 533        struct nfs_inode *nfsi = NFS_I(ino);
 534        struct pnfs_layout_segment *lseg;
 535        bool found = false;
 536
 537        spin_lock(&ino->i_lock);
 538        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
 539                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
 540                        found = true;
 541                        break;
 542                }
 543        if (!found) {
 544                struct pnfs_layout_hdr *lo = nfsi->layout;
 545                u32 current_seqid = be32_to_cpu(lo->plh_stateid.stateid.seqid);
 546
 547                /* Since close does not return a layout stateid for use as
 548                 * a barrier, we choose the worst-case barrier.
 549                 */
 550                *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
 551        }
 552        spin_unlock(&ino->i_lock);
 553        return found;
 554}
 555
 556/*
 557 * Compare two layout segments for sorting into layout cache.
 558 * We want to preferentially return RW over RO layouts, so ensure those
 559 * are seen first.
 560 */
 561static s64
 562cmp_layout(u32 iomode1, u32 iomode2)
 563{
 564        /* read > read/write */
 565        return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
 566}
 567
 568static void
 569pnfs_insert_layout(struct pnfs_layout_hdr *lo,
 570                   struct pnfs_layout_segment *lseg)
 571{
 572        struct pnfs_layout_segment *lp;
 573        int found = 0;
 574
 575        dprintk("%s:Begin\n", __func__);
 576
 577        assert_spin_locked(&lo->plh_inode->i_lock);
 578        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
 579                if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
 580                        continue;
 581                list_add_tail(&lseg->pls_list, &lp->pls_list);
 582                dprintk("%s: inserted lseg %p "
 583                        "iomode %d offset %llu length %llu before "
 584                        "lp %p iomode %d offset %llu length %llu\n",
 585                        __func__, lseg, lseg->pls_range.iomode,
 586                        lseg->pls_range.offset, lseg->pls_range.length,
 587                        lp, lp->pls_range.iomode, lp->pls_range.offset,
 588                        lp->pls_range.length);
 589                found = 1;
 590                break;
 591        }
 592        if (!found) {
 593                list_add_tail(&lseg->pls_list, &lo->plh_segs);
 594                dprintk("%s: inserted lseg %p "
 595                        "iomode %d offset %llu length %llu at tail\n",
 596                        __func__, lseg, lseg->pls_range.iomode,
 597                        lseg->pls_range.offset, lseg->pls_range.length);
 598        }
 599        get_layout_hdr(lo);
 600
 601        dprintk("%s:Return\n", __func__);
 602}
 603
 604static struct pnfs_layout_hdr *
 605alloc_init_layout_hdr(struct inode *ino)
 606{
 607        struct pnfs_layout_hdr *lo;
 608
 609        lo = kzalloc(sizeof(struct pnfs_layout_hdr), GFP_KERNEL);
 610        if (!lo)
 611                return NULL;
 612        atomic_set(&lo->plh_refcount, 1);
 613        INIT_LIST_HEAD(&lo->plh_layouts);
 614        INIT_LIST_HEAD(&lo->plh_segs);
 615        INIT_LIST_HEAD(&lo->plh_bulk_recall);
 616        lo->plh_inode = ino;
 617        return lo;
 618}
 619
 620static struct pnfs_layout_hdr *
 621pnfs_find_alloc_layout(struct inode *ino)
 622{
 623        struct nfs_inode *nfsi = NFS_I(ino);
 624        struct pnfs_layout_hdr *new = NULL;
 625
 626        dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
 627
 628        assert_spin_locked(&ino->i_lock);
 629        if (nfsi->layout) {
 630                if (test_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags))
 631                        return NULL;
 632                else
 633                        return nfsi->layout;
 634        }
 635        spin_unlock(&ino->i_lock);
 636        new = alloc_init_layout_hdr(ino);
 637        spin_lock(&ino->i_lock);
 638
 639        if (likely(nfsi->layout == NULL))       /* Won the race? */
 640                nfsi->layout = new;
 641        else
 642                kfree(new);
 643        return nfsi->layout;
 644}
 645
 646/*
 647 * iomode matching rules:
 648 * iomode       lseg    match
 649 * -----        -----   -----
 650 * ANY          READ    true
 651 * ANY          RW      true
 652 * RW           READ    false
 653 * RW           RW      true
 654 * READ         READ    true
 655 * READ         RW      true
 656 */
 657static int
 658is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
 659{
 660        return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
 661}
 662
 663/*
 664 * lookup range in layout
 665 */
 666static struct pnfs_layout_segment *
 667pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 668{
 669        struct pnfs_layout_segment *lseg, *ret = NULL;
 670
 671        dprintk("%s:Begin\n", __func__);
 672
 673        assert_spin_locked(&lo->plh_inode->i_lock);
 674        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
 675                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
 676                    is_matching_lseg(lseg, iomode)) {
 677                        ret = lseg;
 678                        break;
 679                }
 680                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
 681                        break;
 682        }
 683
 684        dprintk("%s:Return lseg %p ref %d\n",
 685                __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0);
 686        return ret;
 687}
 688
 689/*
 690 * Layout segment is retreived from the server if not cached.
 691 * The appropriate layout segment is referenced and returned to the caller.
 692 */
 693struct pnfs_layout_segment *
 694pnfs_update_layout(struct inode *ino,
 695                   struct nfs_open_context *ctx,
 696                   enum pnfs_iomode iomode)
 697{
 698        struct nfs_inode *nfsi = NFS_I(ino);
 699        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 700        struct pnfs_layout_hdr *lo;
 701        struct pnfs_layout_segment *lseg = NULL;
 702
 703        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
 704                return NULL;
 705        spin_lock(&ino->i_lock);
 706        lo = pnfs_find_alloc_layout(ino);
 707        if (lo == NULL) {
 708                dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__);
 709                goto out_unlock;
 710        }
 711
 712        /* Do we even need to bother with this? */
 713        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
 714            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
 715                dprintk("%s matches recall, use MDS\n", __func__);
 716                goto out_unlock;
 717        }
 718        /* Check to see if the layout for the given range already exists */
 719        lseg = pnfs_find_lseg(lo, iomode);
 720        if (lseg)
 721                goto out_unlock;
 722
 723        /* if LAYOUTGET already failed once we don't try again */
 724        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
 725                goto out_unlock;
 726
 727        if (pnfs_layoutgets_blocked(lo, NULL, 0))
 728                goto out_unlock;
 729        atomic_inc(&lo->plh_outstanding);
 730
 731        get_layout_hdr(lo);
 732        if (list_empty(&lo->plh_segs)) {
 733                /* The lo must be on the clp list if there is any
 734                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
 735                 */
 736                spin_lock(&clp->cl_lock);
 737                BUG_ON(!list_empty(&lo->plh_layouts));
 738                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
 739                spin_unlock(&clp->cl_lock);
 740        }
 741        spin_unlock(&ino->i_lock);
 742
 743        lseg = send_layoutget(lo, ctx, iomode);
 744        if (!lseg) {
 745                spin_lock(&ino->i_lock);
 746                if (list_empty(&lo->plh_segs)) {
 747                        spin_lock(&clp->cl_lock);
 748                        list_del_init(&lo->plh_layouts);
 749                        spin_unlock(&clp->cl_lock);
 750                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 751                }
 752                spin_unlock(&ino->i_lock);
 753        }
 754        atomic_dec(&lo->plh_outstanding);
 755        put_layout_hdr(lo);
 756out:
 757        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
 758                nfsi->layout->plh_flags, lseg);
 759        return lseg;
 760out_unlock:
 761        spin_unlock(&ino->i_lock);
 762        goto out;
 763}
 764
 765int
 766pnfs_layout_process(struct nfs4_layoutget *lgp)
 767{
 768        struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
 769        struct nfs4_layoutget_res *res = &lgp->res;
 770        struct pnfs_layout_segment *lseg;
 771        struct inode *ino = lo->plh_inode;
 772        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
 773        int status = 0;
 774
 775        /* Verify we got what we asked for.
 776         * Note that because the xdr parsing only accepts a single
 777         * element array, this can fail even if the server is behaving
 778         * correctly.
 779         */
 780        if (lgp->args.range.iomode > res->range.iomode ||
 781            res->range.offset != 0 ||
 782            res->range.length != NFS4_MAX_UINT64) {
 783                status = -EINVAL;
 784                goto out;
 785        }
 786        /* Inject layout blob into I/O device driver */
 787        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res);
 788        if (!lseg || IS_ERR(lseg)) {
 789                if (!lseg)
 790                        status = -ENOMEM;
 791                else
 792                        status = PTR_ERR(lseg);
 793                dprintk("%s: Could not allocate layout: error %d\n",
 794                       __func__, status);
 795                goto out;
 796        }
 797
 798        spin_lock(&ino->i_lock);
 799        if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) ||
 800            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
 801                dprintk("%s forget reply due to recall\n", __func__);
 802                goto out_forget_reply;
 803        }
 804
 805        if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) {
 806                dprintk("%s forget reply due to state\n", __func__);
 807                goto out_forget_reply;
 808        }
 809        init_lseg(lo, lseg);
 810        lseg->pls_range = res->range;
 811        *lgp->lsegpp = lseg;
 812        pnfs_insert_layout(lo, lseg);
 813
 814        if (res->return_on_close) {
 815                set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
 816                set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
 817        }
 818
 819        /* Done processing layoutget. Set the layout stateid */
 820        pnfs_set_layout_stateid(lo, &res->stateid, false);
 821        spin_unlock(&ino->i_lock);
 822out:
 823        return status;
 824
 825out_forget_reply:
 826        spin_unlock(&ino->i_lock);
 827        lseg->pls_layout = lo;
 828        NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
 829        goto out;
 830}
 831
 832/*
 833 * Device ID cache. Currently supports one layout type per struct nfs_client.
 834 * Add layout type to the lookup key to expand to support multiple types.
 835 */
 836int
 837pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
 838                         void (*free_callback)(struct pnfs_deviceid_node *))
 839{
 840        struct pnfs_deviceid_cache *c;
 841
 842        c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
 843        if (!c)
 844                return -ENOMEM;
 845        spin_lock(&clp->cl_lock);
 846        if (clp->cl_devid_cache != NULL) {
 847                atomic_inc(&clp->cl_devid_cache->dc_ref);
 848                dprintk("%s [kref [%d]]\n", __func__,
 849                        atomic_read(&clp->cl_devid_cache->dc_ref));
 850                kfree(c);
 851        } else {
 852                /* kzalloc initializes hlists */
 853                spin_lock_init(&c->dc_lock);
 854                atomic_set(&c->dc_ref, 1);
 855                c->dc_free_callback = free_callback;
 856                clp->cl_devid_cache = c;
 857                dprintk("%s [new]\n", __func__);
 858        }
 859        spin_unlock(&clp->cl_lock);
 860        return 0;
 861}
 862EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
 863
 864/*
 865 * Called from pnfs_layoutdriver_type->free_lseg
 866 * last layout segment reference frees deviceid
 867 */
 868void
 869pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
 870                  struct pnfs_deviceid_node *devid)
 871{
 872        struct nfs4_deviceid *id = &devid->de_id;
 873        struct pnfs_deviceid_node *d;
 874        struct hlist_node *n;
 875        long h = nfs4_deviceid_hash(id);
 876
 877        dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
 878        if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
 879                return;
 880
 881        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
 882                if (!memcmp(&d->de_id, id, sizeof(*id))) {
 883                        hlist_del_rcu(&d->de_node);
 884                        spin_unlock(&c->dc_lock);
 885                        synchronize_rcu();
 886                        c->dc_free_callback(devid);
 887                        return;
 888                }
 889        spin_unlock(&c->dc_lock);
 890        /* Why wasn't it found in  the list? */
 891        BUG();
 892}
 893EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
 894
 895/* Find and reference a deviceid */
 896struct pnfs_deviceid_node *
 897pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
 898{
 899        struct pnfs_deviceid_node *d;
 900        struct hlist_node *n;
 901        long hash = nfs4_deviceid_hash(id);
 902
 903        dprintk("--> %s hash %ld\n", __func__, hash);
 904        rcu_read_lock();
 905        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
 906                if (!memcmp(&d->de_id, id, sizeof(*id))) {
 907                        if (!atomic_inc_not_zero(&d->de_ref)) {
 908                                goto fail;
 909                        } else {
 910                                rcu_read_unlock();
 911                                return d;
 912                        }
 913                }
 914        }
 915fail:
 916        rcu_read_unlock();
 917        return NULL;
 918}
 919EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
 920
 921/*
 922 * Add a deviceid to the cache.
 923 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
 924 */
 925struct pnfs_deviceid_node *
 926pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
 927{
 928        struct pnfs_deviceid_node *d;
 929        long hash = nfs4_deviceid_hash(&new->de_id);
 930
 931        dprintk("--> %s hash %ld\n", __func__, hash);
 932        spin_lock(&c->dc_lock);
 933        d = pnfs_find_get_deviceid(c, &new->de_id);
 934        if (d) {
 935                spin_unlock(&c->dc_lock);
 936                dprintk("%s [discard]\n", __func__);
 937                c->dc_free_callback(new);
 938                return d;
 939        }
 940        INIT_HLIST_NODE(&new->de_node);
 941        atomic_set(&new->de_ref, 1);
 942        hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
 943        spin_unlock(&c->dc_lock);
 944        dprintk("%s [new]\n", __func__);
 945        return new;
 946}
 947EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
 948
 949void
 950pnfs_put_deviceid_cache(struct nfs_client *clp)
 951{
 952        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
 953
 954        dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
 955        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
 956                int i;
 957                /* Verify cache is empty */
 958                for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
 959                        BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
 960                clp->cl_devid_cache = NULL;
 961                spin_unlock(&clp->cl_lock);
 962                kfree(local);
 963        }
 964}
 965EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
 966