linux/fs/nfs/flexfilelayout/flexfilelayout.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Module for pnfs flexfile layout driver.
   4 *
   5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
   6 *
   7 * Tao Peng <bergwolf@primarydata.com>
   8 */
   9
  10#include <linux/nfs_fs.h>
  11#include <linux/nfs_mount.h>
  12#include <linux/nfs_page.h>
  13#include <linux/module.h>
  14#include <linux/sched/mm.h>
  15
  16#include <linux/sunrpc/metrics.h>
  17
  18#include "flexfilelayout.h"
  19#include "../nfs4session.h"
  20#include "../nfs4idmap.h"
  21#include "../internal.h"
  22#include "../delegation.h"
  23#include "../nfs4trace.h"
  24#include "../iostat.h"
  25#include "../nfs.h"
  26#include "../nfs42.h"
  27
  28#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  29
  30#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
  31#define FF_LAYOUTRETURN_MAXERR 20
  32
  33static unsigned short io_maxretrans;
  34
  35static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
  36                struct nfs_pgio_header *hdr);
  37static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
  38                               struct nfs42_layoutstat_devinfo *devinfo,
  39                               int dev_limit);
  40static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
  41                              const struct nfs42_layoutstat_devinfo *devinfo,
  42                              struct nfs4_ff_layout_mirror *mirror);
  43
  44static struct pnfs_layout_hdr *
  45ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
  46{
  47        struct nfs4_flexfile_layout *ffl;
  48
  49        ffl = kzalloc(sizeof(*ffl), gfp_flags);
  50        if (ffl) {
  51                INIT_LIST_HEAD(&ffl->error_list);
  52                INIT_LIST_HEAD(&ffl->mirrors);
  53                ffl->last_report_time = ktime_get();
  54                return &ffl->generic_hdr;
  55        } else
  56                return NULL;
  57}
  58
  59static void
  60ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
  61{
  62        struct nfs4_ff_layout_ds_err *err, *n;
  63
  64        list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
  65                                 list) {
  66                list_del(&err->list);
  67                kfree(err);
  68        }
  69        kfree(FF_LAYOUT_FROM_HDR(lo));
  70}
  71
  72static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
  73{
  74        __be32 *p;
  75
  76        p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
  77        if (unlikely(p == NULL))
  78                return -ENOBUFS;
  79        stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
  80        memcpy(stateid->data, p, NFS4_STATEID_SIZE);
  81        dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
  82                p[0], p[1], p[2], p[3]);
  83        return 0;
  84}
  85
  86static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
  87{
  88        __be32 *p;
  89
  90        p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
  91        if (unlikely(!p))
  92                return -ENOBUFS;
  93        memcpy(devid, p, NFS4_DEVICEID4_SIZE);
  94        nfs4_print_deviceid(devid);
  95        return 0;
  96}
  97
  98static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
  99{
 100        __be32 *p;
 101
 102        p = xdr_inline_decode(xdr, 4);
 103        if (unlikely(!p))
 104                return -ENOBUFS;
 105        fh->size = be32_to_cpup(p++);
 106        if (fh->size > sizeof(struct nfs_fh)) {
 107                printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
 108                       fh->size);
 109                return -EOVERFLOW;
 110        }
 111        /* fh.data */
 112        p = xdr_inline_decode(xdr, fh->size);
 113        if (unlikely(!p))
 114                return -ENOBUFS;
 115        memcpy(&fh->data, p, fh->size);
 116        dprintk("%s: fh len %d\n", __func__, fh->size);
 117
 118        return 0;
 119}
 120
 121/*
 122 * Currently only stringified uids and gids are accepted.
 123 * I.e., kerberos is not supported to the DSes, so no pricipals.
 124 *
 125 * That means that one common function will suffice, but when
 126 * principals are added, this should be split to accomodate
 127 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
 128 */
 129static int
 130decode_name(struct xdr_stream *xdr, u32 *id)
 131{
 132        __be32 *p;
 133        int len;
 134
 135        /* opaque_length(4)*/
 136        p = xdr_inline_decode(xdr, 4);
 137        if (unlikely(!p))
 138                return -ENOBUFS;
 139        len = be32_to_cpup(p++);
 140        if (len < 0)
 141                return -EINVAL;
 142
 143        dprintk("%s: len %u\n", __func__, len);
 144
 145        /* opaque body */
 146        p = xdr_inline_decode(xdr, len);
 147        if (unlikely(!p))
 148                return -ENOBUFS;
 149
 150        if (!nfs_map_string_to_numeric((char *)p, len, id))
 151                return -EINVAL;
 152
 153        return 0;
 154}
 155
 156static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
 157                const struct nfs4_ff_layout_mirror *m2)
 158{
 159        int i, j;
 160
 161        if (m1->fh_versions_cnt != m2->fh_versions_cnt)
 162                return false;
 163        for (i = 0; i < m1->fh_versions_cnt; i++) {
 164                bool found_fh = false;
 165                for (j = 0; j < m2->fh_versions_cnt; j++) {
 166                        if (nfs_compare_fh(&m1->fh_versions[i],
 167                                        &m2->fh_versions[j]) == 0) {
 168                                found_fh = true;
 169                                break;
 170                        }
 171                }
 172                if (!found_fh)
 173                        return false;
 174        }
 175        return true;
 176}
 177
 178static struct nfs4_ff_layout_mirror *
 179ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
 180                struct nfs4_ff_layout_mirror *mirror)
 181{
 182        struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
 183        struct nfs4_ff_layout_mirror *pos;
 184        struct inode *inode = lo->plh_inode;
 185
 186        spin_lock(&inode->i_lock);
 187        list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
 188                if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
 189                        continue;
 190                if (!ff_mirror_match_fh(mirror, pos))
 191                        continue;
 192                if (refcount_inc_not_zero(&pos->ref)) {
 193                        spin_unlock(&inode->i_lock);
 194                        return pos;
 195                }
 196        }
 197        list_add(&mirror->mirrors, &ff_layout->mirrors);
 198        mirror->layout = lo;
 199        spin_unlock(&inode->i_lock);
 200        return mirror;
 201}
 202
 203static void
 204ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
 205{
 206        struct inode *inode;
 207        if (mirror->layout == NULL)
 208                return;
 209        inode = mirror->layout->plh_inode;
 210        spin_lock(&inode->i_lock);
 211        list_del(&mirror->mirrors);
 212        spin_unlock(&inode->i_lock);
 213        mirror->layout = NULL;
 214}
 215
 216static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
 217{
 218        struct nfs4_ff_layout_mirror *mirror;
 219
 220        mirror = kzalloc(sizeof(*mirror), gfp_flags);
 221        if (mirror != NULL) {
 222                spin_lock_init(&mirror->lock);
 223                refcount_set(&mirror->ref, 1);
 224                INIT_LIST_HEAD(&mirror->mirrors);
 225        }
 226        return mirror;
 227}
 228
 229static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
 230{
 231        const struct cred       *cred;
 232
 233        ff_layout_remove_mirror(mirror);
 234        kfree(mirror->fh_versions);
 235        cred = rcu_access_pointer(mirror->ro_cred);
 236        put_cred(cred);
 237        cred = rcu_access_pointer(mirror->rw_cred);
 238        put_cred(cred);
 239        nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
 240        kfree(mirror);
 241}
 242
 243static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
 244{
 245        if (mirror != NULL && refcount_dec_and_test(&mirror->ref))
 246                ff_layout_free_mirror(mirror);
 247}
 248
 249static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 250{
 251        int i;
 252
 253        if (fls->mirror_array) {
 254                for (i = 0; i < fls->mirror_array_cnt; i++) {
 255                        /* normally mirror_ds is freed in
 256                         * .free_deviceid_node but we still do it here
 257                         * for .alloc_lseg error path */
 258                        ff_layout_put_mirror(fls->mirror_array[i]);
 259                }
 260                kfree(fls->mirror_array);
 261                fls->mirror_array = NULL;
 262        }
 263}
 264
 265static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
 266{
 267        int ret = 0;
 268
 269        dprintk("--> %s\n", __func__);
 270
 271        /* FIXME: remove this check when layout segment support is added */
 272        if (lgr->range.offset != 0 ||
 273            lgr->range.length != NFS4_MAX_UINT64) {
 274                dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
 275                        __func__);
 276                ret = -EINVAL;
 277        }
 278
 279        dprintk("--> %s returns %d\n", __func__, ret);
 280        return ret;
 281}
 282
 283static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 284{
 285        if (fls) {
 286                ff_layout_free_mirror_array(fls);
 287                kfree(fls);
 288        }
 289}
 290
 291static bool
 292ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
 293                const struct pnfs_layout_range *l2)
 294{
 295        u64 end1, end2;
 296
 297        if (l1->iomode != l2->iomode)
 298                return l1->iomode != IOMODE_READ;
 299        end1 = pnfs_calc_offset_end(l1->offset, l1->length);
 300        end2 = pnfs_calc_offset_end(l2->offset, l2->length);
 301        if (end1 < l2->offset)
 302                return false;
 303        if (end2 < l1->offset)
 304                return true;
 305        return l2->offset <= l1->offset;
 306}
 307
 308static bool
 309ff_lseg_merge(struct pnfs_layout_segment *new,
 310                struct pnfs_layout_segment *old)
 311{
 312        u64 new_end, old_end;
 313
 314        if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
 315                return false;
 316        if (new->pls_range.iomode != old->pls_range.iomode)
 317                return false;
 318        old_end = pnfs_calc_offset_end(old->pls_range.offset,
 319                        old->pls_range.length);
 320        if (old_end < new->pls_range.offset)
 321                return false;
 322        new_end = pnfs_calc_offset_end(new->pls_range.offset,
 323                        new->pls_range.length);
 324        if (new_end < old->pls_range.offset)
 325                return false;
 326
 327        /* Mergeable: copy info from 'old' to 'new' */
 328        if (new_end < old_end)
 329                new_end = old_end;
 330        if (new->pls_range.offset < old->pls_range.offset)
 331                new->pls_range.offset = old->pls_range.offset;
 332        new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
 333                        new_end);
 334        if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
 335                set_bit(NFS_LSEG_ROC, &new->pls_flags);
 336        return true;
 337}
 338
 339static void
 340ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
 341                struct pnfs_layout_segment *lseg,
 342                struct list_head *free_me)
 343{
 344        pnfs_generic_layout_insert_lseg(lo, lseg,
 345                        ff_lseg_range_is_after,
 346                        ff_lseg_merge,
 347                        free_me);
 348}
 349
 350static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 351{
 352        int i, j;
 353
 354        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
 355                for (j = i + 1; j < fls->mirror_array_cnt; j++)
 356                        if (fls->mirror_array[i]->efficiency <
 357                            fls->mirror_array[j]->efficiency)
 358                                swap(fls->mirror_array[i],
 359                                     fls->mirror_array[j]);
 360        }
 361}
 362
 363static struct pnfs_layout_segment *
 364ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 365                     struct nfs4_layoutget_res *lgr,
 366                     gfp_t gfp_flags)
 367{
 368        struct pnfs_layout_segment *ret;
 369        struct nfs4_ff_layout_segment *fls = NULL;
 370        struct xdr_stream stream;
 371        struct xdr_buf buf;
 372        struct page *scratch;
 373        u64 stripe_unit;
 374        u32 mirror_array_cnt;
 375        __be32 *p;
 376        int i, rc;
 377
 378        dprintk("--> %s\n", __func__);
 379        scratch = alloc_page(gfp_flags);
 380        if (!scratch)
 381                return ERR_PTR(-ENOMEM);
 382
 383        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
 384                              lgr->layoutp->len);
 385        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 386
 387        /* stripe unit and mirror_array_cnt */
 388        rc = -EIO;
 389        p = xdr_inline_decode(&stream, 8 + 4);
 390        if (!p)
 391                goto out_err_free;
 392
 393        p = xdr_decode_hyper(p, &stripe_unit);
 394        mirror_array_cnt = be32_to_cpup(p++);
 395        dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
 396                stripe_unit, mirror_array_cnt);
 397
 398        if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
 399            mirror_array_cnt == 0)
 400                goto out_err_free;
 401
 402        rc = -ENOMEM;
 403        fls = kzalloc(sizeof(*fls), gfp_flags);
 404        if (!fls)
 405                goto out_err_free;
 406
 407        fls->mirror_array_cnt = mirror_array_cnt;
 408        fls->stripe_unit = stripe_unit;
 409        fls->mirror_array = kcalloc(fls->mirror_array_cnt,
 410                                    sizeof(fls->mirror_array[0]), gfp_flags);
 411        if (fls->mirror_array == NULL)
 412                goto out_err_free;
 413
 414        for (i = 0; i < fls->mirror_array_cnt; i++) {
 415                struct nfs4_ff_layout_mirror *mirror;
 416                struct cred *kcred;
 417                const struct cred __rcu *cred;
 418                kuid_t uid;
 419                kgid_t gid;
 420                u32 ds_count, fh_count, id;
 421                int j;
 422
 423                rc = -EIO;
 424                p = xdr_inline_decode(&stream, 4);
 425                if (!p)
 426                        goto out_err_free;
 427                ds_count = be32_to_cpup(p);
 428
 429                /* FIXME: allow for striping? */
 430                if (ds_count != 1)
 431                        goto out_err_free;
 432
 433                fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
 434                if (fls->mirror_array[i] == NULL) {
 435                        rc = -ENOMEM;
 436                        goto out_err_free;
 437                }
 438
 439                fls->mirror_array[i]->ds_count = ds_count;
 440
 441                /* deviceid */
 442                rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
 443                if (rc)
 444                        goto out_err_free;
 445
 446                /* efficiency */
 447                rc = -EIO;
 448                p = xdr_inline_decode(&stream, 4);
 449                if (!p)
 450                        goto out_err_free;
 451                fls->mirror_array[i]->efficiency = be32_to_cpup(p);
 452
 453                /* stateid */
 454                rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
 455                if (rc)
 456                        goto out_err_free;
 457
 458                /* fh */
 459                rc = -EIO;
 460                p = xdr_inline_decode(&stream, 4);
 461                if (!p)
 462                        goto out_err_free;
 463                fh_count = be32_to_cpup(p);
 464
 465                fls->mirror_array[i]->fh_versions =
 466                        kcalloc(fh_count, sizeof(struct nfs_fh),
 467                                gfp_flags);
 468                if (fls->mirror_array[i]->fh_versions == NULL) {
 469                        rc = -ENOMEM;
 470                        goto out_err_free;
 471                }
 472
 473                for (j = 0; j < fh_count; j++) {
 474                        rc = decode_nfs_fh(&stream,
 475                                           &fls->mirror_array[i]->fh_versions[j]);
 476                        if (rc)
 477                                goto out_err_free;
 478                }
 479
 480                fls->mirror_array[i]->fh_versions_cnt = fh_count;
 481
 482                /* user */
 483                rc = decode_name(&stream, &id);
 484                if (rc)
 485                        goto out_err_free;
 486
 487                uid = make_kuid(&init_user_ns, id);
 488
 489                /* group */
 490                rc = decode_name(&stream, &id);
 491                if (rc)
 492                        goto out_err_free;
 493
 494                gid = make_kgid(&init_user_ns, id);
 495
 496                if (gfp_flags & __GFP_FS)
 497                        kcred = prepare_kernel_cred(NULL);
 498                else {
 499                        unsigned int nofs_flags = memalloc_nofs_save();
 500                        kcred = prepare_kernel_cred(NULL);
 501                        memalloc_nofs_restore(nofs_flags);
 502                }
 503                rc = -ENOMEM;
 504                if (!kcred)
 505                        goto out_err_free;
 506                kcred->fsuid = uid;
 507                kcred->fsgid = gid;
 508                cred = RCU_INITIALIZER(kcred);
 509
 510                if (lgr->range.iomode == IOMODE_READ)
 511                        rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
 512                else
 513                        rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
 514
 515                mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
 516                if (mirror != fls->mirror_array[i]) {
 517                        /* swap cred ptrs so free_mirror will clean up old */
 518                        if (lgr->range.iomode == IOMODE_READ) {
 519                                cred = xchg(&mirror->ro_cred, cred);
 520                                rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
 521                        } else {
 522                                cred = xchg(&mirror->rw_cred, cred);
 523                                rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
 524                        }
 525                        ff_layout_free_mirror(fls->mirror_array[i]);
 526                        fls->mirror_array[i] = mirror;
 527                }
 528
 529                dprintk("%s: iomode %s uid %u gid %u\n", __func__,
 530                        lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
 531                        from_kuid(&init_user_ns, uid),
 532                        from_kgid(&init_user_ns, gid));
 533        }
 534
 535        p = xdr_inline_decode(&stream, 4);
 536        if (!p)
 537                goto out_sort_mirrors;
 538        fls->flags = be32_to_cpup(p);
 539
 540        p = xdr_inline_decode(&stream, 4);
 541        if (!p)
 542                goto out_sort_mirrors;
 543        for (i=0; i < fls->mirror_array_cnt; i++)
 544                fls->mirror_array[i]->report_interval = be32_to_cpup(p);
 545
 546out_sort_mirrors:
 547        ff_layout_sort_mirrors(fls);
 548        rc = ff_layout_check_layout(lgr);
 549        if (rc)
 550                goto out_err_free;
 551        ret = &fls->generic_hdr;
 552        dprintk("<-- %s (success)\n", __func__);
 553out_free_page:
 554        __free_page(scratch);
 555        return ret;
 556out_err_free:
 557        _ff_layout_free_lseg(fls);
 558        ret = ERR_PTR(rc);
 559        dprintk("<-- %s (%d)\n", __func__, rc);
 560        goto out_free_page;
 561}
 562
 563static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
 564{
 565        struct pnfs_layout_segment *lseg;
 566
 567        list_for_each_entry(lseg, &layout->plh_segs, pls_list)
 568                if (lseg->pls_range.iomode == IOMODE_RW)
 569                        return true;
 570
 571        return false;
 572}
 573
 574static void
 575ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 576{
 577        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 578
 579        dprintk("--> %s\n", __func__);
 580
 581        if (lseg->pls_range.iomode == IOMODE_RW) {
 582                struct nfs4_flexfile_layout *ffl;
 583                struct inode *inode;
 584
 585                ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
 586                inode = ffl->generic_hdr.plh_inode;
 587                spin_lock(&inode->i_lock);
 588                if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
 589                        ffl->commit_info.nbuckets = 0;
 590                        kfree(ffl->commit_info.buckets);
 591                        ffl->commit_info.buckets = NULL;
 592                }
 593                spin_unlock(&inode->i_lock);
 594        }
 595        _ff_layout_free_lseg(fls);
 596}
 597
 598/* Return 1 until we have multiple lsegs support */
 599static int
 600ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
 601{
 602        return 1;
 603}
 604
 605static void
 606nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 607{
 608        /* first IO request? */
 609        if (atomic_inc_return(&timer->n_ops) == 1) {
 610                timer->start_time = now;
 611        }
 612}
 613
 614static ktime_t
 615nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
 616{
 617        ktime_t start;
 618
 619        if (atomic_dec_return(&timer->n_ops) < 0)
 620                WARN_ON_ONCE(1);
 621
 622        start = timer->start_time;
 623        timer->start_time = now;
 624        return ktime_sub(now, start);
 625}
 626
 627static bool
 628nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 629                            struct nfs4_ff_layoutstat *layoutstat,
 630                            ktime_t now)
 631{
 632        s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
 633        struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
 634
 635        nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
 636        if (!mirror->start_time)
 637                mirror->start_time = now;
 638        if (mirror->report_interval != 0)
 639                report_interval = (s64)mirror->report_interval * 1000LL;
 640        else if (layoutstats_timer != 0)
 641                report_interval = (s64)layoutstats_timer * 1000LL;
 642        if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
 643                        report_interval) {
 644                ffl->last_report_time = now;
 645                return true;
 646        }
 647
 648        return false;
 649}
 650
 651static void
 652nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
 653                __u64 requested)
 654{
 655        struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
 656
 657        iostat->ops_requested++;
 658        iostat->bytes_requested += requested;
 659}
 660
 661static void
 662nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
 663                __u64 requested,
 664                __u64 completed,
 665                ktime_t time_completed,
 666                ktime_t time_started)
 667{
 668        struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
 669        ktime_t completion_time = ktime_sub(time_completed, time_started);
 670        ktime_t timer;
 671
 672        iostat->ops_completed++;
 673        iostat->bytes_completed += completed;
 674        iostat->bytes_not_delivered += requested - completed;
 675
 676        timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
 677        iostat->total_busy_time =
 678                        ktime_add(iostat->total_busy_time, timer);
 679        iostat->aggregate_completion_time =
 680                        ktime_add(iostat->aggregate_completion_time,
 681                                        completion_time);
 682}
 683
 684static void
 685nfs4_ff_layout_stat_io_start_read(struct inode *inode,
 686                struct nfs4_ff_layout_mirror *mirror,
 687                __u64 requested, ktime_t now)
 688{
 689        bool report;
 690
 691        spin_lock(&mirror->lock);
 692        report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
 693        nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
 694        set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 695        spin_unlock(&mirror->lock);
 696
 697        if (report)
 698                pnfs_report_layoutstat(inode, GFP_KERNEL);
 699}
 700
 701static void
 702nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 703                struct nfs4_ff_layout_mirror *mirror,
 704                __u64 requested,
 705                __u64 completed)
 706{
 707        spin_lock(&mirror->lock);
 708        nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
 709                        requested, completed,
 710                        ktime_get(), task->tk_start);
 711        set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 712        spin_unlock(&mirror->lock);
 713}
 714
 715static void
 716nfs4_ff_layout_stat_io_start_write(struct inode *inode,
 717                struct nfs4_ff_layout_mirror *mirror,
 718                __u64 requested, ktime_t now)
 719{
 720        bool report;
 721
 722        spin_lock(&mirror->lock);
 723        report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
 724        nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
 725        set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 726        spin_unlock(&mirror->lock);
 727
 728        if (report)
 729                pnfs_report_layoutstat(inode, GFP_NOIO);
 730}
 731
 732static void
 733nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 734                struct nfs4_ff_layout_mirror *mirror,
 735                __u64 requested,
 736                __u64 completed,
 737                enum nfs3_stable_how committed)
 738{
 739        if (committed == NFS_UNSTABLE)
 740                requested = completed = 0;
 741
 742        spin_lock(&mirror->lock);
 743        nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
 744                        requested, completed, ktime_get(), task->tk_start);
 745        set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 746        spin_unlock(&mirror->lock);
 747}
 748
 749static int
 750ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 751                            struct nfs_commit_info *cinfo,
 752                            gfp_t gfp_flags)
 753{
 754        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 755        struct pnfs_commit_bucket *buckets;
 756        int size;
 757
 758        if (cinfo->ds->nbuckets != 0) {
 759                /* This assumes there is only one RW lseg per file.
 760                 * To support multiple lseg per file, we need to
 761                 * change struct pnfs_commit_bucket to allow dynamic
 762                 * increasing nbuckets.
 763                 */
 764                return 0;
 765        }
 766
 767        size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
 768
 769        buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
 770                          gfp_flags);
 771        if (!buckets)
 772                return -ENOMEM;
 773        else {
 774                int i;
 775
 776                spin_lock(&cinfo->inode->i_lock);
 777                if (cinfo->ds->nbuckets != 0)
 778                        kfree(buckets);
 779                else {
 780                        cinfo->ds->buckets = buckets;
 781                        cinfo->ds->nbuckets = size;
 782                        for (i = 0; i < size; i++) {
 783                                INIT_LIST_HEAD(&buckets[i].written);
 784                                INIT_LIST_HEAD(&buckets[i].committing);
 785                                /* mark direct verifier as unset */
 786                                buckets[i].direct_verf.committed =
 787                                        NFS_INVALID_STABLE_HOW;
 788                        }
 789                }
 790                spin_unlock(&cinfo->inode->i_lock);
 791                return 0;
 792        }
 793}
 794
 795static void
 796ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
 797{
 798        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
 799
 800        if (devid)
 801                nfs4_mark_deviceid_unavailable(devid);
 802}
 803
 804static void
 805ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
 806{
 807        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
 808
 809        if (devid)
 810                nfs4_mark_deviceid_available(devid);
 811}
 812
 813static struct nfs4_pnfs_ds *
 814ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
 815                             int start_idx, int *best_idx,
 816                             bool check_device)
 817{
 818        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 819        struct nfs4_ff_layout_mirror *mirror;
 820        struct nfs4_pnfs_ds *ds;
 821        bool fail_return = false;
 822        int idx;
 823
 824        /* mirrors are initially sorted by efficiency */
 825        for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
 826                if (idx+1 == fls->mirror_array_cnt)
 827                        fail_return = !check_device;
 828
 829                mirror = FF_LAYOUT_COMP(lseg, idx);
 830                ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return);
 831                if (!ds)
 832                        continue;
 833
 834                if (check_device &&
 835                    nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
 836                        continue;
 837
 838                *best_idx = idx;
 839                return ds;
 840        }
 841
 842        return NULL;
 843}
 844
 845static struct nfs4_pnfs_ds *
 846ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
 847                                 int start_idx, int *best_idx)
 848{
 849        return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
 850}
 851
 852static struct nfs4_pnfs_ds *
 853ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
 854                                   int start_idx, int *best_idx)
 855{
 856        return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
 857}
 858
 859static struct nfs4_pnfs_ds *
 860ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
 861                                  int start_idx, int *best_idx)
 862{
 863        struct nfs4_pnfs_ds *ds;
 864
 865        ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
 866        if (ds)
 867                return ds;
 868        return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
 869}
 870
 871static void
 872ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
 873                      struct nfs_page *req,
 874                      bool strict_iomode)
 875{
 876        pnfs_put_lseg(pgio->pg_lseg);
 877        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 878                                           nfs_req_openctx(req),
 879                                           0,
 880                                           NFS4_MAX_UINT64,
 881                                           IOMODE_READ,
 882                                           strict_iomode,
 883                                           GFP_KERNEL);
 884        if (IS_ERR(pgio->pg_lseg)) {
 885                pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 886                pgio->pg_lseg = NULL;
 887        }
 888}
 889
 890static void
 891ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 892                        struct nfs_page *req)
 893{
 894        struct nfs_pgio_mirror *pgm;
 895        struct nfs4_ff_layout_mirror *mirror;
 896        struct nfs4_pnfs_ds *ds;
 897        int ds_idx;
 898
 899retry:
 900        pnfs_generic_pg_check_layout(pgio);
 901        /* Use full layout for now */
 902        if (!pgio->pg_lseg) {
 903                ff_layout_pg_get_read(pgio, req, false);
 904                if (!pgio->pg_lseg)
 905                        goto out_nolseg;
 906        }
 907        if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
 908                ff_layout_pg_get_read(pgio, req, true);
 909                if (!pgio->pg_lseg)
 910                        goto out_nolseg;
 911        }
 912
 913        ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
 914        if (!ds) {
 915                if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 916                        goto out_mds;
 917                pnfs_put_lseg(pgio->pg_lseg);
 918                pgio->pg_lseg = NULL;
 919                /* Sleep for 1 second before retrying */
 920                ssleep(1);
 921                goto retry;
 922        }
 923
 924        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
 925
 926        pgio->pg_mirror_idx = ds_idx;
 927
 928        /* read always uses only one mirror - idx 0 for pgio layer */
 929        pgm = &pgio->pg_mirrors[0];
 930        pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
 931
 932        if (NFS_SERVER(pgio->pg_inode)->flags &
 933                        (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
 934                pgio->pg_maxretrans = io_maxretrans;
 935        return;
 936out_nolseg:
 937        if (pgio->pg_error < 0)
 938                return;
 939out_mds:
 940        trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
 941                        0, NFS4_MAX_UINT64, IOMODE_READ,
 942                        NFS_I(pgio->pg_inode)->layout,
 943                        pgio->pg_lseg);
 944        pnfs_put_lseg(pgio->pg_lseg);
 945        pgio->pg_lseg = NULL;
 946        pgio->pg_maxretrans = 0;
 947        nfs_pageio_reset_read_mds(pgio);
 948}
 949
 950static void
 951ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 952                        struct nfs_page *req)
 953{
 954        struct nfs4_ff_layout_mirror *mirror;
 955        struct nfs_pgio_mirror *pgm;
 956        struct nfs_commit_info cinfo;
 957        struct nfs4_pnfs_ds *ds;
 958        int i;
 959        int status;
 960
 961retry:
 962        pnfs_generic_pg_check_layout(pgio);
 963        if (!pgio->pg_lseg) {
 964                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 965                                                   nfs_req_openctx(req),
 966                                                   0,
 967                                                   NFS4_MAX_UINT64,
 968                                                   IOMODE_RW,
 969                                                   false,
 970                                                   GFP_NOFS);
 971                if (IS_ERR(pgio->pg_lseg)) {
 972                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 973                        pgio->pg_lseg = NULL;
 974                        return;
 975                }
 976        }
 977        /* If no lseg, fall back to write through mds */
 978        if (pgio->pg_lseg == NULL)
 979                goto out_mds;
 980
 981        nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
 982        status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
 983        if (status < 0)
 984                goto out_mds;
 985
 986        /* Use a direct mapping of ds_idx to pgio mirror_idx */
 987        if (WARN_ON_ONCE(pgio->pg_mirror_count !=
 988            FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
 989                goto out_mds;
 990
 991        for (i = 0; i < pgio->pg_mirror_count; i++) {
 992                mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
 993                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
 994                if (!ds) {
 995                        if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
 996                                goto out_mds;
 997                        pnfs_put_lseg(pgio->pg_lseg);
 998                        pgio->pg_lseg = NULL;
 999                        /* Sleep for 1 second before retrying */
1000                        ssleep(1);
1001                        goto retry;
1002                }
1003                pgm = &pgio->pg_mirrors[i];
1004                pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
1005        }
1006
1007        if (NFS_SERVER(pgio->pg_inode)->flags &
1008                        (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
1009                pgio->pg_maxretrans = io_maxretrans;
1010        return;
1011
1012out_mds:
1013        trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
1014                        0, NFS4_MAX_UINT64, IOMODE_RW,
1015                        NFS_I(pgio->pg_inode)->layout,
1016                        pgio->pg_lseg);
1017        pnfs_put_lseg(pgio->pg_lseg);
1018        pgio->pg_lseg = NULL;
1019        pgio->pg_maxretrans = 0;
1020        nfs_pageio_reset_write_mds(pgio);
1021}
1022
1023static unsigned int
1024ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
1025                                    struct nfs_page *req)
1026{
1027        if (!pgio->pg_lseg) {
1028                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
1029                                                   nfs_req_openctx(req),
1030                                                   0,
1031                                                   NFS4_MAX_UINT64,
1032                                                   IOMODE_RW,
1033                                                   false,
1034                                                   GFP_NOFS);
1035                if (IS_ERR(pgio->pg_lseg)) {
1036                        pgio->pg_error = PTR_ERR(pgio->pg_lseg);
1037                        pgio->pg_lseg = NULL;
1038                        goto out;
1039                }
1040        }
1041        if (pgio->pg_lseg)
1042                return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
1043
1044        trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
1045                        0, NFS4_MAX_UINT64, IOMODE_RW,
1046                        NFS_I(pgio->pg_inode)->layout,
1047                        pgio->pg_lseg);
1048        /* no lseg means that pnfs is not in use, so no mirroring here */
1049        nfs_pageio_reset_write_mds(pgio);
1050out:
1051        return 1;
1052}
1053
1054static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
1055        .pg_init = ff_layout_pg_init_read,
1056        .pg_test = pnfs_generic_pg_test,
1057        .pg_doio = pnfs_generic_pg_readpages,
1058        .pg_cleanup = pnfs_generic_pg_cleanup,
1059};
1060
1061static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
1062        .pg_init = ff_layout_pg_init_write,
1063        .pg_test = pnfs_generic_pg_test,
1064        .pg_doio = pnfs_generic_pg_writepages,
1065        .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
1066        .pg_cleanup = pnfs_generic_pg_cleanup,
1067};
1068
1069static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
1070{
1071        struct rpc_task *task = &hdr->task;
1072
1073        pnfs_layoutcommit_inode(hdr->inode, false);
1074
1075        if (retry_pnfs) {
1076                dprintk("%s Reset task %5u for i/o through pNFS "
1077                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
1078                        hdr->task.tk_pid,
1079                        hdr->inode->i_sb->s_id,
1080                        (unsigned long long)NFS_FILEID(hdr->inode),
1081                        hdr->args.count,
1082                        (unsigned long long)hdr->args.offset);
1083
1084                hdr->completion_ops->reschedule_io(hdr);
1085                return;
1086        }
1087
1088        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1089                dprintk("%s Reset task %5u for i/o through MDS "
1090                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
1091                        hdr->task.tk_pid,
1092                        hdr->inode->i_sb->s_id,
1093                        (unsigned long long)NFS_FILEID(hdr->inode),
1094                        hdr->args.count,
1095                        (unsigned long long)hdr->args.offset);
1096
1097                trace_pnfs_mds_fallback_write_done(hdr->inode,
1098                                hdr->args.offset, hdr->args.count,
1099                                IOMODE_RW, NFS_I(hdr->inode)->layout,
1100                                hdr->lseg);
1101                task->tk_status = pnfs_write_done_resend_to_mds(hdr);
1102        }
1103}
1104
1105static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
1106{
1107        struct rpc_task *task = &hdr->task;
1108
1109        pnfs_layoutcommit_inode(hdr->inode, false);
1110
1111        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1112                dprintk("%s Reset task %5u for i/o through MDS "
1113                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
1114                        hdr->task.tk_pid,
1115                        hdr->inode->i_sb->s_id,
1116                        (unsigned long long)NFS_FILEID(hdr->inode),
1117                        hdr->args.count,
1118                        (unsigned long long)hdr->args.offset);
1119
1120                trace_pnfs_mds_fallback_read_done(hdr->inode,
1121                                hdr->args.offset, hdr->args.count,
1122                                IOMODE_READ, NFS_I(hdr->inode)->layout,
1123                                hdr->lseg);
1124                task->tk_status = pnfs_read_done_resend_to_mds(hdr);
1125        }
1126}
1127
1128static int ff_layout_async_handle_error_v4(struct rpc_task *task,
1129                                           struct nfs4_state *state,
1130                                           struct nfs_client *clp,
1131                                           struct pnfs_layout_segment *lseg,
1132                                           int idx)
1133{
1134        struct pnfs_layout_hdr *lo = lseg->pls_layout;
1135        struct inode *inode = lo->plh_inode;
1136        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
1137        struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
1138
1139        switch (task->tk_status) {
1140        case -NFS4ERR_BADSESSION:
1141        case -NFS4ERR_BADSLOT:
1142        case -NFS4ERR_BAD_HIGH_SLOT:
1143        case -NFS4ERR_DEADSESSION:
1144        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1145        case -NFS4ERR_SEQ_FALSE_RETRY:
1146        case -NFS4ERR_SEQ_MISORDERED:
1147                dprintk("%s ERROR %d, Reset session. Exchangeid "
1148                        "flags 0x%x\n", __func__, task->tk_status,
1149                        clp->cl_exchange_flags);
1150                nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
1151                break;
1152        case -NFS4ERR_DELAY:
1153        case -NFS4ERR_GRACE:
1154                rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
1155                break;
1156        case -NFS4ERR_RETRY_UNCACHED_REP:
1157                break;
1158        /* Invalidate Layout errors */
1159        case -NFS4ERR_PNFS_NO_LAYOUT:
1160        case -ESTALE:           /* mapped NFS4ERR_STALE */
1161        case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
1162        case -EISDIR:           /* mapped NFS4ERR_ISDIR */
1163        case -NFS4ERR_FHEXPIRED:
1164        case -NFS4ERR_WRONG_TYPE:
1165                dprintk("%s Invalid layout error %d\n", __func__,
1166                        task->tk_status);
1167                /*
1168                 * Destroy layout so new i/o will get a new layout.
1169                 * Layout will not be destroyed until all current lseg
1170                 * references are put. Mark layout as invalid to resend failed
1171                 * i/o and all i/o waiting on the slot table to the MDS until
1172                 * layout is destroyed and a new valid layout is obtained.
1173                 */
1174                pnfs_destroy_layout(NFS_I(inode));
1175                rpc_wake_up(&tbl->slot_tbl_waitq);
1176                goto reset;
1177        /* RPC connection errors */
1178        case -ECONNREFUSED:
1179        case -EHOSTDOWN:
1180        case -EHOSTUNREACH:
1181        case -ENETUNREACH:
1182        case -EIO:
1183        case -ETIMEDOUT:
1184        case -EPIPE:
1185                dprintk("%s DS connection error %d\n", __func__,
1186                        task->tk_status);
1187                nfs4_delete_deviceid(devid->ld, devid->nfs_client,
1188                                &devid->deviceid);
1189                rpc_wake_up(&tbl->slot_tbl_waitq);
1190                /* fall through */
1191        default:
1192                if (ff_layout_avoid_mds_available_ds(lseg))
1193                        return -NFS4ERR_RESET_TO_PNFS;
1194reset:
1195                dprintk("%s Retry through MDS. Error %d\n", __func__,
1196                        task->tk_status);
1197                return -NFS4ERR_RESET_TO_MDS;
1198        }
1199        task->tk_status = 0;
1200        return -EAGAIN;
1201}
1202
1203/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
1204static int ff_layout_async_handle_error_v3(struct rpc_task *task,
1205                                           struct pnfs_layout_segment *lseg,
1206                                           int idx)
1207{
1208        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
1209
1210        switch (task->tk_status) {
1211        /* File access problems. Don't mark the device as unavailable */
1212        case -EACCES:
1213        case -ESTALE:
1214        case -EISDIR:
1215        case -EBADHANDLE:
1216        case -ELOOP:
1217        case -ENOSPC:
1218                break;
1219        case -EJUKEBOX:
1220                nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
1221                goto out_retry;
1222        default:
1223                dprintk("%s DS connection error %d\n", __func__,
1224                        task->tk_status);
1225                nfs4_delete_deviceid(devid->ld, devid->nfs_client,
1226                                &devid->deviceid);
1227        }
1228        /* FIXME: Need to prevent infinite looping here. */
1229        return -NFS4ERR_RESET_TO_PNFS;
1230out_retry:
1231        task->tk_status = 0;
1232        rpc_restart_call_prepare(task);
1233        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
1234        return -EAGAIN;
1235}
1236
1237static int ff_layout_async_handle_error(struct rpc_task *task,
1238                                        struct nfs4_state *state,
1239                                        struct nfs_client *clp,
1240                                        struct pnfs_layout_segment *lseg,
1241                                        int idx)
1242{
1243        int vers = clp->cl_nfs_mod->rpc_vers->number;
1244
1245        if (task->tk_status >= 0) {
1246                ff_layout_mark_ds_reachable(lseg, idx);
1247                return 0;
1248        }
1249
1250        /* Handle the case of an invalid layout segment */
1251        if (!pnfs_is_valid_lseg(lseg))
1252                return -NFS4ERR_RESET_TO_PNFS;
1253
1254        switch (vers) {
1255        case 3:
1256                return ff_layout_async_handle_error_v3(task, lseg, idx);
1257        case 4:
1258                return ff_layout_async_handle_error_v4(task, state, clp,
1259                                                       lseg, idx);
1260        default:
1261                /* should never happen */
1262                WARN_ON_ONCE(1);
1263                return 0;
1264        }
1265}
1266
1267static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
1268                                        int idx, u64 offset, u64 length,
1269                                        u32 status, int opnum, int error)
1270{
1271        struct nfs4_ff_layout_mirror *mirror;
1272        int err;
1273
1274        if (status == 0) {
1275                switch (error) {
1276                case -ETIMEDOUT:
1277                case -EPFNOSUPPORT:
1278                case -EPROTONOSUPPORT:
1279                case -EOPNOTSUPP:
1280                case -ECONNREFUSED:
1281                case -ECONNRESET:
1282                case -EHOSTDOWN:
1283                case -EHOSTUNREACH:
1284                case -ENETUNREACH:
1285                case -EADDRINUSE:
1286                case -ENOBUFS:
1287                case -EPIPE:
1288                case -EPERM:
1289                        status = NFS4ERR_NXIO;
1290                        break;
1291                case -EACCES:
1292                        status = NFS4ERR_ACCESS;
1293                        break;
1294                default:
1295                        return;
1296                }
1297        }
1298
1299        switch (status) {
1300        case NFS4ERR_DELAY:
1301        case NFS4ERR_GRACE:
1302                return;
1303        default:
1304                break;
1305        }
1306
1307        mirror = FF_LAYOUT_COMP(lseg, idx);
1308        err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
1309                                       mirror, offset, length, status, opnum,
1310                                       GFP_NOIO);
1311        if (status == NFS4ERR_NXIO)
1312                ff_layout_mark_ds_unreachable(lseg, idx);
1313        pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
1314        dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
1315}
1316
1317/* NFS_PROTO call done callback routines */
1318static int ff_layout_read_done_cb(struct rpc_task *task,
1319                                struct nfs_pgio_header *hdr)
1320{
1321        int new_idx = hdr->pgio_mirror_idx;
1322        int err;
1323
1324        trace_nfs4_pnfs_read(hdr, task->tk_status);
1325        if (task->tk_status < 0)
1326                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1327                                            hdr->args.offset, hdr->args.count,
1328                                            hdr->res.op_status, OP_READ,
1329                                            task->tk_status);
1330        err = ff_layout_async_handle_error(task, hdr->args.context->state,
1331                                           hdr->ds_clp, hdr->lseg,
1332                                           hdr->pgio_mirror_idx);
1333
1334        clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
1335        clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
1336        switch (err) {
1337        case -NFS4ERR_RESET_TO_PNFS:
1338                if (ff_layout_choose_best_ds_for_read(hdr->lseg,
1339                                        hdr->pgio_mirror_idx + 1,
1340                                        &new_idx))
1341                        goto out_layouterror;
1342                set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
1343                return task->tk_status;
1344        case -NFS4ERR_RESET_TO_MDS:
1345                set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
1346                return task->tk_status;
1347        case -EAGAIN:
1348                goto out_eagain;
1349        }
1350
1351        return 0;
1352out_layouterror:
1353        ff_layout_read_record_layoutstats_done(task, hdr);
1354        ff_layout_send_layouterror(hdr->lseg);
1355        hdr->pgio_mirror_idx = new_idx;
1356out_eagain:
1357        rpc_restart_call_prepare(task);
1358        return -EAGAIN;
1359}
1360
1361static bool
1362ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
1363{
1364        return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
1365}
1366
1367/*
1368 * We reference the rpc_cred of the first WRITE that triggers the need for
1369 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
1370 * rfc5661 is not clear about which credential should be used.
1371 *
1372 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
1373 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
1374 * we always send layoutcommit after DS writes.
1375 */
1376static void
1377ff_layout_set_layoutcommit(struct inode *inode,
1378                struct pnfs_layout_segment *lseg,
1379                loff_t end_offset)
1380{
1381        if (!ff_layout_need_layoutcommit(lseg))
1382                return;
1383
1384        pnfs_set_layoutcommit(inode, lseg, end_offset);
1385        dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
1386                (unsigned long long) NFS_I(inode)->layout->plh_lwb);
1387}
1388
1389static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
1390                struct nfs_pgio_header *hdr)
1391{
1392        if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
1393                return;
1394        nfs4_ff_layout_stat_io_start_read(hdr->inode,
1395                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1396                        hdr->args.count,
1397                        task->tk_start);
1398}
1399
1400static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
1401                struct nfs_pgio_header *hdr)
1402{
1403        if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
1404                return;
1405        nfs4_ff_layout_stat_io_end_read(task,
1406                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1407                        hdr->args.count,
1408                        hdr->res.count);
1409        set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
1410}
1411
1412static int ff_layout_read_prepare_common(struct rpc_task *task,
1413                                         struct nfs_pgio_header *hdr)
1414{
1415        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1416                rpc_exit(task, -EIO);
1417                return -EIO;
1418        }
1419
1420        ff_layout_read_record_layoutstats_start(task, hdr);
1421        return 0;
1422}
1423
1424/*
1425 * Call ops for the async read/write cases
1426 * In the case of dense layouts, the offset needs to be reset to its
1427 * original value.
1428 */
1429static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
1430{
1431        struct nfs_pgio_header *hdr = data;
1432
1433        if (ff_layout_read_prepare_common(task, hdr))
1434                return;
1435
1436        rpc_call_start(task);
1437}
1438
1439static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
1440{
1441        struct nfs_pgio_header *hdr = data;
1442
1443        if (nfs4_setup_sequence(hdr->ds_clp,
1444                                &hdr->args.seq_args,
1445                                &hdr->res.seq_res,
1446                                task))
1447                return;
1448
1449        ff_layout_read_prepare_common(task, hdr);
1450}
1451
1452static void ff_layout_read_call_done(struct rpc_task *task, void *data)
1453{
1454        struct nfs_pgio_header *hdr = data;
1455
1456        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
1457
1458        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1459            task->tk_status == 0) {
1460                nfs4_sequence_done(task, &hdr->res.seq_res);
1461                return;
1462        }
1463
1464        /* Note this may cause RPC to be resent */
1465        hdr->mds_ops->rpc_call_done(task, hdr);
1466}
1467
1468static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
1469{
1470        struct nfs_pgio_header *hdr = data;
1471
1472        ff_layout_read_record_layoutstats_done(task, hdr);
1473        rpc_count_iostats_metrics(task,
1474            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
1475}
1476
1477static void ff_layout_read_release(void *data)
1478{
1479        struct nfs_pgio_header *hdr = data;
1480
1481        ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
1482        if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
1483                ff_layout_send_layouterror(hdr->lseg);
1484                pnfs_read_resend_pnfs(hdr);
1485        } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
1486                ff_layout_reset_read(hdr);
1487        pnfs_generic_rw_release(data);
1488}
1489
1490
1491static int ff_layout_write_done_cb(struct rpc_task *task,
1492                                struct nfs_pgio_header *hdr)
1493{
1494        loff_t end_offs = 0;
1495        int err;
1496
1497        trace_nfs4_pnfs_write(hdr, task->tk_status);
1498        if (task->tk_status < 0)
1499                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1500                                            hdr->args.offset, hdr->args.count,
1501                                            hdr->res.op_status, OP_WRITE,
1502                                            task->tk_status);
1503        err = ff_layout_async_handle_error(task, hdr->args.context->state,
1504                                           hdr->ds_clp, hdr->lseg,
1505                                           hdr->pgio_mirror_idx);
1506
1507        clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
1508        clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
1509        switch (err) {
1510        case -NFS4ERR_RESET_TO_PNFS:
1511                set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
1512                return task->tk_status;
1513        case -NFS4ERR_RESET_TO_MDS:
1514                set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
1515                return task->tk_status;
1516        case -EAGAIN:
1517                return -EAGAIN;
1518        }
1519
1520        if (hdr->res.verf->committed == NFS_FILE_SYNC ||
1521            hdr->res.verf->committed == NFS_DATA_SYNC)
1522                end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
1523
1524        /* Note: if the write is unstable, don't set end_offs until commit */
1525        ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
1526
1527        /* zero out fattr since we don't care DS attr at all */
1528        hdr->fattr.valid = 0;
1529        if (task->tk_status >= 0)
1530                nfs_writeback_update_inode(hdr);
1531
1532        return 0;
1533}
1534
1535static int ff_layout_commit_done_cb(struct rpc_task *task,
1536                                     struct nfs_commit_data *data)
1537{
1538        int err;
1539
1540        trace_nfs4_pnfs_commit_ds(data, task->tk_status);
1541        if (task->tk_status < 0)
1542                ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
1543                                            data->args.offset, data->args.count,
1544                                            data->res.op_status, OP_COMMIT,
1545                                            task->tk_status);
1546        err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
1547                                           data->lseg, data->ds_commit_index);
1548
1549        switch (err) {
1550        case -NFS4ERR_RESET_TO_PNFS:
1551                pnfs_generic_prepare_to_resend_writes(data);
1552                return -EAGAIN;
1553        case -NFS4ERR_RESET_TO_MDS:
1554                pnfs_generic_prepare_to_resend_writes(data);
1555                return -EAGAIN;
1556        case -EAGAIN:
1557                rpc_restart_call_prepare(task);
1558                return -EAGAIN;
1559        }
1560
1561        ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
1562
1563        return 0;
1564}
1565
1566static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
1567                struct nfs_pgio_header *hdr)
1568{
1569        if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
1570                return;
1571        nfs4_ff_layout_stat_io_start_write(hdr->inode,
1572                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1573                        hdr->args.count,
1574                        task->tk_start);
1575}
1576
1577static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
1578                struct nfs_pgio_header *hdr)
1579{
1580        if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
1581                return;
1582        nfs4_ff_layout_stat_io_end_write(task,
1583                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1584                        hdr->args.count, hdr->res.count,
1585                        hdr->res.verf->committed);
1586        set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
1587}
1588
1589static int ff_layout_write_prepare_common(struct rpc_task *task,
1590                                          struct nfs_pgio_header *hdr)
1591{
1592        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1593                rpc_exit(task, -EIO);
1594                return -EIO;
1595        }
1596
1597        ff_layout_write_record_layoutstats_start(task, hdr);
1598        return 0;
1599}
1600
1601static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
1602{
1603        struct nfs_pgio_header *hdr = data;
1604
1605        if (ff_layout_write_prepare_common(task, hdr))
1606                return;
1607
1608        rpc_call_start(task);
1609}
1610
1611static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
1612{
1613        struct nfs_pgio_header *hdr = data;
1614
1615        if (nfs4_setup_sequence(hdr->ds_clp,
1616                                &hdr->args.seq_args,
1617                                &hdr->res.seq_res,
1618                                task))
1619                return;
1620
1621        ff_layout_write_prepare_common(task, hdr);
1622}
1623
1624static void ff_layout_write_call_done(struct rpc_task *task, void *data)
1625{
1626        struct nfs_pgio_header *hdr = data;
1627
1628        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1629            task->tk_status == 0) {
1630                nfs4_sequence_done(task, &hdr->res.seq_res);
1631                return;
1632        }
1633
1634        /* Note this may cause RPC to be resent */
1635        hdr->mds_ops->rpc_call_done(task, hdr);
1636}
1637
1638static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
1639{
1640        struct nfs_pgio_header *hdr = data;
1641
1642        ff_layout_write_record_layoutstats_done(task, hdr);
1643        rpc_count_iostats_metrics(task,
1644            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
1645}
1646
1647static void ff_layout_write_release(void *data)
1648{
1649        struct nfs_pgio_header *hdr = data;
1650
1651        ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
1652        if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
1653                ff_layout_send_layouterror(hdr->lseg);
1654                ff_layout_reset_write(hdr, true);
1655        } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
1656                ff_layout_reset_write(hdr, false);
1657        pnfs_generic_rw_release(data);
1658}
1659
1660static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
1661                struct nfs_commit_data *cdata)
1662{
1663        if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
1664                return;
1665        nfs4_ff_layout_stat_io_start_write(cdata->inode,
1666                        FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
1667                        0, task->tk_start);
1668}
1669
1670static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
1671                struct nfs_commit_data *cdata)
1672{
1673        struct nfs_page *req;
1674        __u64 count = 0;
1675
1676        if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
1677                return;
1678
1679        if (task->tk_status == 0) {
1680                list_for_each_entry(req, &cdata->pages, wb_list)
1681                        count += req->wb_bytes;
1682        }
1683        nfs4_ff_layout_stat_io_end_write(task,
1684                        FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
1685                        count, count, NFS_FILE_SYNC);
1686        set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
1687}
1688
1689static void ff_layout_commit_prepare_common(struct rpc_task *task,
1690                struct nfs_commit_data *cdata)
1691{
1692        ff_layout_commit_record_layoutstats_start(task, cdata);
1693}
1694
1695static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
1696{
1697        ff_layout_commit_prepare_common(task, data);
1698        rpc_call_start(task);
1699}
1700
1701static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
1702{
1703        struct nfs_commit_data *wdata = data;
1704
1705        if (nfs4_setup_sequence(wdata->ds_clp,
1706                                &wdata->args.seq_args,
1707                                &wdata->res.seq_res,
1708                                task))
1709                return;
1710        ff_layout_commit_prepare_common(task, data);
1711}
1712
1713static void ff_layout_commit_done(struct rpc_task *task, void *data)
1714{
1715        pnfs_generic_write_commit_done(task, data);
1716}
1717
1718static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
1719{
1720        struct nfs_commit_data *cdata = data;
1721
1722        ff_layout_commit_record_layoutstats_done(task, cdata);
1723        rpc_count_iostats_metrics(task,
1724            &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
1725}
1726
1727static void ff_layout_commit_release(void *data)
1728{
1729        struct nfs_commit_data *cdata = data;
1730
1731        ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
1732        pnfs_generic_commit_release(data);
1733}
1734
1735static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
1736        .rpc_call_prepare = ff_layout_read_prepare_v3,
1737        .rpc_call_done = ff_layout_read_call_done,
1738        .rpc_count_stats = ff_layout_read_count_stats,
1739        .rpc_release = ff_layout_read_release,
1740};
1741
1742static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
1743        .rpc_call_prepare = ff_layout_read_prepare_v4,
1744        .rpc_call_done = ff_layout_read_call_done,
1745        .rpc_count_stats = ff_layout_read_count_stats,
1746        .rpc_release = ff_layout_read_release,
1747};
1748
1749static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
1750        .rpc_call_prepare = ff_layout_write_prepare_v3,
1751        .rpc_call_done = ff_layout_write_call_done,
1752        .rpc_count_stats = ff_layout_write_count_stats,
1753        .rpc_release = ff_layout_write_release,
1754};
1755
1756static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
1757        .rpc_call_prepare = ff_layout_write_prepare_v4,
1758        .rpc_call_done = ff_layout_write_call_done,
1759        .rpc_count_stats = ff_layout_write_count_stats,
1760        .rpc_release = ff_layout_write_release,
1761};
1762
1763static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
1764        .rpc_call_prepare = ff_layout_commit_prepare_v3,
1765        .rpc_call_done = ff_layout_commit_done,
1766        .rpc_count_stats = ff_layout_commit_count_stats,
1767        .rpc_release = ff_layout_commit_release,
1768};
1769
1770static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
1771        .rpc_call_prepare = ff_layout_commit_prepare_v4,
1772        .rpc_call_done = ff_layout_commit_done,
1773        .rpc_count_stats = ff_layout_commit_count_stats,
1774        .rpc_release = ff_layout_commit_release,
1775};
1776
1777static enum pnfs_try_status
1778ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1779{
1780        struct pnfs_layout_segment *lseg = hdr->lseg;
1781        struct nfs4_pnfs_ds *ds;
1782        struct rpc_clnt *ds_clnt;
1783        struct nfs4_ff_layout_mirror *mirror;
1784        const struct cred *ds_cred;
1785        loff_t offset = hdr->args.offset;
1786        u32 idx = hdr->pgio_mirror_idx;
1787        int vers;
1788        struct nfs_fh *fh;
1789
1790        dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
1791                __func__, hdr->inode->i_ino,
1792                hdr->args.pgbase, (size_t)hdr->args.count, offset);
1793
1794        mirror = FF_LAYOUT_COMP(lseg, idx);
1795        ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
1796        if (!ds)
1797                goto out_failed;
1798
1799        ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
1800                                                   hdr->inode);
1801        if (IS_ERR(ds_clnt))
1802                goto out_failed;
1803
1804        ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
1805        if (!ds_cred)
1806                goto out_failed;
1807
1808        vers = nfs4_ff_layout_ds_version(mirror);
1809
1810        dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
1811                ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
1812
1813        hdr->pgio_done_cb = ff_layout_read_done_cb;
1814        refcount_inc(&ds->ds_clp->cl_count);
1815        hdr->ds_clp = ds->ds_clp;
1816        fh = nfs4_ff_layout_select_ds_fh(mirror);
1817        if (fh)
1818                hdr->args.fh = fh;
1819
1820        nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
1821
1822        /*
1823         * Note that if we ever decide to split across DSes,
1824         * then we may need to handle dense-like offsets.
1825         */
1826        hdr->args.offset = offset;
1827        hdr->mds_offset = offset;
1828
1829        /* Perform an asynchronous read to ds */
1830        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1831                          vers == 3 ? &ff_layout_read_call_ops_v3 :
1832                                      &ff_layout_read_call_ops_v4,
1833                          0, RPC_TASK_SOFTCONN);
1834        put_cred(ds_cred);
1835        return PNFS_ATTEMPTED;
1836
1837out_failed:
1838        if (ff_layout_avoid_mds_available_ds(lseg))
1839                return PNFS_TRY_AGAIN;
1840        trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
1841                        hdr->args.offset, hdr->args.count,
1842                        IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
1843        return PNFS_NOT_ATTEMPTED;
1844}
1845
1846/* Perform async writes. */
1847static enum pnfs_try_status
1848ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
1849{
1850        struct pnfs_layout_segment *lseg = hdr->lseg;
1851        struct nfs4_pnfs_ds *ds;
1852        struct rpc_clnt *ds_clnt;
1853        struct nfs4_ff_layout_mirror *mirror;
1854        const struct cred *ds_cred;
1855        loff_t offset = hdr->args.offset;
1856        int vers;
1857        struct nfs_fh *fh;
1858        int idx = hdr->pgio_mirror_idx;
1859
1860        mirror = FF_LAYOUT_COMP(lseg, idx);
1861        ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
1862        if (!ds)
1863                goto out_failed;
1864
1865        ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
1866                                                   hdr->inode);
1867        if (IS_ERR(ds_clnt))
1868                goto out_failed;
1869
1870        ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
1871        if (!ds_cred)
1872                goto out_failed;
1873
1874        vers = nfs4_ff_layout_ds_version(mirror);
1875
1876        dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
1877                __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
1878                offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count),
1879                vers);
1880
1881        hdr->pgio_done_cb = ff_layout_write_done_cb;
1882        refcount_inc(&ds->ds_clp->cl_count);
1883        hdr->ds_clp = ds->ds_clp;
1884        hdr->ds_commit_idx = idx;
1885        fh = nfs4_ff_layout_select_ds_fh(mirror);
1886        if (fh)
1887                hdr->args.fh = fh;
1888
1889        nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
1890
1891        /*
1892         * Note that if we ever decide to split across DSes,
1893         * then we may need to handle dense-like offsets.
1894         */
1895        hdr->args.offset = offset;
1896
1897        /* Perform an asynchronous write */
1898        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1899                          vers == 3 ? &ff_layout_write_call_ops_v3 :
1900                                      &ff_layout_write_call_ops_v4,
1901                          sync, RPC_TASK_SOFTCONN);
1902        put_cred(ds_cred);
1903        return PNFS_ATTEMPTED;
1904
1905out_failed:
1906        if (ff_layout_avoid_mds_available_ds(lseg))
1907                return PNFS_TRY_AGAIN;
1908        trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
1909                        hdr->args.offset, hdr->args.count,
1910                        IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
1911        return PNFS_NOT_ATTEMPTED;
1912}
1913
1914static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1915{
1916        return i;
1917}
1918
1919static struct nfs_fh *
1920select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1921{
1922        struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
1923
1924        /* FIXME: Assume that there is only one NFS version available
1925         * for the DS.
1926         */
1927        return &flseg->mirror_array[i]->fh_versions[0];
1928}
1929
1930static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1931{
1932        struct pnfs_layout_segment *lseg = data->lseg;
1933        struct nfs4_pnfs_ds *ds;
1934        struct rpc_clnt *ds_clnt;
1935        struct nfs4_ff_layout_mirror *mirror;
1936        const struct cred *ds_cred;
1937        u32 idx;
1938        int vers, ret;
1939        struct nfs_fh *fh;
1940
1941        if (!lseg || !(pnfs_is_valid_lseg(lseg) ||
1942            test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
1943                goto out_err;
1944
1945        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
1946        mirror = FF_LAYOUT_COMP(lseg, idx);
1947        ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
1948        if (!ds)
1949                goto out_err;
1950
1951        ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
1952                                                   data->inode);
1953        if (IS_ERR(ds_clnt))
1954                goto out_err;
1955
1956        ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
1957        if (!ds_cred)
1958                goto out_err;
1959
1960        vers = nfs4_ff_layout_ds_version(mirror);
1961
1962        dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
1963                data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
1964                vers);
1965        data->commit_done_cb = ff_layout_commit_done_cb;
1966        data->cred = ds_cred;
1967        refcount_inc(&ds->ds_clp->cl_count);
1968        data->ds_clp = ds->ds_clp;
1969        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1970        if (fh)
1971                data->args.fh = fh;
1972
1973        ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
1974                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
1975                                               &ff_layout_commit_call_ops_v4,
1976                                   how, RPC_TASK_SOFTCONN);
1977        put_cred(ds_cred);
1978        return ret;
1979out_err:
1980        pnfs_generic_prepare_to_resend_writes(data);
1981        pnfs_generic_commit_release(data);
1982        return -EAGAIN;
1983}
1984
1985static int
1986ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1987                           int how, struct nfs_commit_info *cinfo)
1988{
1989        return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1990                                            ff_layout_initiate_commit);
1991}
1992
1993static struct pnfs_ds_commit_info *
1994ff_layout_get_ds_info(struct inode *inode)
1995{
1996        struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
1997
1998        if (layout == NULL)
1999                return NULL;
2000
2001        return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
2002}
2003
2004static void
2005ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
2006{
2007        nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
2008                                                  id_node));
2009}
2010
2011static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
2012                                  const struct nfs4_layoutreturn_args *args,
2013                                  const struct nfs4_flexfile_layoutreturn_args *ff_args)
2014{
2015        __be32 *start;
2016
2017        start = xdr_reserve_space(xdr, 4);
2018        if (unlikely(!start))
2019                return -E2BIG;
2020
2021        *start = cpu_to_be32(ff_args->num_errors);
2022        /* This assume we always return _ALL_ layouts */
2023        return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors);
2024}
2025
2026static void
2027encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
2028{
2029        WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
2030}
2031
2032static void
2033ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
2034                            const nfs4_stateid *stateid,
2035                            const struct nfs42_layoutstat_devinfo *devinfo)
2036{
2037        __be32 *p;
2038
2039        p = xdr_reserve_space(xdr, 8 + 8);
2040        p = xdr_encode_hyper(p, devinfo->offset);
2041        p = xdr_encode_hyper(p, devinfo->length);
2042        encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
2043        p = xdr_reserve_space(xdr, 4*8);
2044        p = xdr_encode_hyper(p, devinfo->read_count);
2045        p = xdr_encode_hyper(p, devinfo->read_bytes);
2046        p = xdr_encode_hyper(p, devinfo->write_count);
2047        p = xdr_encode_hyper(p, devinfo->write_bytes);
2048        encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE);
2049}
2050
2051static void
2052ff_layout_encode_ff_iostat(struct xdr_stream *xdr,
2053                            const nfs4_stateid *stateid,
2054                            const struct nfs42_layoutstat_devinfo *devinfo)
2055{
2056        ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo);
2057        ff_layout_encode_ff_layoutupdate(xdr, devinfo,
2058                        devinfo->ld_private.data);
2059}
2060
2061/* report nothing for now */
2062static void ff_layout_encode_iostats_array(struct xdr_stream *xdr,
2063                const struct nfs4_layoutreturn_args *args,
2064                struct nfs4_flexfile_layoutreturn_args *ff_args)
2065{
2066        __be32 *p;
2067        int i;
2068
2069        p = xdr_reserve_space(xdr, 4);
2070        *p = cpu_to_be32(ff_args->num_dev);
2071        for (i = 0; i < ff_args->num_dev; i++)
2072                ff_layout_encode_ff_iostat(xdr,
2073                                &args->layout->plh_stateid,
2074                                &ff_args->devinfo[i]);
2075}
2076
2077static void
2078ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo,
2079                unsigned int num_entries)
2080{
2081        unsigned int i;
2082
2083        for (i = 0; i < num_entries; i++) {
2084                if (!devinfo[i].ld_private.ops)
2085                        continue;
2086                if (!devinfo[i].ld_private.ops->free)
2087                        continue;
2088                devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
2089        }
2090}
2091
2092static struct nfs4_deviceid_node *
2093ff_layout_alloc_deviceid_node(struct nfs_server *server,
2094                              struct pnfs_device *pdev, gfp_t gfp_flags)
2095{
2096        struct nfs4_ff_layout_ds *dsaddr;
2097
2098        dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
2099        if (!dsaddr)
2100                return NULL;
2101        return &dsaddr->id_node;
2102}
2103
2104static void
2105ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
2106                const void *voidargs,
2107                const struct nfs4_xdr_opaque_data *ff_opaque)
2108{
2109        const struct nfs4_layoutreturn_args *args = voidargs;
2110        struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data;
2111        struct xdr_buf tmp_buf = {
2112                .head = {
2113                        [0] = {
2114                                .iov_base = page_address(ff_args->pages[0]),
2115                        },
2116                },
2117                .buflen = PAGE_SIZE,
2118        };
2119        struct xdr_stream tmp_xdr;
2120        __be32 *start;
2121
2122        dprintk("%s: Begin\n", __func__);
2123
2124        xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
2125
2126        ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
2127        ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
2128
2129        start = xdr_reserve_space(xdr, 4);
2130        *start = cpu_to_be32(tmp_buf.len);
2131        xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len);
2132
2133        dprintk("%s: Return\n", __func__);
2134}
2135
2136static void
2137ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args)
2138{
2139        struct nfs4_flexfile_layoutreturn_args *ff_args;
2140
2141        if (!args->data)
2142                return;
2143        ff_args = args->data;
2144        args->data = NULL;
2145
2146        ff_layout_free_ds_ioerr(&ff_args->errors);
2147        ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev);
2148
2149        put_page(ff_args->pages[0]);
2150        kfree(ff_args);
2151}
2152
2153static const struct nfs4_xdr_opaque_ops layoutreturn_ops = {
2154        .encode = ff_layout_encode_layoutreturn,
2155        .free = ff_layout_free_layoutreturn,
2156};
2157
2158static int
2159ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
2160{
2161        struct nfs4_flexfile_layoutreturn_args *ff_args;
2162        struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
2163
2164        ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
2165        if (!ff_args)
2166                goto out_nomem;
2167        ff_args->pages[0] = alloc_page(GFP_KERNEL);
2168        if (!ff_args->pages[0])
2169                goto out_nomem_free;
2170
2171        INIT_LIST_HEAD(&ff_args->errors);
2172        ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout,
2173                        &args->range, &ff_args->errors,
2174                        FF_LAYOUTRETURN_MAXERR);
2175
2176        spin_lock(&args->inode->i_lock);
2177        ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
2178                        &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
2179        spin_unlock(&args->inode->i_lock);
2180
2181        args->ld_private->ops = &layoutreturn_ops;
2182        args->ld_private->data = ff_args;
2183        return 0;
2184out_nomem_free:
2185        kfree(ff_args);
2186out_nomem:
2187        return -ENOMEM;
2188}
2189
2190#ifdef CONFIG_NFS_V4_2
2191void
2192ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
2193{
2194        struct pnfs_layout_hdr *lo = lseg->pls_layout;
2195        struct nfs42_layout_error *errors;
2196        LIST_HEAD(head);
2197
2198        if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
2199                return;
2200        ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
2201        if (list_empty(&head))
2202                return;
2203
2204        errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
2205                        sizeof(*errors), GFP_NOFS);
2206        if (errors != NULL) {
2207                const struct nfs4_ff_layout_ds_err *pos;
2208                size_t n = 0;
2209
2210                list_for_each_entry(pos, &head, list) {
2211                        errors[n].offset = pos->offset;
2212                        errors[n].length = pos->length;
2213                        nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
2214                        errors[n].errors[0].dev_id = pos->deviceid;
2215                        errors[n].errors[0].status = pos->status;
2216                        errors[n].errors[0].opnum = pos->opnum;
2217                        n++;
2218                        if (!list_is_last(&pos->list, &head) &&
2219                            n < NFS42_LAYOUTERROR_MAX)
2220                                continue;
2221                        if (nfs42_proc_layouterror(lseg, errors, n) < 0)
2222                                break;
2223                        n = 0;
2224                }
2225                kfree(errors);
2226        }
2227        ff_layout_free_ds_ioerr(&head);
2228}
2229#else
2230void
2231ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
2232{
2233}
2234#endif
2235
2236static int
2237ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
2238{
2239        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
2240
2241        return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
2242}
2243
2244static size_t
2245ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
2246                          const int buflen)
2247{
2248        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
2249        const struct in6_addr *addr = &sin6->sin6_addr;
2250
2251        /*
2252         * RFC 4291, Section 2.2.2
2253         *
2254         * Shorthanded ANY address
2255         */
2256        if (ipv6_addr_any(addr))
2257                return snprintf(buf, buflen, "::");
2258
2259        /*
2260         * RFC 4291, Section 2.2.2
2261         *
2262         * Shorthanded loopback address
2263         */
2264        if (ipv6_addr_loopback(addr))
2265                return snprintf(buf, buflen, "::1");
2266
2267        /*
2268         * RFC 4291, Section 2.2.3
2269         *
2270         * Special presentation address format for mapped v4
2271         * addresses.
2272         */
2273        if (ipv6_addr_v4mapped(addr))
2274                return snprintf(buf, buflen, "::ffff:%pI4",
2275                                        &addr->s6_addr32[3]);
2276
2277        /*
2278         * RFC 4291, Section 2.2.1
2279         */
2280        return snprintf(buf, buflen, "%pI6c", addr);
2281}
2282
2283/* Derived from rpc_sockaddr2uaddr */
2284static void
2285ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
2286{
2287        struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
2288        char portbuf[RPCBIND_MAXUADDRPLEN];
2289        char addrbuf[RPCBIND_MAXUADDRLEN];
2290        char *netid;
2291        unsigned short port;
2292        int len, netid_len;
2293        __be32 *p;
2294
2295        switch (sap->sa_family) {
2296        case AF_INET:
2297                if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
2298                        return;
2299                port = ntohs(((struct sockaddr_in *)sap)->sin_port);
2300                netid = "tcp";
2301                netid_len = 3;
2302                break;
2303        case AF_INET6:
2304                if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
2305                        return;
2306                port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
2307                netid = "tcp6";
2308                netid_len = 4;
2309                break;
2310        default:
2311                /* we only support tcp and tcp6 */
2312                WARN_ON_ONCE(1);
2313                return;
2314        }
2315
2316        snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
2317        len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
2318
2319        p = xdr_reserve_space(xdr, 4 + netid_len);
2320        xdr_encode_opaque(p, netid, netid_len);
2321
2322        p = xdr_reserve_space(xdr, 4 + len);
2323        xdr_encode_opaque(p, addrbuf, len);
2324}
2325
2326static void
2327ff_layout_encode_nfstime(struct xdr_stream *xdr,
2328                         ktime_t t)
2329{
2330        struct timespec64 ts;
2331        __be32 *p;
2332
2333        p = xdr_reserve_space(xdr, 12);
2334        ts = ktime_to_timespec64(t);
2335        p = xdr_encode_hyper(p, ts.tv_sec);
2336        *p++ = cpu_to_be32(ts.tv_nsec);
2337}
2338
2339static void
2340ff_layout_encode_io_latency(struct xdr_stream *xdr,
2341                            struct nfs4_ff_io_stat *stat)
2342{
2343        __be32 *p;
2344
2345        p = xdr_reserve_space(xdr, 5 * 8);
2346        p = xdr_encode_hyper(p, stat->ops_requested);
2347        p = xdr_encode_hyper(p, stat->bytes_requested);
2348        p = xdr_encode_hyper(p, stat->ops_completed);
2349        p = xdr_encode_hyper(p, stat->bytes_completed);
2350        p = xdr_encode_hyper(p, stat->bytes_not_delivered);
2351        ff_layout_encode_nfstime(xdr, stat->total_busy_time);
2352        ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
2353}
2354
2355static void
2356ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
2357                              const struct nfs42_layoutstat_devinfo *devinfo,
2358                              struct nfs4_ff_layout_mirror *mirror)
2359{
2360        struct nfs4_pnfs_ds_addr *da;
2361        struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
2362        struct nfs_fh *fh = &mirror->fh_versions[0];
2363        __be32 *p;
2364
2365        da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
2366        dprintk("%s: DS %s: encoding address %s\n",
2367                __func__, ds->ds_remotestr, da->da_remotestr);
2368        /* netaddr4 */
2369        ff_layout_encode_netaddr(xdr, da);
2370        /* nfs_fh4 */
2371        p = xdr_reserve_space(xdr, 4 + fh->size);
2372        xdr_encode_opaque(p, fh->data, fh->size);
2373        /* ff_io_latency4 read */
2374        spin_lock(&mirror->lock);
2375        ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
2376        /* ff_io_latency4 write */
2377        ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
2378        spin_unlock(&mirror->lock);
2379        /* nfstime4 */
2380        ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
2381        /* bool */
2382        p = xdr_reserve_space(xdr, 4);
2383        *p = cpu_to_be32(false);
2384}
2385
2386static void
2387ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
2388                             const struct nfs4_xdr_opaque_data *opaque)
2389{
2390        struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque,
2391                        struct nfs42_layoutstat_devinfo, ld_private);
2392        __be32 *start;
2393
2394        /* layoutupdate length */
2395        start = xdr_reserve_space(xdr, 4);
2396        ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data);
2397
2398        *start = cpu_to_be32((xdr->p - start - 1) * 4);
2399}
2400
2401static void
2402ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
2403{
2404        struct nfs4_ff_layout_mirror *mirror = opaque->data;
2405
2406        ff_layout_put_mirror(mirror);
2407}
2408
2409static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
2410        .encode = ff_layout_encode_layoutstats,
2411        .free   = ff_layout_free_layoutstats,
2412};
2413
2414static int
2415ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
2416                               struct nfs42_layoutstat_devinfo *devinfo,
2417                               int dev_limit)
2418{
2419        struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
2420        struct nfs4_ff_layout_mirror *mirror;
2421        struct nfs4_deviceid_node *dev;
2422        int i = 0;
2423
2424        list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
2425                if (i >= dev_limit)
2426                        break;
2427                if (IS_ERR_OR_NULL(mirror->mirror_ds))
2428                        continue;
2429                if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
2430                        continue;
2431                /* mirror refcount put in cleanup_layoutstats */
2432                if (!refcount_inc_not_zero(&mirror->ref))
2433                        continue;
2434                dev = &mirror->mirror_ds->id_node; 
2435                memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
2436                devinfo->offset = 0;
2437                devinfo->length = NFS4_MAX_UINT64;
2438                spin_lock(&mirror->lock);
2439                devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
2440                devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
2441                devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
2442                devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
2443                spin_unlock(&mirror->lock);
2444                devinfo->layout_type = LAYOUT_FLEX_FILES;
2445                devinfo->ld_private.ops = &layoutstat_ops;
2446                devinfo->ld_private.data = mirror;
2447
2448                devinfo++;
2449                i++;
2450        }
2451        return i;
2452}
2453
2454static int
2455ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
2456{
2457        struct nfs4_flexfile_layout *ff_layout;
2458        const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
2459
2460        /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
2461        args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
2462        if (!args->devinfo)
2463                return -ENOMEM;
2464
2465        spin_lock(&args->inode->i_lock);
2466        ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
2467        args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
2468                        &args->devinfo[0], dev_count);
2469        spin_unlock(&args->inode->i_lock);
2470        if (!args->num_dev) {
2471                kfree(args->devinfo);
2472                args->devinfo = NULL;
2473                return -ENOENT;
2474        }
2475
2476        return 0;
2477}
2478
2479static int
2480ff_layout_set_layoutdriver(struct nfs_server *server,
2481                const struct nfs_fh *dummy)
2482{
2483#if IS_ENABLED(CONFIG_NFS_V4_2)
2484        server->caps |= NFS_CAP_LAYOUTSTATS;
2485#endif
2486        return 0;
2487}
2488
2489static struct pnfs_layoutdriver_type flexfilelayout_type = {
2490        .id                     = LAYOUT_FLEX_FILES,
2491        .name                   = "LAYOUT_FLEX_FILES",
2492        .owner                  = THIS_MODULE,
2493        .flags                  = PNFS_LAYOUTGET_ON_OPEN,
2494        .max_layoutget_response = 4096, /* 1 page or so... */
2495        .set_layoutdriver       = ff_layout_set_layoutdriver,
2496        .alloc_layout_hdr       = ff_layout_alloc_layout_hdr,
2497        .free_layout_hdr        = ff_layout_free_layout_hdr,
2498        .alloc_lseg             = ff_layout_alloc_lseg,
2499        .free_lseg              = ff_layout_free_lseg,
2500        .add_lseg               = ff_layout_add_lseg,
2501        .pg_read_ops            = &ff_layout_pg_read_ops,
2502        .pg_write_ops           = &ff_layout_pg_write_ops,
2503        .get_ds_info            = ff_layout_get_ds_info,
2504        .free_deviceid_node     = ff_layout_free_deviceid_node,
2505        .mark_request_commit    = pnfs_layout_mark_request_commit,
2506        .clear_request_commit   = pnfs_generic_clear_request_commit,
2507        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
2508        .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
2509        .commit_pagelist        = ff_layout_commit_pagelist,
2510        .read_pagelist          = ff_layout_read_pagelist,
2511        .write_pagelist         = ff_layout_write_pagelist,
2512        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
2513        .prepare_layoutreturn   = ff_layout_prepare_layoutreturn,
2514        .sync                   = pnfs_nfs_generic_sync,
2515        .prepare_layoutstats    = ff_layout_prepare_layoutstats,
2516};
2517
2518static int __init nfs4flexfilelayout_init(void)
2519{
2520        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
2521               __func__);
2522        return pnfs_register_layoutdriver(&flexfilelayout_type);
2523}
2524
2525static void __exit nfs4flexfilelayout_exit(void)
2526{
2527        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
2528               __func__);
2529        pnfs_unregister_layoutdriver(&flexfilelayout_type);
2530}
2531
2532MODULE_ALIAS("nfs-layouttype4-4");
2533
2534MODULE_LICENSE("GPL");
2535MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
2536
2537module_init(nfs4flexfilelayout_init);
2538module_exit(nfs4flexfilelayout_exit);
2539
2540module_param(io_maxretrans, ushort, 0644);
2541MODULE_PARM_DESC(io_maxretrans, "The  number of times the NFSv4.1 client "
2542                        "retries an I/O request before returning an error. ");
2543