linux/fs/nfs/flexfilelayout/flexfilelayout.c
<<
>>
Prefs
   1/*
   2 * Module for pnfs flexfile layout driver.
   3 *
   4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
   5 *
   6 * Tao Peng <bergwolf@primarydata.com>
   7 */
   8
   9#include <linux/nfs_fs.h>
  10#include <linux/nfs_page.h>
  11#include <linux/module.h>
  12
  13#include <linux/sunrpc/metrics.h>
  14
  15#include "flexfilelayout.h"
  16#include "../nfs4session.h"
  17#include "../nfs4idmap.h"
  18#include "../internal.h"
  19#include "../delegation.h"
  20#include "../nfs4trace.h"
  21#include "../iostat.h"
  22#include "../nfs.h"
  23#include "../nfs42.h"
  24
  25#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  26
  27#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
  28
  29static struct pnfs_layout_hdr *
  30ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
  31{
  32        struct nfs4_flexfile_layout *ffl;
  33
  34        ffl = kzalloc(sizeof(*ffl), gfp_flags);
  35        if (ffl) {
  36                INIT_LIST_HEAD(&ffl->error_list);
  37                return &ffl->generic_hdr;
  38        } else
  39                return NULL;
  40}
  41
  42static void
  43ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
  44{
  45        struct nfs4_ff_layout_ds_err *err, *n;
  46
  47        list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
  48                                 list) {
  49                list_del(&err->list);
  50                kfree(err);
  51        }
  52        kfree(FF_LAYOUT_FROM_HDR(lo));
  53}
  54
  55static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
  56{
  57        __be32 *p;
  58
  59        p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
  60        if (unlikely(p == NULL))
  61                return -ENOBUFS;
  62        memcpy(stateid, p, NFS4_STATEID_SIZE);
  63        dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
  64                p[0], p[1], p[2], p[3]);
  65        return 0;
  66}
  67
  68static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
  69{
  70        __be32 *p;
  71
  72        p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
  73        if (unlikely(!p))
  74                return -ENOBUFS;
  75        memcpy(devid, p, NFS4_DEVICEID4_SIZE);
  76        nfs4_print_deviceid(devid);
  77        return 0;
  78}
  79
  80static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
  81{
  82        __be32 *p;
  83
  84        p = xdr_inline_decode(xdr, 4);
  85        if (unlikely(!p))
  86                return -ENOBUFS;
  87        fh->size = be32_to_cpup(p++);
  88        if (fh->size > sizeof(struct nfs_fh)) {
  89                printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
  90                       fh->size);
  91                return -EOVERFLOW;
  92        }
  93        /* fh.data */
  94        p = xdr_inline_decode(xdr, fh->size);
  95        if (unlikely(!p))
  96                return -ENOBUFS;
  97        memcpy(&fh->data, p, fh->size);
  98        dprintk("%s: fh len %d\n", __func__, fh->size);
  99
 100        return 0;
 101}
 102
 103/*
 104 * Currently only stringified uids and gids are accepted.
 105 * I.e., kerberos is not supported to the DSes, so no pricipals.
 106 *
 107 * That means that one common function will suffice, but when
 108 * principals are added, this should be split to accomodate
 109 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
 110 */
 111static int
 112decode_name(struct xdr_stream *xdr, u32 *id)
 113{
 114        __be32 *p;
 115        int len;
 116
 117        /* opaque_length(4)*/
 118        p = xdr_inline_decode(xdr, 4);
 119        if (unlikely(!p))
 120                return -ENOBUFS;
 121        len = be32_to_cpup(p++);
 122        if (len < 0)
 123                return -EINVAL;
 124
 125        dprintk("%s: len %u\n", __func__, len);
 126
 127        /* opaque body */
 128        p = xdr_inline_decode(xdr, len);
 129        if (unlikely(!p))
 130                return -ENOBUFS;
 131
 132        if (!nfs_map_string_to_numeric((char *)p, len, id))
 133                return -EINVAL;
 134
 135        return 0;
 136}
 137
 138static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 139{
 140        int i;
 141
 142        if (fls->mirror_array) {
 143                for (i = 0; i < fls->mirror_array_cnt; i++) {
 144                        /* normally mirror_ds is freed in
 145                         * .free_deviceid_node but we still do it here
 146                         * for .alloc_lseg error path */
 147                        if (fls->mirror_array[i]) {
 148                                kfree(fls->mirror_array[i]->fh_versions);
 149                                nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
 150                                kfree(fls->mirror_array[i]);
 151                        }
 152                }
 153                kfree(fls->mirror_array);
 154                fls->mirror_array = NULL;
 155        }
 156}
 157
 158static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
 159{
 160        int ret = 0;
 161
 162        dprintk("--> %s\n", __func__);
 163
 164        /* FIXME: remove this check when layout segment support is added */
 165        if (lgr->range.offset != 0 ||
 166            lgr->range.length != NFS4_MAX_UINT64) {
 167                dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
 168                        __func__);
 169                ret = -EINVAL;
 170        }
 171
 172        dprintk("--> %s returns %d\n", __func__, ret);
 173        return ret;
 174}
 175
 176static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
 177{
 178        if (fls) {
 179                ff_layout_free_mirror_array(fls);
 180                kfree(fls);
 181        }
 182}
 183
 184static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 185{
 186        int i, j;
 187
 188        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
 189                for (j = i + 1; j < fls->mirror_array_cnt; j++)
 190                        if (fls->mirror_array[i]->efficiency <
 191                            fls->mirror_array[j]->efficiency)
 192                                swap(fls->mirror_array[i],
 193                                     fls->mirror_array[j]);
 194        }
 195}
 196
 197static struct pnfs_layout_segment *
 198ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 199                     struct nfs4_layoutget_res *lgr,
 200                     gfp_t gfp_flags)
 201{
 202        struct pnfs_layout_segment *ret;
 203        struct nfs4_ff_layout_segment *fls = NULL;
 204        struct xdr_stream stream;
 205        struct xdr_buf buf;
 206        struct page *scratch;
 207        u64 stripe_unit;
 208        u32 mirror_array_cnt;
 209        __be32 *p;
 210        int i, rc;
 211
 212        dprintk("--> %s\n", __func__);
 213        scratch = alloc_page(gfp_flags);
 214        if (!scratch)
 215                return ERR_PTR(-ENOMEM);
 216
 217        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
 218                              lgr->layoutp->len);
 219        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 220
 221        /* stripe unit and mirror_array_cnt */
 222        rc = -EIO;
 223        p = xdr_inline_decode(&stream, 8 + 4);
 224        if (!p)
 225                goto out_err_free;
 226
 227        p = xdr_decode_hyper(p, &stripe_unit);
 228        mirror_array_cnt = be32_to_cpup(p++);
 229        dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
 230                stripe_unit, mirror_array_cnt);
 231
 232        if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
 233            mirror_array_cnt == 0)
 234                goto out_err_free;
 235
 236        rc = -ENOMEM;
 237        fls = kzalloc(sizeof(*fls), gfp_flags);
 238        if (!fls)
 239                goto out_err_free;
 240
 241        fls->mirror_array_cnt = mirror_array_cnt;
 242        fls->stripe_unit = stripe_unit;
 243        fls->mirror_array = kcalloc(fls->mirror_array_cnt,
 244                                    sizeof(fls->mirror_array[0]), gfp_flags);
 245        if (fls->mirror_array == NULL)
 246                goto out_err_free;
 247
 248        for (i = 0; i < fls->mirror_array_cnt; i++) {
 249                struct nfs4_deviceid devid;
 250                struct nfs4_deviceid_node *idnode;
 251                u32 ds_count;
 252                u32 fh_count;
 253                int j;
 254
 255                rc = -EIO;
 256                p = xdr_inline_decode(&stream, 4);
 257                if (!p)
 258                        goto out_err_free;
 259                ds_count = be32_to_cpup(p);
 260
 261                /* FIXME: allow for striping? */
 262                if (ds_count != 1)
 263                        goto out_err_free;
 264
 265                fls->mirror_array[i] =
 266                        kzalloc(sizeof(struct nfs4_ff_layout_mirror),
 267                                gfp_flags);
 268                if (fls->mirror_array[i] == NULL) {
 269                        rc = -ENOMEM;
 270                        goto out_err_free;
 271                }
 272
 273                spin_lock_init(&fls->mirror_array[i]->lock);
 274                fls->mirror_array[i]->ds_count = ds_count;
 275                fls->mirror_array[i]->lseg = &fls->generic_hdr;
 276
 277                /* deviceid */
 278                rc = decode_deviceid(&stream, &devid);
 279                if (rc)
 280                        goto out_err_free;
 281
 282                idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
 283                                                &devid, lh->plh_lc_cred,
 284                                                gfp_flags);
 285                /*
 286                 * upon success, mirror_ds is allocated by previous
 287                 * getdeviceinfo, or newly by .alloc_deviceid_node
 288                 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
 289                 */
 290                if (idnode)
 291                        fls->mirror_array[i]->mirror_ds =
 292                                FF_LAYOUT_MIRROR_DS(idnode);
 293                else
 294                        goto out_err_free;
 295
 296                /* efficiency */
 297                rc = -EIO;
 298                p = xdr_inline_decode(&stream, 4);
 299                if (!p)
 300                        goto out_err_free;
 301                fls->mirror_array[i]->efficiency = be32_to_cpup(p);
 302
 303                /* stateid */
 304                rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
 305                if (rc)
 306                        goto out_err_free;
 307
 308                /* fh */
 309                p = xdr_inline_decode(&stream, 4);
 310                if (!p)
 311                        goto out_err_free;
 312                fh_count = be32_to_cpup(p);
 313
 314                fls->mirror_array[i]->fh_versions =
 315                        kzalloc(fh_count * sizeof(struct nfs_fh),
 316                                gfp_flags);
 317                if (fls->mirror_array[i]->fh_versions == NULL) {
 318                        rc = -ENOMEM;
 319                        goto out_err_free;
 320                }
 321
 322                for (j = 0; j < fh_count; j++) {
 323                        rc = decode_nfs_fh(&stream,
 324                                           &fls->mirror_array[i]->fh_versions[j]);
 325                        if (rc)
 326                                goto out_err_free;
 327                }
 328
 329                fls->mirror_array[i]->fh_versions_cnt = fh_count;
 330
 331                /* user */
 332                rc = decode_name(&stream, &fls->mirror_array[i]->uid);
 333                if (rc)
 334                        goto out_err_free;
 335
 336                /* group */
 337                rc = decode_name(&stream, &fls->mirror_array[i]->gid);
 338                if (rc)
 339                        goto out_err_free;
 340
 341                dprintk("%s: uid %d gid %d\n", __func__,
 342                        fls->mirror_array[i]->uid,
 343                        fls->mirror_array[i]->gid);
 344        }
 345
 346        p = xdr_inline_decode(&stream, 4);
 347        if (p)
 348                fls->flags = be32_to_cpup(p);
 349
 350        ff_layout_sort_mirrors(fls);
 351        rc = ff_layout_check_layout(lgr);
 352        if (rc)
 353                goto out_err_free;
 354
 355        ret = &fls->generic_hdr;
 356        dprintk("<-- %s (success)\n", __func__);
 357out_free_page:
 358        __free_page(scratch);
 359        return ret;
 360out_err_free:
 361        _ff_layout_free_lseg(fls);
 362        ret = ERR_PTR(rc);
 363        dprintk("<-- %s (%d)\n", __func__, rc);
 364        goto out_free_page;
 365}
 366
 367static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
 368{
 369        struct pnfs_layout_segment *lseg;
 370
 371        list_for_each_entry(lseg, &layout->plh_segs, pls_list)
 372                if (lseg->pls_range.iomode == IOMODE_RW)
 373                        return true;
 374
 375        return false;
 376}
 377
 378static void
 379ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 380{
 381        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 382        int i;
 383
 384        dprintk("--> %s\n", __func__);
 385
 386        for (i = 0; i < fls->mirror_array_cnt; i++) {
 387                if (fls->mirror_array[i]) {
 388                        nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
 389                        fls->mirror_array[i]->mirror_ds = NULL;
 390                        if (fls->mirror_array[i]->cred) {
 391                                put_rpccred(fls->mirror_array[i]->cred);
 392                                fls->mirror_array[i]->cred = NULL;
 393                        }
 394                }
 395        }
 396
 397        if (lseg->pls_range.iomode == IOMODE_RW) {
 398                struct nfs4_flexfile_layout *ffl;
 399                struct inode *inode;
 400
 401                ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
 402                inode = ffl->generic_hdr.plh_inode;
 403                spin_lock(&inode->i_lock);
 404                if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
 405                        ffl->commit_info.nbuckets = 0;
 406                        kfree(ffl->commit_info.buckets);
 407                        ffl->commit_info.buckets = NULL;
 408                }
 409                spin_unlock(&inode->i_lock);
 410        }
 411        _ff_layout_free_lseg(fls);
 412}
 413
 414/* Return 1 until we have multiple lsegs support */
 415static int
 416ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
 417{
 418        return 1;
 419}
 420
 421static void
 422nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
 423{
 424        /* first IO request? */
 425        if (atomic_inc_return(&timer->n_ops) == 1) {
 426                timer->start_time = ktime_get();
 427        }
 428}
 429
 430static ktime_t
 431nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
 432{
 433        ktime_t start, now;
 434
 435        if (atomic_dec_return(&timer->n_ops) < 0)
 436                WARN_ON_ONCE(1);
 437
 438        now = ktime_get();
 439        start = timer->start_time;
 440        timer->start_time = now;
 441        return ktime_sub(now, start);
 442}
 443
 444static ktime_t
 445nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
 446{
 447        return ktime_sub(ktime_get(), task->tk_start);
 448}
 449
 450static bool
 451nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 452                            struct nfs4_ff_layoutstat *layoutstat)
 453{
 454        static const ktime_t notime = {0};
 455        ktime_t now = ktime_get();
 456
 457        nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
 458        if (ktime_equal(mirror->start_time, notime))
 459                mirror->start_time = now;
 460        if (ktime_equal(mirror->last_report_time, notime))
 461                mirror->last_report_time = now;
 462        if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
 463                        FF_LAYOUTSTATS_REPORT_INTERVAL) {
 464                mirror->last_report_time = now;
 465                return true;
 466        }
 467
 468        return false;
 469}
 470
 471static void
 472nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
 473                __u64 requested)
 474{
 475        struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
 476
 477        iostat->ops_requested++;
 478        iostat->bytes_requested += requested;
 479}
 480
 481static void
 482nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
 483                __u64 requested,
 484                __u64 completed,
 485                ktime_t time_completed)
 486{
 487        struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
 488        ktime_t timer;
 489
 490        iostat->ops_completed++;
 491        iostat->bytes_completed += completed;
 492        iostat->bytes_not_delivered += requested - completed;
 493
 494        timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
 495        iostat->total_busy_time =
 496                        ktime_add(iostat->total_busy_time, timer);
 497        iostat->aggregate_completion_time =
 498                        ktime_add(iostat->aggregate_completion_time, time_completed);
 499}
 500
 501static void
 502nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
 503                __u64 requested)
 504{
 505        bool report;
 506
 507        spin_lock(&mirror->lock);
 508        report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
 509        nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
 510        spin_unlock(&mirror->lock);
 511
 512        if (report)
 513                pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
 514}
 515
 516static void
 517nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 518                struct nfs4_ff_layout_mirror *mirror,
 519                __u64 requested,
 520                __u64 completed)
 521{
 522        spin_lock(&mirror->lock);
 523        nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
 524                        requested, completed,
 525                        nfs4_ff_layout_calc_completion_time(task));
 526        spin_unlock(&mirror->lock);
 527}
 528
 529static void
 530nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
 531                __u64 requested)
 532{
 533        bool report;
 534
 535        spin_lock(&mirror->lock);
 536        report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
 537        nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
 538        spin_unlock(&mirror->lock);
 539
 540        if (report)
 541                pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
 542}
 543
 544static void
 545nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 546                struct nfs4_ff_layout_mirror *mirror,
 547                __u64 requested,
 548                __u64 completed,
 549                enum nfs3_stable_how committed)
 550{
 551        if (committed == NFS_UNSTABLE)
 552                requested = completed = 0;
 553
 554        spin_lock(&mirror->lock);
 555        nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
 556                        requested, completed,
 557                        nfs4_ff_layout_calc_completion_time(task));
 558        spin_unlock(&mirror->lock);
 559}
 560
 561static int
 562ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 563                            struct nfs_commit_info *cinfo,
 564                            gfp_t gfp_flags)
 565{
 566        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 567        struct pnfs_commit_bucket *buckets;
 568        int size;
 569
 570        if (cinfo->ds->nbuckets != 0) {
 571                /* This assumes there is only one RW lseg per file.
 572                 * To support multiple lseg per file, we need to
 573                 * change struct pnfs_commit_bucket to allow dynamic
 574                 * increasing nbuckets.
 575                 */
 576                return 0;
 577        }
 578
 579        size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
 580
 581        buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
 582                          gfp_flags);
 583        if (!buckets)
 584                return -ENOMEM;
 585        else {
 586                int i;
 587
 588                spin_lock(cinfo->lock);
 589                if (cinfo->ds->nbuckets != 0)
 590                        kfree(buckets);
 591                else {
 592                        cinfo->ds->buckets = buckets;
 593                        cinfo->ds->nbuckets = size;
 594                        for (i = 0; i < size; i++) {
 595                                INIT_LIST_HEAD(&buckets[i].written);
 596                                INIT_LIST_HEAD(&buckets[i].committing);
 597                                /* mark direct verifier as unset */
 598                                buckets[i].direct_verf.committed =
 599                                        NFS_INVALID_STABLE_HOW;
 600                        }
 601                }
 602                spin_unlock(cinfo->lock);
 603                return 0;
 604        }
 605}
 606
 607static struct nfs4_pnfs_ds *
 608ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
 609                                  int *best_idx)
 610{
 611        struct nfs4_ff_layout_segment *fls;
 612        struct nfs4_pnfs_ds *ds;
 613        int idx;
 614
 615        fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
 616        /* mirrors are sorted by efficiency */
 617        for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
 618                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
 619                if (ds) {
 620                        *best_idx = idx;
 621                        return ds;
 622                }
 623        }
 624
 625        return NULL;
 626}
 627
 628static void
 629ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 630                        struct nfs_page *req)
 631{
 632        struct nfs_pgio_mirror *pgm;
 633        struct nfs4_ff_layout_mirror *mirror;
 634        struct nfs4_pnfs_ds *ds;
 635        int ds_idx;
 636
 637        /* Use full layout for now */
 638        if (!pgio->pg_lseg)
 639                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 640                                                   req->wb_context,
 641                                                   0,
 642                                                   NFS4_MAX_UINT64,
 643                                                   IOMODE_READ,
 644                                                   GFP_KERNEL);
 645        /* If no lseg, fall back to read through mds */
 646        if (pgio->pg_lseg == NULL)
 647                goto out_mds;
 648
 649        ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
 650        if (!ds)
 651                goto out_mds;
 652        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
 653
 654        pgio->pg_mirror_idx = ds_idx;
 655
 656        /* read always uses only one mirror - idx 0 for pgio layer */
 657        pgm = &pgio->pg_mirrors[0];
 658        pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
 659
 660        return;
 661out_mds:
 662        pnfs_put_lseg(pgio->pg_lseg);
 663        pgio->pg_lseg = NULL;
 664        nfs_pageio_reset_read_mds(pgio);
 665}
 666
 667static void
 668ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 669                        struct nfs_page *req)
 670{
 671        struct nfs4_ff_layout_mirror *mirror;
 672        struct nfs_pgio_mirror *pgm;
 673        struct nfs_commit_info cinfo;
 674        struct nfs4_pnfs_ds *ds;
 675        int i;
 676        int status;
 677
 678        if (!pgio->pg_lseg)
 679                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 680                                                   req->wb_context,
 681                                                   0,
 682                                                   NFS4_MAX_UINT64,
 683                                                   IOMODE_RW,
 684                                                   GFP_NOFS);
 685        /* If no lseg, fall back to write through mds */
 686        if (pgio->pg_lseg == NULL)
 687                goto out_mds;
 688
 689        nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
 690        status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
 691        if (status < 0)
 692                goto out_mds;
 693
 694        /* Use a direct mapping of ds_idx to pgio mirror_idx */
 695        if (WARN_ON_ONCE(pgio->pg_mirror_count !=
 696            FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
 697                goto out_mds;
 698
 699        for (i = 0; i < pgio->pg_mirror_count; i++) {
 700                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
 701                if (!ds)
 702                        goto out_mds;
 703                pgm = &pgio->pg_mirrors[i];
 704                mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
 705                pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
 706        }
 707
 708        return;
 709
 710out_mds:
 711        pnfs_put_lseg(pgio->pg_lseg);
 712        pgio->pg_lseg = NULL;
 713        nfs_pageio_reset_write_mds(pgio);
 714}
 715
 716static unsigned int
 717ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 718                                    struct nfs_page *req)
 719{
 720        if (!pgio->pg_lseg)
 721                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 722                                                   req->wb_context,
 723                                                   0,
 724                                                   NFS4_MAX_UINT64,
 725                                                   IOMODE_RW,
 726                                                   GFP_NOFS);
 727        if (pgio->pg_lseg)
 728                return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 729
 730        /* no lseg means that pnfs is not in use, so no mirroring here */
 731        pnfs_put_lseg(pgio->pg_lseg);
 732        pgio->pg_lseg = NULL;
 733        nfs_pageio_reset_write_mds(pgio);
 734        return 1;
 735}
 736
 737static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
 738        .pg_init = ff_layout_pg_init_read,
 739        .pg_test = pnfs_generic_pg_test,
 740        .pg_doio = pnfs_generic_pg_readpages,
 741        .pg_cleanup = pnfs_generic_pg_cleanup,
 742};
 743
 744static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
 745        .pg_init = ff_layout_pg_init_write,
 746        .pg_test = pnfs_generic_pg_test,
 747        .pg_doio = pnfs_generic_pg_writepages,
 748        .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
 749        .pg_cleanup = pnfs_generic_pg_cleanup,
 750};
 751
 752static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
 753{
 754        struct rpc_task *task = &hdr->task;
 755
 756        pnfs_layoutcommit_inode(hdr->inode, false);
 757
 758        if (retry_pnfs) {
 759                dprintk("%s Reset task %5u for i/o through pNFS "
 760                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 761                        hdr->task.tk_pid,
 762                        hdr->inode->i_sb->s_id,
 763                        (unsigned long long)NFS_FILEID(hdr->inode),
 764                        hdr->args.count,
 765                        (unsigned long long)hdr->args.offset);
 766
 767                if (!hdr->dreq) {
 768                        struct nfs_open_context *ctx;
 769
 770                        ctx = nfs_list_entry(hdr->pages.next)->wb_context;
 771                        set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
 772                        hdr->completion_ops->error_cleanup(&hdr->pages);
 773                } else {
 774                        nfs_direct_set_resched_writes(hdr->dreq);
 775                        /* fake unstable write to let common nfs resend pages */
 776                        hdr->verf.committed = NFS_UNSTABLE;
 777                        hdr->good_bytes = hdr->args.count;
 778                }
 779                return;
 780        }
 781
 782        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 783                dprintk("%s Reset task %5u for i/o through MDS "
 784                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 785                        hdr->task.tk_pid,
 786                        hdr->inode->i_sb->s_id,
 787                        (unsigned long long)NFS_FILEID(hdr->inode),
 788                        hdr->args.count,
 789                        (unsigned long long)hdr->args.offset);
 790
 791                task->tk_status = pnfs_write_done_resend_to_mds(hdr);
 792        }
 793}
 794
 795static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
 796{
 797        struct rpc_task *task = &hdr->task;
 798
 799        pnfs_layoutcommit_inode(hdr->inode, false);
 800
 801        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
 802                dprintk("%s Reset task %5u for i/o through MDS "
 803                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
 804                        hdr->task.tk_pid,
 805                        hdr->inode->i_sb->s_id,
 806                        (unsigned long long)NFS_FILEID(hdr->inode),
 807                        hdr->args.count,
 808                        (unsigned long long)hdr->args.offset);
 809
 810                task->tk_status = pnfs_read_done_resend_to_mds(hdr);
 811        }
 812}
 813
 814static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 815                                           struct nfs4_state *state,
 816                                           struct nfs_client *clp,
 817                                           struct pnfs_layout_segment *lseg,
 818                                           int idx)
 819{
 820        struct pnfs_layout_hdr *lo = lseg->pls_layout;
 821        struct inode *inode = lo->plh_inode;
 822        struct nfs_server *mds_server = NFS_SERVER(inode);
 823
 824        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
 825        struct nfs_client *mds_client = mds_server->nfs_client;
 826        struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
 827
 828        if (task->tk_status >= 0)
 829                return 0;
 830
 831        switch (task->tk_status) {
 832        /* MDS state errors */
 833        case -NFS4ERR_DELEG_REVOKED:
 834        case -NFS4ERR_ADMIN_REVOKED:
 835        case -NFS4ERR_BAD_STATEID:
 836                if (state == NULL)
 837                        break;
 838                nfs_remove_bad_delegation(state->inode);
 839        case -NFS4ERR_OPENMODE:
 840                if (state == NULL)
 841                        break;
 842                if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
 843                        goto out_bad_stateid;
 844                goto wait_on_recovery;
 845        case -NFS4ERR_EXPIRED:
 846                if (state != NULL) {
 847                        if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
 848                                goto out_bad_stateid;
 849                }
 850                nfs4_schedule_lease_recovery(mds_client);
 851                goto wait_on_recovery;
 852        /* DS session errors */
 853        case -NFS4ERR_BADSESSION:
 854        case -NFS4ERR_BADSLOT:
 855        case -NFS4ERR_BAD_HIGH_SLOT:
 856        case -NFS4ERR_DEADSESSION:
 857        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 858        case -NFS4ERR_SEQ_FALSE_RETRY:
 859        case -NFS4ERR_SEQ_MISORDERED:
 860                dprintk("%s ERROR %d, Reset session. Exchangeid "
 861                        "flags 0x%x\n", __func__, task->tk_status,
 862                        clp->cl_exchange_flags);
 863                nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
 864                break;
 865        case -NFS4ERR_DELAY:
 866        case -NFS4ERR_GRACE:
 867                rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
 868                break;
 869        case -NFS4ERR_RETRY_UNCACHED_REP:
 870                break;
 871        /* Invalidate Layout errors */
 872        case -NFS4ERR_PNFS_NO_LAYOUT:
 873        case -ESTALE:           /* mapped NFS4ERR_STALE */
 874        case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
 875        case -EISDIR:           /* mapped NFS4ERR_ISDIR */
 876        case -NFS4ERR_FHEXPIRED:
 877        case -NFS4ERR_WRONG_TYPE:
 878                dprintk("%s Invalid layout error %d\n", __func__,
 879                        task->tk_status);
 880                /*
 881                 * Destroy layout so new i/o will get a new layout.
 882                 * Layout will not be destroyed until all current lseg
 883                 * references are put. Mark layout as invalid to resend failed
 884                 * i/o and all i/o waiting on the slot table to the MDS until
 885                 * layout is destroyed and a new valid layout is obtained.
 886                 */
 887                pnfs_destroy_layout(NFS_I(inode));
 888                rpc_wake_up(&tbl->slot_tbl_waitq);
 889                goto reset;
 890        /* RPC connection errors */
 891        case -ECONNREFUSED:
 892        case -EHOSTDOWN:
 893        case -EHOSTUNREACH:
 894        case -ENETUNREACH:
 895        case -EIO:
 896        case -ETIMEDOUT:
 897        case -EPIPE:
 898                dprintk("%s DS connection error %d\n", __func__,
 899                        task->tk_status);
 900                nfs4_mark_deviceid_unavailable(devid);
 901                rpc_wake_up(&tbl->slot_tbl_waitq);
 902                /* fall through */
 903        default:
 904                if (ff_layout_has_available_ds(lseg))
 905                        return -NFS4ERR_RESET_TO_PNFS;
 906reset:
 907                dprintk("%s Retry through MDS. Error %d\n", __func__,
 908                        task->tk_status);
 909                return -NFS4ERR_RESET_TO_MDS;
 910        }
 911out:
 912        task->tk_status = 0;
 913        return -EAGAIN;
 914out_bad_stateid:
 915        task->tk_status = -EIO;
 916        return 0;
 917wait_on_recovery:
 918        rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
 919        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
 920                rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
 921        goto out;
 922}
 923
 924/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
 925static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 926                                           struct pnfs_layout_segment *lseg,
 927                                           int idx)
 928{
 929        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
 930
 931        if (task->tk_status >= 0)
 932                return 0;
 933
 934        if (task->tk_status != -EJUKEBOX) {
 935                dprintk("%s DS connection error %d\n", __func__,
 936                        task->tk_status);
 937                nfs4_mark_deviceid_unavailable(devid);
 938                if (ff_layout_has_available_ds(lseg))
 939                        return -NFS4ERR_RESET_TO_PNFS;
 940                else
 941                        return -NFS4ERR_RESET_TO_MDS;
 942        }
 943
 944        if (task->tk_status == -EJUKEBOX)
 945                nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
 946        task->tk_status = 0;
 947        rpc_restart_call(task);
 948        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
 949        return -EAGAIN;
 950}
 951
 952static int ff_layout_async_handle_error(struct rpc_task *task,
 953                                        struct nfs4_state *state,
 954                                        struct nfs_client *clp,
 955                                        struct pnfs_layout_segment *lseg,
 956                                        int idx)
 957{
 958        int vers = clp->cl_nfs_mod->rpc_vers->number;
 959
 960        switch (vers) {
 961        case 3:
 962                return ff_layout_async_handle_error_v3(task, lseg, idx);
 963        case 4:
 964                return ff_layout_async_handle_error_v4(task, state, clp,
 965                                                       lseg, idx);
 966        default:
 967                /* should never happen */
 968                WARN_ON_ONCE(1);
 969                return 0;
 970        }
 971}
 972
 973static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 974                                        int idx, u64 offset, u64 length,
 975                                        u32 status, int opnum)
 976{
 977        struct nfs4_ff_layout_mirror *mirror;
 978        int err;
 979
 980        mirror = FF_LAYOUT_COMP(lseg, idx);
 981        err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 982                                       mirror, offset, length, status, opnum,
 983                                       GFP_NOIO);
 984        dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 985}
 986
 987/* NFS_PROTO call done callback routines */
 988
 989static int ff_layout_read_done_cb(struct rpc_task *task,
 990                                struct nfs_pgio_header *hdr)
 991{
 992        struct inode *inode;
 993        int err;
 994
 995        trace_nfs4_pnfs_read(hdr, task->tk_status);
 996        if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
 997                hdr->res.op_status = NFS4ERR_NXIO;
 998        if (task->tk_status < 0 && hdr->res.op_status)
 999                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1000                                            hdr->args.offset, hdr->args.count,
1001                                            hdr->res.op_status, OP_READ);
1002        err = ff_layout_async_handle_error(task, hdr->args.context->state,
1003                                           hdr->ds_clp, hdr->lseg,
1004                                           hdr->pgio_mirror_idx);
1005
1006        switch (err) {
1007        case -NFS4ERR_RESET_TO_PNFS:
1008                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
1009                        &hdr->lseg->pls_layout->plh_flags);
1010                pnfs_read_resend_pnfs(hdr);
1011                return task->tk_status;
1012        case -NFS4ERR_RESET_TO_MDS:
1013                inode = hdr->lseg->pls_layout->plh_inode;
1014                pnfs_error_mark_layout_for_return(inode, hdr->lseg);
1015                ff_layout_reset_read(hdr);
1016                return task->tk_status;
1017        case -EAGAIN:
1018                rpc_restart_call_prepare(task);
1019                return -EAGAIN;
1020        }
1021
1022        return 0;
1023}
1024
1025static bool
1026ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
1027{
1028        return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
1029}
1030
1031/*
1032 * We reference the rpc_cred of the first WRITE that triggers the need for
1033 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
1034 * rfc5661 is not clear about which credential should be used.
1035 *
1036 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
1037 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
1038 * we always send layoutcommit after DS writes.
1039 */
1040static void
1041ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
1042{
1043        if (!ff_layout_need_layoutcommit(hdr->lseg))
1044                return;
1045
1046        pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
1047                        hdr->mds_offset + hdr->res.count);
1048        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
1049                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
1050}
1051
1052static bool
1053ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
1054{
1055        /* No mirroring for now */
1056        struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
1057
1058        return ff_layout_test_devid_unavailable(node);
1059}
1060
1061static int ff_layout_read_prepare_common(struct rpc_task *task,
1062                                         struct nfs_pgio_header *hdr)
1063{
1064        nfs4_ff_layout_stat_io_start_read(
1065                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1066                        hdr->args.count);
1067
1068        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1069                rpc_exit(task, -EIO);
1070                return -EIO;
1071        }
1072        if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
1073                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
1074                if (ff_layout_has_available_ds(hdr->lseg))
1075                        pnfs_read_resend_pnfs(hdr);
1076                else
1077                        ff_layout_reset_read(hdr);
1078                rpc_exit(task, 0);
1079                return -EAGAIN;
1080        }
1081        hdr->pgio_done_cb = ff_layout_read_done_cb;
1082
1083        return 0;
1084}
1085
1086/*
1087 * Call ops for the async read/write cases
1088 * In the case of dense layouts, the offset needs to be reset to its
1089 * original value.
1090 */
1091static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
1092{
1093        struct nfs_pgio_header *hdr = data;
1094
1095        if (ff_layout_read_prepare_common(task, hdr))
1096                return;
1097
1098        rpc_call_start(task);
1099}
1100
1101static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
1102                                    struct nfs4_sequence_args *args,
1103                                    struct nfs4_sequence_res *res,
1104                                    struct rpc_task *task)
1105{
1106        if (ds_clp->cl_session)
1107                return nfs41_setup_sequence(ds_clp->cl_session,
1108                                           args,
1109                                           res,
1110                                           task);
1111        return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
1112                                   args,
1113                                   res,
1114                                   task);
1115}
1116
1117static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
1118{
1119        struct nfs_pgio_header *hdr = data;
1120
1121        if (ff_layout_setup_sequence(hdr->ds_clp,
1122                                     &hdr->args.seq_args,
1123                                     &hdr->res.seq_res,
1124                                     task))
1125                return;
1126
1127        if (ff_layout_read_prepare_common(task, hdr))
1128                return;
1129
1130        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
1131                        hdr->args.lock_context, FMODE_READ) == -EIO)
1132                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
1133}
1134
1135static void ff_layout_read_call_done(struct rpc_task *task, void *data)
1136{
1137        struct nfs_pgio_header *hdr = data;
1138
1139        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
1140
1141        nfs4_ff_layout_stat_io_end_read(task,
1142                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1143                        hdr->args.count, hdr->res.count);
1144
1145        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1146            task->tk_status == 0) {
1147                nfs4_sequence_done(task, &hdr->res.seq_res);
1148                return;
1149        }
1150
1151        /* Note this may cause RPC to be resent */
1152        hdr->mds_ops->rpc_call_done(task, hdr);
1153}
1154
1155static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
1156{
1157        struct nfs_pgio_header *hdr = data;
1158
1159        rpc_count_iostats_metrics(task,
1160            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
1161}
1162
1163static int ff_layout_write_done_cb(struct rpc_task *task,
1164                                struct nfs_pgio_header *hdr)
1165{
1166        struct inode *inode;
1167        int err;
1168
1169        trace_nfs4_pnfs_write(hdr, task->tk_status);
1170        if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
1171                hdr->res.op_status = NFS4ERR_NXIO;
1172        if (task->tk_status < 0 && hdr->res.op_status)
1173                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
1174                                            hdr->args.offset, hdr->args.count,
1175                                            hdr->res.op_status, OP_WRITE);
1176        err = ff_layout_async_handle_error(task, hdr->args.context->state,
1177                                           hdr->ds_clp, hdr->lseg,
1178                                           hdr->pgio_mirror_idx);
1179
1180        switch (err) {
1181        case -NFS4ERR_RESET_TO_PNFS:
1182        case -NFS4ERR_RESET_TO_MDS:
1183                inode = hdr->lseg->pls_layout->plh_inode;
1184                pnfs_error_mark_layout_for_return(inode, hdr->lseg);
1185                if (err == -NFS4ERR_RESET_TO_PNFS) {
1186                        pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
1187                        ff_layout_reset_write(hdr, true);
1188                } else {
1189                        pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
1190                        ff_layout_reset_write(hdr, false);
1191                }
1192                return task->tk_status;
1193        case -EAGAIN:
1194                rpc_restart_call_prepare(task);
1195                return -EAGAIN;
1196        }
1197
1198        if (hdr->res.verf->committed == NFS_FILE_SYNC ||
1199            hdr->res.verf->committed == NFS_DATA_SYNC)
1200                ff_layout_set_layoutcommit(hdr);
1201
1202        return 0;
1203}
1204
1205static int ff_layout_commit_done_cb(struct rpc_task *task,
1206                                     struct nfs_commit_data *data)
1207{
1208        struct inode *inode;
1209        int err;
1210
1211        trace_nfs4_pnfs_commit_ds(data, task->tk_status);
1212        if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
1213                data->res.op_status = NFS4ERR_NXIO;
1214        if (task->tk_status < 0 && data->res.op_status)
1215                ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
1216                                            data->args.offset, data->args.count,
1217                                            data->res.op_status, OP_COMMIT);
1218        err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
1219                                           data->lseg, data->ds_commit_index);
1220
1221        switch (err) {
1222        case -NFS4ERR_RESET_TO_PNFS:
1223        case -NFS4ERR_RESET_TO_MDS:
1224                inode = data->lseg->pls_layout->plh_inode;
1225                pnfs_error_mark_layout_for_return(inode, data->lseg);
1226                if (err == -NFS4ERR_RESET_TO_PNFS)
1227                        pnfs_set_retry_layoutget(data->lseg->pls_layout);
1228                else
1229                        pnfs_clear_retry_layoutget(data->lseg->pls_layout);
1230                pnfs_generic_prepare_to_resend_writes(data);
1231                return -EAGAIN;
1232        case -EAGAIN:
1233                rpc_restart_call_prepare(task);
1234                return -EAGAIN;
1235        }
1236
1237        if (data->verf.committed == NFS_UNSTABLE
1238            && ff_layout_need_layoutcommit(data->lseg))
1239                pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
1240
1241        return 0;
1242}
1243
1244static int ff_layout_write_prepare_common(struct rpc_task *task,
1245                                          struct nfs_pgio_header *hdr)
1246{
1247        nfs4_ff_layout_stat_io_start_write(
1248                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1249                        hdr->args.count);
1250
1251        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
1252                rpc_exit(task, -EIO);
1253                return -EIO;
1254        }
1255
1256        if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
1257                bool retry_pnfs;
1258
1259                retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
1260                dprintk("%s task %u reset io to %s\n", __func__,
1261                        task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
1262                ff_layout_reset_write(hdr, retry_pnfs);
1263                rpc_exit(task, 0);
1264                return -EAGAIN;
1265        }
1266
1267        return 0;
1268}
1269
1270static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
1271{
1272        struct nfs_pgio_header *hdr = data;
1273
1274        if (ff_layout_write_prepare_common(task, hdr))
1275                return;
1276
1277        rpc_call_start(task);
1278}
1279
1280static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
1281{
1282        struct nfs_pgio_header *hdr = data;
1283
1284        if (ff_layout_setup_sequence(hdr->ds_clp,
1285                                     &hdr->args.seq_args,
1286                                     &hdr->res.seq_res,
1287                                     task))
1288                return;
1289
1290        if (ff_layout_write_prepare_common(task, hdr))
1291                return;
1292
1293        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
1294                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
1295                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
1296}
1297
1298static void ff_layout_write_call_done(struct rpc_task *task, void *data)
1299{
1300        struct nfs_pgio_header *hdr = data;
1301
1302        nfs4_ff_layout_stat_io_end_write(task,
1303                        FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
1304                        hdr->args.count, hdr->res.count,
1305                        hdr->res.verf->committed);
1306
1307        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
1308            task->tk_status == 0) {
1309                nfs4_sequence_done(task, &hdr->res.seq_res);
1310                return;
1311        }
1312
1313        /* Note this may cause RPC to be resent */
1314        hdr->mds_ops->rpc_call_done(task, hdr);
1315}
1316
1317static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
1318{
1319        struct nfs_pgio_header *hdr = data;
1320
1321        rpc_count_iostats_metrics(task,
1322            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
1323}
1324
1325static void ff_layout_commit_prepare_common(struct rpc_task *task,
1326                struct nfs_commit_data *cdata)
1327{
1328        nfs4_ff_layout_stat_io_start_write(
1329                        FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
1330                        0);
1331}
1332
1333static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
1334{
1335        ff_layout_commit_prepare_common(task, data);
1336        rpc_call_start(task);
1337}
1338
1339static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
1340{
1341        struct nfs_commit_data *wdata = data;
1342
1343        if (ff_layout_setup_sequence(wdata->ds_clp,
1344                                 &wdata->args.seq_args,
1345                                 &wdata->res.seq_res,
1346                                 task))
1347                return;
1348        ff_layout_commit_prepare_common(task, data);
1349}
1350
1351static void ff_layout_commit_done(struct rpc_task *task, void *data)
1352{
1353        struct nfs_commit_data *cdata = data;
1354        struct nfs_page *req;
1355        __u64 count = 0;
1356
1357        if (task->tk_status == 0) {
1358                list_for_each_entry(req, &cdata->pages, wb_list)
1359                        count += req->wb_bytes;
1360        }
1361
1362        nfs4_ff_layout_stat_io_end_write(task,
1363                        FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
1364                        count, count, NFS_FILE_SYNC);
1365
1366        pnfs_generic_write_commit_done(task, data);
1367}
1368
1369static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
1370{
1371        struct nfs_commit_data *cdata = data;
1372
1373        rpc_count_iostats_metrics(task,
1374            &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
1375}
1376
1377static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
1378        .rpc_call_prepare = ff_layout_read_prepare_v3,
1379        .rpc_call_done = ff_layout_read_call_done,
1380        .rpc_count_stats = ff_layout_read_count_stats,
1381        .rpc_release = pnfs_generic_rw_release,
1382};
1383
1384static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
1385        .rpc_call_prepare = ff_layout_read_prepare_v4,
1386        .rpc_call_done = ff_layout_read_call_done,
1387        .rpc_count_stats = ff_layout_read_count_stats,
1388        .rpc_release = pnfs_generic_rw_release,
1389};
1390
1391static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
1392        .rpc_call_prepare = ff_layout_write_prepare_v3,
1393        .rpc_call_done = ff_layout_write_call_done,
1394        .rpc_count_stats = ff_layout_write_count_stats,
1395        .rpc_release = pnfs_generic_rw_release,
1396};
1397
1398static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
1399        .rpc_call_prepare = ff_layout_write_prepare_v4,
1400        .rpc_call_done = ff_layout_write_call_done,
1401        .rpc_count_stats = ff_layout_write_count_stats,
1402        .rpc_release = pnfs_generic_rw_release,
1403};
1404
1405static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
1406        .rpc_call_prepare = ff_layout_commit_prepare_v3,
1407        .rpc_call_done = ff_layout_commit_done,
1408        .rpc_count_stats = ff_layout_commit_count_stats,
1409        .rpc_release = pnfs_generic_commit_release,
1410};
1411
1412static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
1413        .rpc_call_prepare = ff_layout_commit_prepare_v4,
1414        .rpc_call_done = ff_layout_commit_done,
1415        .rpc_count_stats = ff_layout_commit_count_stats,
1416        .rpc_release = pnfs_generic_commit_release,
1417};
1418
1419static enum pnfs_try_status
1420ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
1421{
1422        struct pnfs_layout_segment *lseg = hdr->lseg;
1423        struct nfs4_pnfs_ds *ds;
1424        struct rpc_clnt *ds_clnt;
1425        struct rpc_cred *ds_cred;
1426        loff_t offset = hdr->args.offset;
1427        u32 idx = hdr->pgio_mirror_idx;
1428        int vers;
1429        struct nfs_fh *fh;
1430
1431        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
1432                __func__, hdr->inode->i_ino,
1433                hdr->args.pgbase, (size_t)hdr->args.count, offset);
1434
1435        ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
1436        if (!ds)
1437                goto out_failed;
1438
1439        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1440                                                   hdr->inode);
1441        if (IS_ERR(ds_clnt))
1442                goto out_failed;
1443
1444        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1445        if (IS_ERR(ds_cred))
1446                goto out_failed;
1447
1448        vers = nfs4_ff_layout_ds_version(lseg, idx);
1449
1450        dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
1451                ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
1452
1453        atomic_inc(&ds->ds_clp->cl_count);
1454        hdr->ds_clp = ds->ds_clp;
1455        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1456        if (fh)
1457                hdr->args.fh = fh;
1458        /*
1459         * Note that if we ever decide to split across DSes,
1460         * then we may need to handle dense-like offsets.
1461         */
1462        hdr->args.offset = offset;
1463        hdr->mds_offset = offset;
1464
1465        /* Perform an asynchronous read to ds */
1466        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1467                          vers == 3 ? &ff_layout_read_call_ops_v3 :
1468                                      &ff_layout_read_call_ops_v4,
1469                          0, RPC_TASK_SOFTCONN);
1470
1471        return PNFS_ATTEMPTED;
1472
1473out_failed:
1474        if (ff_layout_has_available_ds(lseg))
1475                return PNFS_TRY_AGAIN;
1476        return PNFS_NOT_ATTEMPTED;
1477}
1478
1479/* Perform async writes. */
1480static enum pnfs_try_status
1481ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
1482{
1483        struct pnfs_layout_segment *lseg = hdr->lseg;
1484        struct nfs4_pnfs_ds *ds;
1485        struct rpc_clnt *ds_clnt;
1486        struct rpc_cred *ds_cred;
1487        loff_t offset = hdr->args.offset;
1488        int vers;
1489        struct nfs_fh *fh;
1490        int idx = hdr->pgio_mirror_idx;
1491
1492        ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1493        if (!ds)
1494                return PNFS_NOT_ATTEMPTED;
1495
1496        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1497                                                   hdr->inode);
1498        if (IS_ERR(ds_clnt))
1499                return PNFS_NOT_ATTEMPTED;
1500
1501        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
1502        if (IS_ERR(ds_cred))
1503                return PNFS_NOT_ATTEMPTED;
1504
1505        vers = nfs4_ff_layout_ds_version(lseg, idx);
1506
1507        dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
1508                __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
1509                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
1510                vers);
1511
1512        hdr->pgio_done_cb = ff_layout_write_done_cb;
1513        atomic_inc(&ds->ds_clp->cl_count);
1514        hdr->ds_clp = ds->ds_clp;
1515        hdr->ds_commit_idx = idx;
1516        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
1517        if (fh)
1518                hdr->args.fh = fh;
1519
1520        /*
1521         * Note that if we ever decide to split across DSes,
1522         * then we may need to handle dense-like offsets.
1523         */
1524        hdr->args.offset = offset;
1525
1526        /* Perform an asynchronous write */
1527        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
1528                          vers == 3 ? &ff_layout_write_call_ops_v3 :
1529                                      &ff_layout_write_call_ops_v4,
1530                          sync, RPC_TASK_SOFTCONN);
1531        return PNFS_ATTEMPTED;
1532}
1533
1534static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1535{
1536        return i;
1537}
1538
1539static struct nfs_fh *
1540select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
1541{
1542        struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
1543
1544        /* FIXME: Assume that there is only one NFS version available
1545         * for the DS.
1546         */
1547        return &flseg->mirror_array[i]->fh_versions[0];
1548}
1549
1550static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
1551{
1552        struct pnfs_layout_segment *lseg = data->lseg;
1553        struct nfs4_pnfs_ds *ds;
1554        struct rpc_clnt *ds_clnt;
1555        struct rpc_cred *ds_cred;
1556        u32 idx;
1557        int vers;
1558        struct nfs_fh *fh;
1559
1560        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
1561        ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
1562        if (!ds)
1563                goto out_err;
1564
1565        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
1566                                                   data->inode);
1567        if (IS_ERR(ds_clnt))
1568                goto out_err;
1569
1570        ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
1571        if (IS_ERR(ds_cred))
1572                goto out_err;
1573
1574        vers = nfs4_ff_layout_ds_version(lseg, idx);
1575
1576        dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
1577                data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
1578                vers);
1579        data->commit_done_cb = ff_layout_commit_done_cb;
1580        data->cred = ds_cred;
1581        atomic_inc(&ds->ds_clp->cl_count);
1582        data->ds_clp = ds->ds_clp;
1583        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
1584        if (fh)
1585                data->args.fh = fh;
1586
1587        return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
1588                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
1589                                               &ff_layout_commit_call_ops_v4,
1590                                   how, RPC_TASK_SOFTCONN);
1591out_err:
1592        pnfs_generic_prepare_to_resend_writes(data);
1593        pnfs_generic_commit_release(data);
1594        return -EAGAIN;
1595}
1596
1597static int
1598ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1599                           int how, struct nfs_commit_info *cinfo)
1600{
1601        return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
1602                                            ff_layout_initiate_commit);
1603}
1604
1605static struct pnfs_ds_commit_info *
1606ff_layout_get_ds_info(struct inode *inode)
1607{
1608        struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
1609
1610        if (layout == NULL)
1611                return NULL;
1612
1613        return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
1614}
1615
1616static void
1617ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
1618{
1619        nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
1620                                                  id_node));
1621}
1622
1623static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
1624                                  struct xdr_stream *xdr,
1625                                  const struct nfs4_layoutreturn_args *args)
1626{
1627        struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
1628        __be32 *start;
1629        int count = 0, ret = 0;
1630
1631        start = xdr_reserve_space(xdr, 4);
1632        if (unlikely(!start))
1633                return -E2BIG;
1634
1635        /* This assume we always return _ALL_ layouts */
1636        spin_lock(&hdr->plh_inode->i_lock);
1637        ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
1638        spin_unlock(&hdr->plh_inode->i_lock);
1639
1640        *start = cpu_to_be32(count);
1641
1642        return ret;
1643}
1644
1645/* report nothing for now */
1646static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
1647                                     struct xdr_stream *xdr,
1648                                     const struct nfs4_layoutreturn_args *args)
1649{
1650        __be32 *p;
1651
1652        p = xdr_reserve_space(xdr, 4);
1653        if (likely(p))
1654                *p = cpu_to_be32(0);
1655}
1656
1657static struct nfs4_deviceid_node *
1658ff_layout_alloc_deviceid_node(struct nfs_server *server,
1659                              struct pnfs_device *pdev, gfp_t gfp_flags)
1660{
1661        struct nfs4_ff_layout_ds *dsaddr;
1662
1663        dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
1664        if (!dsaddr)
1665                return NULL;
1666        return &dsaddr->id_node;
1667}
1668
1669static void
1670ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
1671                              struct xdr_stream *xdr,
1672                              const struct nfs4_layoutreturn_args *args)
1673{
1674        struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
1675        __be32 *start;
1676
1677        dprintk("%s: Begin\n", __func__);
1678        start = xdr_reserve_space(xdr, 4);
1679        BUG_ON(!start);
1680
1681        if (ff_layout_encode_ioerr(flo, xdr, args))
1682                goto out;
1683
1684        ff_layout_encode_iostats(flo, xdr, args);
1685out:
1686        *start = cpu_to_be32((xdr->p - start - 1) * 4);
1687        dprintk("%s: Return\n", __func__);
1688}
1689
1690static int
1691ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
1692{
1693        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
1694
1695        return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
1696}
1697
1698static size_t
1699ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
1700                          const int buflen)
1701{
1702        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
1703        const struct in6_addr *addr = &sin6->sin6_addr;
1704
1705        /*
1706         * RFC 4291, Section 2.2.2
1707         *
1708         * Shorthanded ANY address
1709         */
1710        if (ipv6_addr_any(addr))
1711                return snprintf(buf, buflen, "::");
1712
1713        /*
1714         * RFC 4291, Section 2.2.2
1715         *
1716         * Shorthanded loopback address
1717         */
1718        if (ipv6_addr_loopback(addr))
1719                return snprintf(buf, buflen, "::1");
1720
1721        /*
1722         * RFC 4291, Section 2.2.3
1723         *
1724         * Special presentation address format for mapped v4
1725         * addresses.
1726         */
1727        if (ipv6_addr_v4mapped(addr))
1728                return snprintf(buf, buflen, "::ffff:%pI4",
1729                                        &addr->s6_addr32[3]);
1730
1731        /*
1732         * RFC 4291, Section 2.2.1
1733         */
1734        return snprintf(buf, buflen, "%pI6c", addr);
1735}
1736
1737/* Derived from rpc_sockaddr2uaddr */
1738static void
1739ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
1740{
1741        struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
1742        char portbuf[RPCBIND_MAXUADDRPLEN];
1743        char addrbuf[RPCBIND_MAXUADDRLEN];
1744        char *netid;
1745        unsigned short port;
1746        int len, netid_len;
1747        __be32 *p;
1748
1749        switch (sap->sa_family) {
1750        case AF_INET:
1751                if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
1752                        return;
1753                port = ntohs(((struct sockaddr_in *)sap)->sin_port);
1754                netid = "tcp";
1755                netid_len = 3;
1756                break;
1757        case AF_INET6:
1758                if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
1759                        return;
1760                port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
1761                netid = "tcp6";
1762                netid_len = 4;
1763                break;
1764        default:
1765                /* we only support tcp and tcp6 */
1766                WARN_ON_ONCE(1);
1767                return;
1768        }
1769
1770        snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
1771        len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
1772
1773        p = xdr_reserve_space(xdr, 4 + netid_len);
1774        xdr_encode_opaque(p, netid, netid_len);
1775
1776        p = xdr_reserve_space(xdr, 4 + len);
1777        xdr_encode_opaque(p, addrbuf, len);
1778}
1779
1780static void
1781ff_layout_encode_nfstime(struct xdr_stream *xdr,
1782                         ktime_t t)
1783{
1784        struct timespec64 ts;
1785        __be32 *p;
1786
1787        p = xdr_reserve_space(xdr, 12);
1788        ts = ktime_to_timespec64(t);
1789        p = xdr_encode_hyper(p, ts.tv_sec);
1790        *p++ = cpu_to_be32(ts.tv_nsec);
1791}
1792
1793static void
1794ff_layout_encode_io_latency(struct xdr_stream *xdr,
1795                            struct nfs4_ff_io_stat *stat)
1796{
1797        __be32 *p;
1798
1799        p = xdr_reserve_space(xdr, 5 * 8);
1800        p = xdr_encode_hyper(p, stat->ops_requested);
1801        p = xdr_encode_hyper(p, stat->bytes_requested);
1802        p = xdr_encode_hyper(p, stat->ops_completed);
1803        p = xdr_encode_hyper(p, stat->bytes_completed);
1804        p = xdr_encode_hyper(p, stat->bytes_not_delivered);
1805        ff_layout_encode_nfstime(xdr, stat->total_busy_time);
1806        ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
1807}
1808
1809static void
1810ff_layout_encode_layoutstats(struct xdr_stream *xdr,
1811                             struct nfs42_layoutstat_args *args,
1812                             struct nfs42_layoutstat_devinfo *devinfo)
1813{
1814        struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
1815        struct nfs4_pnfs_ds_addr *da;
1816        struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
1817        struct nfs_fh *fh = &mirror->fh_versions[0];
1818        __be32 *p, *start;
1819
1820        da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
1821        dprintk("%s: DS %s: encoding address %s\n",
1822                __func__, ds->ds_remotestr, da->da_remotestr);
1823        /* layoutupdate length */
1824        start = xdr_reserve_space(xdr, 4);
1825        /* netaddr4 */
1826        ff_layout_encode_netaddr(xdr, da);
1827        /* nfs_fh4 */
1828        p = xdr_reserve_space(xdr, 4 + fh->size);
1829        xdr_encode_opaque(p, fh->data, fh->size);
1830        /* ff_io_latency4 read */
1831        spin_lock(&mirror->lock);
1832        ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
1833        /* ff_io_latency4 write */
1834        ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
1835        spin_unlock(&mirror->lock);
1836        /* nfstime4 */
1837        ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
1838        /* bool */
1839        p = xdr_reserve_space(xdr, 4);
1840        *p = cpu_to_be32(false);
1841
1842        *start = cpu_to_be32((xdr->p - start - 1) * 4);
1843}
1844
1845static bool
1846ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
1847                               struct pnfs_layout_segment *pls,
1848                               int *dev_count, int dev_limit)
1849{
1850        struct nfs4_ff_layout_mirror *mirror;
1851        struct nfs4_deviceid_node *dev;
1852        struct nfs42_layoutstat_devinfo *devinfo;
1853        int i;
1854
1855        for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
1856                if (*dev_count >= dev_limit)
1857                        break;
1858                mirror = FF_LAYOUT_COMP(pls, i);
1859                if (!mirror || !mirror->mirror_ds)
1860                        continue;
1861                dev = FF_LAYOUT_DEVID_NODE(pls, i);
1862                devinfo = &args->devinfo[*dev_count];
1863                memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
1864                devinfo->offset = pls->pls_range.offset;
1865                devinfo->length = pls->pls_range.length;
1866                /* well, we don't really know if IO is continuous or not! */
1867                devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
1868                devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
1869                devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
1870                devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
1871                devinfo->layout_type = LAYOUT_FLEX_FILES;
1872                devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
1873                devinfo->layout_private = mirror;
1874                /* lseg refcount put in cleanup_layoutstats */
1875                pnfs_get_lseg(pls);
1876
1877                ++(*dev_count);
1878        }
1879
1880        return *dev_count < dev_limit;
1881}
1882
1883static int
1884ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
1885{
1886        struct pnfs_layout_segment *pls;
1887        int dev_count = 0;
1888
1889        spin_lock(&args->inode->i_lock);
1890        list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
1891                dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
1892        }
1893        spin_unlock(&args->inode->i_lock);
1894        /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
1895        if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
1896                dprintk("%s: truncating devinfo to limit (%d:%d)\n",
1897                        __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
1898                dev_count = PNFS_LAYOUTSTATS_MAXDEV;
1899        }
1900        args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
1901        if (!args->devinfo)
1902                return -ENOMEM;
1903
1904        dev_count = 0;
1905        spin_lock(&args->inode->i_lock);
1906        list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
1907                if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
1908                                                    PNFS_LAYOUTSTATS_MAXDEV)) {
1909                        break;
1910                }
1911        }
1912        spin_unlock(&args->inode->i_lock);
1913        args->num_dev = dev_count;
1914
1915        return 0;
1916}
1917
1918static void
1919ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
1920{
1921        struct nfs4_ff_layout_mirror *mirror;
1922        int i;
1923
1924        for (i = 0; i < data->args.num_dev; i++) {
1925                mirror = data->args.devinfo[i].layout_private;
1926                data->args.devinfo[i].layout_private = NULL;
1927                pnfs_put_lseg(mirror->lseg);
1928        }
1929}
1930
1931static struct pnfs_layoutdriver_type flexfilelayout_type = {
1932        .id                     = LAYOUT_FLEX_FILES,
1933        .name                   = "LAYOUT_FLEX_FILES",
1934        .owner                  = THIS_MODULE,
1935        .alloc_layout_hdr       = ff_layout_alloc_layout_hdr,
1936        .free_layout_hdr        = ff_layout_free_layout_hdr,
1937        .alloc_lseg             = ff_layout_alloc_lseg,
1938        .free_lseg              = ff_layout_free_lseg,
1939        .pg_read_ops            = &ff_layout_pg_read_ops,
1940        .pg_write_ops           = &ff_layout_pg_write_ops,
1941        .get_ds_info            = ff_layout_get_ds_info,
1942        .free_deviceid_node     = ff_layout_free_deviceid_node,
1943        .mark_request_commit    = pnfs_layout_mark_request_commit,
1944        .clear_request_commit   = pnfs_generic_clear_request_commit,
1945        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
1946        .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
1947        .commit_pagelist        = ff_layout_commit_pagelist,
1948        .read_pagelist          = ff_layout_read_pagelist,
1949        .write_pagelist         = ff_layout_write_pagelist,
1950        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
1951        .encode_layoutreturn    = ff_layout_encode_layoutreturn,
1952        .sync                   = pnfs_nfs_generic_sync,
1953        .prepare_layoutstats    = ff_layout_prepare_layoutstats,
1954        .cleanup_layoutstats    = ff_layout_cleanup_layoutstats,
1955};
1956
1957static int __init nfs4flexfilelayout_init(void)
1958{
1959        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
1960               __func__);
1961        return pnfs_register_layoutdriver(&flexfilelayout_type);
1962}
1963
1964static void __exit nfs4flexfilelayout_exit(void)
1965{
1966        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
1967               __func__);
1968        pnfs_unregister_layoutdriver(&flexfilelayout_type);
1969}
1970
1971MODULE_ALIAS("nfs-layouttype4-4");
1972
1973MODULE_LICENSE("GPL");
1974MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
1975
1976module_init(nfs4flexfilelayout_init);
1977module_exit(nfs4flexfilelayout_exit);
1978