linux/fs/nfs/flexfilelayout/flexfilelayoutdev.c
<<
>>
Prefs
   1/*
   2 * Device operations for the pnfs nfs4 file layout driver.
   3 *
   4 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
   5 *
   6 * Tao Peng <bergwolf@primarydata.com>
   7 */
   8
   9#include <linux/nfs_fs.h>
  10#include <linux/vmalloc.h>
  11#include <linux/module.h>
  12#include <linux/sunrpc/addr.h>
  13
  14#include "../internal.h"
  15#include "../nfs4session.h"
  16#include "flexfilelayout.h"
  17
  18#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  19
  20static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
  21static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
  22
  23void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
  24{
  25        if (mirror_ds)
  26                nfs4_put_deviceid_node(&mirror_ds->id_node);
  27}
  28
  29void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
  30{
  31        nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
  32        nfs4_pnfs_ds_put(mirror_ds->ds);
  33        kfree_rcu(mirror_ds, id_node.rcu);
  34}
  35
  36/* Decode opaque device data and construct new_ds using it */
  37struct nfs4_ff_layout_ds *
  38nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
  39                            gfp_t gfp_flags)
  40{
  41        struct xdr_stream stream;
  42        struct xdr_buf buf;
  43        struct page *scratch;
  44        struct list_head dsaddrs;
  45        struct nfs4_pnfs_ds_addr *da;
  46        struct nfs4_ff_layout_ds *new_ds = NULL;
  47        struct nfs4_ff_ds_version *ds_versions = NULL;
  48        u32 mp_count;
  49        u32 version_count;
  50        __be32 *p;
  51        int i, ret = -ENOMEM;
  52
  53        /* set up xdr stream */
  54        scratch = alloc_page(gfp_flags);
  55        if (!scratch)
  56                goto out_err;
  57
  58        new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
  59        if (!new_ds)
  60                goto out_scratch;
  61
  62        nfs4_init_deviceid_node(&new_ds->id_node,
  63                                server,
  64                                &pdev->dev_id);
  65        INIT_LIST_HEAD(&dsaddrs);
  66
  67        xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
  68        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
  69
  70        /* multipath count */
  71        p = xdr_inline_decode(&stream, 4);
  72        if (unlikely(!p))
  73                goto out_err_drain_dsaddrs;
  74        mp_count = be32_to_cpup(p);
  75        dprintk("%s: multipath ds count %d\n", __func__, mp_count);
  76
  77        for (i = 0; i < mp_count; i++) {
  78                /* multipath ds */
  79                da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
  80                                            &stream, gfp_flags);
  81                if (da)
  82                        list_add_tail(&da->da_node, &dsaddrs);
  83        }
  84        if (list_empty(&dsaddrs)) {
  85                dprintk("%s: no suitable DS addresses found\n",
  86                        __func__);
  87                ret = -ENOMEDIUM;
  88                goto out_err_drain_dsaddrs;
  89        }
  90
  91        /* version count */
  92        p = xdr_inline_decode(&stream, 4);
  93        if (unlikely(!p))
  94                goto out_err_drain_dsaddrs;
  95        version_count = be32_to_cpup(p);
  96        dprintk("%s: version count %d\n", __func__, version_count);
  97
  98        ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
  99                              gfp_flags);
 100        if (!ds_versions)
 101                goto out_scratch;
 102
 103        for (i = 0; i < version_count; i++) {
 104                /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
 105                 * tightly_coupled(4) */
 106                p = xdr_inline_decode(&stream, 20);
 107                if (unlikely(!p))
 108                        goto out_err_drain_dsaddrs;
 109                ds_versions[i].version = be32_to_cpup(p++);
 110                ds_versions[i].minor_version = be32_to_cpup(p++);
 111                ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
 112                ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
 113                ds_versions[i].tightly_coupled = be32_to_cpup(p);
 114
 115                if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
 116                        ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
 117                if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
 118                        ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
 119
 120                if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
 121                        dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
 122                                i, ds_versions[i].version,
 123                                ds_versions[i].minor_version);
 124                        ret = -EPROTONOSUPPORT;
 125                        goto out_err_drain_dsaddrs;
 126                }
 127
 128                dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
 129                        __func__, i, ds_versions[i].version,
 130                        ds_versions[i].minor_version,
 131                        ds_versions[i].rsize,
 132                        ds_versions[i].wsize,
 133                        ds_versions[i].tightly_coupled);
 134        }
 135
 136        new_ds->ds_versions = ds_versions;
 137        new_ds->ds_versions_cnt = version_count;
 138
 139        new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
 140        if (!new_ds->ds)
 141                goto out_err_drain_dsaddrs;
 142
 143        /* If DS was already in cache, free ds addrs */
 144        while (!list_empty(&dsaddrs)) {
 145                da = list_first_entry(&dsaddrs,
 146                                      struct nfs4_pnfs_ds_addr,
 147                                      da_node);
 148                list_del_init(&da->da_node);
 149                kfree(da->da_remotestr);
 150                kfree(da);
 151        }
 152
 153        __free_page(scratch);
 154        return new_ds;
 155
 156out_err_drain_dsaddrs:
 157        while (!list_empty(&dsaddrs)) {
 158                da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
 159                                      da_node);
 160                list_del_init(&da->da_node);
 161                kfree(da->da_remotestr);
 162                kfree(da);
 163        }
 164
 165        kfree(ds_versions);
 166out_scratch:
 167        __free_page(scratch);
 168out_err:
 169        kfree(new_ds);
 170
 171        dprintk("%s ERROR: returning %d\n", __func__, ret);
 172        return NULL;
 173}
 174
 175static u64
 176end_offset(u64 start, u64 len)
 177{
 178        u64 end;
 179
 180        end = start + len;
 181        return end >= start ? end : NFS4_MAX_UINT64;
 182}
 183
 184static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
 185                            u64 offset, u64 length)
 186{
 187        u64 end;
 188
 189        end = max_t(u64, end_offset(err->offset, err->length),
 190                    end_offset(offset, length));
 191        err->offset = min_t(u64, err->offset, offset);
 192        err->length = end - err->offset;
 193}
 194
 195static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
 196                               u64 length, int status, enum nfs_opnum4 opnum,
 197                               nfs4_stateid *stateid,
 198                               struct nfs4_deviceid *deviceid)
 199{
 200        return err->status == status && err->opnum == opnum &&
 201               nfs4_stateid_match(&err->stateid, stateid) &&
 202               !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
 203               end_offset(err->offset, err->length) >= offset &&
 204               err->offset <= end_offset(offset, length);
 205}
 206
 207static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
 208                           struct nfs4_ff_layout_ds_err *new)
 209{
 210        if (!ds_error_can_merge(old, new->offset, new->length, new->status,
 211                                new->opnum, &new->stateid, &new->deviceid))
 212                return false;
 213
 214        extend_ds_error(old, new->offset, new->length);
 215        return true;
 216}
 217
 218static bool
 219ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
 220                              struct nfs4_ff_layout_ds_err *dserr)
 221{
 222        struct nfs4_ff_layout_ds_err *err;
 223
 224        list_for_each_entry(err, &flo->error_list, list) {
 225                if (merge_ds_error(err, dserr)) {
 226                        return true;
 227                }
 228        }
 229
 230        list_add(&dserr->list, &flo->error_list);
 231        return false;
 232}
 233
 234static bool
 235ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
 236                          u64 length, int status, enum nfs_opnum4 opnum,
 237                          nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
 238{
 239        bool found = false;
 240        struct nfs4_ff_layout_ds_err *err;
 241
 242        list_for_each_entry(err, &flo->error_list, list) {
 243                if (ds_error_can_merge(err, offset, length, status, opnum,
 244                                       stateid, deviceid)) {
 245                        found = true;
 246                        extend_ds_error(err, offset, length);
 247                        break;
 248                }
 249        }
 250
 251        return found;
 252}
 253
 254int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 255                             struct nfs4_ff_layout_mirror *mirror, u64 offset,
 256                             u64 length, int status, enum nfs_opnum4 opnum,
 257                             gfp_t gfp_flags)
 258{
 259        struct nfs4_ff_layout_ds_err *dserr;
 260        bool needfree;
 261
 262        if (status == 0)
 263                return 0;
 264
 265        if (mirror->mirror_ds == NULL)
 266                return -EINVAL;
 267
 268        spin_lock(&flo->generic_hdr.plh_inode->i_lock);
 269        if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
 270                                      &mirror->stateid,
 271                                      &mirror->mirror_ds->id_node.deviceid)) {
 272                spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
 273                return 0;
 274        }
 275        spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
 276        dserr = kmalloc(sizeof(*dserr), gfp_flags);
 277        if (!dserr)
 278                return -ENOMEM;
 279
 280        INIT_LIST_HEAD(&dserr->list);
 281        dserr->offset = offset;
 282        dserr->length = length;
 283        dserr->status = status;
 284        dserr->opnum = opnum;
 285        nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
 286        memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
 287               NFS4_DEVICEID4_SIZE);
 288
 289        spin_lock(&flo->generic_hdr.plh_inode->i_lock);
 290        needfree = ff_layout_add_ds_error_locked(flo, dserr);
 291        spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
 292        if (needfree)
 293                kfree(dserr);
 294
 295        return 0;
 296}
 297
 298/* currently we only support AUTH_NONE and AUTH_SYS */
 299static rpc_authflavor_t
 300nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
 301{
 302        if (mirror->uid == (u32)-1)
 303                return RPC_AUTH_NULL;
 304        return RPC_AUTH_UNIX;
 305}
 306
 307/* fetch cred for NFSv3 DS */
 308static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
 309                                      struct nfs4_pnfs_ds *ds)
 310{
 311        if (ds->ds_clp && !mirror->cred &&
 312            mirror->mirror_ds->ds_versions[0].version == 3) {
 313                struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
 314                struct rpc_cred *cred;
 315                struct auth_cred acred = {
 316                        .uid = make_kuid(&init_user_ns, mirror->uid),
 317                        .gid = make_kgid(&init_user_ns, mirror->gid),
 318                };
 319
 320                /* AUTH_NULL ignores acred */
 321                cred = auth->au_ops->lookup_cred(auth, &acred, 0);
 322                if (IS_ERR(cred)) {
 323                        dprintk("%s: lookup_cred failed with %ld\n",
 324                                __func__, PTR_ERR(cred));
 325                        return PTR_ERR(cred);
 326                } else {
 327                        if (cmpxchg(&mirror->cred, NULL, cred))
 328                                put_rpccred(cred);
 329                }
 330        }
 331        return 0;
 332}
 333
 334struct nfs_fh *
 335nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
 336{
 337        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
 338        struct nfs_fh *fh = NULL;
 339        struct nfs4_deviceid_node *devid;
 340
 341        if (mirror == NULL || mirror->mirror_ds == NULL ||
 342            mirror->mirror_ds->ds == NULL) {
 343                printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
 344                        __func__, mirror_idx);
 345                if (mirror && mirror->mirror_ds) {
 346                        devid = &mirror->mirror_ds->id_node;
 347                        pnfs_generic_mark_devid_invalid(devid);
 348                }
 349                goto out;
 350        }
 351
 352        /* FIXME: For now assume there is only 1 version available for the DS */
 353        fh = &mirror->fh_versions[0];
 354out:
 355        return fh;
 356}
 357
 358/* Upon return, either ds is connected, or ds is NULL */
 359struct nfs4_pnfs_ds *
 360nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 361                          bool fail_return)
 362{
 363        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
 364        struct nfs4_pnfs_ds *ds = NULL;
 365        struct nfs4_deviceid_node *devid;
 366        struct inode *ino = lseg->pls_layout->plh_inode;
 367        struct nfs_server *s = NFS_SERVER(ino);
 368        unsigned int max_payload;
 369        rpc_authflavor_t flavor;
 370
 371        if (mirror == NULL || mirror->mirror_ds == NULL ||
 372            mirror->mirror_ds->ds == NULL) {
 373                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
 374                        __func__, ds_idx);
 375                if (mirror && mirror->mirror_ds) {
 376                        devid = &mirror->mirror_ds->id_node;
 377                        pnfs_generic_mark_devid_invalid(devid);
 378                }
 379                goto out;
 380        }
 381
 382        devid = &mirror->mirror_ds->id_node;
 383        if (ff_layout_test_devid_unavailable(devid))
 384                goto out;
 385
 386        ds = mirror->mirror_ds->ds;
 387        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
 388        smp_rmb();
 389        if (ds->ds_clp)
 390                goto out_update_creds;
 391
 392        flavor = nfs4_ff_layout_choose_authflavor(mirror);
 393
 394        /* FIXME: For now we assume the server sent only one version of NFS
 395         * to use for the DS.
 396         */
 397        nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
 398                             dataserver_retrans,
 399                             mirror->mirror_ds->ds_versions[0].version,
 400                             mirror->mirror_ds->ds_versions[0].minor_version,
 401                             flavor);
 402
 403        /* connect success, check rsize/wsize limit */
 404        if (ds->ds_clp) {
 405                max_payload =
 406                        nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
 407                                       NULL);
 408                if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
 409                        mirror->mirror_ds->ds_versions[0].rsize = max_payload;
 410                if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
 411                        mirror->mirror_ds->ds_versions[0].wsize = max_payload;
 412        } else {
 413                ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 414                                         mirror, lseg->pls_range.offset,
 415                                         lseg->pls_range.length, NFS4ERR_NXIO,
 416                                         OP_ILLEGAL, GFP_NOIO);
 417                if (fail_return) {
 418                        pnfs_error_mark_layout_for_return(ino, lseg);
 419                        if (ff_layout_has_available_ds(lseg))
 420                                pnfs_set_retry_layoutget(lseg->pls_layout);
 421                        else
 422                                pnfs_clear_retry_layoutget(lseg->pls_layout);
 423
 424                } else {
 425                        if (ff_layout_has_available_ds(lseg))
 426                                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
 427                                        &lseg->pls_layout->plh_flags);
 428                        else {
 429                                pnfs_error_mark_layout_for_return(ino, lseg);
 430                                pnfs_clear_retry_layoutget(lseg->pls_layout);
 431                        }
 432                }
 433        }
 434out_update_creds:
 435        if (ff_layout_update_mirror_cred(mirror, ds))
 436                ds = NULL;
 437out:
 438        return ds;
 439}
 440
 441struct rpc_cred *
 442ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
 443                      struct rpc_cred *mdscred)
 444{
 445        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
 446        struct rpc_cred *cred = ERR_PTR(-EINVAL);
 447
 448        if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
 449                goto out;
 450
 451        if (mirror && mirror->cred)
 452                cred = mirror->cred;
 453        else
 454                cred = mdscred;
 455out:
 456        return cred;
 457}
 458
 459/**
 460* Find or create a DS rpc client with th MDS server rpc client auth flavor
 461* in the nfs_client cl_ds_clients list.
 462*/
 463struct rpc_clnt *
 464nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
 465                                 struct nfs_client *ds_clp, struct inode *inode)
 466{
 467        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
 468
 469        switch (mirror->mirror_ds->ds_versions[0].version) {
 470        case 3:
 471                /* For NFSv3 DS, flavor is set when creating DS connections */
 472                return ds_clp->cl_rpcclient;
 473        case 4:
 474                return nfs4_find_or_create_ds_client(ds_clp, inode);
 475        default:
 476                BUG();
 477        }
 478}
 479
 480static bool is_range_intersecting(u64 offset1, u64 length1,
 481                                  u64 offset2, u64 length2)
 482{
 483        u64 end1 = end_offset(offset1, length1);
 484        u64 end2 = end_offset(offset2, length2);
 485
 486        return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
 487               (end2 == NFS4_MAX_UINT64 || end2 > offset1);
 488}
 489
 490/* called with inode i_lock held */
 491int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
 492                              struct xdr_stream *xdr, int *count,
 493                              const struct pnfs_layout_range *range)
 494{
 495        struct nfs4_ff_layout_ds_err *err, *n;
 496        __be32 *p;
 497
 498        list_for_each_entry_safe(err, n, &flo->error_list, list) {
 499                if (!is_range_intersecting(err->offset, err->length,
 500                                           range->offset, range->length))
 501                        continue;
 502                /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
 503                 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
 504                 */
 505                p = xdr_reserve_space(xdr,
 506                                24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
 507                if (unlikely(!p))
 508                        return -ENOBUFS;
 509                p = xdr_encode_hyper(p, err->offset);
 510                p = xdr_encode_hyper(p, err->length);
 511                p = xdr_encode_opaque_fixed(p, &err->stateid,
 512                                            NFS4_STATEID_SIZE);
 513                p = xdr_encode_opaque_fixed(p, &err->deviceid,
 514                                            NFS4_DEVICEID4_SIZE);
 515                *p++ = cpu_to_be32(err->status);
 516                *p++ = cpu_to_be32(err->opnum);
 517                *count += 1;
 518                list_del(&err->list);
 519                dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
 520                        __func__, err->offset, err->length, err->status,
 521                        err->opnum, *count);
 522                kfree(err);
 523        }
 524
 525        return 0;
 526}
 527
 528bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 529{
 530        struct nfs4_ff_layout_mirror *mirror;
 531        struct nfs4_deviceid_node *devid;
 532        int idx;
 533
 534        for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 535                mirror = FF_LAYOUT_COMP(lseg, idx);
 536                if (mirror && mirror->mirror_ds) {
 537                        devid = &mirror->mirror_ds->id_node;
 538                        if (!ff_layout_test_devid_unavailable(devid))
 539                                return true;
 540                }
 541        }
 542
 543        return false;
 544}
 545
 546module_param(dataserver_retrans, uint, 0644);
 547MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
 548                        "retries a request before it attempts further "
 549                        " recovery  action.");
 550module_param(dataserver_timeo, uint, 0644);
 551MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
 552                        "NFSv4.1  client  waits for a response from a "
 553                        " data server before it retries an NFS request.");
 554