linux/fs/nfs/nfs4filelayoutdev.c
<<
>>
Prefs
   1/*
   2 *  Device operations for the pnfs nfs4 file layout driver.
   3 *
   4 *  Copyright (c) 2002
   5 *  The Regents of the University of Michigan
   6 *  All Rights Reserved
   7 *
   8 *  Dean Hildebrand <dhildebz@umich.edu>
   9 *  Garth Goodson   <Garth.Goodson@netapp.com>
  10 *
  11 *  Permission is granted to use, copy, create derivative works, and
  12 *  redistribute this software and such derivative works for any purpose,
  13 *  so long as the name of the University of Michigan is not used in
  14 *  any advertising or publicity pertaining to the use or distribution
  15 *  of this software without specific, written prior authorization. If
  16 *  the above copyright notice or any other identification of the
  17 *  University of Michigan is included in any copy of any portion of
  18 *  this software, then the disclaimer below must also be included.
  19 *
  20 *  This software is provided as is, without representation or warranty
  21 *  of any kind either express or implied, including without limitation
  22 *  the implied warranties of merchantability, fitness for a particular
  23 *  purpose, or noninfringement.  The Regents of the University of
  24 *  Michigan shall not be liable for any damages, including special,
  25 *  indirect, incidental, or consequential damages, with respect to any
  26 *  claim arising out of or in connection with the use of the software,
  27 *  even if it has been or is hereafter advised of the possibility of
  28 *  such damages.
  29 */
  30
  31#include <linux/nfs_fs.h>
  32#include <linux/vmalloc.h>
  33#include <linux/module.h>
  34#include <linux/sunrpc/addr.h>
  35
  36#include "internal.h"
  37#include "nfs4session.h"
  38#include "nfs4filelayout.h"
  39
  40#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
  41
  42static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
  43static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
  44
  45/*
  46 * Data server cache
  47 *
  48 * Data servers can be mapped to different device ids.
  49 * nfs4_pnfs_ds reference counting
  50 *   - set to 1 on allocation
  51 *   - incremented when a device id maps a data server already in the cache.
  52 *   - decremented when deviceid is removed from the cache.
  53 */
  54static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
  55static LIST_HEAD(nfs4_data_server_cache);
  56
  57/* Debug routines */
  58void
  59print_ds(struct nfs4_pnfs_ds *ds)
  60{
  61        if (ds == NULL) {
  62                printk("%s NULL device\n", __func__);
  63                return;
  64        }
  65        printk("        ds %s\n"
  66                "        ref count %d\n"
  67                "        client %p\n"
  68                "        cl_exchange_flags %x\n",
  69                ds->ds_remotestr,
  70                atomic_read(&ds->ds_count), ds->ds_clp,
  71                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
  72}
  73
  74static bool
  75same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
  76{
  77        struct sockaddr_in *a, *b;
  78        struct sockaddr_in6 *a6, *b6;
  79
  80        if (addr1->sa_family != addr2->sa_family)
  81                return false;
  82
  83        switch (addr1->sa_family) {
  84        case AF_INET:
  85                a = (struct sockaddr_in *)addr1;
  86                b = (struct sockaddr_in *)addr2;
  87
  88                if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
  89                    a->sin_port == b->sin_port)
  90                        return true;
  91                break;
  92
  93        case AF_INET6:
  94                a6 = (struct sockaddr_in6 *)addr1;
  95                b6 = (struct sockaddr_in6 *)addr2;
  96
  97                /* LINKLOCAL addresses must have matching scope_id */
  98                if (ipv6_addr_scope(&a6->sin6_addr) ==
  99                    IPV6_ADDR_SCOPE_LINKLOCAL &&
 100                    a6->sin6_scope_id != b6->sin6_scope_id)
 101                        return false;
 102
 103                if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
 104                    a6->sin6_port == b6->sin6_port)
 105                        return true;
 106                break;
 107
 108        default:
 109                dprintk("%s: unhandled address family: %u\n",
 110                        __func__, addr1->sa_family);
 111                return false;
 112        }
 113
 114        return false;
 115}
 116
 117static bool
 118_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
 119                               const struct list_head *dsaddrs2)
 120{
 121        struct nfs4_pnfs_ds_addr *da1, *da2;
 122
 123        /* step through both lists, comparing as we go */
 124        for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
 125             da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
 126             da1 != NULL && da2 != NULL;
 127             da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
 128             da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
 129                if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
 130                                   (struct sockaddr *)&da2->da_addr))
 131                        return false;
 132        }
 133        if (da1 == NULL && da2 == NULL)
 134                return true;
 135
 136        return false;
 137}
 138
 139/*
 140 * Lookup DS by addresses.  nfs4_ds_cache_lock is held
 141 */
 142static struct nfs4_pnfs_ds *
 143_data_server_lookup_locked(const struct list_head *dsaddrs)
 144{
 145        struct nfs4_pnfs_ds *ds;
 146
 147        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
 148                if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
 149                        return ds;
 150        return NULL;
 151}
 152
 153/*
 154 * Create an rpc connection to the nfs4_pnfs_ds data server
 155 * Currently only supports IPv4 and IPv6 addresses
 156 */
 157static int
 158nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
 159{
 160        struct nfs_client *clp = ERR_PTR(-EIO);
 161        struct nfs4_pnfs_ds_addr *da;
 162        int status = 0;
 163
 164        dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
 165                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
 166
 167        list_for_each_entry(da, &ds->ds_addrs, da_node) {
 168                dprintk("%s: DS %s: trying address %s\n",
 169                        __func__, ds->ds_remotestr, da->da_remotestr);
 170
 171                clp = nfs4_set_ds_client(mds_srv->nfs_client,
 172                                        (struct sockaddr *)&da->da_addr,
 173                                        da->da_addrlen, IPPROTO_TCP,
 174                                        dataserver_timeo, dataserver_retrans);
 175                if (!IS_ERR(clp))
 176                        break;
 177        }
 178
 179        if (IS_ERR(clp)) {
 180                status = PTR_ERR(clp);
 181                goto out;
 182        }
 183
 184        status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
 185        if (status)
 186                goto out_put;
 187
 188        ds->ds_clp = clp;
 189        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
 190out:
 191        return status;
 192out_put:
 193        nfs_put_client(clp);
 194        goto out;
 195}
 196
 197static void
 198destroy_ds(struct nfs4_pnfs_ds *ds)
 199{
 200        struct nfs4_pnfs_ds_addr *da;
 201
 202        dprintk("--> %s\n", __func__);
 203        ifdebug(FACILITY)
 204                print_ds(ds);
 205
 206        if (ds->ds_clp)
 207                nfs_put_client(ds->ds_clp);
 208
 209        while (!list_empty(&ds->ds_addrs)) {
 210                da = list_first_entry(&ds->ds_addrs,
 211                                      struct nfs4_pnfs_ds_addr,
 212                                      da_node);
 213                list_del_init(&da->da_node);
 214                kfree(da->da_remotestr);
 215                kfree(da);
 216        }
 217
 218        kfree(ds->ds_remotestr);
 219        kfree(ds);
 220}
 221
 222void
 223nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 224{
 225        struct nfs4_pnfs_ds *ds;
 226        int i;
 227
 228        nfs4_print_deviceid(&dsaddr->id_node.deviceid);
 229
 230        for (i = 0; i < dsaddr->ds_num; i++) {
 231                ds = dsaddr->ds_list[i];
 232                if (ds != NULL) {
 233                        if (atomic_dec_and_lock(&ds->ds_count,
 234                                                &nfs4_ds_cache_lock)) {
 235                                list_del_init(&ds->ds_node);
 236                                spin_unlock(&nfs4_ds_cache_lock);
 237                                destroy_ds(ds);
 238                        }
 239                }
 240        }
 241        kfree(dsaddr->stripe_indices);
 242        kfree(dsaddr);
 243}
 244
 245/*
 246 * Create a string with a human readable address and port to avoid
 247 * complicated setup around many dprinks.
 248 */
 249static char *
 250nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
 251{
 252        struct nfs4_pnfs_ds_addr *da;
 253        char *remotestr;
 254        size_t len;
 255        char *p;
 256
 257        len = 3;        /* '{', '}' and eol */
 258        list_for_each_entry(da, dsaddrs, da_node) {
 259                len += strlen(da->da_remotestr) + 1;    /* string plus comma */
 260        }
 261
 262        remotestr = kzalloc(len, gfp_flags);
 263        if (!remotestr)
 264                return NULL;
 265
 266        p = remotestr;
 267        *(p++) = '{';
 268        len--;
 269        list_for_each_entry(da, dsaddrs, da_node) {
 270                size_t ll = strlen(da->da_remotestr);
 271
 272                if (ll > len)
 273                        goto out_err;
 274
 275                memcpy(p, da->da_remotestr, ll);
 276                p += ll;
 277                len -= ll;
 278
 279                if (len < 1)
 280                        goto out_err;
 281                (*p++) = ',';
 282                len--;
 283        }
 284        if (len < 2)
 285                goto out_err;
 286        *(p++) = '}';
 287        *p = '\0';
 288        return remotestr;
 289out_err:
 290        kfree(remotestr);
 291        return NULL;
 292}
 293
 294static struct nfs4_pnfs_ds *
 295nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
 296{
 297        struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
 298        char *remotestr;
 299
 300        if (list_empty(dsaddrs)) {
 301                dprintk("%s: no addresses defined\n", __func__);
 302                goto out;
 303        }
 304
 305        ds = kzalloc(sizeof(*ds), gfp_flags);
 306        if (!ds)
 307                goto out;
 308
 309        /* this is only used for debugging, so it's ok if its NULL */
 310        remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
 311
 312        spin_lock(&nfs4_ds_cache_lock);
 313        tmp_ds = _data_server_lookup_locked(dsaddrs);
 314        if (tmp_ds == NULL) {
 315                INIT_LIST_HEAD(&ds->ds_addrs);
 316                list_splice_init(dsaddrs, &ds->ds_addrs);
 317                ds->ds_remotestr = remotestr;
 318                atomic_set(&ds->ds_count, 1);
 319                INIT_LIST_HEAD(&ds->ds_node);
 320                ds->ds_clp = NULL;
 321                list_add(&ds->ds_node, &nfs4_data_server_cache);
 322                dprintk("%s add new data server %s\n", __func__,
 323                        ds->ds_remotestr);
 324        } else {
 325                kfree(remotestr);
 326                kfree(ds);
 327                atomic_inc(&tmp_ds->ds_count);
 328                dprintk("%s data server %s found, inc'ed ds_count to %d\n",
 329                        __func__, tmp_ds->ds_remotestr,
 330                        atomic_read(&tmp_ds->ds_count));
 331                ds = tmp_ds;
 332        }
 333        spin_unlock(&nfs4_ds_cache_lock);
 334out:
 335        return ds;
 336}
 337
 338/*
 339 * Currently only supports ipv4, ipv6 and one multi-path address.
 340 */
 341static struct nfs4_pnfs_ds_addr *
 342decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
 343{
 344        struct nfs4_pnfs_ds_addr *da = NULL;
 345        char *buf, *portstr;
 346        __be16 port;
 347        int nlen, rlen;
 348        int tmp[2];
 349        __be32 *p;
 350        char *netid, *match_netid;
 351        size_t len, match_netid_len;
 352        char *startsep = "";
 353        char *endsep = "";
 354
 355
 356        /* r_netid */
 357        p = xdr_inline_decode(streamp, 4);
 358        if (unlikely(!p))
 359                goto out_err;
 360        nlen = be32_to_cpup(p++);
 361
 362        p = xdr_inline_decode(streamp, nlen);
 363        if (unlikely(!p))
 364                goto out_err;
 365
 366        netid = kmalloc(nlen+1, gfp_flags);
 367        if (unlikely(!netid))
 368                goto out_err;
 369
 370        netid[nlen] = '\0';
 371        memcpy(netid, p, nlen);
 372
 373        /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
 374        p = xdr_inline_decode(streamp, 4);
 375        if (unlikely(!p))
 376                goto out_free_netid;
 377        rlen = be32_to_cpup(p);
 378
 379        p = xdr_inline_decode(streamp, rlen);
 380        if (unlikely(!p))
 381                goto out_free_netid;
 382
 383        /* port is ".ABC.DEF", 8 chars max */
 384        if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
 385                dprintk("%s: Invalid address, length %d\n", __func__,
 386                        rlen);
 387                goto out_free_netid;
 388        }
 389        buf = kmalloc(rlen + 1, gfp_flags);
 390        if (!buf) {
 391                dprintk("%s: Not enough memory\n", __func__);
 392                goto out_free_netid;
 393        }
 394        buf[rlen] = '\0';
 395        memcpy(buf, p, rlen);
 396
 397        /* replace port '.' with '-' */
 398        portstr = strrchr(buf, '.');
 399        if (!portstr) {
 400                dprintk("%s: Failed finding expected dot in port\n",
 401                        __func__);
 402                goto out_free_buf;
 403        }
 404        *portstr = '-';
 405
 406        /* find '.' between address and port */
 407        portstr = strrchr(buf, '.');
 408        if (!portstr) {
 409                dprintk("%s: Failed finding expected dot between address and "
 410                        "port\n", __func__);
 411                goto out_free_buf;
 412        }
 413        *portstr = '\0';
 414
 415        da = kzalloc(sizeof(*da), gfp_flags);
 416        if (unlikely(!da))
 417                goto out_free_buf;
 418
 419        INIT_LIST_HEAD(&da->da_node);
 420
 421        if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
 422                      sizeof(da->da_addr))) {
 423                dprintk("%s: error parsing address %s\n", __func__, buf);
 424                goto out_free_da;
 425        }
 426
 427        portstr++;
 428        sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
 429        port = htons((tmp[0] << 8) | (tmp[1]));
 430
 431        switch (da->da_addr.ss_family) {
 432        case AF_INET:
 433                ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
 434                da->da_addrlen = sizeof(struct sockaddr_in);
 435                match_netid = "tcp";
 436                match_netid_len = 3;
 437                break;
 438
 439        case AF_INET6:
 440                ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
 441                da->da_addrlen = sizeof(struct sockaddr_in6);
 442                match_netid = "tcp6";
 443                match_netid_len = 4;
 444                startsep = "[";
 445                endsep = "]";
 446                break;
 447
 448        default:
 449                dprintk("%s: unsupported address family: %u\n",
 450                        __func__, da->da_addr.ss_family);
 451                goto out_free_da;
 452        }
 453
 454        if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
 455                dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
 456                        __func__, netid, match_netid);
 457                goto out_free_da;
 458        }
 459
 460        /* save human readable address */
 461        len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
 462        da->da_remotestr = kzalloc(len, gfp_flags);
 463
 464        /* NULL is ok, only used for dprintk */
 465        if (da->da_remotestr)
 466                snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
 467                         buf, endsep, ntohs(port));
 468
 469        dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
 470        kfree(buf);
 471        kfree(netid);
 472        return da;
 473
 474out_free_da:
 475        kfree(da);
 476out_free_buf:
 477        dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
 478        kfree(buf);
 479out_free_netid:
 480        kfree(netid);
 481out_err:
 482        return NULL;
 483}
 484
 485/* Decode opaque device data and return the result */
 486static struct nfs4_file_layout_dsaddr*
 487decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
 488{
 489        int i;
 490        u32 cnt, num;
 491        u8 *indexp;
 492        __be32 *p;
 493        u8 *stripe_indices;
 494        u8 max_stripe_index;
 495        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
 496        struct xdr_stream stream;
 497        struct xdr_buf buf;
 498        struct page *scratch;
 499        struct list_head dsaddrs;
 500        struct nfs4_pnfs_ds_addr *da;
 501
 502        /* set up xdr stream */
 503        scratch = alloc_page(gfp_flags);
 504        if (!scratch)
 505                goto out_err;
 506
 507        xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
 508        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 509
 510        /* Get the stripe count (number of stripe index) */
 511        p = xdr_inline_decode(&stream, 4);
 512        if (unlikely(!p))
 513                goto out_err_free_scratch;
 514
 515        cnt = be32_to_cpup(p);
 516        dprintk("%s stripe count  %d\n", __func__, cnt);
 517        if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
 518                printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
 519                       "supported maximum %d\n", __func__,
 520                        cnt, NFS4_PNFS_MAX_STRIPE_CNT);
 521                goto out_err_free_scratch;
 522        }
 523
 524        /* read stripe indices */
 525        stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
 526        if (!stripe_indices)
 527                goto out_err_free_scratch;
 528
 529        p = xdr_inline_decode(&stream, cnt << 2);
 530        if (unlikely(!p))
 531                goto out_err_free_stripe_indices;
 532
 533        indexp = &stripe_indices[0];
 534        max_stripe_index = 0;
 535        for (i = 0; i < cnt; i++) {
 536                *indexp = be32_to_cpup(p++);
 537                max_stripe_index = max(max_stripe_index, *indexp);
 538                indexp++;
 539        }
 540
 541        /* Check the multipath list count */
 542        p = xdr_inline_decode(&stream, 4);
 543        if (unlikely(!p))
 544                goto out_err_free_stripe_indices;
 545
 546        num = be32_to_cpup(p);
 547        dprintk("%s ds_num %u\n", __func__, num);
 548        if (num > NFS4_PNFS_MAX_MULTI_CNT) {
 549                printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
 550                        "supported maximum %d\n", __func__,
 551                        num, NFS4_PNFS_MAX_MULTI_CNT);
 552                goto out_err_free_stripe_indices;
 553        }
 554
 555        /* validate stripe indices are all < num */
 556        if (max_stripe_index >= num) {
 557                printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
 558                        __func__, max_stripe_index, num);
 559                goto out_err_free_stripe_indices;
 560        }
 561
 562        dsaddr = kzalloc(sizeof(*dsaddr) +
 563                        (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
 564                        gfp_flags);
 565        if (!dsaddr)
 566                goto out_err_free_stripe_indices;
 567
 568        dsaddr->stripe_count = cnt;
 569        dsaddr->stripe_indices = stripe_indices;
 570        stripe_indices = NULL;
 571        dsaddr->ds_num = num;
 572        nfs4_init_deviceid_node(&dsaddr->id_node,
 573                                NFS_SERVER(ino)->pnfs_curr_ld,
 574                                NFS_SERVER(ino)->nfs_client,
 575                                &pdev->dev_id);
 576
 577        INIT_LIST_HEAD(&dsaddrs);
 578
 579        for (i = 0; i < dsaddr->ds_num; i++) {
 580                int j;
 581                u32 mp_count;
 582
 583                p = xdr_inline_decode(&stream, 4);
 584                if (unlikely(!p))
 585                        goto out_err_free_deviceid;
 586
 587                mp_count = be32_to_cpup(p); /* multipath count */
 588                for (j = 0; j < mp_count; j++) {
 589                        da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
 590                                            &stream, gfp_flags);
 591                        if (da)
 592                                list_add_tail(&da->da_node, &dsaddrs);
 593                }
 594                if (list_empty(&dsaddrs)) {
 595                        dprintk("%s: no suitable DS addresses found\n",
 596                                __func__);
 597                        goto out_err_free_deviceid;
 598                }
 599
 600                dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
 601                if (!dsaddr->ds_list[i])
 602                        goto out_err_drain_dsaddrs;
 603
 604                /* If DS was already in cache, free ds addrs */
 605                while (!list_empty(&dsaddrs)) {
 606                        da = list_first_entry(&dsaddrs,
 607                                              struct nfs4_pnfs_ds_addr,
 608                                              da_node);
 609                        list_del_init(&da->da_node);
 610                        kfree(da->da_remotestr);
 611                        kfree(da);
 612                }
 613        }
 614
 615        __free_page(scratch);
 616        return dsaddr;
 617
 618out_err_drain_dsaddrs:
 619        while (!list_empty(&dsaddrs)) {
 620                da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
 621                                      da_node);
 622                list_del_init(&da->da_node);
 623                kfree(da->da_remotestr);
 624                kfree(da);
 625        }
 626out_err_free_deviceid:
 627        nfs4_fl_free_deviceid(dsaddr);
 628        /* stripe_indicies was part of dsaddr */
 629        goto out_err_free_scratch;
 630out_err_free_stripe_indices:
 631        kfree(stripe_indices);
 632out_err_free_scratch:
 633        __free_page(scratch);
 634out_err:
 635        dprintk("%s ERROR: returning NULL\n", __func__);
 636        return NULL;
 637}
 638
 639/*
 640 * Decode the opaque device specified in 'dev' and add it to the cache of
 641 * available devices.
 642 */
 643static struct nfs4_file_layout_dsaddr *
 644decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
 645{
 646        struct nfs4_deviceid_node *d;
 647        struct nfs4_file_layout_dsaddr *n, *new;
 648
 649        new = decode_device(inode, dev, gfp_flags);
 650        if (!new) {
 651                printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
 652                        __func__);
 653                return NULL;
 654        }
 655
 656        d = nfs4_insert_deviceid_node(&new->id_node);
 657        n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
 658        if (n != new) {
 659                nfs4_fl_free_deviceid(new);
 660                return n;
 661        }
 662
 663        return new;
 664}
 665
 666/*
 667 * Retrieve the information for dev_id, add it to the list
 668 * of available devices, and return it.
 669 */
 670struct nfs4_file_layout_dsaddr *
 671filelayout_get_device_info(struct inode *inode,
 672                struct nfs4_deviceid *dev_id,
 673                struct rpc_cred *cred,
 674                gfp_t gfp_flags)
 675{
 676        struct pnfs_device *pdev = NULL;
 677        u32 max_resp_sz;
 678        int max_pages;
 679        struct page **pages = NULL;
 680        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
 681        int rc, i;
 682        struct nfs_server *server = NFS_SERVER(inode);
 683
 684        /*
 685         * Use the session max response size as the basis for setting
 686         * GETDEVICEINFO's maxcount
 687         */
 688        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
 689        max_pages = nfs_page_array_len(0, max_resp_sz);
 690        dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
 691                __func__, inode, max_resp_sz, max_pages);
 692
 693        pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
 694        if (pdev == NULL)
 695                return NULL;
 696
 697        pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
 698        if (pages == NULL) {
 699                kfree(pdev);
 700                return NULL;
 701        }
 702        for (i = 0; i < max_pages; i++) {
 703                pages[i] = alloc_page(gfp_flags);
 704                if (!pages[i])
 705                        goto out_free;
 706        }
 707
 708        memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
 709        pdev->layout_type = LAYOUT_NFSV4_1_FILES;
 710        pdev->pages = pages;
 711        pdev->pgbase = 0;
 712        pdev->pglen = max_resp_sz;
 713        pdev->mincount = 0;
 714        pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
 715
 716        rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
 717        dprintk("%s getdevice info returns %d\n", __func__, rc);
 718        if (rc)
 719                goto out_free;
 720
 721        /*
 722         * Found new device, need to decode it and then add it to the
 723         * list of known devices for this mountpoint.
 724         */
 725        dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
 726out_free:
 727        for (i = 0; i < max_pages; i++)
 728                __free_page(pages[i]);
 729        kfree(pages);
 730        kfree(pdev);
 731        dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
 732        return dsaddr;
 733}
 734
 735void
 736nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 737{
 738        nfs4_put_deviceid_node(&dsaddr->id_node);
 739}
 740
 741/*
 742 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
 743 * Then: ((res + fsi) % dsaddr->stripe_count)
 744 */
 745u32
 746nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
 747{
 748        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
 749        u64 tmp;
 750
 751        tmp = offset - flseg->pattern_offset;
 752        do_div(tmp, flseg->stripe_unit);
 753        tmp += flseg->first_stripe_index;
 754        return do_div(tmp, flseg->dsaddr->stripe_count);
 755}
 756
 757u32
 758nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
 759{
 760        return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
 761}
 762
 763struct nfs_fh *
 764nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
 765{
 766        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
 767        u32 i;
 768
 769        if (flseg->stripe_type == STRIPE_SPARSE) {
 770                if (flseg->num_fh == 1)
 771                        i = 0;
 772                else if (flseg->num_fh == 0)
 773                        /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
 774                        return NULL;
 775                else
 776                        i = nfs4_fl_calc_ds_index(lseg, j);
 777        } else
 778                i = j;
 779        return flseg->fh_array[i];
 780}
 781
 782static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
 783{
 784        might_sleep();
 785        wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
 786                        nfs_wait_bit_killable, TASK_KILLABLE);
 787}
 788
 789static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
 790{
 791        smp_mb__before_clear_bit();
 792        clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
 793        smp_mb__after_clear_bit();
 794        wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
 795}
 796
 797
 798struct nfs4_pnfs_ds *
 799nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 800{
 801        struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
 802        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
 803        struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
 804
 805        if (filelayout_test_devid_unavailable(devid))
 806                return NULL;
 807
 808        if (ds == NULL) {
 809                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
 810                        __func__, ds_idx);
 811                filelayout_mark_devid_invalid(devid);
 812                return NULL;
 813        }
 814        if (ds->ds_clp)
 815                return ds;
 816
 817        if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
 818                struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
 819                int err;
 820
 821                err = nfs4_ds_connect(s, ds);
 822                if (err) {
 823                        nfs4_mark_deviceid_unavailable(devid);
 824                        ds = NULL;
 825                }
 826                nfs4_clear_ds_conn_bit(ds);
 827        } else {
 828                /* Either ds is connected, or ds is NULL */
 829                nfs4_wait_ds_connect(ds);
 830        }
 831        return ds;
 832}
 833
 834module_param(dataserver_retrans, uint, 0644);
 835MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
 836                        "retries a request before it attempts further "
 837                        " recovery  action.");
 838module_param(dataserver_timeo, uint, 0644);
 839MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
 840                        "NFSv4.1  client  waits for a response from a "
 841                        " data server before it retries an NFS request.");
 842