linux/net/sunrpc/xprtrdma/xprt_rdma.h
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the BSD-type
   8 * license below:
   9 *
  10 * Redistribution and use in source and binary forms, with or without
  11 * modification, are permitted provided that the following conditions
  12 * are met:
  13 *
  14 *      Redistributions of source code must retain the above copyright
  15 *      notice, this list of conditions and the following disclaimer.
  16 *
  17 *      Redistributions in binary form must reproduce the above
  18 *      copyright notice, this list of conditions and the following
  19 *      disclaimer in the documentation and/or other materials provided
  20 *      with the distribution.
  21 *
  22 *      Neither the name of the Network Appliance, Inc. nor the names of
  23 *      its contributors may be used to endorse or promote products
  24 *      derived from this software without specific prior written
  25 *      permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38 */
  39
  40#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
  41#define _LINUX_SUNRPC_XPRT_RDMA_H
  42
  43#include <linux/wait.h>                 /* wait_queue_head_t, etc */
  44#include <linux/spinlock.h>             /* spinlock_t, etc */
  45#include <linux/atomic.h>                       /* atomic_t, etc */
  46
  47#include <rdma/rdma_cm.h>               /* RDMA connection api */
  48#include <rdma/ib_verbs.h>              /* RDMA verbs api */
  49
  50#include <linux/sunrpc/clnt.h>          /* rpc_xprt */
  51#include <linux/sunrpc/rpc_rdma.h>      /* RPC/RDMA protocol */
  52#include <linux/sunrpc/xprtrdma.h>      /* xprt parameters */
  53
  54#define RDMA_RESOLVE_TIMEOUT    (5000)  /* 5 seconds */
  55#define RDMA_CONNECT_RETRY_MAX  (2)     /* retries if no listener backlog */
  56
  57/*
  58 * Interface Adapter -- one per transport instance
  59 */
  60struct rpcrdma_ia {
  61        struct rdma_cm_id       *ri_id;
  62        struct ib_pd            *ri_pd;
  63        struct ib_mr            *ri_bind_mem;
  64        u32                     ri_dma_lkey;
  65        int                     ri_have_dma_lkey;
  66        struct completion       ri_done;
  67        int                     ri_async_rc;
  68        enum rpcrdma_memreg     ri_memreg_strategy;
  69};
  70
  71/*
  72 * RDMA Endpoint -- one per transport instance
  73 */
  74
  75struct rpcrdma_ep {
  76        atomic_t                rep_cqcount;
  77        int                     rep_cqinit;
  78        int                     rep_connected;
  79        struct rpcrdma_ia       *rep_ia;
  80        struct ib_cq            *rep_cq;
  81        struct ib_qp_init_attr  rep_attr;
  82        wait_queue_head_t       rep_connect_wait;
  83        struct ib_sge           rep_pad;        /* holds zeroed pad */
  84        struct ib_mr            *rep_pad_mr;    /* holds zeroed pad */
  85        void                    (*rep_func)(struct rpcrdma_ep *);
  86        struct rpc_xprt         *rep_xprt;      /* for rep_func */
  87        struct rdma_conn_param  rep_remote_cma;
  88        struct sockaddr_storage rep_remote_addr;
  89};
  90
  91#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
  92#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
  93
  94/*
  95 * struct rpcrdma_rep -- this structure encapsulates state required to recv
  96 * and complete a reply, asychronously. It needs several pieces of
  97 * state:
  98 *   o recv buffer (posted to provider)
  99 *   o ib_sge (also donated to provider)
 100 *   o status of reply (length, success or not)
 101 *   o bookkeeping state to get run by tasklet (list, etc)
 102 *
 103 * These are allocated during initialization, per-transport instance;
 104 * however, the tasklet execution list itself is global, as it should
 105 * always be pretty short.
 106 *
 107 * N of these are associated with a transport instance, and stored in
 108 * struct rpcrdma_buffer. N is the max number of outstanding requests.
 109 */
 110
 111/* temporary static scatter/gather max */
 112#define RPCRDMA_MAX_DATA_SEGS   (64)    /* max scatter/gather */
 113#define RPCRDMA_MAX_SEGS        (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
 114#define MAX_RPCRDMAHDR  (\
 115        /* max supported RPC/RDMA header */ \
 116        sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
 117        (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
 118
 119struct rpcrdma_buffer;
 120
 121struct rpcrdma_rep {
 122        unsigned int    rr_len;         /* actual received reply length */
 123        struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
 124        struct rpc_xprt *rr_xprt;       /* needed for request/reply matching */
 125        void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
 126        struct list_head rr_list;       /* tasklet list */
 127        wait_queue_head_t rr_unbind;    /* optional unbind wait */
 128        struct ib_sge   rr_iov;         /* for posting */
 129        struct ib_mr    *rr_handle;     /* handle for mem in rr_iov */
 130        char    rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
 131};
 132
 133/*
 134 * struct rpcrdma_req -- structure central to the request/reply sequence.
 135 *
 136 * N of these are associated with a transport instance, and stored in
 137 * struct rpcrdma_buffer. N is the max number of outstanding requests.
 138 *
 139 * It includes pre-registered buffer memory for send AND recv.
 140 * The recv buffer, however, is not owned by this structure, and
 141 * is "donated" to the hardware when a recv is posted. When a
 142 * reply is handled, the recv buffer used is given back to the
 143 * struct rpcrdma_req associated with the request.
 144 *
 145 * In addition to the basic memory, this structure includes an array
 146 * of iovs for send operations. The reason is that the iovs passed to
 147 * ib_post_{send,recv} must not be modified until the work request
 148 * completes.
 149 *
 150 * NOTES:
 151 *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
 152 *     marshal. The number needed varies depending on the iov lists that
 153 *     are passed to us, the memory registration mode we are in, and if
 154 *     physical addressing is used, the layout.
 155 */
 156
 157struct rpcrdma_mr_seg {         /* chunk descriptors */
 158        union {                         /* chunk memory handles */
 159                struct ib_mr    *rl_mr;         /* if registered directly */
 160                struct rpcrdma_mw {             /* if registered from region */
 161                        union {
 162                                struct ib_mw    *mw;
 163                                struct ib_fmr   *fmr;
 164                                struct {
 165                                        struct ib_fast_reg_page_list *fr_pgl;
 166                                        struct ib_mr *fr_mr;
 167                                        enum { FRMR_IS_INVALID, FRMR_IS_VALID  } state;
 168                                } frmr;
 169                        } r;
 170                        struct list_head mw_list;
 171                } *rl_mw;
 172        } mr_chunk;
 173        u64             mr_base;        /* registration result */
 174        u32             mr_rkey;        /* registration result */
 175        u32             mr_len;         /* length of chunk or segment */
 176        int             mr_nsegs;       /* number of segments in chunk or 0 */
 177        enum dma_data_direction mr_dir; /* segment mapping direction */
 178        dma_addr_t      mr_dma;         /* segment mapping address */
 179        size_t          mr_dmalen;      /* segment mapping length */
 180        struct page     *mr_page;       /* owning page, if any */
 181        char            *mr_offset;     /* kva if no page, else offset */
 182};
 183
 184struct rpcrdma_req {
 185        size_t          rl_size;        /* actual length of buffer */
 186        unsigned int    rl_niovs;       /* 0, 2 or 4 */
 187        unsigned int    rl_nchunks;     /* non-zero if chunks */
 188        unsigned int    rl_connect_cookie;      /* retry detection */
 189        struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
 190        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
 191        struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
 192        struct ib_sge   rl_send_iov[4]; /* for active requests */
 193        struct ib_sge   rl_iov;         /* for posting */
 194        struct ib_mr    *rl_handle;     /* handle for mem in rl_iov */
 195        char            rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
 196        __u32           rl_xdr_buf[0];  /* start of returned rpc rq_buffer */
 197};
 198#define rpcr_to_rdmar(r) \
 199        container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
 200
 201/*
 202 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
 203 * inline requests/replies, and client/server credits.
 204 *
 205 * One of these is associated with a transport instance
 206 */
 207struct rpcrdma_buffer {
 208        spinlock_t      rb_lock;        /* protects indexes */
 209        atomic_t        rb_credits;     /* most recent server credits */
 210        unsigned long   rb_cwndscale;   /* cached framework rpc_cwndscale */
 211        int             rb_max_requests;/* client max requests */
 212        struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
 213        int             rb_send_index;
 214        struct rpcrdma_req      **rb_send_bufs;
 215        int             rb_recv_index;
 216        struct rpcrdma_rep      **rb_recv_bufs;
 217        char            *rb_pool;
 218};
 219#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 220
 221/*
 222 * Internal structure for transport instance creation. This
 223 * exists primarily for modularity.
 224 *
 225 * This data should be set with mount options
 226 */
 227struct rpcrdma_create_data_internal {
 228        struct sockaddr_storage addr;   /* RDMA server address */
 229        unsigned int    max_requests;   /* max requests (slots) in flight */
 230        unsigned int    rsize;          /* mount rsize - max read hdr+data */
 231        unsigned int    wsize;          /* mount wsize - max write hdr+data */
 232        unsigned int    inline_rsize;   /* max non-rdma read data payload */
 233        unsigned int    inline_wsize;   /* max non-rdma write data payload */
 234        unsigned int    padding;        /* non-rdma write header padding */
 235};
 236
 237#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
 238        (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
 239
 240#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
 241        (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
 242
 243#define RPCRDMA_INLINE_PAD_VALUE(rq)\
 244        rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
 245
 246/*
 247 * Statistics for RPCRDMA
 248 */
 249struct rpcrdma_stats {
 250        unsigned long           read_chunk_count;
 251        unsigned long           write_chunk_count;
 252        unsigned long           reply_chunk_count;
 253
 254        unsigned long long      total_rdma_request;
 255        unsigned long long      total_rdma_reply;
 256
 257        unsigned long long      pullup_copy_count;
 258        unsigned long long      fixup_copy_count;
 259        unsigned long           hardway_register_count;
 260        unsigned long           failed_marshal_count;
 261        unsigned long           bad_reply_count;
 262};
 263
 264/*
 265 * RPCRDMA transport -- encapsulates the structures above for
 266 * integration with RPC.
 267 *
 268 * The contained structures are embedded, not pointers,
 269 * for convenience. This structure need not be visible externally.
 270 *
 271 * It is allocated and initialized during mount, and released
 272 * during unmount.
 273 */
 274struct rpcrdma_xprt {
 275        struct rpc_xprt         xprt;
 276        struct rpcrdma_ia       rx_ia;
 277        struct rpcrdma_ep       rx_ep;
 278        struct rpcrdma_buffer   rx_buf;
 279        struct rpcrdma_create_data_internal rx_data;
 280        struct delayed_work     rdma_connect;
 281        struct rpcrdma_stats    rx_stats;
 282};
 283
 284#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
 285#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
 286
 287/* Setting this to 0 ensures interoperability with early servers.
 288 * Setting this to 1 enhances certain unaligned read/write performance.
 289 * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
 290extern int xprt_rdma_pad_optimize;
 291
 292/*
 293 * Interface Adapter calls - xprtrdma/verbs.c
 294 */
 295int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
 296void rpcrdma_ia_close(struct rpcrdma_ia *);
 297
 298/*
 299 * Endpoint calls - xprtrdma/verbs.c
 300 */
 301int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
 302                                struct rpcrdma_create_data_internal *);
 303int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
 304int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 305int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 306
 307int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
 308                                struct rpcrdma_req *);
 309int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
 310                                struct rpcrdma_rep *);
 311
 312/*
 313 * Buffer calls - xprtrdma/verbs.c
 314 */
 315int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
 316                                struct rpcrdma_ia *,
 317                                struct rpcrdma_create_data_internal *);
 318void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 319
 320struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 321void rpcrdma_buffer_put(struct rpcrdma_req *);
 322void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 323void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 324
 325int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
 326                                struct ib_mr **, struct ib_sge *);
 327int rpcrdma_deregister_internal(struct rpcrdma_ia *,
 328                                struct ib_mr *, struct ib_sge *);
 329
 330int rpcrdma_register_external(struct rpcrdma_mr_seg *,
 331                                int, int, struct rpcrdma_xprt *);
 332int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
 333                                struct rpcrdma_xprt *, void *);
 334
 335/*
 336 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
 337 */
 338void rpcrdma_conn_func(struct rpcrdma_ep *);
 339void rpcrdma_reply_handler(struct rpcrdma_rep *);
 340
 341/*
 342 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
 343 */
 344int rpcrdma_marshal_req(struct rpc_rqst *);
 345
 346/* Temporary NFS request map cache. Created in svc_rdma.c  */
 347extern struct kmem_cache *svc_rdma_map_cachep;
 348/* WR context cache. Created in svc_rdma.c  */
 349extern struct kmem_cache *svc_rdma_ctxt_cachep;
 350/* Workqueue created in svc_rdma.c */
 351extern struct workqueue_struct *svc_rdma_wq;
 352
 353#endif                          /* _LINUX_SUNRPC_XPRT_RDMA_H */
 354