linux/net/sunrpc/xprtrdma/xprt_rdma.h
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the BSD-type
   8 * license below:
   9 *
  10 * Redistribution and use in source and binary forms, with or without
  11 * modification, are permitted provided that the following conditions
  12 * are met:
  13 *
  14 *      Redistributions of source code must retain the above copyright
  15 *      notice, this list of conditions and the following disclaimer.
  16 *
  17 *      Redistributions in binary form must reproduce the above
  18 *      copyright notice, this list of conditions and the following
  19 *      disclaimer in the documentation and/or other materials provided
  20 *      with the distribution.
  21 *
  22 *      Neither the name of the Network Appliance, Inc. nor the names of
  23 *      its contributors may be used to endorse or promote products
  24 *      derived from this software without specific prior written
  25 *      permission.
  26 *
  27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38 */
  39
  40#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
  41#define _LINUX_SUNRPC_XPRT_RDMA_H
  42
  43#include <linux/wait.h>                 /* wait_queue_head_t, etc */
  44#include <linux/spinlock.h>             /* spinlock_t, etc */
  45#include <asm/atomic.h>                 /* atomic_t, etc */
  46
  47#include <rdma/rdma_cm.h>               /* RDMA connection api */
  48#include <rdma/ib_verbs.h>              /* RDMA verbs api */
  49
  50#include <linux/sunrpc/clnt.h>          /* rpc_xprt */
  51#include <linux/sunrpc/rpc_rdma.h>      /* RPC/RDMA protocol */
  52#include <linux/sunrpc/xprtrdma.h>      /* xprt parameters */
  53
  54#define RDMA_RESOLVE_TIMEOUT    (5000)  /* 5 seconds */
  55#define RDMA_CONNECT_RETRY_MAX  (2)     /* retries if no listener backlog */
  56
  57/*
  58 * Interface Adapter -- one per transport instance
  59 */
  60struct rpcrdma_ia {
  61        struct rdma_cm_id       *ri_id;
  62        struct ib_pd            *ri_pd;
  63        struct ib_mr            *ri_bind_mem;
  64        u32                     ri_dma_lkey;
  65        int                     ri_have_dma_lkey;
  66        struct completion       ri_done;
  67        int                     ri_async_rc;
  68        enum rpcrdma_memreg     ri_memreg_strategy;
  69};
  70
  71/*
  72 * RDMA Endpoint -- one per transport instance
  73 */
  74
  75struct rpcrdma_ep {
  76        atomic_t                rep_cqcount;
  77        int                     rep_cqinit;
  78        int                     rep_connected;
  79        struct rpcrdma_ia       *rep_ia;
  80        struct ib_cq            *rep_cq;
  81        struct ib_qp_init_attr  rep_attr;
  82        wait_queue_head_t       rep_connect_wait;
  83        struct ib_sge           rep_pad;        /* holds zeroed pad */
  84        struct ib_mr            *rep_pad_mr;    /* holds zeroed pad */
  85        void                    (*rep_func)(struct rpcrdma_ep *);
  86        struct rpc_xprt         *rep_xprt;      /* for rep_func */
  87        struct rdma_conn_param  rep_remote_cma;
  88        struct sockaddr_storage rep_remote_addr;
  89};
  90
  91#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
  92#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
  93
  94/*
  95 * struct rpcrdma_rep -- this structure encapsulates state required to recv
  96 * and complete a reply, asychronously. It needs several pieces of
  97 * state:
  98 *   o recv buffer (posted to provider)
  99 *   o ib_sge (also donated to provider)
 100 *   o status of reply (length, success or not)
 101 *   o bookkeeping state to get run by tasklet (list, etc)
 102 *
 103 * These are allocated during initialization, per-transport instance;
 104 * however, the tasklet execution list itself is global, as it should
 105 * always be pretty short.
 106 *
 107 * N of these are associated with a transport instance, and stored in
 108 * struct rpcrdma_buffer. N is the max number of outstanding requests.
 109 */
 110
 111/* temporary static scatter/gather max */
 112#define RPCRDMA_MAX_DATA_SEGS   (8)     /* max scatter/gather */
 113#define RPCRDMA_MAX_SEGS        (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
 114#define MAX_RPCRDMAHDR  (\
 115        /* max supported RPC/RDMA header */ \
 116        sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
 117        (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
 118
 119struct rpcrdma_buffer;
 120
 121struct rpcrdma_rep {
 122        unsigned int    rr_len;         /* actual received reply length */
 123        struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
 124        struct rpc_xprt *rr_xprt;       /* needed for request/reply matching */
 125        void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
 126        struct list_head rr_list;       /* tasklet list */
 127        wait_queue_head_t rr_unbind;    /* optional unbind wait */
 128        struct ib_sge   rr_iov;         /* for posting */
 129        struct ib_mr    *rr_handle;     /* handle for mem in rr_iov */
 130        char    rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
 131};
 132
 133/*
 134 * struct rpcrdma_req -- structure central to the request/reply sequence.
 135 *
 136 * N of these are associated with a transport instance, and stored in
 137 * struct rpcrdma_buffer. N is the max number of outstanding requests.
 138 *
 139 * It includes pre-registered buffer memory for send AND recv.
 140 * The recv buffer, however, is not owned by this structure, and
 141 * is "donated" to the hardware when a recv is posted. When a
 142 * reply is handled, the recv buffer used is given back to the
 143 * struct rpcrdma_req associated with the request.
 144 *
 145 * In addition to the basic memory, this structure includes an array
 146 * of iovs for send operations. The reason is that the iovs passed to
 147 * ib_post_{send,recv} must not be modified until the work request
 148 * completes.
 149 *
 150 * NOTES:
 151 *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
 152 *     marshal. The number needed varies depending on the iov lists that
 153 *     are passed to us, the memory registration mode we are in, and if
 154 *     physical addressing is used, the layout.
 155 */
 156
 157struct rpcrdma_mr_seg {         /* chunk descriptors */
 158        union {                         /* chunk memory handles */
 159                struct ib_mr    *rl_mr;         /* if registered directly */
 160                struct rpcrdma_mw {             /* if registered from region */
 161                        union {
 162                                struct ib_mw    *mw;
 163                                struct ib_fmr   *fmr;
 164                                struct {
 165                                        struct ib_fast_reg_page_list *fr_pgl;
 166                                        struct ib_mr *fr_mr;
 167                                } frmr;
 168                        } r;
 169                        struct list_head mw_list;
 170                } *rl_mw;
 171        } mr_chunk;
 172        u64             mr_base;        /* registration result */
 173        u32             mr_rkey;        /* registration result */
 174        u32             mr_len;         /* length of chunk or segment */
 175        int             mr_nsegs;       /* number of segments in chunk or 0 */
 176        enum dma_data_direction mr_dir; /* segment mapping direction */
 177        dma_addr_t      mr_dma;         /* segment mapping address */
 178        size_t          mr_dmalen;      /* segment mapping length */
 179        struct page     *mr_page;       /* owning page, if any */
 180        char            *mr_offset;     /* kva if no page, else offset */
 181};
 182
 183struct rpcrdma_req {
 184        size_t          rl_size;        /* actual length of buffer */
 185        unsigned int    rl_niovs;       /* 0, 2 or 4 */
 186        unsigned int    rl_nchunks;     /* non-zero if chunks */
 187        unsigned int    rl_connect_cookie;      /* retry detection */
 188        struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
 189        struct rpcrdma_rep      *rl_reply;/* holder for reply buffer */
 190        struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
 191        struct ib_sge   rl_send_iov[4]; /* for active requests */
 192        struct ib_sge   rl_iov;         /* for posting */
 193        struct ib_mr    *rl_handle;     /* handle for mem in rl_iov */
 194        char            rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
 195        __u32           rl_xdr_buf[0];  /* start of returned rpc rq_buffer */
 196};
 197#define rpcr_to_rdmar(r) \
 198        container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
 199
 200/*
 201 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
 202 * inline requests/replies, and client/server credits.
 203 *
 204 * One of these is associated with a transport instance
 205 */
 206struct rpcrdma_buffer {
 207        spinlock_t      rb_lock;        /* protects indexes */
 208        atomic_t        rb_credits;     /* most recent server credits */
 209        unsigned long   rb_cwndscale;   /* cached framework rpc_cwndscale */
 210        int             rb_max_requests;/* client max requests */
 211        struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
 212        int             rb_send_index;
 213        struct rpcrdma_req      **rb_send_bufs;
 214        int             rb_recv_index;
 215        struct rpcrdma_rep      **rb_recv_bufs;
 216        char            *rb_pool;
 217};
 218#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
 219
 220/*
 221 * Internal structure for transport instance creation. This
 222 * exists primarily for modularity.
 223 *
 224 * This data should be set with mount options
 225 */
 226struct rpcrdma_create_data_internal {
 227        struct sockaddr_storage addr;   /* RDMA server address */
 228        unsigned int    max_requests;   /* max requests (slots) in flight */
 229        unsigned int    rsize;          /* mount rsize - max read hdr+data */
 230        unsigned int    wsize;          /* mount wsize - max write hdr+data */
 231        unsigned int    inline_rsize;   /* max non-rdma read data payload */
 232        unsigned int    inline_wsize;   /* max non-rdma write data payload */
 233        unsigned int    padding;        /* non-rdma write header padding */
 234};
 235
 236#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
 237        (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
 238
 239#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
 240        (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
 241
 242#define RPCRDMA_INLINE_PAD_VALUE(rq)\
 243        rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
 244
 245/*
 246 * Statistics for RPCRDMA
 247 */
 248struct rpcrdma_stats {
 249        unsigned long           read_chunk_count;
 250        unsigned long           write_chunk_count;
 251        unsigned long           reply_chunk_count;
 252
 253        unsigned long long      total_rdma_request;
 254        unsigned long long      total_rdma_reply;
 255
 256        unsigned long long      pullup_copy_count;
 257        unsigned long long      fixup_copy_count;
 258        unsigned long           hardway_register_count;
 259        unsigned long           failed_marshal_count;
 260        unsigned long           bad_reply_count;
 261};
 262
 263/*
 264 * RPCRDMA transport -- encapsulates the structures above for
 265 * integration with RPC.
 266 *
 267 * The contained structures are embedded, not pointers,
 268 * for convenience. This structure need not be visible externally.
 269 *
 270 * It is allocated and initialized during mount, and released
 271 * during unmount.
 272 */
 273struct rpcrdma_xprt {
 274        struct rpc_xprt         xprt;
 275        struct rpcrdma_ia       rx_ia;
 276        struct rpcrdma_ep       rx_ep;
 277        struct rpcrdma_buffer   rx_buf;
 278        struct rpcrdma_create_data_internal rx_data;
 279        struct delayed_work     rdma_connect;
 280        struct rpcrdma_stats    rx_stats;
 281};
 282
 283#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
 284#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
 285
 286/* Setting this to 0 ensures interoperability with early servers.
 287 * Setting this to 1 enhances certain unaligned read/write performance.
 288 * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
 289extern int xprt_rdma_pad_optimize;
 290
 291/*
 292 * Interface Adapter calls - xprtrdma/verbs.c
 293 */
 294int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
 295void rpcrdma_ia_close(struct rpcrdma_ia *);
 296
 297/*
 298 * Endpoint calls - xprtrdma/verbs.c
 299 */
 300int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
 301                                struct rpcrdma_create_data_internal *);
 302int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
 303int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 304int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 305
 306int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
 307                                struct rpcrdma_req *);
 308int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
 309                                struct rpcrdma_rep *);
 310
 311/*
 312 * Buffer calls - xprtrdma/verbs.c
 313 */
 314int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
 315                                struct rpcrdma_ia *,
 316                                struct rpcrdma_create_data_internal *);
 317void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 318
 319struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
 320void rpcrdma_buffer_put(struct rpcrdma_req *);
 321void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 322void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 323
 324int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
 325                                struct ib_mr **, struct ib_sge *);
 326int rpcrdma_deregister_internal(struct rpcrdma_ia *,
 327                                struct ib_mr *, struct ib_sge *);
 328
 329int rpcrdma_register_external(struct rpcrdma_mr_seg *,
 330                                int, int, struct rpcrdma_xprt *);
 331int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
 332                                struct rpcrdma_xprt *, void *);
 333
 334/*
 335 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
 336 */
 337void rpcrdma_conn_func(struct rpcrdma_ep *);
 338void rpcrdma_reply_handler(struct rpcrdma_rep *);
 339
 340/*
 341 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
 342 */
 343int rpcrdma_marshal_req(struct rpc_rqst *);
 344
 345#endif                          /* _LINUX_SUNRPC_XPRT_RDMA_H */
 346