linux/net/sunrpc/xprtrdma/fmr_ops.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2015 Oracle.  All rights reserved.
   3 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
   4 */
   5
   6/* Lightweight memory registration using Fast Memory Regions (FMR).
   7 * Referred to sometimes as MTHCAFMR mode.
   8 *
   9 * FMR uses synchronous memory registration and deregistration.
  10 * FMR registration is known to be fast, but FMR deregistration
  11 * can take tens of usecs to complete.
  12 */
  13
  14/* Normal operation
  15 *
  16 * A Memory Region is prepared for RDMA READ or WRITE using the
  17 * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
  18 * finished, the Memory Region is unmapped using the ib_unmap_fmr
  19 * verb (fmr_op_unmap).
  20 */
  21
  22#include "xprt_rdma.h"
  23
  24#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
  25# define RPCDBG_FACILITY        RPCDBG_TRANS
  26#endif
  27
  28/* Maximum scatter/gather per FMR */
  29#define RPCRDMA_MAX_FMR_SGES    (64)
  30
  31/* Access mode of externally registered pages */
  32enum {
  33        RPCRDMA_FMR_ACCESS_FLAGS        = IB_ACCESS_REMOTE_WRITE |
  34                                          IB_ACCESS_REMOTE_READ,
  35};
  36
  37bool
  38fmr_is_supported(struct rpcrdma_ia *ia)
  39{
  40        if (!ia->ri_device->alloc_fmr) {
  41                pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
  42                        ia->ri_device->name);
  43                return false;
  44        }
  45        return true;
  46}
  47
  48static int
  49fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw)
  50{
  51        static struct ib_fmr_attr fmr_attr = {
  52                .max_pages      = RPCRDMA_MAX_FMR_SGES,
  53                .max_maps       = 1,
  54                .page_shift     = PAGE_SHIFT
  55        };
  56
  57        mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
  58                                       sizeof(u64), GFP_KERNEL);
  59        if (!mw->fmr.fm_physaddrs)
  60                goto out_free;
  61
  62        mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
  63                            sizeof(*mw->mw_sg), GFP_KERNEL);
  64        if (!mw->mw_sg)
  65                goto out_free;
  66
  67        sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES);
  68
  69        mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
  70                                     &fmr_attr);
  71        if (IS_ERR(mw->fmr.fm_mr))
  72                goto out_fmr_err;
  73
  74        return 0;
  75
  76out_fmr_err:
  77        dprintk("RPC:       %s: ib_alloc_fmr returned %ld\n", __func__,
  78                PTR_ERR(mw->fmr.fm_mr));
  79
  80out_free:
  81        kfree(mw->mw_sg);
  82        kfree(mw->fmr.fm_physaddrs);
  83        return -ENOMEM;
  84}
  85
  86static int
  87__fmr_unmap(struct rpcrdma_mw *mw)
  88{
  89        LIST_HEAD(l);
  90        int rc;
  91
  92        list_add(&mw->fmr.fm_mr->list, &l);
  93        rc = ib_unmap_fmr(&l);
  94        list_del_init(&mw->fmr.fm_mr->list);
  95        return rc;
  96}
  97
  98static void
  99fmr_op_release_mr(struct rpcrdma_mw *r)
 100{
 101        LIST_HEAD(unmap_list);
 102        int rc;
 103
 104        /* Ensure MW is not on any rl_registered list */
 105        if (!list_empty(&r->mw_list))
 106                list_del(&r->mw_list);
 107
 108        kfree(r->fmr.fm_physaddrs);
 109        kfree(r->mw_sg);
 110
 111        /* In case this one was left mapped, try to unmap it
 112         * to prevent dealloc_fmr from failing with EBUSY
 113         */
 114        rc = __fmr_unmap(r);
 115        if (rc)
 116                pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
 117                       r, rc);
 118
 119        rc = ib_dealloc_fmr(r->fmr.fm_mr);
 120        if (rc)
 121                pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
 122                       r, rc);
 123
 124        kfree(r);
 125}
 126
 127/* Reset of a single FMR.
 128 */
 129static void
 130fmr_op_recover_mr(struct rpcrdma_mw *mw)
 131{
 132        struct rpcrdma_xprt *r_xprt = mw->mw_xprt;
 133        int rc;
 134
 135        /* ORDER: invalidate first */
 136        rc = __fmr_unmap(mw);
 137
 138        /* ORDER: then DMA unmap */
 139        ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
 140                        mw->mw_sg, mw->mw_nents, mw->mw_dir);
 141        if (rc)
 142                goto out_release;
 143
 144        rpcrdma_put_mw(r_xprt, mw);
 145        r_xprt->rx_stats.mrs_recovered++;
 146        return;
 147
 148out_release:
 149        pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw);
 150        r_xprt->rx_stats.mrs_orphaned++;
 151
 152        spin_lock(&r_xprt->rx_buf.rb_mwlock);
 153        list_del(&mw->mw_all);
 154        spin_unlock(&r_xprt->rx_buf.rb_mwlock);
 155
 156        fmr_op_release_mr(mw);
 157}
 158
 159static int
 160fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 161            struct rpcrdma_create_data_internal *cdata)
 162{
 163        ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
 164                                RPCRDMA_MAX_FMR_SGES);
 165        return 0;
 166}
 167
 168/* FMR mode conveys up to 64 pages of payload per chunk segment.
 169 */
 170static size_t
 171fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 172{
 173        return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
 174                     RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
 175}
 176
 177/* Use the ib_map_phys_fmr() verb to register a memory region
 178 * for remote access via RDMA READ or RDMA WRITE.
 179 */
 180static int
 181fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 182           int nsegs, bool writing, struct rpcrdma_mw **out)
 183{
 184        struct rpcrdma_mr_seg *seg1 = seg;
 185        int len, pageoff, i, rc;
 186        struct rpcrdma_mw *mw;
 187        u64 *dma_pages;
 188
 189        mw = rpcrdma_get_mw(r_xprt);
 190        if (!mw)
 191                return -ENOBUFS;
 192
 193        pageoff = offset_in_page(seg1->mr_offset);
 194        seg1->mr_offset -= pageoff;     /* start of page */
 195        seg1->mr_len += pageoff;
 196        len = -pageoff;
 197        if (nsegs > RPCRDMA_MAX_FMR_SGES)
 198                nsegs = RPCRDMA_MAX_FMR_SGES;
 199        for (i = 0; i < nsegs;) {
 200                if (seg->mr_page)
 201                        sg_set_page(&mw->mw_sg[i],
 202                                    seg->mr_page,
 203                                    seg->mr_len,
 204                                    offset_in_page(seg->mr_offset));
 205                else
 206                        sg_set_buf(&mw->mw_sg[i], seg->mr_offset,
 207                                   seg->mr_len);
 208                len += seg->mr_len;
 209                ++seg;
 210                ++i;
 211                /* Check for holes */
 212                if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
 213                    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
 214                        break;
 215        }
 216        mw->mw_nents = i;
 217        mw->mw_dir = rpcrdma_data_dir(writing);
 218        if (i == 0)
 219                goto out_dmamap_err;
 220
 221        if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device,
 222                           mw->mw_sg, mw->mw_nents, mw->mw_dir))
 223                goto out_dmamap_err;
 224
 225        for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++)
 226                dma_pages[i] = sg_dma_address(&mw->mw_sg[i]);
 227        rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents,
 228                             dma_pages[0]);
 229        if (rc)
 230                goto out_maperr;
 231
 232        mw->mw_handle = mw->fmr.fm_mr->rkey;
 233        mw->mw_length = len;
 234        mw->mw_offset = dma_pages[0] + pageoff;
 235
 236        *out = mw;
 237        return mw->mw_nents;
 238
 239out_dmamap_err:
 240        pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n",
 241               mw->mw_sg, mw->mw_nents);
 242        rpcrdma_defer_mr_recovery(mw);
 243        return -EIO;
 244
 245out_maperr:
 246        pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
 247               len, (unsigned long long)dma_pages[0],
 248               pageoff, mw->mw_nents, rc);
 249        rpcrdma_defer_mr_recovery(mw);
 250        return -EIO;
 251}
 252
 253/* Invalidate all memory regions that were registered for "req".
 254 *
 255 * Sleeps until it is safe for the host CPU to access the
 256 * previously mapped memory regions.
 257 *
 258 * Caller ensures that req->rl_registered is not empty.
 259 */
 260static void
 261fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 262{
 263        struct rpcrdma_mw *mw, *tmp;
 264        LIST_HEAD(unmap_list);
 265        int rc;
 266
 267        dprintk("RPC:       %s: req %p\n", __func__, req);
 268
 269        /* ORDER: Invalidate all of the req's MRs first
 270         *
 271         * ib_unmap_fmr() is slow, so use a single call instead
 272         * of one call per mapped FMR.
 273         */
 274        list_for_each_entry(mw, &req->rl_registered, mw_list)
 275                list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
 276        r_xprt->rx_stats.local_inv_needed++;
 277        rc = ib_unmap_fmr(&unmap_list);
 278        if (rc)
 279                goto out_reset;
 280
 281        /* ORDER: Now DMA unmap all of the req's MRs, and return
 282         * them to the free MW list.
 283         */
 284        list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
 285                list_del_init(&mw->mw_list);
 286                list_del_init(&mw->fmr.fm_mr->list);
 287                ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
 288                                mw->mw_sg, mw->mw_nents, mw->mw_dir);
 289                rpcrdma_put_mw(r_xprt, mw);
 290        }
 291
 292        return;
 293
 294out_reset:
 295        pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
 296
 297        list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
 298                list_del_init(&mw->fmr.fm_mr->list);
 299                fmr_op_recover_mr(mw);
 300        }
 301}
 302
 303/* Use a slow, safe mechanism to invalidate all memory regions
 304 * that were registered for "req".
 305 */
 306static void
 307fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 308                  bool sync)
 309{
 310        struct rpcrdma_mw *mw;
 311
 312        while (!list_empty(&req->rl_registered)) {
 313                mw = rpcrdma_pop_mw(&req->rl_registered);
 314                if (sync)
 315                        fmr_op_recover_mr(mw);
 316                else
 317                        rpcrdma_defer_mr_recovery(mw);
 318        }
 319}
 320
 321const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 322        .ro_map                         = fmr_op_map,
 323        .ro_unmap_sync                  = fmr_op_unmap_sync,
 324        .ro_unmap_safe                  = fmr_op_unmap_safe,
 325        .ro_recover_mr                  = fmr_op_recover_mr,
 326        .ro_open                        = fmr_op_open,
 327        .ro_maxpages                    = fmr_op_maxpages,
 328        .ro_init_mr                     = fmr_op_init_mr,
 329        .ro_release_mr                  = fmr_op_release_mr,
 330        .ro_displayname                 = "fmr",
 331        .ro_send_w_inv_ok               = 0,
 332};
 333