1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50#include <linux/pci.h>
51#include <linux/slab.h>
52
53#include "xprt_rdma.h"
54
55
56
57
58
59#ifdef RPC_DEBUG
60# define RPCDBG_FACILITY RPCDBG_TRANS
61#endif
62
63
64
65
66
67
68
69
70
71
72
73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74static LIST_HEAD(rpcrdma_tasklets_g);
75
76static void
77rpcrdma_run_tasklet(unsigned long data)
78{
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
82
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
97
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 }
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101}
102
103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105static inline void
106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
114}
115
116static void
117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118{
119 struct rpcrdma_ep *ep = context;
120
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
127 }
128}
129
130static void
131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132{
133 struct rpcrdma_ep *ep = context;
134
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
141 }
142}
143
144static inline
145void rpcrdma_event_process(struct ib_wc *wc)
146{
147 struct rpcrdma_mw *frmr;
148 struct rpcrdma_rep *rep =
149 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
150
151 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
152 __func__, rep, wc->status, wc->opcode, wc->byte_len);
153
154 if (!rep)
155 return;
156
157 if (IB_WC_SUCCESS != wc->status) {
158 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
159 __func__, wc->opcode, wc->status);
160 rep->rr_len = ~0U;
161 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162 rpcrdma_schedule_tasklet(rep);
163 return;
164 }
165
166 switch (wc->opcode) {
167 case IB_WC_FAST_REG_MR:
168 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169 frmr->r.frmr.state = FRMR_IS_VALID;
170 break;
171 case IB_WC_LOCAL_INV:
172 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173 frmr->r.frmr.state = FRMR_IS_INVALID;
174 break;
175 case IB_WC_RECV:
176 rep->rr_len = wc->byte_len;
177 ib_dma_sync_single_for_cpu(
178 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
179 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
180
181 if (rep->rr_len >= 16) {
182 struct rpcrdma_msg *p =
183 (struct rpcrdma_msg *) rep->rr_base;
184 unsigned int credits = ntohl(p->rm_credit);
185 if (credits == 0) {
186 dprintk("RPC: %s: server"
187 " dropped credits to 0!\n", __func__);
188
189 credits = 1;
190 } else if (credits > rep->rr_buffer->rb_max_requests) {
191 dprintk("RPC: %s: server"
192 " over-crediting: %d (%d)\n",
193 __func__, credits,
194 rep->rr_buffer->rb_max_requests);
195 credits = rep->rr_buffer->rb_max_requests;
196 }
197 atomic_set(&rep->rr_buffer->rb_credits, credits);
198 }
199
200 case IB_WC_BIND_MW:
201 rpcrdma_schedule_tasklet(rep);
202 break;
203 default:
204 dprintk("RPC: %s: unexpected WC event %X\n",
205 __func__, wc->opcode);
206 break;
207 }
208}
209
210static inline int
211rpcrdma_cq_poll(struct ib_cq *cq)
212{
213 struct ib_wc wc;
214 int rc;
215
216 for (;;) {
217 rc = ib_poll_cq(cq, 1, &wc);
218 if (rc < 0) {
219 dprintk("RPC: %s: ib_poll_cq failed %i\n",
220 __func__, rc);
221 return rc;
222 }
223 if (rc == 0)
224 break;
225
226 rpcrdma_event_process(&wc);
227 }
228
229 return 0;
230}
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247static void
248rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
249{
250 int rc;
251
252 rc = rpcrdma_cq_poll(cq);
253 if (rc)
254 return;
255
256 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
257 if (rc) {
258 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
259 __func__, rc);
260 return;
261 }
262
263 rpcrdma_cq_poll(cq);
264}
265
266#ifdef RPC_DEBUG
267static const char * const conn[] = {
268 "address resolved",
269 "address error",
270 "route resolved",
271 "route error",
272 "connect request",
273 "connect response",
274 "connect error",
275 "unreachable",
276 "rejected",
277 "established",
278 "disconnected",
279 "device removal"
280};
281#endif
282
283static int
284rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
285{
286 struct rpcrdma_xprt *xprt = id->context;
287 struct rpcrdma_ia *ia = &xprt->rx_ia;
288 struct rpcrdma_ep *ep = &xprt->rx_ep;
289#ifdef RPC_DEBUG
290 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
291#endif
292 struct ib_qp_attr attr;
293 struct ib_qp_init_attr iattr;
294 int connstate = 0;
295
296 switch (event->event) {
297 case RDMA_CM_EVENT_ADDR_RESOLVED:
298 case RDMA_CM_EVENT_ROUTE_RESOLVED:
299 ia->ri_async_rc = 0;
300 complete(&ia->ri_done);
301 break;
302 case RDMA_CM_EVENT_ADDR_ERROR:
303 ia->ri_async_rc = -EHOSTUNREACH;
304 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
305 __func__, ep);
306 complete(&ia->ri_done);
307 break;
308 case RDMA_CM_EVENT_ROUTE_ERROR:
309 ia->ri_async_rc = -ENETUNREACH;
310 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
311 __func__, ep);
312 complete(&ia->ri_done);
313 break;
314 case RDMA_CM_EVENT_ESTABLISHED:
315 connstate = 1;
316 ib_query_qp(ia->ri_id->qp, &attr,
317 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
318 &iattr);
319 dprintk("RPC: %s: %d responder resources"
320 " (%d initiator)\n",
321 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
322 goto connected;
323 case RDMA_CM_EVENT_CONNECT_ERROR:
324 connstate = -ENOTCONN;
325 goto connected;
326 case RDMA_CM_EVENT_UNREACHABLE:
327 connstate = -ENETDOWN;
328 goto connected;
329 case RDMA_CM_EVENT_REJECTED:
330 connstate = -ECONNREFUSED;
331 goto connected;
332 case RDMA_CM_EVENT_DISCONNECTED:
333 connstate = -ECONNABORTED;
334 goto connected;
335 case RDMA_CM_EVENT_DEVICE_REMOVAL:
336 connstate = -ENODEV;
337connected:
338 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
339 __func__,
340 (event->event <= 11) ? conn[event->event] :
341 "unknown connection error",
342 &addr->sin_addr.s_addr,
343 ntohs(addr->sin_port),
344 ep, event->event);
345 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
346 dprintk("RPC: %s: %sconnected\n",
347 __func__, connstate > 0 ? "" : "dis");
348 ep->rep_connected = connstate;
349 ep->rep_func(ep);
350 wake_up_all(&ep->rep_connect_wait);
351 break;
352 default:
353 dprintk("RPC: %s: unexpected CM event %d\n",
354 __func__, event->event);
355 break;
356 }
357
358#ifdef RPC_DEBUG
359 if (connstate == 1) {
360 int ird = attr.max_dest_rd_atomic;
361 int tird = ep->rep_remote_cma.responder_resources;
362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
363 "on %s, memreg %d slots %d ird %d%s\n",
364 &addr->sin_addr.s_addr,
365 ntohs(addr->sin_port),
366 ia->ri_id->device->name,
367 ia->ri_memreg_strategy,
368 xprt->rx_buf.rb_max_requests,
369 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
370 } else if (connstate < 0) {
371 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
372 &addr->sin_addr.s_addr,
373 ntohs(addr->sin_port),
374 connstate);
375 }
376#endif
377
378 return 0;
379}
380
381static struct rdma_cm_id *
382rpcrdma_create_id(struct rpcrdma_xprt *xprt,
383 struct rpcrdma_ia *ia, struct sockaddr *addr)
384{
385 struct rdma_cm_id *id;
386 int rc;
387
388 init_completion(&ia->ri_done);
389
390 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
391 if (IS_ERR(id)) {
392 rc = PTR_ERR(id);
393 dprintk("RPC: %s: rdma_create_id() failed %i\n",
394 __func__, rc);
395 return id;
396 }
397
398 ia->ri_async_rc = -ETIMEDOUT;
399 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
400 if (rc) {
401 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
402 __func__, rc);
403 goto out;
404 }
405 wait_for_completion_interruptible_timeout(&ia->ri_done,
406 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
407 rc = ia->ri_async_rc;
408 if (rc)
409 goto out;
410
411 ia->ri_async_rc = -ETIMEDOUT;
412 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
413 if (rc) {
414 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
415 __func__, rc);
416 goto out;
417 }
418 wait_for_completion_interruptible_timeout(&ia->ri_done,
419 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
420 rc = ia->ri_async_rc;
421 if (rc)
422 goto out;
423
424 return id;
425
426out:
427 rdma_destroy_id(id);
428 return ERR_PTR(rc);
429}
430
431
432
433
434static void
435rpcrdma_clean_cq(struct ib_cq *cq)
436{
437 struct ib_wc wc;
438 int count = 0;
439
440 while (1 == ib_poll_cq(cq, 1, &wc))
441 ++count;
442
443 if (count)
444 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
445 __func__, count, wc.opcode);
446}
447
448
449
450
451
452
453
454
455
456
457int
458rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
459{
460 int rc, mem_priv;
461 struct ib_device_attr devattr;
462 struct rpcrdma_ia *ia = &xprt->rx_ia;
463
464 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
465 if (IS_ERR(ia->ri_id)) {
466 rc = PTR_ERR(ia->ri_id);
467 goto out1;
468 }
469
470 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
471 if (IS_ERR(ia->ri_pd)) {
472 rc = PTR_ERR(ia->ri_pd);
473 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
474 __func__, rc);
475 goto out2;
476 }
477
478
479
480
481
482
483 rc = ib_query_device(ia->ri_id->device, &devattr);
484 if (rc) {
485 dprintk("RPC: %s: ib_query_device failed %d\n",
486 __func__, rc);
487 goto out2;
488 }
489
490 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
491 ia->ri_have_dma_lkey = 1;
492 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
493 }
494
495 switch (memreg) {
496 case RPCRDMA_MEMWINDOWS:
497 case RPCRDMA_MEMWINDOWS_ASYNC:
498 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
499 dprintk("RPC: %s: MEMWINDOWS registration "
500 "specified but not supported by adapter, "
501 "using slower RPCRDMA_REGISTER\n",
502 __func__);
503 memreg = RPCRDMA_REGISTER;
504 }
505 break;
506 case RPCRDMA_MTHCAFMR:
507 if (!ia->ri_id->device->alloc_fmr) {
508#if RPCRDMA_PERSISTENT_REGISTRATION
509 dprintk("RPC: %s: MTHCAFMR registration "
510 "specified but not supported by adapter, "
511 "using riskier RPCRDMA_ALLPHYSICAL\n",
512 __func__);
513 memreg = RPCRDMA_ALLPHYSICAL;
514#else
515 dprintk("RPC: %s: MTHCAFMR registration "
516 "specified but not supported by adapter, "
517 "using slower RPCRDMA_REGISTER\n",
518 __func__);
519 memreg = RPCRDMA_REGISTER;
520#endif
521 }
522 break;
523 case RPCRDMA_FRMR:
524
525 if ((devattr.device_cap_flags &
526 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
528#if RPCRDMA_PERSISTENT_REGISTRATION
529 dprintk("RPC: %s: FRMR registration "
530 "specified but not supported by adapter, "
531 "using riskier RPCRDMA_ALLPHYSICAL\n",
532 __func__);
533 memreg = RPCRDMA_ALLPHYSICAL;
534#else
535 dprintk("RPC: %s: FRMR registration "
536 "specified but not supported by adapter, "
537 "using slower RPCRDMA_REGISTER\n",
538 __func__);
539 memreg = RPCRDMA_REGISTER;
540#endif
541 }
542 break;
543 }
544
545
546
547
548
549
550
551
552
553 switch (memreg) {
554 case RPCRDMA_BOUNCEBUFFERS:
555 case RPCRDMA_REGISTER:
556 case RPCRDMA_FRMR:
557 break;
558#if RPCRDMA_PERSISTENT_REGISTRATION
559 case RPCRDMA_ALLPHYSICAL:
560 mem_priv = IB_ACCESS_LOCAL_WRITE |
561 IB_ACCESS_REMOTE_WRITE |
562 IB_ACCESS_REMOTE_READ;
563 goto register_setup;
564#endif
565 case RPCRDMA_MEMWINDOWS_ASYNC:
566 case RPCRDMA_MEMWINDOWS:
567 mem_priv = IB_ACCESS_LOCAL_WRITE |
568 IB_ACCESS_MW_BIND;
569 goto register_setup;
570 case RPCRDMA_MTHCAFMR:
571 if (ia->ri_have_dma_lkey)
572 break;
573 mem_priv = IB_ACCESS_LOCAL_WRITE;
574 register_setup:
575 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
576 if (IS_ERR(ia->ri_bind_mem)) {
577 printk(KERN_ALERT "%s: ib_get_dma_mr for "
578 "phys register failed with %lX\n\t"
579 "Will continue with degraded performance\n",
580 __func__, PTR_ERR(ia->ri_bind_mem));
581 memreg = RPCRDMA_REGISTER;
582 ia->ri_bind_mem = NULL;
583 }
584 break;
585 default:
586 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
587 __func__, memreg);
588 rc = -EINVAL;
589 goto out2;
590 }
591 dprintk("RPC: %s: memory registration strategy is %d\n",
592 __func__, memreg);
593
594
595 ia->ri_memreg_strategy = memreg;
596
597 return 0;
598out2:
599 rdma_destroy_id(ia->ri_id);
600 ia->ri_id = NULL;
601out1:
602 return rc;
603}
604
605
606
607
608
609
610void
611rpcrdma_ia_close(struct rpcrdma_ia *ia)
612{
613 int rc;
614
615 dprintk("RPC: %s: entering\n", __func__);
616 if (ia->ri_bind_mem != NULL) {
617 rc = ib_dereg_mr(ia->ri_bind_mem);
618 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
619 __func__, rc);
620 }
621 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
622 if (ia->ri_id->qp)
623 rdma_destroy_qp(ia->ri_id);
624 rdma_destroy_id(ia->ri_id);
625 ia->ri_id = NULL;
626 }
627 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
628 rc = ib_dealloc_pd(ia->ri_pd);
629 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
630 __func__, rc);
631 }
632}
633
634
635
636
637int
638rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
639 struct rpcrdma_create_data_internal *cdata)
640{
641 struct ib_device_attr devattr;
642 int rc, err;
643
644 rc = ib_query_device(ia->ri_id->device, &devattr);
645 if (rc) {
646 dprintk("RPC: %s: ib_query_device failed %d\n",
647 __func__, rc);
648 return rc;
649 }
650
651
652 if (cdata->max_requests > devattr.max_qp_wr)
653 cdata->max_requests = devattr.max_qp_wr;
654
655 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
656 ep->rep_attr.qp_context = ep;
657
658 ep->rep_attr.srq = NULL;
659 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
660 switch (ia->ri_memreg_strategy) {
661 case RPCRDMA_FRMR:
662
663
664
665
666
667
668
669
670
671 ep->rep_attr.cap.max_send_wr *= 7;
672 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
673 cdata->max_requests = devattr.max_qp_wr / 7;
674 if (!cdata->max_requests)
675 return -EINVAL;
676 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
677 }
678 break;
679 case RPCRDMA_MEMWINDOWS_ASYNC:
680 case RPCRDMA_MEMWINDOWS:
681
682 ep->rep_attr.cap.max_send_wr++;
683 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
684 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
685 return -EINVAL;
686 break;
687 default:
688 break;
689 }
690 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
691 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
692 ep->rep_attr.cap.max_recv_sge = 1;
693 ep->rep_attr.cap.max_inline_data = 0;
694 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
695 ep->rep_attr.qp_type = IB_QPT_RC;
696 ep->rep_attr.port_num = ~0;
697
698 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
699 "iovs: send %d recv %d\n",
700 __func__,
701 ep->rep_attr.cap.max_send_wr,
702 ep->rep_attr.cap.max_recv_wr,
703 ep->rep_attr.cap.max_send_sge,
704 ep->rep_attr.cap.max_recv_sge);
705
706
707 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 ;
708 switch (ia->ri_memreg_strategy) {
709 case RPCRDMA_MEMWINDOWS_ASYNC:
710 case RPCRDMA_MEMWINDOWS:
711 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
712 break;
713 default:
714 break;
715 }
716 if (ep->rep_cqinit <= 2)
717 ep->rep_cqinit = 0;
718 INIT_CQCOUNT(ep);
719 ep->rep_ia = ia;
720 init_waitqueue_head(&ep->rep_connect_wait);
721
722
723
724
725
726
727 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
728 rpcrdma_cq_async_error_upcall, NULL,
729 ep->rep_attr.cap.max_recv_wr +
730 ep->rep_attr.cap.max_send_wr + 1, 0);
731 if (IS_ERR(ep->rep_cq)) {
732 rc = PTR_ERR(ep->rep_cq);
733 dprintk("RPC: %s: ib_create_cq failed: %i\n",
734 __func__, rc);
735 goto out1;
736 }
737
738 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
739 if (rc) {
740 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
741 __func__, rc);
742 goto out2;
743 }
744
745 ep->rep_attr.send_cq = ep->rep_cq;
746 ep->rep_attr.recv_cq = ep->rep_cq;
747
748
749
750
751 ep->rep_remote_cma.private_data = NULL;
752 ep->rep_remote_cma.private_data_len = 0;
753
754
755 ep->rep_remote_cma.initiator_depth = 0;
756 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
757 ep->rep_remote_cma.responder_resources = 0;
758 else if (devattr.max_qp_rd_atom > 32)
759 ep->rep_remote_cma.responder_resources = 32;
760 else
761 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
762
763 ep->rep_remote_cma.retry_count = 7;
764 ep->rep_remote_cma.flow_control = 0;
765 ep->rep_remote_cma.rnr_retry_count = 0;
766
767 return 0;
768
769out2:
770 err = ib_destroy_cq(ep->rep_cq);
771 if (err)
772 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773 __func__, err);
774out1:
775 return rc;
776}
777
778
779
780
781
782
783
784
785
786
787
788int
789rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
790{
791 int rc;
792
793 dprintk("RPC: %s: entering, connected is %d\n",
794 __func__, ep->rep_connected);
795
796 if (ia->ri_id->qp) {
797 rc = rpcrdma_ep_disconnect(ep, ia);
798 if (rc)
799 dprintk("RPC: %s: rpcrdma_ep_disconnect"
800 " returned %i\n", __func__, rc);
801 rdma_destroy_qp(ia->ri_id);
802 ia->ri_id->qp = NULL;
803 }
804
805
806 if (ep->rep_pad_mr) {
807 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
808 ep->rep_pad_mr = NULL;
809 }
810
811 rpcrdma_clean_cq(ep->rep_cq);
812 rc = ib_destroy_cq(ep->rep_cq);
813 if (rc)
814 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
815 __func__, rc);
816
817 return rc;
818}
819
820
821
822
823int
824rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
825{
826 struct rdma_cm_id *id;
827 int rc = 0;
828 int retry_count = 0;
829
830 if (ep->rep_connected != 0) {
831 struct rpcrdma_xprt *xprt;
832retry:
833 rc = rpcrdma_ep_disconnect(ep, ia);
834 if (rc && rc != -ENOTCONN)
835 dprintk("RPC: %s: rpcrdma_ep_disconnect"
836 " status %i\n", __func__, rc);
837 rpcrdma_clean_cq(ep->rep_cq);
838
839 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
840 id = rpcrdma_create_id(xprt, ia,
841 (struct sockaddr *)&xprt->rx_data.addr);
842 if (IS_ERR(id)) {
843 rc = PTR_ERR(id);
844 goto out;
845 }
846
847
848
849
850
851
852
853 if (ia->ri_id->device != id->device) {
854 printk("RPC: %s: can't reconnect on "
855 "different device!\n", __func__);
856 rdma_destroy_id(id);
857 rc = -ENETDOWN;
858 goto out;
859 }
860
861 rdma_destroy_qp(ia->ri_id);
862 rdma_destroy_id(ia->ri_id);
863 ia->ri_id = id;
864 }
865
866 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
867 if (rc) {
868 dprintk("RPC: %s: rdma_create_qp failed %i\n",
869 __func__, rc);
870 goto out;
871 }
872
873
874if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
875 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
876 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
877 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
878 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
879 struct ib_qp_attr attr = {
880 .path_mtu = IB_MTU_1024
881 };
882 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
883 }
884}
885
886 ep->rep_connected = 0;
887
888 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
889 if (rc) {
890 dprintk("RPC: %s: rdma_connect() failed with %i\n",
891 __func__, rc);
892 goto out;
893 }
894
895 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
896
897
898
899
900
901
902
903 if (ep->rep_connected == -ECONNREFUSED &&
904 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
905 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
906 goto retry;
907 }
908 if (ep->rep_connected <= 0) {
909
910
911 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
912 (ep->rep_remote_cma.responder_resources == 0 ||
913 ep->rep_remote_cma.initiator_depth !=
914 ep->rep_remote_cma.responder_resources)) {
915 if (ep->rep_remote_cma.responder_resources == 0)
916 ep->rep_remote_cma.responder_resources = 1;
917 ep->rep_remote_cma.initiator_depth =
918 ep->rep_remote_cma.responder_resources;
919 goto retry;
920 }
921 rc = ep->rep_connected;
922 } else {
923 dprintk("RPC: %s: connected\n", __func__);
924 }
925
926out:
927 if (rc)
928 ep->rep_connected = rc;
929 return rc;
930}
931
932
933
934
935
936
937
938
939
940
941int
942rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
943{
944 int rc;
945
946 rpcrdma_clean_cq(ep->rep_cq);
947 rc = rdma_disconnect(ia->ri_id);
948 if (!rc) {
949
950 wait_event_interruptible(ep->rep_connect_wait,
951 ep->rep_connected != 1);
952 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
953 (ep->rep_connected == 1) ? "still " : "dis");
954 } else {
955 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
956 ep->rep_connected = rc;
957 }
958 return rc;
959}
960
961
962
963
964int
965rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
967{
968 char *p;
969 size_t len;
970 int i, rc;
971 struct rpcrdma_mw *r;
972
973 buf->rb_max_requests = cdata->max_requests;
974 spin_lock_init(&buf->rb_lock);
975 atomic_set(&buf->rb_credits, 1);
976
977
978
979
980
981
982
983
984
985
986 len = buf->rb_max_requests *
987 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
988 len += cdata->padding;
989 switch (ia->ri_memreg_strategy) {
990 case RPCRDMA_FRMR:
991 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
992 sizeof(struct rpcrdma_mw);
993 break;
994 case RPCRDMA_MTHCAFMR:
995
996 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
997 sizeof(struct rpcrdma_mw);
998 break;
999 case RPCRDMA_MEMWINDOWS_ASYNC:
1000 case RPCRDMA_MEMWINDOWS:
1001 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002 sizeof(struct rpcrdma_mw);
1003 break;
1004 default:
1005 break;
1006 }
1007
1008
1009 p = kzalloc(len, GFP_KERNEL);
1010 if (p == NULL) {
1011 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012 __func__, len);
1013 rc = -ENOMEM;
1014 goto out;
1015 }
1016 buf->rb_pool = p;
1017
1018 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1022
1023
1024
1025
1026 if (cdata->padding) {
1027 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028 &ep->rep_pad_mr, &ep->rep_pad);
1029 if (rc)
1030 goto out;
1031 }
1032 p += cdata->padding;
1033
1034
1035
1036
1037
1038
1039 INIT_LIST_HEAD(&buf->rb_mws);
1040 r = (struct rpcrdma_mw *)p;
1041 switch (ia->ri_memreg_strategy) {
1042 case RPCRDMA_FRMR:
1043 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045 RPCRDMA_MAX_SEGS);
1046 if (IS_ERR(r->r.frmr.fr_mr)) {
1047 rc = PTR_ERR(r->r.frmr.fr_mr);
1048 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1049 " failed %i\n", __func__, rc);
1050 goto out;
1051 }
1052 r->r.frmr.fr_pgl =
1053 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054 RPCRDMA_MAX_SEGS);
1055 if (IS_ERR(r->r.frmr.fr_pgl)) {
1056 rc = PTR_ERR(r->r.frmr.fr_pgl);
1057 dprintk("RPC: %s: "
1058 "ib_alloc_fast_reg_page_list "
1059 "failed %i\n", __func__, rc);
1060 goto out;
1061 }
1062 list_add(&r->mw_list, &buf->rb_mws);
1063 ++r;
1064 }
1065 break;
1066 case RPCRDMA_MTHCAFMR:
1067
1068 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1069 static struct ib_fmr_attr fa =
1070 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1071 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073 &fa);
1074 if (IS_ERR(r->r.fmr)) {
1075 rc = PTR_ERR(r->r.fmr);
1076 dprintk("RPC: %s: ib_alloc_fmr"
1077 " failed %i\n", __func__, rc);
1078 goto out;
1079 }
1080 list_add(&r->mw_list, &buf->rb_mws);
1081 ++r;
1082 }
1083 break;
1084 case RPCRDMA_MEMWINDOWS_ASYNC:
1085 case RPCRDMA_MEMWINDOWS:
1086
1087 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088 r->r.mw = ib_alloc_mw(ia->ri_pd);
1089 if (IS_ERR(r->r.mw)) {
1090 rc = PTR_ERR(r->r.mw);
1091 dprintk("RPC: %s: ib_alloc_mw"
1092 " failed %i\n", __func__, rc);
1093 goto out;
1094 }
1095 list_add(&r->mw_list, &buf->rb_mws);
1096 ++r;
1097 }
1098 break;
1099 default:
1100 break;
1101 }
1102
1103
1104
1105
1106
1107 for (i = 0; i < buf->rb_max_requests; i++) {
1108 struct rpcrdma_req *req;
1109 struct rpcrdma_rep *rep;
1110
1111 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112
1113
1114 if (len < 4096)
1115 len = 4096;
1116 req = kmalloc(len, GFP_KERNEL);
1117 if (req == NULL) {
1118 dprintk("RPC: %s: request buffer %d alloc"
1119 " failed\n", __func__, i);
1120 rc = -ENOMEM;
1121 goto out;
1122 }
1123 memset(req, 0, sizeof(struct rpcrdma_req));
1124 buf->rb_send_bufs[i] = req;
1125 buf->rb_send_bufs[i]->rl_buffer = buf;
1126
1127 rc = rpcrdma_register_internal(ia, req->rl_base,
1128 len - offsetof(struct rpcrdma_req, rl_base),
1129 &buf->rb_send_bufs[i]->rl_handle,
1130 &buf->rb_send_bufs[i]->rl_iov);
1131 if (rc)
1132 goto out;
1133
1134 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1135
1136 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137 rep = kmalloc(len, GFP_KERNEL);
1138 if (rep == NULL) {
1139 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1140 __func__, i);
1141 rc = -ENOMEM;
1142 goto out;
1143 }
1144 memset(rep, 0, sizeof(struct rpcrdma_rep));
1145 buf->rb_recv_bufs[i] = rep;
1146 buf->rb_recv_bufs[i]->rr_buffer = buf;
1147 init_waitqueue_head(&rep->rr_unbind);
1148
1149 rc = rpcrdma_register_internal(ia, rep->rr_base,
1150 len - offsetof(struct rpcrdma_rep, rr_base),
1151 &buf->rb_recv_bufs[i]->rr_handle,
1152 &buf->rb_recv_bufs[i]->rr_iov);
1153 if (rc)
1154 goto out;
1155
1156 }
1157 dprintk("RPC: %s: max_requests %d\n",
1158 __func__, buf->rb_max_requests);
1159
1160 return 0;
1161out:
1162 rpcrdma_buffer_destroy(buf);
1163 return rc;
1164}
1165
1166
1167
1168
1169
1170
1171
1172void
1173rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1174{
1175 int rc, i;
1176 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1177 struct rpcrdma_mw *r;
1178
1179
1180
1181
1182
1183
1184
1185
1186 dprintk("RPC: %s: entering\n", __func__);
1187
1188 for (i = 0; i < buf->rb_max_requests; i++) {
1189 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190 rpcrdma_deregister_internal(ia,
1191 buf->rb_recv_bufs[i]->rr_handle,
1192 &buf->rb_recv_bufs[i]->rr_iov);
1193 kfree(buf->rb_recv_bufs[i]);
1194 }
1195 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196 while (!list_empty(&buf->rb_mws)) {
1197 r = list_entry(buf->rb_mws.next,
1198 struct rpcrdma_mw, mw_list);
1199 list_del(&r->mw_list);
1200 switch (ia->ri_memreg_strategy) {
1201 case RPCRDMA_FRMR:
1202 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203 if (rc)
1204 dprintk("RPC: %s:"
1205 " ib_dereg_mr"
1206 " failed %i\n",
1207 __func__, rc);
1208 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209 break;
1210 case RPCRDMA_MTHCAFMR:
1211 rc = ib_dealloc_fmr(r->r.fmr);
1212 if (rc)
1213 dprintk("RPC: %s:"
1214 " ib_dealloc_fmr"
1215 " failed %i\n",
1216 __func__, rc);
1217 break;
1218 case RPCRDMA_MEMWINDOWS_ASYNC:
1219 case RPCRDMA_MEMWINDOWS:
1220 rc = ib_dealloc_mw(r->r.mw);
1221 if (rc)
1222 dprintk("RPC: %s:"
1223 " ib_dealloc_mw"
1224 " failed %i\n",
1225 __func__, rc);
1226 break;
1227 default:
1228 break;
1229 }
1230 }
1231 rpcrdma_deregister_internal(ia,
1232 buf->rb_send_bufs[i]->rl_handle,
1233 &buf->rb_send_bufs[i]->rl_iov);
1234 kfree(buf->rb_send_bufs[i]);
1235 }
1236 }
1237
1238 kfree(buf->rb_pool);
1239}
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250struct rpcrdma_req *
1251rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1252{
1253 struct rpcrdma_req *req;
1254 unsigned long flags;
1255 int i;
1256 struct rpcrdma_mw *r;
1257
1258 spin_lock_irqsave(&buffers->rb_lock, flags);
1259 if (buffers->rb_send_index == buffers->rb_max_requests) {
1260 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261 dprintk("RPC: %s: out of request buffers\n", __func__);
1262 return ((struct rpcrdma_req *)NULL);
1263 }
1264
1265 req = buffers->rb_send_bufs[buffers->rb_send_index];
1266 if (buffers->rb_send_index < buffers->rb_recv_index) {
1267 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1268 __func__,
1269 buffers->rb_recv_index - buffers->rb_send_index);
1270 req->rl_reply = NULL;
1271 } else {
1272 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1274 }
1275 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276 if (!list_empty(&buffers->rb_mws)) {
1277 i = RPCRDMA_MAX_SEGS - 1;
1278 do {
1279 r = list_entry(buffers->rb_mws.next,
1280 struct rpcrdma_mw, mw_list);
1281 list_del(&r->mw_list);
1282 req->rl_segments[i].mr_chunk.rl_mw = r;
1283 } while (--i >= 0);
1284 }
1285 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286 return req;
1287}
1288
1289
1290
1291
1292
1293void
1294rpcrdma_buffer_put(struct rpcrdma_req *req)
1295{
1296 struct rpcrdma_buffer *buffers = req->rl_buffer;
1297 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298 int i;
1299 unsigned long flags;
1300
1301 BUG_ON(req->rl_nchunks != 0);
1302 spin_lock_irqsave(&buffers->rb_lock, flags);
1303 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304 req->rl_niovs = 0;
1305 if (req->rl_reply) {
1306 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307 init_waitqueue_head(&req->rl_reply->rr_unbind);
1308 req->rl_reply->rr_func = NULL;
1309 req->rl_reply = NULL;
1310 }
1311 switch (ia->ri_memreg_strategy) {
1312 case RPCRDMA_FRMR:
1313 case RPCRDMA_MTHCAFMR:
1314 case RPCRDMA_MEMWINDOWS_ASYNC:
1315 case RPCRDMA_MEMWINDOWS:
1316
1317
1318
1319
1320 i = 1;
1321 do {
1322 struct rpcrdma_mw **mw;
1323 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325 *mw = NULL;
1326 } while (++i < RPCRDMA_MAX_SEGS);
1327 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328 &buffers->rb_mws);
1329 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330 break;
1331 default:
1332 break;
1333 }
1334 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1335}
1336
1337
1338
1339
1340
1341
1342void
1343rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1344{
1345 struct rpcrdma_buffer *buffers = req->rl_buffer;
1346 unsigned long flags;
1347
1348 if (req->rl_iov.length == 0)
1349 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350 spin_lock_irqsave(&buffers->rb_lock, flags);
1351 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1354 }
1355 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1356}
1357
1358
1359
1360
1361
1362
1363void
1364rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1365{
1366 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367 unsigned long flags;
1368
1369 rep->rr_func = NULL;
1370 spin_lock_irqsave(&buffers->rb_lock, flags);
1371 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1373}
1374
1375
1376
1377
1378
1379int
1380rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381 struct ib_mr **mrp, struct ib_sge *iov)
1382{
1383 struct ib_phys_buf ipb;
1384 struct ib_mr *mr;
1385 int rc;
1386
1387
1388
1389
1390 iov->addr = ib_dma_map_single(ia->ri_id->device,
1391 va, len, DMA_BIDIRECTIONAL);
1392 iov->length = len;
1393
1394 if (ia->ri_have_dma_lkey) {
1395 *mrp = NULL;
1396 iov->lkey = ia->ri_dma_lkey;
1397 return 0;
1398 } else if (ia->ri_bind_mem != NULL) {
1399 *mrp = NULL;
1400 iov->lkey = ia->ri_bind_mem->lkey;
1401 return 0;
1402 }
1403
1404 ipb.addr = iov->addr;
1405 ipb.size = iov->length;
1406 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1408
1409 dprintk("RPC: %s: phys convert: 0x%llx "
1410 "registered 0x%llx length %d\n",
1411 __func__, (unsigned long long)ipb.addr,
1412 (unsigned long long)iov->addr, len);
1413
1414 if (IS_ERR(mr)) {
1415 *mrp = NULL;
1416 rc = PTR_ERR(mr);
1417 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1418 } else {
1419 *mrp = mr;
1420 iov->lkey = mr->lkey;
1421 rc = 0;
1422 }
1423
1424 return rc;
1425}
1426
1427int
1428rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429 struct ib_mr *mr, struct ib_sge *iov)
1430{
1431 int rc;
1432
1433 ib_dma_unmap_single(ia->ri_id->device,
1434 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1435
1436 if (NULL == mr)
1437 return 0;
1438
1439 rc = ib_dereg_mr(mr);
1440 if (rc)
1441 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1442 return rc;
1443}
1444
1445
1446
1447
1448
1449static void
1450rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1451{
1452 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453 seg->mr_dmalen = seg->mr_len;
1454 if (seg->mr_page)
1455 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456 seg->mr_page, offset_in_page(seg->mr_offset),
1457 seg->mr_dmalen, seg->mr_dir);
1458 else
1459 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460 seg->mr_offset,
1461 seg->mr_dmalen, seg->mr_dir);
1462 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464 __func__,
1465 (unsigned long long)seg->mr_dma,
1466 seg->mr_offset, seg->mr_dmalen);
1467 }
1468}
1469
1470static void
1471rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1472{
1473 if (seg->mr_page)
1474 ib_dma_unmap_page(ia->ri_id->device,
1475 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1476 else
1477 ib_dma_unmap_single(ia->ri_id->device,
1478 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1479}
1480
1481static int
1482rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483 int *nsegs, int writing, struct rpcrdma_ia *ia,
1484 struct rpcrdma_xprt *r_xprt)
1485{
1486 struct rpcrdma_mr_seg *seg1 = seg;
1487 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1488
1489 u8 key;
1490 int len, pageoff;
1491 int i, rc;
1492
1493 pageoff = offset_in_page(seg1->mr_offset);
1494 seg1->mr_offset -= pageoff;
1495 seg1->mr_len += pageoff;
1496 len = -pageoff;
1497 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1498 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1499 for (i = 0; i < *nsegs;) {
1500 rpcrdma_map_one(ia, seg, writing);
1501 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1502 len += seg->mr_len;
1503 BUG_ON(seg->mr_len > PAGE_SIZE);
1504 ++seg;
1505 ++i;
1506
1507 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1508 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1509 break;
1510 }
1511 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1512 __func__, seg1->mr_chunk.rl_mw, i);
1513
1514 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1516 __func__,
1517 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518
1519 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521 invalidate_wr.next = &frmr_wr;
1522 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524 invalidate_wr.ex.invalidate_rkey =
1525 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526 DECR_CQCOUNT(&r_xprt->rx_ep);
1527 post_wr = &invalidate_wr;
1528 } else
1529 post_wr = &frmr_wr;
1530
1531
1532 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1533 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1534
1535
1536 memset(&frmr_wr, 0, sizeof frmr_wr);
1537 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1538 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1539 frmr_wr.send_flags = IB_SEND_SIGNALED;
1540 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1541 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1542 frmr_wr.wr.fast_reg.page_list_len = i;
1543 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1544 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1545 BUG_ON(frmr_wr.wr.fast_reg.length < len);
1546 frmr_wr.wr.fast_reg.access_flags = (writing ?
1547 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1548 IB_ACCESS_REMOTE_READ);
1549 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1550 DECR_CQCOUNT(&r_xprt->rx_ep);
1551
1552 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1553
1554 if (rc) {
1555 dprintk("RPC: %s: failed ib_post_send for register,"
1556 " status %i\n", __func__, rc);
1557 while (i--)
1558 rpcrdma_unmap_one(ia, --seg);
1559 } else {
1560 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561 seg1->mr_base = seg1->mr_dma + pageoff;
1562 seg1->mr_nsegs = i;
1563 seg1->mr_len = len;
1564 }
1565 *nsegs = i;
1566 return rc;
1567}
1568
1569static int
1570rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1571 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1572{
1573 struct rpcrdma_mr_seg *seg1 = seg;
1574 struct ib_send_wr invalidate_wr, *bad_wr;
1575 int rc;
1576
1577 while (seg1->mr_nsegs--)
1578 rpcrdma_unmap_one(ia, seg++);
1579
1580 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1581 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1582 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1583 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1584 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1585 DECR_CQCOUNT(&r_xprt->rx_ep);
1586
1587 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1588 if (rc)
1589 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1590 " status %i\n", __func__, rc);
1591 return rc;
1592}
1593
1594static int
1595rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1596 int *nsegs, int writing, struct rpcrdma_ia *ia)
1597{
1598 struct rpcrdma_mr_seg *seg1 = seg;
1599 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1600 int len, pageoff, i, rc;
1601
1602 pageoff = offset_in_page(seg1->mr_offset);
1603 seg1->mr_offset -= pageoff;
1604 seg1->mr_len += pageoff;
1605 len = -pageoff;
1606 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1607 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1608 for (i = 0; i < *nsegs;) {
1609 rpcrdma_map_one(ia, seg, writing);
1610 physaddrs[i] = seg->mr_dma;
1611 len += seg->mr_len;
1612 ++seg;
1613 ++i;
1614
1615 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1616 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1617 break;
1618 }
1619 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1620 physaddrs, i, seg1->mr_dma);
1621 if (rc) {
1622 dprintk("RPC: %s: failed ib_map_phys_fmr "
1623 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1624 len, (unsigned long long)seg1->mr_dma,
1625 pageoff, i, rc);
1626 while (i--)
1627 rpcrdma_unmap_one(ia, --seg);
1628 } else {
1629 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1630 seg1->mr_base = seg1->mr_dma + pageoff;
1631 seg1->mr_nsegs = i;
1632 seg1->mr_len = len;
1633 }
1634 *nsegs = i;
1635 return rc;
1636}
1637
1638static int
1639rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1640 struct rpcrdma_ia *ia)
1641{
1642 struct rpcrdma_mr_seg *seg1 = seg;
1643 LIST_HEAD(l);
1644 int rc;
1645
1646 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1647 rc = ib_unmap_fmr(&l);
1648 while (seg1->mr_nsegs--)
1649 rpcrdma_unmap_one(ia, seg++);
1650 if (rc)
1651 dprintk("RPC: %s: failed ib_unmap_fmr,"
1652 " status %i\n", __func__, rc);
1653 return rc;
1654}
1655
1656static int
1657rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1658 int *nsegs, int writing, struct rpcrdma_ia *ia,
1659 struct rpcrdma_xprt *r_xprt)
1660{
1661 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1662 IB_ACCESS_REMOTE_READ);
1663 struct ib_mw_bind param;
1664 int rc;
1665
1666 *nsegs = 1;
1667 rpcrdma_map_one(ia, seg, writing);
1668 param.mr = ia->ri_bind_mem;
1669 param.wr_id = 0ULL;
1670 param.addr = seg->mr_dma;
1671 param.length = seg->mr_len;
1672 param.send_flags = 0;
1673 param.mw_access_flags = mem_priv;
1674
1675 DECR_CQCOUNT(&r_xprt->rx_ep);
1676 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
1677 if (rc) {
1678 dprintk("RPC: %s: failed ib_bind_mw "
1679 "%u@0x%llx status %i\n",
1680 __func__, seg->mr_len,
1681 (unsigned long long)seg->mr_dma, rc);
1682 rpcrdma_unmap_one(ia, seg);
1683 } else {
1684 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1685 seg->mr_base = param.addr;
1686 seg->mr_nsegs = 1;
1687 }
1688 return rc;
1689}
1690
1691static int
1692rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1693 struct rpcrdma_ia *ia,
1694 struct rpcrdma_xprt *r_xprt, void **r)
1695{
1696 struct ib_mw_bind param;
1697 LIST_HEAD(l);
1698 int rc;
1699
1700 BUG_ON(seg->mr_nsegs != 1);
1701 param.mr = ia->ri_bind_mem;
1702 param.addr = 0ULL;
1703 param.length = 0;
1704 param.mw_access_flags = 0;
1705 if (*r) {
1706 param.wr_id = (u64) (unsigned long) *r;
1707 param.send_flags = IB_SEND_SIGNALED;
1708 INIT_CQCOUNT(&r_xprt->rx_ep);
1709 } else {
1710 param.wr_id = 0ULL;
1711 param.send_flags = 0;
1712 DECR_CQCOUNT(&r_xprt->rx_ep);
1713 }
1714 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
1715 rpcrdma_unmap_one(ia, seg);
1716 if (rc)
1717 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1718 " status %i\n", __func__, rc);
1719 else
1720 *r = NULL;
1721 return rc;
1722}
1723
1724static int
1725rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1726 int *nsegs, int writing, struct rpcrdma_ia *ia)
1727{
1728 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1729 IB_ACCESS_REMOTE_READ);
1730 struct rpcrdma_mr_seg *seg1 = seg;
1731 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1732 int len, i, rc = 0;
1733
1734 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1735 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1736 for (len = 0, i = 0; i < *nsegs;) {
1737 rpcrdma_map_one(ia, seg, writing);
1738 ipb[i].addr = seg->mr_dma;
1739 ipb[i].size = seg->mr_len;
1740 len += seg->mr_len;
1741 ++seg;
1742 ++i;
1743
1744 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1745 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1746 break;
1747 }
1748 seg1->mr_base = seg1->mr_dma;
1749 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1750 ipb, i, mem_priv, &seg1->mr_base);
1751 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1752 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1753 dprintk("RPC: %s: failed ib_reg_phys_mr "
1754 "%u@0x%llx (%d)... status %i\n",
1755 __func__, len,
1756 (unsigned long long)seg1->mr_dma, i, rc);
1757 while (i--)
1758 rpcrdma_unmap_one(ia, --seg);
1759 } else {
1760 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1761 seg1->mr_nsegs = i;
1762 seg1->mr_len = len;
1763 }
1764 *nsegs = i;
1765 return rc;
1766}
1767
1768static int
1769rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1770 struct rpcrdma_ia *ia)
1771{
1772 struct rpcrdma_mr_seg *seg1 = seg;
1773 int rc;
1774
1775 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1776 seg1->mr_chunk.rl_mr = NULL;
1777 while (seg1->mr_nsegs--)
1778 rpcrdma_unmap_one(ia, seg++);
1779 if (rc)
1780 dprintk("RPC: %s: failed ib_dereg_mr,"
1781 " status %i\n", __func__, rc);
1782 return rc;
1783}
1784
1785int
1786rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1787 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1788{
1789 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1790 int rc = 0;
1791
1792 switch (ia->ri_memreg_strategy) {
1793
1794#if RPCRDMA_PERSISTENT_REGISTRATION
1795 case RPCRDMA_ALLPHYSICAL:
1796 rpcrdma_map_one(ia, seg, writing);
1797 seg->mr_rkey = ia->ri_bind_mem->rkey;
1798 seg->mr_base = seg->mr_dma;
1799 seg->mr_nsegs = 1;
1800 nsegs = 1;
1801 break;
1802#endif
1803
1804
1805 case RPCRDMA_FRMR:
1806 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1807 break;
1808
1809
1810 case RPCRDMA_MTHCAFMR:
1811 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1812 break;
1813
1814
1815 case RPCRDMA_MEMWINDOWS_ASYNC:
1816 case RPCRDMA_MEMWINDOWS:
1817 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1818 break;
1819
1820
1821 default:
1822 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1823 break;
1824 }
1825 if (rc)
1826 return -1;
1827
1828 return nsegs;
1829}
1830
1831int
1832rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1833 struct rpcrdma_xprt *r_xprt, void *r)
1834{
1835 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1836 int nsegs = seg->mr_nsegs, rc;
1837
1838 switch (ia->ri_memreg_strategy) {
1839
1840#if RPCRDMA_PERSISTENT_REGISTRATION
1841 case RPCRDMA_ALLPHYSICAL:
1842 BUG_ON(nsegs != 1);
1843 rpcrdma_unmap_one(ia, seg);
1844 rc = 0;
1845 break;
1846#endif
1847
1848 case RPCRDMA_FRMR:
1849 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1850 break;
1851
1852 case RPCRDMA_MTHCAFMR:
1853 rc = rpcrdma_deregister_fmr_external(seg, ia);
1854 break;
1855
1856 case RPCRDMA_MEMWINDOWS_ASYNC:
1857 case RPCRDMA_MEMWINDOWS:
1858 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1859 break;
1860
1861 default:
1862 rc = rpcrdma_deregister_default_external(seg, ia);
1863 break;
1864 }
1865 if (r) {
1866 struct rpcrdma_rep *rep = r;
1867 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1868 rep->rr_func = NULL;
1869 func(rep);
1870 }
1871 return nsegs;
1872}
1873
1874
1875
1876
1877
1878
1879int
1880rpcrdma_ep_post(struct rpcrdma_ia *ia,
1881 struct rpcrdma_ep *ep,
1882 struct rpcrdma_req *req)
1883{
1884 struct ib_send_wr send_wr, *send_wr_fail;
1885 struct rpcrdma_rep *rep = req->rl_reply;
1886 int rc;
1887
1888 if (rep) {
1889 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1890 if (rc)
1891 goto out;
1892 req->rl_reply = NULL;
1893 }
1894
1895 send_wr.next = NULL;
1896 send_wr.wr_id = 0ULL;
1897 send_wr.sg_list = req->rl_send_iov;
1898 send_wr.num_sge = req->rl_niovs;
1899 send_wr.opcode = IB_WR_SEND;
1900 if (send_wr.num_sge == 4)
1901 ib_dma_sync_single_for_device(ia->ri_id->device,
1902 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1903 DMA_TO_DEVICE);
1904 ib_dma_sync_single_for_device(ia->ri_id->device,
1905 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1906 DMA_TO_DEVICE);
1907 ib_dma_sync_single_for_device(ia->ri_id->device,
1908 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1909 DMA_TO_DEVICE);
1910
1911 if (DECR_CQCOUNT(ep) > 0)
1912 send_wr.send_flags = 0;
1913 else {
1914 INIT_CQCOUNT(ep);
1915 send_wr.send_flags = IB_SEND_SIGNALED;
1916 }
1917
1918 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1919 if (rc)
1920 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1921 rc);
1922out:
1923 return rc;
1924}
1925
1926
1927
1928
1929int
1930rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1931 struct rpcrdma_ep *ep,
1932 struct rpcrdma_rep *rep)
1933{
1934 struct ib_recv_wr recv_wr, *recv_wr_fail;
1935 int rc;
1936
1937 recv_wr.next = NULL;
1938 recv_wr.wr_id = (u64) (unsigned long) rep;
1939 recv_wr.sg_list = &rep->rr_iov;
1940 recv_wr.num_sge = 1;
1941
1942 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1943 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1944
1945 DECR_CQCOUNT(ep);
1946 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1947
1948 if (rc)
1949 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1950 rc);
1951 return rc;
1952}
1953