1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50#include <linux/pci.h>
51#include <linux/slab.h>
52
53#include "xprt_rdma.h"
54
55
56
57
58
59#ifdef RPC_DEBUG
60# define RPCDBG_FACILITY RPCDBG_TRANS
61#endif
62
63
64
65
66
67
68
69
70
71
72
73static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74static LIST_HEAD(rpcrdma_tasklets_g);
75
76static void
77rpcrdma_run_tasklet(unsigned long data)
78{
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
82
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
97
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 }
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
101}
102
103static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104
105static inline void
106rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107{
108 unsigned long flags;
109
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
114}
115
116static void
117rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118{
119 struct rpcrdma_ep *ep = context;
120
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
127 }
128}
129
130static void
131rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132{
133 struct rpcrdma_ep *ep = context;
134
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
141 }
142}
143
144static inline
145void rpcrdma_event_process(struct ib_wc *wc)
146{
147 struct rpcrdma_rep *rep =
148 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
149
150 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
151 __func__, rep, wc->status, wc->opcode, wc->byte_len);
152
153 if (!rep)
154 return;
155
156 if (IB_WC_SUCCESS != wc->status) {
157 dprintk("RPC: %s: %s WC status %X, connection lost\n",
158 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
159 wc->status);
160 rep->rr_len = ~0U;
161 rpcrdma_schedule_tasklet(rep);
162 return;
163 }
164
165 switch (wc->opcode) {
166 case IB_WC_RECV:
167 rep->rr_len = wc->byte_len;
168 ib_dma_sync_single_for_cpu(
169 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
170 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
171
172 if (rep->rr_len >= 16) {
173 struct rpcrdma_msg *p =
174 (struct rpcrdma_msg *) rep->rr_base;
175 unsigned int credits = ntohl(p->rm_credit);
176 if (credits == 0) {
177 dprintk("RPC: %s: server"
178 " dropped credits to 0!\n", __func__);
179
180 credits = 1;
181 } else if (credits > rep->rr_buffer->rb_max_requests) {
182 dprintk("RPC: %s: server"
183 " over-crediting: %d (%d)\n",
184 __func__, credits,
185 rep->rr_buffer->rb_max_requests);
186 credits = rep->rr_buffer->rb_max_requests;
187 }
188 atomic_set(&rep->rr_buffer->rb_credits, credits);
189 }
190
191 case IB_WC_BIND_MW:
192 rpcrdma_schedule_tasklet(rep);
193 break;
194 default:
195 dprintk("RPC: %s: unexpected WC event %X\n",
196 __func__, wc->opcode);
197 break;
198 }
199}
200
201static inline int
202rpcrdma_cq_poll(struct ib_cq *cq)
203{
204 struct ib_wc wc;
205 int rc;
206
207 for (;;) {
208 rc = ib_poll_cq(cq, 1, &wc);
209 if (rc < 0) {
210 dprintk("RPC: %s: ib_poll_cq failed %i\n",
211 __func__, rc);
212 return rc;
213 }
214 if (rc == 0)
215 break;
216
217 rpcrdma_event_process(&wc);
218 }
219
220 return 0;
221}
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238static void
239rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
240{
241 int rc;
242
243 rc = rpcrdma_cq_poll(cq);
244 if (rc)
245 return;
246
247 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
248 if (rc) {
249 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
250 __func__, rc);
251 return;
252 }
253
254 rpcrdma_cq_poll(cq);
255}
256
257#ifdef RPC_DEBUG
258static const char * const conn[] = {
259 "address resolved",
260 "address error",
261 "route resolved",
262 "route error",
263 "connect request",
264 "connect response",
265 "connect error",
266 "unreachable",
267 "rejected",
268 "established",
269 "disconnected",
270 "device removal"
271};
272#endif
273
274static int
275rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
276{
277 struct rpcrdma_xprt *xprt = id->context;
278 struct rpcrdma_ia *ia = &xprt->rx_ia;
279 struct rpcrdma_ep *ep = &xprt->rx_ep;
280#ifdef RPC_DEBUG
281 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
282#endif
283 struct ib_qp_attr attr;
284 struct ib_qp_init_attr iattr;
285 int connstate = 0;
286
287 switch (event->event) {
288 case RDMA_CM_EVENT_ADDR_RESOLVED:
289 case RDMA_CM_EVENT_ROUTE_RESOLVED:
290 ia->ri_async_rc = 0;
291 complete(&ia->ri_done);
292 break;
293 case RDMA_CM_EVENT_ADDR_ERROR:
294 ia->ri_async_rc = -EHOSTUNREACH;
295 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
296 __func__, ep);
297 complete(&ia->ri_done);
298 break;
299 case RDMA_CM_EVENT_ROUTE_ERROR:
300 ia->ri_async_rc = -ENETUNREACH;
301 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
302 __func__, ep);
303 complete(&ia->ri_done);
304 break;
305 case RDMA_CM_EVENT_ESTABLISHED:
306 connstate = 1;
307 ib_query_qp(ia->ri_id->qp, &attr,
308 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
309 &iattr);
310 dprintk("RPC: %s: %d responder resources"
311 " (%d initiator)\n",
312 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
313 goto connected;
314 case RDMA_CM_EVENT_CONNECT_ERROR:
315 connstate = -ENOTCONN;
316 goto connected;
317 case RDMA_CM_EVENT_UNREACHABLE:
318 connstate = -ENETDOWN;
319 goto connected;
320 case RDMA_CM_EVENT_REJECTED:
321 connstate = -ECONNREFUSED;
322 goto connected;
323 case RDMA_CM_EVENT_DISCONNECTED:
324 connstate = -ECONNABORTED;
325 goto connected;
326 case RDMA_CM_EVENT_DEVICE_REMOVAL:
327 connstate = -ENODEV;
328connected:
329 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
330 __func__,
331 (event->event <= 11) ? conn[event->event] :
332 "unknown connection error",
333 &addr->sin_addr.s_addr,
334 ntohs(addr->sin_port),
335 ep, event->event);
336 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
337 dprintk("RPC: %s: %sconnected\n",
338 __func__, connstate > 0 ? "" : "dis");
339 ep->rep_connected = connstate;
340 ep->rep_func(ep);
341 wake_up_all(&ep->rep_connect_wait);
342 break;
343 default:
344 dprintk("RPC: %s: unexpected CM event %d\n",
345 __func__, event->event);
346 break;
347 }
348
349#ifdef RPC_DEBUG
350 if (connstate == 1) {
351 int ird = attr.max_dest_rd_atomic;
352 int tird = ep->rep_remote_cma.responder_resources;
353 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
354 "on %s, memreg %d slots %d ird %d%s\n",
355 &addr->sin_addr.s_addr,
356 ntohs(addr->sin_port),
357 ia->ri_id->device->name,
358 ia->ri_memreg_strategy,
359 xprt->rx_buf.rb_max_requests,
360 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
361 } else if (connstate < 0) {
362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
363 &addr->sin_addr.s_addr,
364 ntohs(addr->sin_port),
365 connstate);
366 }
367#endif
368
369 return 0;
370}
371
372static struct rdma_cm_id *
373rpcrdma_create_id(struct rpcrdma_xprt *xprt,
374 struct rpcrdma_ia *ia, struct sockaddr *addr)
375{
376 struct rdma_cm_id *id;
377 int rc;
378
379 init_completion(&ia->ri_done);
380
381 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
382 if (IS_ERR(id)) {
383 rc = PTR_ERR(id);
384 dprintk("RPC: %s: rdma_create_id() failed %i\n",
385 __func__, rc);
386 return id;
387 }
388
389 ia->ri_async_rc = -ETIMEDOUT;
390 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
391 if (rc) {
392 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
393 __func__, rc);
394 goto out;
395 }
396 wait_for_completion_interruptible_timeout(&ia->ri_done,
397 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
398 rc = ia->ri_async_rc;
399 if (rc)
400 goto out;
401
402 ia->ri_async_rc = -ETIMEDOUT;
403 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
404 if (rc) {
405 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
406 __func__, rc);
407 goto out;
408 }
409 wait_for_completion_interruptible_timeout(&ia->ri_done,
410 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
411 rc = ia->ri_async_rc;
412 if (rc)
413 goto out;
414
415 return id;
416
417out:
418 rdma_destroy_id(id);
419 return ERR_PTR(rc);
420}
421
422
423
424
425static void
426rpcrdma_clean_cq(struct ib_cq *cq)
427{
428 struct ib_wc wc;
429 int count = 0;
430
431 while (1 == ib_poll_cq(cq, 1, &wc))
432 ++count;
433
434 if (count)
435 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
436 __func__, count, wc.opcode);
437}
438
439
440
441
442
443
444
445
446
447
448int
449rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
450{
451 int rc, mem_priv;
452 struct ib_device_attr devattr;
453 struct rpcrdma_ia *ia = &xprt->rx_ia;
454
455 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
456 if (IS_ERR(ia->ri_id)) {
457 rc = PTR_ERR(ia->ri_id);
458 goto out1;
459 }
460
461 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
462 if (IS_ERR(ia->ri_pd)) {
463 rc = PTR_ERR(ia->ri_pd);
464 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
465 __func__, rc);
466 goto out2;
467 }
468
469
470
471
472
473
474 rc = ib_query_device(ia->ri_id->device, &devattr);
475 if (rc) {
476 dprintk("RPC: %s: ib_query_device failed %d\n",
477 __func__, rc);
478 goto out2;
479 }
480
481 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
482 ia->ri_have_dma_lkey = 1;
483 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
484 }
485
486 switch (memreg) {
487 case RPCRDMA_MEMWINDOWS:
488 case RPCRDMA_MEMWINDOWS_ASYNC:
489 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
490 dprintk("RPC: %s: MEMWINDOWS registration "
491 "specified but not supported by adapter, "
492 "using slower RPCRDMA_REGISTER\n",
493 __func__);
494 memreg = RPCRDMA_REGISTER;
495 }
496 break;
497 case RPCRDMA_MTHCAFMR:
498 if (!ia->ri_id->device->alloc_fmr) {
499#if RPCRDMA_PERSISTENT_REGISTRATION
500 dprintk("RPC: %s: MTHCAFMR registration "
501 "specified but not supported by adapter, "
502 "using riskier RPCRDMA_ALLPHYSICAL\n",
503 __func__);
504 memreg = RPCRDMA_ALLPHYSICAL;
505#else
506 dprintk("RPC: %s: MTHCAFMR registration "
507 "specified but not supported by adapter, "
508 "using slower RPCRDMA_REGISTER\n",
509 __func__);
510 memreg = RPCRDMA_REGISTER;
511#endif
512 }
513 break;
514 case RPCRDMA_FRMR:
515
516 if ((devattr.device_cap_flags &
517 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
518 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
519#if RPCRDMA_PERSISTENT_REGISTRATION
520 dprintk("RPC: %s: FRMR registration "
521 "specified but not supported by adapter, "
522 "using riskier RPCRDMA_ALLPHYSICAL\n",
523 __func__);
524 memreg = RPCRDMA_ALLPHYSICAL;
525#else
526 dprintk("RPC: %s: FRMR registration "
527 "specified but not supported by adapter, "
528 "using slower RPCRDMA_REGISTER\n",
529 __func__);
530 memreg = RPCRDMA_REGISTER;
531#endif
532 }
533 break;
534 }
535
536
537
538
539
540
541
542
543
544 switch (memreg) {
545 case RPCRDMA_BOUNCEBUFFERS:
546 case RPCRDMA_REGISTER:
547 case RPCRDMA_FRMR:
548 break;
549#if RPCRDMA_PERSISTENT_REGISTRATION
550 case RPCRDMA_ALLPHYSICAL:
551 mem_priv = IB_ACCESS_LOCAL_WRITE |
552 IB_ACCESS_REMOTE_WRITE |
553 IB_ACCESS_REMOTE_READ;
554 goto register_setup;
555#endif
556 case RPCRDMA_MEMWINDOWS_ASYNC:
557 case RPCRDMA_MEMWINDOWS:
558 mem_priv = IB_ACCESS_LOCAL_WRITE |
559 IB_ACCESS_MW_BIND;
560 goto register_setup;
561 case RPCRDMA_MTHCAFMR:
562 if (ia->ri_have_dma_lkey)
563 break;
564 mem_priv = IB_ACCESS_LOCAL_WRITE;
565 register_setup:
566 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
567 if (IS_ERR(ia->ri_bind_mem)) {
568 printk(KERN_ALERT "%s: ib_get_dma_mr for "
569 "phys register failed with %lX\n\t"
570 "Will continue with degraded performance\n",
571 __func__, PTR_ERR(ia->ri_bind_mem));
572 memreg = RPCRDMA_REGISTER;
573 ia->ri_bind_mem = NULL;
574 }
575 break;
576 default:
577 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
578 __func__, memreg);
579 rc = -EINVAL;
580 goto out2;
581 }
582 dprintk("RPC: %s: memory registration strategy is %d\n",
583 __func__, memreg);
584
585
586 ia->ri_memreg_strategy = memreg;
587
588 return 0;
589out2:
590 rdma_destroy_id(ia->ri_id);
591 ia->ri_id = NULL;
592out1:
593 return rc;
594}
595
596
597
598
599
600
601void
602rpcrdma_ia_close(struct rpcrdma_ia *ia)
603{
604 int rc;
605
606 dprintk("RPC: %s: entering\n", __func__);
607 if (ia->ri_bind_mem != NULL) {
608 rc = ib_dereg_mr(ia->ri_bind_mem);
609 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
610 __func__, rc);
611 }
612 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
613 if (ia->ri_id->qp)
614 rdma_destroy_qp(ia->ri_id);
615 rdma_destroy_id(ia->ri_id);
616 ia->ri_id = NULL;
617 }
618 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
619 rc = ib_dealloc_pd(ia->ri_pd);
620 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
621 __func__, rc);
622 }
623}
624
625
626
627
628int
629rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
630 struct rpcrdma_create_data_internal *cdata)
631{
632 struct ib_device_attr devattr;
633 int rc, err;
634
635 rc = ib_query_device(ia->ri_id->device, &devattr);
636 if (rc) {
637 dprintk("RPC: %s: ib_query_device failed %d\n",
638 __func__, rc);
639 return rc;
640 }
641
642
643 if (cdata->max_requests > devattr.max_qp_wr)
644 cdata->max_requests = devattr.max_qp_wr;
645
646 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
647 ep->rep_attr.qp_context = ep;
648
649 ep->rep_attr.srq = NULL;
650 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
651 switch (ia->ri_memreg_strategy) {
652 case RPCRDMA_FRMR:
653
654
655
656
657
658
659
660
661
662 ep->rep_attr.cap.max_send_wr *= 7;
663 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
664 cdata->max_requests = devattr.max_qp_wr / 7;
665 if (!cdata->max_requests)
666 return -EINVAL;
667 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
668 }
669 break;
670 case RPCRDMA_MEMWINDOWS_ASYNC:
671 case RPCRDMA_MEMWINDOWS:
672
673 ep->rep_attr.cap.max_send_wr++;
674 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
675 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
676 return -EINVAL;
677 break;
678 default:
679 break;
680 }
681 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
682 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
683 ep->rep_attr.cap.max_recv_sge = 1;
684 ep->rep_attr.cap.max_inline_data = 0;
685 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
686 ep->rep_attr.qp_type = IB_QPT_RC;
687 ep->rep_attr.port_num = ~0;
688
689 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
690 "iovs: send %d recv %d\n",
691 __func__,
692 ep->rep_attr.cap.max_send_wr,
693 ep->rep_attr.cap.max_recv_wr,
694 ep->rep_attr.cap.max_send_sge,
695 ep->rep_attr.cap.max_recv_sge);
696
697
698 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 ;
699 switch (ia->ri_memreg_strategy) {
700 case RPCRDMA_MEMWINDOWS_ASYNC:
701 case RPCRDMA_MEMWINDOWS:
702 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
703 break;
704 default:
705 break;
706 }
707 if (ep->rep_cqinit <= 2)
708 ep->rep_cqinit = 0;
709 INIT_CQCOUNT(ep);
710 ep->rep_ia = ia;
711 init_waitqueue_head(&ep->rep_connect_wait);
712
713
714
715
716
717
718 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
719 rpcrdma_cq_async_error_upcall, NULL,
720 ep->rep_attr.cap.max_recv_wr +
721 ep->rep_attr.cap.max_send_wr + 1, 0);
722 if (IS_ERR(ep->rep_cq)) {
723 rc = PTR_ERR(ep->rep_cq);
724 dprintk("RPC: %s: ib_create_cq failed: %i\n",
725 __func__, rc);
726 goto out1;
727 }
728
729 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
730 if (rc) {
731 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
732 __func__, rc);
733 goto out2;
734 }
735
736 ep->rep_attr.send_cq = ep->rep_cq;
737 ep->rep_attr.recv_cq = ep->rep_cq;
738
739
740
741
742 ep->rep_remote_cma.private_data = NULL;
743 ep->rep_remote_cma.private_data_len = 0;
744
745
746 ep->rep_remote_cma.initiator_depth = 0;
747 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
748 ep->rep_remote_cma.responder_resources = 0;
749 else if (devattr.max_qp_rd_atom > 32)
750 ep->rep_remote_cma.responder_resources = 32;
751 else
752 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
753
754 ep->rep_remote_cma.retry_count = 7;
755 ep->rep_remote_cma.flow_control = 0;
756 ep->rep_remote_cma.rnr_retry_count = 0;
757
758 return 0;
759
760out2:
761 err = ib_destroy_cq(ep->rep_cq);
762 if (err)
763 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
764 __func__, err);
765out1:
766 return rc;
767}
768
769
770
771
772
773
774
775
776
777
778
779int
780rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
781{
782 int rc;
783
784 dprintk("RPC: %s: entering, connected is %d\n",
785 __func__, ep->rep_connected);
786
787 if (ia->ri_id->qp) {
788 rc = rpcrdma_ep_disconnect(ep, ia);
789 if (rc)
790 dprintk("RPC: %s: rpcrdma_ep_disconnect"
791 " returned %i\n", __func__, rc);
792 rdma_destroy_qp(ia->ri_id);
793 ia->ri_id->qp = NULL;
794 }
795
796
797 if (ep->rep_pad_mr) {
798 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
799 ep->rep_pad_mr = NULL;
800 }
801
802 rpcrdma_clean_cq(ep->rep_cq);
803 rc = ib_destroy_cq(ep->rep_cq);
804 if (rc)
805 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
806 __func__, rc);
807
808 return rc;
809}
810
811
812
813
814int
815rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
816{
817 struct rdma_cm_id *id;
818 int rc = 0;
819 int retry_count = 0;
820
821 if (ep->rep_connected != 0) {
822 struct rpcrdma_xprt *xprt;
823retry:
824 rc = rpcrdma_ep_disconnect(ep, ia);
825 if (rc && rc != -ENOTCONN)
826 dprintk("RPC: %s: rpcrdma_ep_disconnect"
827 " status %i\n", __func__, rc);
828 rpcrdma_clean_cq(ep->rep_cq);
829
830 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
831 id = rpcrdma_create_id(xprt, ia,
832 (struct sockaddr *)&xprt->rx_data.addr);
833 if (IS_ERR(id)) {
834 rc = PTR_ERR(id);
835 goto out;
836 }
837
838
839
840
841
842
843
844 if (ia->ri_id->device != id->device) {
845 printk("RPC: %s: can't reconnect on "
846 "different device!\n", __func__);
847 rdma_destroy_id(id);
848 rc = -ENETDOWN;
849 goto out;
850 }
851
852 rdma_destroy_qp(ia->ri_id);
853 rdma_destroy_id(ia->ri_id);
854 ia->ri_id = id;
855 }
856
857 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
858 if (rc) {
859 dprintk("RPC: %s: rdma_create_qp failed %i\n",
860 __func__, rc);
861 goto out;
862 }
863
864
865if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
866 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
867 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
868 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
869 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
870 struct ib_qp_attr attr = {
871 .path_mtu = IB_MTU_1024
872 };
873 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
874 }
875}
876
877 ep->rep_connected = 0;
878
879 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
880 if (rc) {
881 dprintk("RPC: %s: rdma_connect() failed with %i\n",
882 __func__, rc);
883 goto out;
884 }
885
886 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
887
888
889
890
891
892
893
894 if (ep->rep_connected == -ECONNREFUSED &&
895 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
896 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
897 goto retry;
898 }
899 if (ep->rep_connected <= 0) {
900
901
902 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
903 (ep->rep_remote_cma.responder_resources == 0 ||
904 ep->rep_remote_cma.initiator_depth !=
905 ep->rep_remote_cma.responder_resources)) {
906 if (ep->rep_remote_cma.responder_resources == 0)
907 ep->rep_remote_cma.responder_resources = 1;
908 ep->rep_remote_cma.initiator_depth =
909 ep->rep_remote_cma.responder_resources;
910 goto retry;
911 }
912 rc = ep->rep_connected;
913 } else {
914 dprintk("RPC: %s: connected\n", __func__);
915 }
916
917out:
918 if (rc)
919 ep->rep_connected = rc;
920 return rc;
921}
922
923
924
925
926
927
928
929
930
931
932int
933rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
934{
935 int rc;
936
937 rpcrdma_clean_cq(ep->rep_cq);
938 rc = rdma_disconnect(ia->ri_id);
939 if (!rc) {
940
941 wait_event_interruptible(ep->rep_connect_wait,
942 ep->rep_connected != 1);
943 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
944 (ep->rep_connected == 1) ? "still " : "dis");
945 } else {
946 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
947 ep->rep_connected = rc;
948 }
949 return rc;
950}
951
952
953
954
955int
956rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
957 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
958{
959 char *p;
960 size_t len;
961 int i, rc;
962 struct rpcrdma_mw *r;
963
964 buf->rb_max_requests = cdata->max_requests;
965 spin_lock_init(&buf->rb_lock);
966 atomic_set(&buf->rb_credits, 1);
967
968
969
970
971
972
973
974
975
976
977 len = buf->rb_max_requests *
978 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
979 len += cdata->padding;
980 switch (ia->ri_memreg_strategy) {
981 case RPCRDMA_FRMR:
982 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
983 sizeof(struct rpcrdma_mw);
984 break;
985 case RPCRDMA_MTHCAFMR:
986
987 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
988 sizeof(struct rpcrdma_mw);
989 break;
990 case RPCRDMA_MEMWINDOWS_ASYNC:
991 case RPCRDMA_MEMWINDOWS:
992 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
993 sizeof(struct rpcrdma_mw);
994 break;
995 default:
996 break;
997 }
998
999
1000 p = kzalloc(len, GFP_KERNEL);
1001 if (p == NULL) {
1002 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1003 __func__, len);
1004 rc = -ENOMEM;
1005 goto out;
1006 }
1007 buf->rb_pool = p;
1008
1009 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1010 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1011 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1012 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1013
1014
1015
1016
1017 if (cdata->padding) {
1018 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1019 &ep->rep_pad_mr, &ep->rep_pad);
1020 if (rc)
1021 goto out;
1022 }
1023 p += cdata->padding;
1024
1025
1026
1027
1028
1029
1030 INIT_LIST_HEAD(&buf->rb_mws);
1031 r = (struct rpcrdma_mw *)p;
1032 switch (ia->ri_memreg_strategy) {
1033 case RPCRDMA_FRMR:
1034 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1035 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1036 RPCRDMA_MAX_SEGS);
1037 if (IS_ERR(r->r.frmr.fr_mr)) {
1038 rc = PTR_ERR(r->r.frmr.fr_mr);
1039 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1040 " failed %i\n", __func__, rc);
1041 goto out;
1042 }
1043 r->r.frmr.fr_pgl =
1044 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1045 RPCRDMA_MAX_SEGS);
1046 if (IS_ERR(r->r.frmr.fr_pgl)) {
1047 rc = PTR_ERR(r->r.frmr.fr_pgl);
1048 dprintk("RPC: %s: "
1049 "ib_alloc_fast_reg_page_list "
1050 "failed %i\n", __func__, rc);
1051 goto out;
1052 }
1053 list_add(&r->mw_list, &buf->rb_mws);
1054 ++r;
1055 }
1056 break;
1057 case RPCRDMA_MTHCAFMR:
1058
1059 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1060 static struct ib_fmr_attr fa =
1061 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1062 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1063 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1064 &fa);
1065 if (IS_ERR(r->r.fmr)) {
1066 rc = PTR_ERR(r->r.fmr);
1067 dprintk("RPC: %s: ib_alloc_fmr"
1068 " failed %i\n", __func__, rc);
1069 goto out;
1070 }
1071 list_add(&r->mw_list, &buf->rb_mws);
1072 ++r;
1073 }
1074 break;
1075 case RPCRDMA_MEMWINDOWS_ASYNC:
1076 case RPCRDMA_MEMWINDOWS:
1077
1078 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1079 r->r.mw = ib_alloc_mw(ia->ri_pd);
1080 if (IS_ERR(r->r.mw)) {
1081 rc = PTR_ERR(r->r.mw);
1082 dprintk("RPC: %s: ib_alloc_mw"
1083 " failed %i\n", __func__, rc);
1084 goto out;
1085 }
1086 list_add(&r->mw_list, &buf->rb_mws);
1087 ++r;
1088 }
1089 break;
1090 default:
1091 break;
1092 }
1093
1094
1095
1096
1097
1098 for (i = 0; i < buf->rb_max_requests; i++) {
1099 struct rpcrdma_req *req;
1100 struct rpcrdma_rep *rep;
1101
1102 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1103
1104
1105 if (len < 4096)
1106 len = 4096;
1107 req = kmalloc(len, GFP_KERNEL);
1108 if (req == NULL) {
1109 dprintk("RPC: %s: request buffer %d alloc"
1110 " failed\n", __func__, i);
1111 rc = -ENOMEM;
1112 goto out;
1113 }
1114 memset(req, 0, sizeof(struct rpcrdma_req));
1115 buf->rb_send_bufs[i] = req;
1116 buf->rb_send_bufs[i]->rl_buffer = buf;
1117
1118 rc = rpcrdma_register_internal(ia, req->rl_base,
1119 len - offsetof(struct rpcrdma_req, rl_base),
1120 &buf->rb_send_bufs[i]->rl_handle,
1121 &buf->rb_send_bufs[i]->rl_iov);
1122 if (rc)
1123 goto out;
1124
1125 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1126
1127 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1128 rep = kmalloc(len, GFP_KERNEL);
1129 if (rep == NULL) {
1130 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1131 __func__, i);
1132 rc = -ENOMEM;
1133 goto out;
1134 }
1135 memset(rep, 0, sizeof(struct rpcrdma_rep));
1136 buf->rb_recv_bufs[i] = rep;
1137 buf->rb_recv_bufs[i]->rr_buffer = buf;
1138 init_waitqueue_head(&rep->rr_unbind);
1139
1140 rc = rpcrdma_register_internal(ia, rep->rr_base,
1141 len - offsetof(struct rpcrdma_rep, rr_base),
1142 &buf->rb_recv_bufs[i]->rr_handle,
1143 &buf->rb_recv_bufs[i]->rr_iov);
1144 if (rc)
1145 goto out;
1146
1147 }
1148 dprintk("RPC: %s: max_requests %d\n",
1149 __func__, buf->rb_max_requests);
1150
1151 return 0;
1152out:
1153 rpcrdma_buffer_destroy(buf);
1154 return rc;
1155}
1156
1157
1158
1159
1160
1161
1162
1163void
1164rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1165{
1166 int rc, i;
1167 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1168 struct rpcrdma_mw *r;
1169
1170
1171
1172
1173
1174
1175
1176
1177 dprintk("RPC: %s: entering\n", __func__);
1178
1179 for (i = 0; i < buf->rb_max_requests; i++) {
1180 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1181 rpcrdma_deregister_internal(ia,
1182 buf->rb_recv_bufs[i]->rr_handle,
1183 &buf->rb_recv_bufs[i]->rr_iov);
1184 kfree(buf->rb_recv_bufs[i]);
1185 }
1186 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1187 while (!list_empty(&buf->rb_mws)) {
1188 r = list_entry(buf->rb_mws.next,
1189 struct rpcrdma_mw, mw_list);
1190 list_del(&r->mw_list);
1191 switch (ia->ri_memreg_strategy) {
1192 case RPCRDMA_FRMR:
1193 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1194 if (rc)
1195 dprintk("RPC: %s:"
1196 " ib_dereg_mr"
1197 " failed %i\n",
1198 __func__, rc);
1199 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1200 break;
1201 case RPCRDMA_MTHCAFMR:
1202 rc = ib_dealloc_fmr(r->r.fmr);
1203 if (rc)
1204 dprintk("RPC: %s:"
1205 " ib_dealloc_fmr"
1206 " failed %i\n",
1207 __func__, rc);
1208 break;
1209 case RPCRDMA_MEMWINDOWS_ASYNC:
1210 case RPCRDMA_MEMWINDOWS:
1211 rc = ib_dealloc_mw(r->r.mw);
1212 if (rc)
1213 dprintk("RPC: %s:"
1214 " ib_dealloc_mw"
1215 " failed %i\n",
1216 __func__, rc);
1217 break;
1218 default:
1219 break;
1220 }
1221 }
1222 rpcrdma_deregister_internal(ia,
1223 buf->rb_send_bufs[i]->rl_handle,
1224 &buf->rb_send_bufs[i]->rl_iov);
1225 kfree(buf->rb_send_bufs[i]);
1226 }
1227 }
1228
1229 kfree(buf->rb_pool);
1230}
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241struct rpcrdma_req *
1242rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1243{
1244 struct rpcrdma_req *req;
1245 unsigned long flags;
1246 int i;
1247 struct rpcrdma_mw *r;
1248
1249 spin_lock_irqsave(&buffers->rb_lock, flags);
1250 if (buffers->rb_send_index == buffers->rb_max_requests) {
1251 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1252 dprintk("RPC: %s: out of request buffers\n", __func__);
1253 return ((struct rpcrdma_req *)NULL);
1254 }
1255
1256 req = buffers->rb_send_bufs[buffers->rb_send_index];
1257 if (buffers->rb_send_index < buffers->rb_recv_index) {
1258 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1259 __func__,
1260 buffers->rb_recv_index - buffers->rb_send_index);
1261 req->rl_reply = NULL;
1262 } else {
1263 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1264 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1265 }
1266 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1267 if (!list_empty(&buffers->rb_mws)) {
1268 i = RPCRDMA_MAX_SEGS - 1;
1269 do {
1270 r = list_entry(buffers->rb_mws.next,
1271 struct rpcrdma_mw, mw_list);
1272 list_del(&r->mw_list);
1273 req->rl_segments[i].mr_chunk.rl_mw = r;
1274 } while (--i >= 0);
1275 }
1276 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1277 return req;
1278}
1279
1280
1281
1282
1283
1284void
1285rpcrdma_buffer_put(struct rpcrdma_req *req)
1286{
1287 struct rpcrdma_buffer *buffers = req->rl_buffer;
1288 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1289 int i;
1290 unsigned long flags;
1291
1292 BUG_ON(req->rl_nchunks != 0);
1293 spin_lock_irqsave(&buffers->rb_lock, flags);
1294 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1295 req->rl_niovs = 0;
1296 if (req->rl_reply) {
1297 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1298 init_waitqueue_head(&req->rl_reply->rr_unbind);
1299 req->rl_reply->rr_func = NULL;
1300 req->rl_reply = NULL;
1301 }
1302 switch (ia->ri_memreg_strategy) {
1303 case RPCRDMA_FRMR:
1304 case RPCRDMA_MTHCAFMR:
1305 case RPCRDMA_MEMWINDOWS_ASYNC:
1306 case RPCRDMA_MEMWINDOWS:
1307
1308
1309
1310
1311 i = 1;
1312 do {
1313 struct rpcrdma_mw **mw;
1314 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1315 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1316 *mw = NULL;
1317 } while (++i < RPCRDMA_MAX_SEGS);
1318 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1319 &buffers->rb_mws);
1320 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1321 break;
1322 default:
1323 break;
1324 }
1325 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1326}
1327
1328
1329
1330
1331
1332
1333void
1334rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1335{
1336 struct rpcrdma_buffer *buffers = req->rl_buffer;
1337 unsigned long flags;
1338
1339 if (req->rl_iov.length == 0)
1340 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1341 spin_lock_irqsave(&buffers->rb_lock, flags);
1342 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1343 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1344 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1345 }
1346 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1347}
1348
1349
1350
1351
1352
1353
1354void
1355rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1356{
1357 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1358 unsigned long flags;
1359
1360 rep->rr_func = NULL;
1361 spin_lock_irqsave(&buffers->rb_lock, flags);
1362 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1363 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1364}
1365
1366
1367
1368
1369
1370int
1371rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1372 struct ib_mr **mrp, struct ib_sge *iov)
1373{
1374 struct ib_phys_buf ipb;
1375 struct ib_mr *mr;
1376 int rc;
1377
1378
1379
1380
1381 iov->addr = ib_dma_map_single(ia->ri_id->device,
1382 va, len, DMA_BIDIRECTIONAL);
1383 iov->length = len;
1384
1385 if (ia->ri_have_dma_lkey) {
1386 *mrp = NULL;
1387 iov->lkey = ia->ri_dma_lkey;
1388 return 0;
1389 } else if (ia->ri_bind_mem != NULL) {
1390 *mrp = NULL;
1391 iov->lkey = ia->ri_bind_mem->lkey;
1392 return 0;
1393 }
1394
1395 ipb.addr = iov->addr;
1396 ipb.size = iov->length;
1397 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1398 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1399
1400 dprintk("RPC: %s: phys convert: 0x%llx "
1401 "registered 0x%llx length %d\n",
1402 __func__, (unsigned long long)ipb.addr,
1403 (unsigned long long)iov->addr, len);
1404
1405 if (IS_ERR(mr)) {
1406 *mrp = NULL;
1407 rc = PTR_ERR(mr);
1408 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1409 } else {
1410 *mrp = mr;
1411 iov->lkey = mr->lkey;
1412 rc = 0;
1413 }
1414
1415 return rc;
1416}
1417
1418int
1419rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1420 struct ib_mr *mr, struct ib_sge *iov)
1421{
1422 int rc;
1423
1424 ib_dma_unmap_single(ia->ri_id->device,
1425 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1426
1427 if (NULL == mr)
1428 return 0;
1429
1430 rc = ib_dereg_mr(mr);
1431 if (rc)
1432 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1433 return rc;
1434}
1435
1436
1437
1438
1439
1440static void
1441rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1442{
1443 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1444 seg->mr_dmalen = seg->mr_len;
1445 if (seg->mr_page)
1446 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1447 seg->mr_page, offset_in_page(seg->mr_offset),
1448 seg->mr_dmalen, seg->mr_dir);
1449 else
1450 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1451 seg->mr_offset,
1452 seg->mr_dmalen, seg->mr_dir);
1453}
1454
1455static void
1456rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1457{
1458 if (seg->mr_page)
1459 ib_dma_unmap_page(ia->ri_id->device,
1460 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1461 else
1462 ib_dma_unmap_single(ia->ri_id->device,
1463 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1464}
1465
1466static int
1467rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1468 int *nsegs, int writing, struct rpcrdma_ia *ia,
1469 struct rpcrdma_xprt *r_xprt)
1470{
1471 struct rpcrdma_mr_seg *seg1 = seg;
1472 struct ib_send_wr frmr_wr, *bad_wr;
1473 u8 key;
1474 int len, pageoff;
1475 int i, rc;
1476
1477 pageoff = offset_in_page(seg1->mr_offset);
1478 seg1->mr_offset -= pageoff;
1479 seg1->mr_len += pageoff;
1480 len = -pageoff;
1481 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1482 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1483 for (i = 0; i < *nsegs;) {
1484 rpcrdma_map_one(ia, seg, writing);
1485 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1486 len += seg->mr_len;
1487 ++seg;
1488 ++i;
1489
1490 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1491 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1492 break;
1493 }
1494 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1495 __func__, seg1->mr_chunk.rl_mw, i);
1496
1497
1498 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1499 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1500
1501
1502 memset(&frmr_wr, 0, sizeof frmr_wr);
1503 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1504 frmr_wr.send_flags = 0;
1505 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1506 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1507 frmr_wr.wr.fast_reg.page_list_len = i;
1508 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1509 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1510 frmr_wr.wr.fast_reg.access_flags = (writing ?
1511 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1512 IB_ACCESS_REMOTE_READ);
1513 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1514 DECR_CQCOUNT(&r_xprt->rx_ep);
1515
1516 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1517
1518 if (rc) {
1519 dprintk("RPC: %s: failed ib_post_send for register,"
1520 " status %i\n", __func__, rc);
1521 while (i--)
1522 rpcrdma_unmap_one(ia, --seg);
1523 } else {
1524 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1525 seg1->mr_base = seg1->mr_dma + pageoff;
1526 seg1->mr_nsegs = i;
1527 seg1->mr_len = len;
1528 }
1529 *nsegs = i;
1530 return rc;
1531}
1532
1533static int
1534rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1535 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1536{
1537 struct rpcrdma_mr_seg *seg1 = seg;
1538 struct ib_send_wr invalidate_wr, *bad_wr;
1539 int rc;
1540
1541 while (seg1->mr_nsegs--)
1542 rpcrdma_unmap_one(ia, seg++);
1543
1544 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1545 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1546 invalidate_wr.send_flags = 0;
1547 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1548 DECR_CQCOUNT(&r_xprt->rx_ep);
1549
1550 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1551 if (rc)
1552 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1553 " status %i\n", __func__, rc);
1554 return rc;
1555}
1556
1557static int
1558rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1559 int *nsegs, int writing, struct rpcrdma_ia *ia)
1560{
1561 struct rpcrdma_mr_seg *seg1 = seg;
1562 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1563 int len, pageoff, i, rc;
1564
1565 pageoff = offset_in_page(seg1->mr_offset);
1566 seg1->mr_offset -= pageoff;
1567 seg1->mr_len += pageoff;
1568 len = -pageoff;
1569 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1570 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1571 for (i = 0; i < *nsegs;) {
1572 rpcrdma_map_one(ia, seg, writing);
1573 physaddrs[i] = seg->mr_dma;
1574 len += seg->mr_len;
1575 ++seg;
1576 ++i;
1577
1578 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1579 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1580 break;
1581 }
1582 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1583 physaddrs, i, seg1->mr_dma);
1584 if (rc) {
1585 dprintk("RPC: %s: failed ib_map_phys_fmr "
1586 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1587 len, (unsigned long long)seg1->mr_dma,
1588 pageoff, i, rc);
1589 while (i--)
1590 rpcrdma_unmap_one(ia, --seg);
1591 } else {
1592 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1593 seg1->mr_base = seg1->mr_dma + pageoff;
1594 seg1->mr_nsegs = i;
1595 seg1->mr_len = len;
1596 }
1597 *nsegs = i;
1598 return rc;
1599}
1600
1601static int
1602rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1603 struct rpcrdma_ia *ia)
1604{
1605 struct rpcrdma_mr_seg *seg1 = seg;
1606 LIST_HEAD(l);
1607 int rc;
1608
1609 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1610 rc = ib_unmap_fmr(&l);
1611 while (seg1->mr_nsegs--)
1612 rpcrdma_unmap_one(ia, seg++);
1613 if (rc)
1614 dprintk("RPC: %s: failed ib_unmap_fmr,"
1615 " status %i\n", __func__, rc);
1616 return rc;
1617}
1618
1619static int
1620rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1621 int *nsegs, int writing, struct rpcrdma_ia *ia,
1622 struct rpcrdma_xprt *r_xprt)
1623{
1624 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1625 IB_ACCESS_REMOTE_READ);
1626 struct ib_mw_bind param;
1627 int rc;
1628
1629 *nsegs = 1;
1630 rpcrdma_map_one(ia, seg, writing);
1631 param.mr = ia->ri_bind_mem;
1632 param.wr_id = 0ULL;
1633 param.addr = seg->mr_dma;
1634 param.length = seg->mr_len;
1635 param.send_flags = 0;
1636 param.mw_access_flags = mem_priv;
1637
1638 DECR_CQCOUNT(&r_xprt->rx_ep);
1639 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
1640 if (rc) {
1641 dprintk("RPC: %s: failed ib_bind_mw "
1642 "%u@0x%llx status %i\n",
1643 __func__, seg->mr_len,
1644 (unsigned long long)seg->mr_dma, rc);
1645 rpcrdma_unmap_one(ia, seg);
1646 } else {
1647 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1648 seg->mr_base = param.addr;
1649 seg->mr_nsegs = 1;
1650 }
1651 return rc;
1652}
1653
1654static int
1655rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1656 struct rpcrdma_ia *ia,
1657 struct rpcrdma_xprt *r_xprt, void **r)
1658{
1659 struct ib_mw_bind param;
1660 LIST_HEAD(l);
1661 int rc;
1662
1663 BUG_ON(seg->mr_nsegs != 1);
1664 param.mr = ia->ri_bind_mem;
1665 param.addr = 0ULL;
1666 param.length = 0;
1667 param.mw_access_flags = 0;
1668 if (*r) {
1669 param.wr_id = (u64) (unsigned long) *r;
1670 param.send_flags = IB_SEND_SIGNALED;
1671 INIT_CQCOUNT(&r_xprt->rx_ep);
1672 } else {
1673 param.wr_id = 0ULL;
1674 param.send_flags = 0;
1675 DECR_CQCOUNT(&r_xprt->rx_ep);
1676 }
1677 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
1678 rpcrdma_unmap_one(ia, seg);
1679 if (rc)
1680 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1681 " status %i\n", __func__, rc);
1682 else
1683 *r = NULL;
1684 return rc;
1685}
1686
1687static int
1688rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1689 int *nsegs, int writing, struct rpcrdma_ia *ia)
1690{
1691 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1692 IB_ACCESS_REMOTE_READ);
1693 struct rpcrdma_mr_seg *seg1 = seg;
1694 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1695 int len, i, rc = 0;
1696
1697 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1698 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1699 for (len = 0, i = 0; i < *nsegs;) {
1700 rpcrdma_map_one(ia, seg, writing);
1701 ipb[i].addr = seg->mr_dma;
1702 ipb[i].size = seg->mr_len;
1703 len += seg->mr_len;
1704 ++seg;
1705 ++i;
1706
1707 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1708 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1709 break;
1710 }
1711 seg1->mr_base = seg1->mr_dma;
1712 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1713 ipb, i, mem_priv, &seg1->mr_base);
1714 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1715 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1716 dprintk("RPC: %s: failed ib_reg_phys_mr "
1717 "%u@0x%llx (%d)... status %i\n",
1718 __func__, len,
1719 (unsigned long long)seg1->mr_dma, i, rc);
1720 while (i--)
1721 rpcrdma_unmap_one(ia, --seg);
1722 } else {
1723 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1724 seg1->mr_nsegs = i;
1725 seg1->mr_len = len;
1726 }
1727 *nsegs = i;
1728 return rc;
1729}
1730
1731static int
1732rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1733 struct rpcrdma_ia *ia)
1734{
1735 struct rpcrdma_mr_seg *seg1 = seg;
1736 int rc;
1737
1738 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1739 seg1->mr_chunk.rl_mr = NULL;
1740 while (seg1->mr_nsegs--)
1741 rpcrdma_unmap_one(ia, seg++);
1742 if (rc)
1743 dprintk("RPC: %s: failed ib_dereg_mr,"
1744 " status %i\n", __func__, rc);
1745 return rc;
1746}
1747
1748int
1749rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1750 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1751{
1752 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1753 int rc = 0;
1754
1755 switch (ia->ri_memreg_strategy) {
1756
1757#if RPCRDMA_PERSISTENT_REGISTRATION
1758 case RPCRDMA_ALLPHYSICAL:
1759 rpcrdma_map_one(ia, seg, writing);
1760 seg->mr_rkey = ia->ri_bind_mem->rkey;
1761 seg->mr_base = seg->mr_dma;
1762 seg->mr_nsegs = 1;
1763 nsegs = 1;
1764 break;
1765#endif
1766
1767
1768 case RPCRDMA_FRMR:
1769 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1770 break;
1771
1772
1773 case RPCRDMA_MTHCAFMR:
1774 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1775 break;
1776
1777
1778 case RPCRDMA_MEMWINDOWS_ASYNC:
1779 case RPCRDMA_MEMWINDOWS:
1780 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1781 break;
1782
1783
1784 default:
1785 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1786 break;
1787 }
1788 if (rc)
1789 return -1;
1790
1791 return nsegs;
1792}
1793
1794int
1795rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1796 struct rpcrdma_xprt *r_xprt, void *r)
1797{
1798 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1799 int nsegs = seg->mr_nsegs, rc;
1800
1801 switch (ia->ri_memreg_strategy) {
1802
1803#if RPCRDMA_PERSISTENT_REGISTRATION
1804 case RPCRDMA_ALLPHYSICAL:
1805 BUG_ON(nsegs != 1);
1806 rpcrdma_unmap_one(ia, seg);
1807 rc = 0;
1808 break;
1809#endif
1810
1811 case RPCRDMA_FRMR:
1812 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1813 break;
1814
1815 case RPCRDMA_MTHCAFMR:
1816 rc = rpcrdma_deregister_fmr_external(seg, ia);
1817 break;
1818
1819 case RPCRDMA_MEMWINDOWS_ASYNC:
1820 case RPCRDMA_MEMWINDOWS:
1821 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1822 break;
1823
1824 default:
1825 rc = rpcrdma_deregister_default_external(seg, ia);
1826 break;
1827 }
1828 if (r) {
1829 struct rpcrdma_rep *rep = r;
1830 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1831 rep->rr_func = NULL;
1832 func(rep);
1833 }
1834 return nsegs;
1835}
1836
1837
1838
1839
1840
1841
1842int
1843rpcrdma_ep_post(struct rpcrdma_ia *ia,
1844 struct rpcrdma_ep *ep,
1845 struct rpcrdma_req *req)
1846{
1847 struct ib_send_wr send_wr, *send_wr_fail;
1848 struct rpcrdma_rep *rep = req->rl_reply;
1849 int rc;
1850
1851 if (rep) {
1852 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1853 if (rc)
1854 goto out;
1855 req->rl_reply = NULL;
1856 }
1857
1858 send_wr.next = NULL;
1859 send_wr.wr_id = 0ULL;
1860 send_wr.sg_list = req->rl_send_iov;
1861 send_wr.num_sge = req->rl_niovs;
1862 send_wr.opcode = IB_WR_SEND;
1863 if (send_wr.num_sge == 4)
1864 ib_dma_sync_single_for_device(ia->ri_id->device,
1865 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1866 DMA_TO_DEVICE);
1867 ib_dma_sync_single_for_device(ia->ri_id->device,
1868 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1869 DMA_TO_DEVICE);
1870 ib_dma_sync_single_for_device(ia->ri_id->device,
1871 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1872 DMA_TO_DEVICE);
1873
1874 if (DECR_CQCOUNT(ep) > 0)
1875 send_wr.send_flags = 0;
1876 else {
1877 INIT_CQCOUNT(ep);
1878 send_wr.send_flags = IB_SEND_SIGNALED;
1879 }
1880
1881 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1882 if (rc)
1883 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1884 rc);
1885out:
1886 return rc;
1887}
1888
1889
1890
1891
1892int
1893rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1894 struct rpcrdma_ep *ep,
1895 struct rpcrdma_rep *rep)
1896{
1897 struct ib_recv_wr recv_wr, *recv_wr_fail;
1898 int rc;
1899
1900 recv_wr.next = NULL;
1901 recv_wr.wr_id = (u64) (unsigned long) rep;
1902 recv_wr.sg_list = &rep->rr_iov;
1903 recv_wr.num_sge = 1;
1904
1905 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1906 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1907
1908 DECR_CQCOUNT(ep);
1909 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1910
1911 if (rc)
1912 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1913 rc);
1914 return rc;
1915}
1916