1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96#include <linux/spinlock.h>
97#include <asm/unaligned.h>
98#include <rdma/ib_verbs.h>
99#include <rdma/rdma_cm.h>
100
101#include <linux/sunrpc/xdr.h>
102#include <linux/sunrpc/debug.h>
103#include <linux/sunrpc/rpc_rdma.h>
104#include <linux/sunrpc/svc_rdma.h>
105
106#include "xprt_rdma.h"
107#include <trace/events/rpcrdma.h>
108
109#define RPCDBG_FACILITY RPCDBG_SVCXPRT
110
111static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
112
113static inline struct svc_rdma_recv_ctxt *
114svc_rdma_next_recv_ctxt(struct list_head *list)
115{
116 return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt,
117 rc_list);
118}
119
120static struct svc_rdma_recv_ctxt *
121svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
122{
123 struct svc_rdma_recv_ctxt *ctxt;
124 dma_addr_t addr;
125 void *buffer;
126
127 ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
128 if (!ctxt)
129 goto fail0;
130 buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
131 if (!buffer)
132 goto fail1;
133 addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
134 rdma->sc_max_req_size, DMA_FROM_DEVICE);
135 if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
136 goto fail2;
137
138 ctxt->rc_recv_wr.next = NULL;
139 ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
140 ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
141 ctxt->rc_recv_wr.num_sge = 1;
142 ctxt->rc_cqe.done = svc_rdma_wc_receive;
143 ctxt->rc_recv_sge.addr = addr;
144 ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
145 ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
146 ctxt->rc_recv_buf = buffer;
147 ctxt->rc_temp = false;
148 return ctxt;
149
150fail2:
151 kfree(buffer);
152fail1:
153 kfree(ctxt);
154fail0:
155 return NULL;
156}
157
158static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
159 struct svc_rdma_recv_ctxt *ctxt)
160{
161 ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
162 ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
163 kfree(ctxt->rc_recv_buf);
164 kfree(ctxt);
165}
166
167
168
169
170
171
172void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
173{
174 struct svc_rdma_recv_ctxt *ctxt;
175 struct llist_node *node;
176
177 while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
178 ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
179 svc_rdma_recv_ctxt_destroy(rdma, ctxt);
180 }
181}
182
183static struct svc_rdma_recv_ctxt *
184svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
185{
186 struct svc_rdma_recv_ctxt *ctxt;
187 struct llist_node *node;
188
189 node = llist_del_first(&rdma->sc_recv_ctxts);
190 if (!node)
191 goto out_empty;
192 ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
193
194out:
195 ctxt->rc_page_count = 0;
196 return ctxt;
197
198out_empty:
199 ctxt = svc_rdma_recv_ctxt_alloc(rdma);
200 if (!ctxt)
201 return NULL;
202 goto out;
203}
204
205
206
207
208
209
210
211void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
212 struct svc_rdma_recv_ctxt *ctxt)
213{
214 unsigned int i;
215
216 for (i = 0; i < ctxt->rc_page_count; i++)
217 put_page(ctxt->rc_pages[i]);
218
219 if (!ctxt->rc_temp)
220 llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
221 else
222 svc_rdma_recv_ctxt_destroy(rdma, ctxt);
223}
224
225static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
226 struct svc_rdma_recv_ctxt *ctxt)
227{
228 int ret;
229
230 svc_xprt_get(&rdma->sc_xprt);
231 ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
232 trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret);
233 if (ret)
234 goto err_post;
235 return 0;
236
237err_post:
238 svc_rdma_recv_ctxt_put(rdma, ctxt);
239 svc_xprt_put(&rdma->sc_xprt);
240 return ret;
241}
242
243static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
244{
245 struct svc_rdma_recv_ctxt *ctxt;
246
247 ctxt = svc_rdma_recv_ctxt_get(rdma);
248 if (!ctxt)
249 return -ENOMEM;
250 return __svc_rdma_post_recv(rdma, ctxt);
251}
252
253
254
255
256
257
258
259bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
260{
261 struct svc_rdma_recv_ctxt *ctxt;
262 unsigned int i;
263 int ret;
264
265 for (i = 0; i < rdma->sc_max_requests; i++) {
266 ctxt = svc_rdma_recv_ctxt_get(rdma);
267 if (!ctxt)
268 return false;
269 ctxt->rc_temp = true;
270 ret = __svc_rdma_post_recv(rdma, ctxt);
271 if (ret)
272 return false;
273 }
274 return true;
275}
276
277
278
279
280
281
282
283
284
285static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
286{
287 struct svcxprt_rdma *rdma = cq->cq_context;
288 struct ib_cqe *cqe = wc->wr_cqe;
289 struct svc_rdma_recv_ctxt *ctxt;
290
291 trace_svcrdma_wc_receive(wc);
292
293
294 ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
295
296 if (wc->status != IB_WC_SUCCESS)
297 goto flushed;
298
299 if (svc_rdma_post_recv(rdma))
300 goto post_err;
301
302
303 ctxt->rc_byte_len = wc->byte_len;
304 ib_dma_sync_single_for_cpu(rdma->sc_pd->device,
305 ctxt->rc_recv_sge.addr,
306 wc->byte_len, DMA_FROM_DEVICE);
307
308 spin_lock(&rdma->sc_rq_dto_lock);
309 list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
310
311 set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
312 spin_unlock(&rdma->sc_rq_dto_lock);
313 if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
314 svc_xprt_enqueue(&rdma->sc_xprt);
315 goto out;
316
317flushed:
318post_err:
319 svc_rdma_recv_ctxt_put(rdma, ctxt);
320 set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
321 svc_xprt_enqueue(&rdma->sc_xprt);
322out:
323 svc_xprt_put(&rdma->sc_xprt);
324}
325
326
327
328
329
330
331void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
332{
333 struct svc_rdma_recv_ctxt *ctxt;
334
335 while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
336 list_del(&ctxt->rc_list);
337 svc_rdma_recv_ctxt_put(rdma, ctxt);
338 }
339 while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
340 list_del(&ctxt->rc_list);
341 svc_rdma_recv_ctxt_put(rdma, ctxt);
342 }
343}
344
345static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
346 struct svc_rdma_recv_ctxt *ctxt)
347{
348 struct xdr_buf *arg = &rqstp->rq_arg;
349
350 arg->head[0].iov_base = ctxt->rc_recv_buf;
351 arg->head[0].iov_len = ctxt->rc_byte_len;
352 arg->tail[0].iov_base = NULL;
353 arg->tail[0].iov_len = 0;
354 arg->page_len = 0;
355 arg->page_base = 0;
356 arg->buflen = ctxt->rc_byte_len;
357 arg->len = ctxt->rc_byte_len;
358}
359
360
361
362
363#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
364
365
366
367
368#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end)
387{
388 u32 position;
389 bool first;
390
391 first = true;
392 while (*p++ != xdr_zero) {
393 if (first) {
394 position = be32_to_cpup(p++);
395 first = false;
396 } else if (be32_to_cpup(p++) != position) {
397 return NULL;
398 }
399 p++;
400 if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG)
401 return NULL;
402 p += 2;
403
404 if (p > end)
405 return NULL;
406 }
407 return p;
408}
409
410
411
412
413
414
415static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end,
416 u32 maxlen)
417{
418 u32 i, segcount;
419
420 segcount = be32_to_cpup(p++);
421 for (i = 0; i < segcount; i++) {
422 p++;
423 if (be32_to_cpup(p++) > maxlen)
424 return NULL;
425 p += 2;
426
427 if (p > end)
428 return NULL;
429 }
430
431 return p;
432}
433
434
435
436
437
438
439
440
441
442
443
444
445static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end)
446{
447 u32 chcount;
448
449 chcount = 0;
450 while (*p++ != xdr_zero) {
451 p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG);
452 if (!p)
453 return NULL;
454 if (chcount++ > 1)
455 return NULL;
456 }
457 return p;
458}
459
460
461
462
463
464
465
466
467
468static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
469{
470 if (*p++ != xdr_zero) {
471 p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG);
472 if (!p)
473 return NULL;
474 }
475 return p;
476}
477
478
479
480
481
482
483
484
485
486
487
488static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
489 struct svc_rdma_recv_ctxt *ctxt)
490{
491 __be32 inv_rkey, *p;
492 u32 i, segcount;
493
494 ctxt->rc_inv_rkey = 0;
495
496 if (!rdma->sc_snd_w_inv)
497 return;
498
499 inv_rkey = xdr_zero;
500 p = ctxt->rc_recv_buf;
501 p += rpcrdma_fixed_maxsz;
502
503
504 while (*p++ != xdr_zero) {
505 p++;
506 if (inv_rkey == xdr_zero)
507 inv_rkey = *p;
508 else if (inv_rkey != *p)
509 return;
510 p += 4;
511 }
512
513
514 while (*p++ != xdr_zero) {
515 segcount = be32_to_cpup(p++);
516 for (i = 0; i < segcount; i++) {
517 if (inv_rkey == xdr_zero)
518 inv_rkey = *p;
519 else if (inv_rkey != *p)
520 return;
521 p += 4;
522 }
523 }
524
525
526 if (*p++ != xdr_zero) {
527 segcount = be32_to_cpup(p++);
528 for (i = 0; i < segcount; i++) {
529 if (inv_rkey == xdr_zero)
530 inv_rkey = *p;
531 else if (inv_rkey != *p)
532 return;
533 p += 4;
534 }
535 }
536
537 ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
538}
539
540
541
542
543
544
545
546
547
548
549
550static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
551{
552 __be32 *p, *end, *rdma_argp;
553 unsigned int hdr_len;
554
555
556 if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
557 goto out_short;
558
559 rdma_argp = rq_arg->head[0].iov_base;
560 if (*(rdma_argp + 1) != rpcrdma_version)
561 goto out_version;
562
563 switch (*(rdma_argp + 3)) {
564 case rdma_msg:
565 break;
566 case rdma_nomsg:
567 break;
568
569 case rdma_done:
570 goto out_drop;
571
572 case rdma_error:
573 goto out_drop;
574
575 default:
576 goto out_proc;
577 }
578
579 end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
580 p = xdr_check_read_list(rdma_argp + 4, end);
581 if (!p)
582 goto out_inval;
583 p = xdr_check_write_list(p, end);
584 if (!p)
585 goto out_inval;
586 p = xdr_check_reply_chunk(p, end);
587 if (!p)
588 goto out_inval;
589 if (p > end)
590 goto out_inval;
591
592 rq_arg->head[0].iov_base = p;
593 hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
594 rq_arg->head[0].iov_len -= hdr_len;
595 rq_arg->len -= hdr_len;
596 trace_svcrdma_decode_rqst(rdma_argp, hdr_len);
597 return hdr_len;
598
599out_short:
600 trace_svcrdma_decode_short(rq_arg->len);
601 return -EINVAL;
602
603out_version:
604 trace_svcrdma_decode_badvers(rdma_argp);
605 return -EPROTONOSUPPORT;
606
607out_drop:
608 trace_svcrdma_decode_drop(rdma_argp);
609 return 0;
610
611out_proc:
612 trace_svcrdma_decode_badproc(rdma_argp);
613 return -EINVAL;
614
615out_inval:
616 trace_svcrdma_decode_parse(rdma_argp);
617 return -EINVAL;
618}
619
620static void rdma_read_complete(struct svc_rqst *rqstp,
621 struct svc_rdma_recv_ctxt *head)
622{
623 int page_no;
624
625
626
627
628 for (page_no = 0; page_no < head->rc_page_count; page_no++) {
629 put_page(rqstp->rq_pages[page_no]);
630 rqstp->rq_pages[page_no] = head->rc_pages[page_no];
631 }
632 head->rc_page_count = 0;
633
634
635 rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count];
636 rqstp->rq_arg.page_len = head->rc_arg.page_len;
637
638
639 rqstp->rq_respages = &rqstp->rq_pages[page_no];
640 rqstp->rq_next_page = rqstp->rq_respages + 1;
641
642
643 rqstp->rq_arg.head[0] = head->rc_arg.head[0];
644 rqstp->rq_arg.tail[0] = head->rc_arg.tail[0];
645 rqstp->rq_arg.len = head->rc_arg.len;
646 rqstp->rq_arg.buflen = head->rc_arg.buflen;
647}
648
649static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
650 __be32 *rdma_argp, int status)
651{
652 struct svc_rdma_send_ctxt *ctxt;
653 unsigned int length;
654 __be32 *p;
655 int ret;
656
657 ctxt = svc_rdma_send_ctxt_get(xprt);
658 if (!ctxt)
659 return;
660
661 p = ctxt->sc_xprt_buf;
662 *p++ = *rdma_argp;
663 *p++ = *(rdma_argp + 1);
664 *p++ = xprt->sc_fc_credits;
665 *p++ = rdma_error;
666 switch (status) {
667 case -EPROTONOSUPPORT:
668 *p++ = err_vers;
669 *p++ = rpcrdma_version;
670 *p++ = rpcrdma_version;
671 trace_svcrdma_err_vers(*rdma_argp);
672 break;
673 default:
674 *p++ = err_chunk;
675 trace_svcrdma_err_chunk(*rdma_argp);
676 }
677 length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf;
678 svc_rdma_sync_reply_hdr(xprt, ctxt, length);
679
680 ctxt->sc_send_wr.opcode = IB_WR_SEND;
681 ret = svc_rdma_send(xprt, &ctxt->sc_send_wr);
682 if (ret)
683 svc_rdma_send_ctxt_put(xprt, ctxt);
684}
685
686
687
688
689
690
691static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
692 __be32 *rdma_resp)
693{
694 __be32 *p;
695
696 if (!xprt->xpt_bc_xprt)
697 return false;
698
699 p = rdma_resp + 3;
700 if (*p++ != rdma_msg)
701 return false;
702
703 if (*p++ != xdr_zero)
704 return false;
705 if (*p++ != xdr_zero)
706 return false;
707 if (*p++ != xdr_zero)
708 return false;
709
710
711 if (*p++ != *rdma_resp)
712 return false;
713
714 if (*p == cpu_to_be32(RPC_CALL))
715 return false;
716
717 return true;
718}
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750int svc_rdma_recvfrom(struct svc_rqst *rqstp)
751{
752 struct svc_xprt *xprt = rqstp->rq_xprt;
753 struct svcxprt_rdma *rdma_xprt =
754 container_of(xprt, struct svcxprt_rdma, sc_xprt);
755 struct svc_rdma_recv_ctxt *ctxt;
756 __be32 *p;
757 int ret;
758
759 spin_lock(&rdma_xprt->sc_rq_dto_lock);
760 ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
761 if (ctxt) {
762 list_del(&ctxt->rc_list);
763 spin_unlock(&rdma_xprt->sc_rq_dto_lock);
764 rdma_read_complete(rqstp, ctxt);
765 goto complete;
766 }
767 ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
768 if (!ctxt) {
769
770 clear_bit(XPT_DATA, &xprt->xpt_flags);
771 spin_unlock(&rdma_xprt->sc_rq_dto_lock);
772 return 0;
773 }
774 list_del(&ctxt->rc_list);
775 spin_unlock(&rdma_xprt->sc_rq_dto_lock);
776
777 atomic_inc(&rdma_stat_recv);
778
779 svc_rdma_build_arg_xdr(rqstp, ctxt);
780
781
782
783
784 rqstp->rq_respages = rqstp->rq_pages;
785 rqstp->rq_next_page = rqstp->rq_respages;
786
787 p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
788 ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
789 if (ret < 0)
790 goto out_err;
791 if (ret == 0)
792 goto out_drop;
793 rqstp->rq_xprt_hlen = ret;
794
795 if (svc_rdma_is_backchannel_reply(xprt, p)) {
796 ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p,
797 &rqstp->rq_arg);
798 svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
799 return ret;
800 }
801 svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
802
803 p += rpcrdma_fixed_maxsz;
804 if (*p != xdr_zero)
805 goto out_readchunk;
806
807complete:
808 rqstp->rq_xprt_ctxt = ctxt;
809 rqstp->rq_prot = IPPROTO_MAX;
810 svc_xprt_copy_addrs(rqstp, xprt);
811 return rqstp->rq_arg.len;
812
813out_readchunk:
814 ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p);
815 if (ret < 0)
816 goto out_postfail;
817 return 0;
818
819out_err:
820 svc_rdma_send_error(rdma_xprt, p, ret);
821 svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
822 return 0;
823
824out_postfail:
825 if (ret == -EINVAL)
826 svc_rdma_send_error(rdma_xprt, p, ret);
827 svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
828 return ret;
829
830out_drop:
831 svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
832 return 0;
833}
834