1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42#include <linux/sunrpc/debug.h>
43#include <linux/sunrpc/rpc_rdma.h>
44#include <linux/spinlock.h>
45#include <asm/unaligned.h>
46#include <rdma/ib_verbs.h>
47#include <rdma/rdma_cm.h>
48#include <linux/sunrpc/svc_rdma.h>
49
50#define RPCDBG_FACILITY RPCDBG_SVCXPRT
51
52
53
54
55
56
57static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
58 struct svc_rdma_op_ctxt *ctxt,
59 u32 byte_count)
60{
61 struct page *page;
62 u32 bc;
63 int sge_no;
64
65
66 page = ctxt->pages[0];
67 put_page(rqstp->rq_pages[0]);
68 rqstp->rq_pages[0] = page;
69
70
71 rqstp->rq_arg.head[0].iov_base = page_address(page);
72 rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
73 rqstp->rq_arg.len = byte_count;
74 rqstp->rq_arg.buflen = byte_count;
75
76
77 bc = byte_count - rqstp->rq_arg.head[0].iov_len;
78
79
80 rqstp->rq_arg.page_len = bc;
81 rqstp->rq_arg.page_base = 0;
82 rqstp->rq_arg.pages = &rqstp->rq_pages[1];
83 sge_no = 1;
84 while (bc && sge_no < ctxt->count) {
85 page = ctxt->pages[sge_no];
86 put_page(rqstp->rq_pages[sge_no]);
87 rqstp->rq_pages[sge_no] = page;
88 bc -= min(bc, ctxt->sge[sge_no].length);
89 rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
90 sge_no++;
91 }
92 rqstp->rq_respages = &rqstp->rq_pages[sge_no];
93
94
95
96
97 BUG_ON(bc && (sge_no == ctxt->count));
98 BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
99 != byte_count);
100 BUG_ON(rqstp->rq_arg.len != byte_count);
101
102
103 bc = sge_no;
104 while (sge_no < ctxt->count) {
105 page = ctxt->pages[sge_no++];
106 put_page(page);
107 }
108 ctxt->count = bc;
109
110
111 rqstp->rq_arg.tail[0].iov_base = NULL;
112 rqstp->rq_arg.tail[0].iov_len = 0;
113}
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128static int map_read_chunks(struct svcxprt_rdma *xprt,
129 struct svc_rqst *rqstp,
130 struct svc_rdma_op_ctxt *head,
131 struct rpcrdma_msg *rmsgp,
132 struct svc_rdma_req_map *rpl_map,
133 struct svc_rdma_req_map *chl_map,
134 int ch_count,
135 int byte_count)
136{
137 int sge_no;
138 int sge_bytes;
139 int page_off;
140 int page_no;
141 int ch_bytes;
142 int ch_no;
143 struct rpcrdma_read_chunk *ch;
144
145 sge_no = 0;
146 page_no = 0;
147 page_off = 0;
148 ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
149 ch_no = 0;
150 ch_bytes = ch->rc_target.rs_length;
151 head->arg.head[0] = rqstp->rq_arg.head[0];
152 head->arg.tail[0] = rqstp->rq_arg.tail[0];
153 head->arg.pages = &head->pages[head->count];
154 head->hdr_count = head->count;
155 head->arg.page_base = 0;
156 head->arg.page_len = ch_bytes;
157 head->arg.len = rqstp->rq_arg.len + ch_bytes;
158 head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
159 head->count++;
160 chl_map->ch[0].start = 0;
161 while (byte_count) {
162 rpl_map->sge[sge_no].iov_base =
163 page_address(rqstp->rq_arg.pages[page_no]) + page_off;
164 sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
165 rpl_map->sge[sge_no].iov_len = sge_bytes;
166
167
168
169
170 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
171 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
172
173 byte_count -= sge_bytes;
174 ch_bytes -= sge_bytes;
175 sge_no++;
176
177
178
179
180 if (ch_bytes == 0) {
181 chl_map->ch[ch_no].count =
182 sge_no - chl_map->ch[ch_no].start;
183 ch_no++;
184 ch++;
185 chl_map->ch[ch_no].start = sge_no;
186 ch_bytes = ch->rc_target.rs_length;
187
188 if (byte_count) {
189 head->arg.page_len += ch_bytes;
190 head->arg.len += ch_bytes;
191 head->arg.buflen += ch_bytes;
192 }
193 }
194
195
196
197
198 if ((sge_bytes + page_off) == PAGE_SIZE) {
199 page_no++;
200 page_off = 0;
201
202
203
204
205 if (byte_count)
206 head->count++;
207 } else
208 page_off += sge_bytes;
209 }
210 BUG_ON(byte_count != 0);
211 return sge_no;
212}
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
229 struct svc_rqst *rqstp,
230 struct svc_rdma_op_ctxt *head,
231 struct rpcrdma_msg *rmsgp,
232 struct svc_rdma_req_map *rpl_map,
233 struct svc_rdma_req_map *chl_map,
234 int ch_count,
235 int byte_count)
236{
237 int page_no;
238 int ch_no;
239 u32 offset;
240 struct rpcrdma_read_chunk *ch;
241 struct svc_rdma_fastreg_mr *frmr;
242 int ret = 0;
243
244 frmr = svc_rdma_get_frmr(xprt);
245 if (IS_ERR(frmr))
246 return -ENOMEM;
247
248 head->frmr = frmr;
249 head->arg.head[0] = rqstp->rq_arg.head[0];
250 head->arg.tail[0] = rqstp->rq_arg.tail[0];
251 head->arg.pages = &head->pages[head->count];
252 head->hdr_count = head->count;
253 head->arg.page_base = 0;
254 head->arg.page_len = byte_count;
255 head->arg.len = rqstp->rq_arg.len + byte_count;
256 head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
257
258
259 frmr->kva = page_address(rqstp->rq_arg.pages[0]);
260 frmr->direction = DMA_FROM_DEVICE;
261 frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
262 frmr->map_len = byte_count;
263 frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
264 for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
265 frmr->page_list->page_list[page_no] =
266 ib_dma_map_single(xprt->sc_cm_id->device,
267 page_address(rqstp->rq_arg.pages[page_no]),
268 PAGE_SIZE, DMA_FROM_DEVICE);
269 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
270 frmr->page_list->page_list[page_no]))
271 goto fatal_err;
272 atomic_inc(&xprt->sc_dma_used);
273 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
274 }
275 head->count += page_no;
276
277
278 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
279
280
281 offset = 0;
282 ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
283 for (ch_no = 0; ch_no < ch_count; ch_no++) {
284 rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
285 rpl_map->sge[ch_no].iov_len = ch->rc_target.rs_length;
286 chl_map->ch[ch_no].count = 1;
287 chl_map->ch[ch_no].start = ch_no;
288 offset += ch->rc_target.rs_length;
289 ch++;
290 }
291
292 ret = svc_rdma_fastreg(xprt, frmr);
293 if (ret)
294 goto fatal_err;
295
296 return ch_no;
297
298 fatal_err:
299 printk("svcrdma: error fast registering xdr for xprt %p", xprt);
300 svc_rdma_put_frmr(xprt, frmr);
301 return -EIO;
302}
303
304static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
305 struct svc_rdma_op_ctxt *ctxt,
306 struct svc_rdma_fastreg_mr *frmr,
307 struct kvec *vec,
308 u64 *sgl_offset,
309 int count)
310{
311 int i;
312
313 ctxt->count = count;
314 ctxt->direction = DMA_FROM_DEVICE;
315 for (i = 0; i < count; i++) {
316 ctxt->sge[i].length = 0;
317 if (!frmr) {
318 ctxt->sge[i].addr =
319 ib_dma_map_single(xprt->sc_cm_id->device,
320 vec[i].iov_base,
321 vec[i].iov_len,
322 DMA_FROM_DEVICE);
323 if (ib_dma_mapping_error(xprt->sc_cm_id->device,
324 ctxt->sge[i].addr))
325 return -EINVAL;
326 ctxt->sge[i].lkey = xprt->sc_dma_lkey;
327 atomic_inc(&xprt->sc_dma_used);
328 } else {
329 ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
330 ctxt->sge[i].lkey = frmr->mr->lkey;
331 }
332 ctxt->sge[i].length = vec[i].iov_len;
333 *sgl_offset = *sgl_offset + vec[i].iov_len;
334 }
335 return 0;
336}
337
338static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
339{
340 if ((RDMA_TRANSPORT_IWARP ==
341 rdma_node_get_transport(xprt->sc_cm_id->
342 device->node_type))
343 && sge_count > 1)
344 return 1;
345 else
346 return min_t(int, sge_count, xprt->sc_max_sge);
347}
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377static int rdma_read_xdr(struct svcxprt_rdma *xprt,
378 struct rpcrdma_msg *rmsgp,
379 struct svc_rqst *rqstp,
380 struct svc_rdma_op_ctxt *hdr_ctxt)
381{
382 struct ib_send_wr read_wr;
383 struct ib_send_wr inv_wr;
384 int err = 0;
385 int ch_no;
386 int ch_count;
387 int byte_count;
388 int sge_count;
389 u64 sgl_offset;
390 struct rpcrdma_read_chunk *ch;
391 struct svc_rdma_op_ctxt *ctxt = NULL;
392 struct svc_rdma_req_map *rpl_map;
393 struct svc_rdma_req_map *chl_map;
394
395
396 ch = svc_rdma_get_read_chunk(rmsgp);
397 if (!ch)
398 return 0;
399
400 svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
401 if (ch_count > RPCSVC_MAXPAGES)
402 return -EINVAL;
403
404
405 rpl_map = svc_rdma_get_req_map();
406 chl_map = svc_rdma_get_req_map();
407
408 if (!xprt->sc_frmr_pg_list_len)
409 sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
410 rpl_map, chl_map, ch_count,
411 byte_count);
412 else
413 sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
414 rpl_map, chl_map, ch_count,
415 byte_count);
416 if (sge_count < 0) {
417 err = -EIO;
418 goto out;
419 }
420
421 sgl_offset = 0;
422 ch_no = 0;
423
424 for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
425 ch->rc_discrim != 0; ch++, ch_no++) {
426next_sge:
427 ctxt = svc_rdma_get_context(xprt);
428 ctxt->direction = DMA_FROM_DEVICE;
429 ctxt->frmr = hdr_ctxt->frmr;
430 ctxt->read_hdr = NULL;
431 clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
432 clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
433
434
435 memset(&read_wr, 0, sizeof read_wr);
436 read_wr.wr_id = (unsigned long)ctxt;
437 read_wr.opcode = IB_WR_RDMA_READ;
438 ctxt->wr_op = read_wr.opcode;
439 read_wr.send_flags = IB_SEND_SIGNALED;
440 read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
441 read_wr.wr.rdma.remote_addr =
442 get_unaligned(&(ch->rc_target.rs_offset)) +
443 sgl_offset;
444 read_wr.sg_list = ctxt->sge;
445 read_wr.num_sge =
446 rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
447 err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
448 &rpl_map->sge[chl_map->ch[ch_no].start],
449 &sgl_offset,
450 read_wr.num_sge);
451 if (err) {
452 svc_rdma_unmap_dma(ctxt);
453 svc_rdma_put_context(ctxt, 0);
454 goto out;
455 }
456 if (((ch+1)->rc_discrim == 0) &&
457 (read_wr.num_sge == chl_map->ch[ch_no].count)) {
458
459
460
461
462
463 set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
464 if (hdr_ctxt->frmr) {
465 set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
466
467
468
469
470 if (xprt->sc_dev_caps &
471 SVCRDMA_DEVCAP_READ_W_INV) {
472 read_wr.opcode =
473 IB_WR_RDMA_READ_WITH_INV;
474 ctxt->wr_op = read_wr.opcode;
475 read_wr.ex.invalidate_rkey =
476 ctxt->frmr->mr->lkey;
477 } else {
478
479 memset(&inv_wr, 0, sizeof inv_wr);
480 inv_wr.opcode = IB_WR_LOCAL_INV;
481 inv_wr.send_flags = IB_SEND_SIGNALED;
482 inv_wr.ex.invalidate_rkey =
483 hdr_ctxt->frmr->mr->lkey;
484 read_wr.next = &inv_wr;
485 }
486 }
487 ctxt->read_hdr = hdr_ctxt;
488 }
489
490 err = svc_rdma_send(xprt, &read_wr);
491 if (err) {
492 printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
493 err);
494 set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
495 svc_rdma_put_context(ctxt, 0);
496 goto out;
497 }
498 atomic_inc(&rdma_stat_read);
499
500 if (read_wr.num_sge < chl_map->ch[ch_no].count) {
501 chl_map->ch[ch_no].count -= read_wr.num_sge;
502 chl_map->ch[ch_no].start += read_wr.num_sge;
503 goto next_sge;
504 }
505 sgl_offset = 0;
506 err = 1;
507 }
508
509 out:
510 svc_rdma_put_req_map(rpl_map);
511 svc_rdma_put_req_map(chl_map);
512
513
514 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
515 rqstp->rq_pages[ch_no] = NULL;
516
517
518
519
520
521 while (rqstp->rq_resused)
522 rqstp->rq_respages[--rqstp->rq_resused] = NULL;
523
524 return err;
525}
526
527static int rdma_read_complete(struct svc_rqst *rqstp,
528 struct svc_rdma_op_ctxt *head)
529{
530 int page_no;
531 int ret;
532
533 BUG_ON(!head);
534
535
536 for (page_no = 0; page_no < head->count; page_no++) {
537 put_page(rqstp->rq_pages[page_no]);
538 rqstp->rq_pages[page_no] = head->pages[page_no];
539 }
540
541 rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
542 rqstp->rq_arg.page_len = head->arg.page_len;
543 rqstp->rq_arg.page_base = head->arg.page_base;
544
545
546 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
547 rqstp->rq_resused = 0;
548
549
550 rqstp->rq_arg.head[0] = head->arg.head[0];
551 rqstp->rq_arg.tail[0] = head->arg.tail[0];
552 rqstp->rq_arg.len = head->arg.len;
553 rqstp->rq_arg.buflen = head->arg.buflen;
554
555
556 svc_rdma_put_context(head, 0);
557
558
559 rqstp->rq_prot = IPPROTO_MAX;
560 svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
561
562 ret = rqstp->rq_arg.head[0].iov_len
563 + rqstp->rq_arg.page_len
564 + rqstp->rq_arg.tail[0].iov_len;
565 dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
566 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
567 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
568 rqstp->rq_arg.head[0].iov_len);
569
570 svc_xprt_received(rqstp->rq_xprt);
571 return ret;
572}
573
574
575
576
577
578
579int svc_rdma_recvfrom(struct svc_rqst *rqstp)
580{
581 struct svc_xprt *xprt = rqstp->rq_xprt;
582 struct svcxprt_rdma *rdma_xprt =
583 container_of(xprt, struct svcxprt_rdma, sc_xprt);
584 struct svc_rdma_op_ctxt *ctxt = NULL;
585 struct rpcrdma_msg *rmsgp;
586 int ret = 0;
587 int len;
588
589 dprintk("svcrdma: rqstp=%p\n", rqstp);
590
591 spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
592 if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
593 ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
594 struct svc_rdma_op_ctxt,
595 dto_q);
596 list_del_init(&ctxt->dto_q);
597 }
598 if (ctxt) {
599 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
600 return rdma_read_complete(rqstp, ctxt);
601 }
602
603 if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
604 ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
605 struct svc_rdma_op_ctxt,
606 dto_q);
607 list_del_init(&ctxt->dto_q);
608 } else {
609 atomic_inc(&rdma_stat_rq_starve);
610 clear_bit(XPT_DATA, &xprt->xpt_flags);
611 ctxt = NULL;
612 }
613 spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
614 if (!ctxt) {
615
616
617
618
619
620 if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
621 goto close_out;
622
623 BUG_ON(ret);
624 goto out;
625 }
626 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
627 ctxt, rdma_xprt, rqstp, ctxt->wc_status);
628 BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
629 atomic_inc(&rdma_stat_recv);
630
631
632 rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
633
634
635 len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
636 rqstp->rq_xprt_hlen = len;
637
638
639 if (len < 0) {
640 if (len == -ENOSYS)
641 svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
642 goto close_out;
643 }
644
645
646 ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
647 if (ret > 0) {
648
649 goto defer;
650 }
651 if (ret < 0) {
652
653 svc_rdma_put_context(ctxt, 1);
654 return 0;
655 }
656
657 ret = rqstp->rq_arg.head[0].iov_len
658 + rqstp->rq_arg.page_len
659 + rqstp->rq_arg.tail[0].iov_len;
660 svc_rdma_put_context(ctxt, 0);
661 out:
662 dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
663 "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
664 ret, rqstp->rq_arg.len,
665 rqstp->rq_arg.head[0].iov_base,
666 rqstp->rq_arg.head[0].iov_len);
667 rqstp->rq_prot = IPPROTO_MAX;
668 svc_xprt_copy_addrs(rqstp, xprt);
669 svc_xprt_received(xprt);
670 return ret;
671
672 close_out:
673 if (ctxt)
674 svc_rdma_put_context(ctxt, 1);
675 dprintk("svcrdma: transport %p is closing\n", xprt);
676
677
678
679
680 set_bit(XPT_CLOSE, &xprt->xpt_flags);
681defer:
682 svc_xprt_received(xprt);
683 return 0;
684}
685