1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/device.h>
36#include <linux/dmapool.h>
37#include <linux/ratelimit.h>
38
39#include "rds_single_path.h"
40#include "rds.h"
41#include "ib.h"
42#include "ib_mr.h"
43
44
45
46
47
48static void rds_ib_send_complete(struct rds_message *rm,
49 int wc_status,
50 void (*complete)(struct rds_message *rm, int status))
51{
52 int notify_status;
53
54 switch (wc_status) {
55 case IB_WC_WR_FLUSH_ERR:
56 return;
57
58 case IB_WC_SUCCESS:
59 notify_status = RDS_RDMA_SUCCESS;
60 break;
61
62 case IB_WC_REM_ACCESS_ERR:
63 notify_status = RDS_RDMA_REMOTE_ERROR;
64 break;
65
66 default:
67 notify_status = RDS_RDMA_OTHER_ERROR;
68 break;
69 }
70 complete(rm, notify_status);
71}
72
73static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
74 struct rm_data_op *op,
75 int wc_status)
76{
77 if (op->op_nents)
78 ib_dma_unmap_sg(ic->i_cm_id->device,
79 op->op_sg, op->op_nents,
80 DMA_TO_DEVICE);
81}
82
83static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
84 struct rm_rdma_op *op,
85 int wc_status)
86{
87 if (op->op_mapped) {
88 ib_dma_unmap_sg(ic->i_cm_id->device,
89 op->op_sg, op->op_nents,
90 op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
91 op->op_mapped = 0;
92 }
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114 rds_ib_send_complete(container_of(op, struct rds_message, rdma),
115 wc_status, rds_rdma_send_complete);
116
117 if (op->op_write)
118 rds_stats_add(s_send_rdma_bytes, op->op_bytes);
119 else
120 rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
121}
122
123static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
124 struct rm_atomic_op *op,
125 int wc_status)
126{
127
128 if (op->op_mapped) {
129 ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
130 DMA_FROM_DEVICE);
131 op->op_mapped = 0;
132 }
133
134 rds_ib_send_complete(container_of(op, struct rds_message, atomic),
135 wc_status, rds_atomic_send_complete);
136
137 if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
138 rds_ib_stats_inc(s_ib_atomic_cswp);
139 else
140 rds_ib_stats_inc(s_ib_atomic_fadd);
141}
142
143
144
145
146
147
148
149
150static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
151 struct rds_ib_send_work *send,
152 int wc_status)
153{
154 struct rds_message *rm = NULL;
155
156
157 switch (send->s_wr.opcode) {
158 case IB_WR_SEND:
159 if (send->s_op) {
160 rm = container_of(send->s_op, struct rds_message, data);
161 rds_ib_send_unmap_data(ic, send->s_op, wc_status);
162 }
163 break;
164 case IB_WR_RDMA_WRITE:
165 case IB_WR_RDMA_READ:
166 if (send->s_op) {
167 rm = container_of(send->s_op, struct rds_message, rdma);
168 rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
169 }
170 break;
171 case IB_WR_ATOMIC_FETCH_AND_ADD:
172 case IB_WR_ATOMIC_CMP_AND_SWP:
173 if (send->s_op) {
174 rm = container_of(send->s_op, struct rds_message, atomic);
175 rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
176 }
177 break;
178 default:
179 printk_ratelimited(KERN_NOTICE
180 "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
181 __func__, send->s_wr.opcode);
182 break;
183 }
184
185 send->s_wr.opcode = 0xdead;
186
187 return rm;
188}
189
190void rds_ib_send_init_ring(struct rds_ib_connection *ic)
191{
192 struct rds_ib_send_work *send;
193 u32 i;
194
195 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
196 struct ib_sge *sge;
197
198 send->s_op = NULL;
199
200 send->s_wr.wr_id = i;
201 send->s_wr.sg_list = send->s_sge;
202 send->s_wr.ex.imm_data = 0;
203
204 sge = &send->s_sge[0];
205 sge->addr = ic->i_send_hdrs_dma[i];
206
207 sge->length = sizeof(struct rds_header);
208 sge->lkey = ic->i_pd->local_dma_lkey;
209
210 send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
211 }
212}
213
214void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
215{
216 struct rds_ib_send_work *send;
217 u32 i;
218
219 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
220 if (send->s_op && send->s_wr.opcode != 0xdead)
221 rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
222 }
223}
224
225
226
227
228
229static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
230{
231 if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
232 waitqueue_active(&rds_ib_ring_empty_wait))
233 wake_up(&rds_ib_ring_empty_wait);
234 BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
235}
236
237
238
239
240
241
242
243void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
244{
245 struct rds_message *rm = NULL;
246 struct rds_connection *conn = ic->conn;
247 struct rds_ib_send_work *send;
248 u32 completed;
249 u32 oldest;
250 u32 i = 0;
251 int nr_sig = 0;
252
253
254 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
255 (unsigned long long)wc->wr_id, wc->status,
256 ib_wc_status_msg(wc->status), wc->byte_len,
257 be32_to_cpu(wc->ex.imm_data));
258 rds_ib_stats_inc(s_ib_tx_cq_event);
259
260 if (wc->wr_id == RDS_IB_ACK_WR_ID) {
261 if (time_after(jiffies, ic->i_ack_queued + HZ / 2))
262 rds_ib_stats_inc(s_ib_tx_stalled);
263 rds_ib_ack_send_complete(ic);
264 return;
265 }
266
267 oldest = rds_ib_ring_oldest(&ic->i_send_ring);
268
269 completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest);
270
271 for (i = 0; i < completed; i++) {
272 send = &ic->i_sends[oldest];
273 if (send->s_wr.send_flags & IB_SEND_SIGNALED)
274 nr_sig++;
275
276 rm = rds_ib_send_unmap_op(ic, send, wc->status);
277
278 if (time_after(jiffies, send->s_queued + HZ / 2))
279 rds_ib_stats_inc(s_ib_tx_stalled);
280
281 if (send->s_op) {
282 if (send->s_op == rm->m_final_op) {
283
284
285
286 rds_message_unmapped(rm);
287 }
288 rds_message_put(rm);
289 send->s_op = NULL;
290 }
291
292 oldest = (oldest + 1) % ic->i_send_ring.w_nr;
293 }
294
295 rds_ib_ring_free(&ic->i_send_ring, completed);
296 rds_ib_sub_signaled(ic, nr_sig);
297
298 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
299 test_bit(0, &conn->c_map_queued))
300 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
301
302
303 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
304 rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c,%d> had status %u (%s), vendor err 0x%x, disconnecting and reconnecting\n",
305 &conn->c_laddr, &conn->c_faddr,
306 conn->c_tos, wc->status,
307 ib_wc_status_msg(wc->status), wc->vendor_err);
308 }
309}
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
356 u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
357{
358 unsigned int avail, posted, got = 0, advertise;
359 long oldval, newval;
360
361 *adv_credits = 0;
362 if (!ic->i_flowctl)
363 return wanted;
364
365try_again:
366 advertise = 0;
367 oldval = newval = atomic_read(&ic->i_credits);
368 posted = IB_GET_POST_CREDITS(oldval);
369 avail = IB_GET_SEND_CREDITS(oldval);
370
371 rdsdebug("wanted=%u credits=%u posted=%u\n",
372 wanted, avail, posted);
373
374
375 if (avail && !posted)
376 avail--;
377
378 if (avail < wanted) {
379 struct rds_connection *conn = ic->i_cm_id->context;
380
381
382 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
383 got = avail;
384 } else {
385
386 got = wanted;
387 }
388 newval -= IB_SET_SEND_CREDITS(got);
389
390
391
392
393
394
395 if (posted && (got || need_posted)) {
396 advertise = min_t(unsigned int, posted, max_posted);
397 newval -= IB_SET_POST_CREDITS(advertise);
398 }
399
400
401 if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
402 goto try_again;
403
404 *adv_credits = advertise;
405 return got;
406}
407
408void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
409{
410 struct rds_ib_connection *ic = conn->c_transport_data;
411
412 if (credits == 0)
413 return;
414
415 rdsdebug("credits=%u current=%u%s\n",
416 credits,
417 IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
418 test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
419
420 atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
421 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
422 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
423
424 WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
425
426 rds_ib_stats_inc(s_ib_rx_credit_updates);
427}
428
429void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
430{
431 struct rds_ib_connection *ic = conn->c_transport_data;
432
433 if (posted == 0)
434 return;
435
436 atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
437
438
439
440
441
442
443
444
445
446
447
448
449
450 if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
451 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
452}
453
454static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
455 struct rds_ib_send_work *send,
456 bool notify)
457{
458
459
460
461
462
463 if (ic->i_unsignaled_wrs-- == 0 || notify) {
464 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
465 send->s_wr.send_flags |= IB_SEND_SIGNALED;
466 return 1;
467 }
468 return 0;
469}
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
485 unsigned int hdr_off, unsigned int sg, unsigned int off)
486{
487 struct rds_ib_connection *ic = conn->c_transport_data;
488 struct ib_device *dev = ic->i_cm_id->device;
489 struct rds_ib_send_work *send = NULL;
490 struct rds_ib_send_work *first;
491 struct rds_ib_send_work *prev;
492 const struct ib_send_wr *failed_wr;
493 struct scatterlist *scat;
494 u32 pos;
495 u32 i;
496 u32 work_alloc;
497 u32 credit_alloc = 0;
498 u32 posted;
499 u32 adv_credits = 0;
500 int send_flags = 0;
501 int bytes_sent = 0;
502 int ret;
503 int flow_controlled = 0;
504 int nr_sig = 0;
505
506 BUG_ON(off % RDS_FRAG_SIZE);
507 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
508
509
510 if (conn->c_loopback
511 && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
512 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
513 scat = &rm->data.op_sg[sg];
514 ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
515 return sizeof(struct rds_header) + ret;
516 }
517
518
519 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
520 i = 1;
521 else
522 i = DIV_ROUND_UP(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
523
524 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
525 if (work_alloc == 0) {
526 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
527 rds_ib_stats_inc(s_ib_tx_ring_full);
528 ret = -ENOMEM;
529 goto out;
530 }
531
532 if (ic->i_flowctl) {
533 credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
534 adv_credits += posted;
535 if (credit_alloc < work_alloc) {
536 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
537 work_alloc = credit_alloc;
538 flow_controlled = 1;
539 }
540 if (work_alloc == 0) {
541 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
542 rds_ib_stats_inc(s_ib_tx_throttle);
543 ret = -ENOMEM;
544 goto out;
545 }
546 }
547
548
549 if (!ic->i_data_op) {
550 if (rm->data.op_nents) {
551 rm->data.op_count = ib_dma_map_sg(dev,
552 rm->data.op_sg,
553 rm->data.op_nents,
554 DMA_TO_DEVICE);
555 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
556 if (rm->data.op_count == 0) {
557 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
558 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
559 ret = -ENOMEM;
560 goto out;
561 }
562 } else {
563 rm->data.op_count = 0;
564 }
565
566 rds_message_addref(rm);
567 rm->data.op_dmasg = 0;
568 rm->data.op_dmaoff = 0;
569 ic->i_data_op = &rm->data;
570
571
572 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
573 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
574 if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
575 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
576
577
578
579 if (rm->rdma.op_active) {
580 struct rds_ext_header_rdma ext_hdr;
581
582 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
583 rds_message_add_extension(&rm->m_inc.i_hdr,
584 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
585 }
586 if (rm->m_rdma_cookie) {
587 rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
588 rds_rdma_cookie_key(rm->m_rdma_cookie),
589 rds_rdma_cookie_offset(rm->m_rdma_cookie));
590 }
591
592
593
594
595
596 rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
597 rds_message_make_checksum(&rm->m_inc.i_hdr);
598
599
600
601
602 if (ic->i_flowctl) {
603 rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
604 adv_credits += posted;
605 BUG_ON(adv_credits > 255);
606 }
607 }
608
609
610
611
612
613
614
615 if (rm->rdma.op_active && rm->rdma.op_fence)
616 send_flags = IB_SEND_FENCE;
617
618
619 send = &ic->i_sends[pos];
620 first = send;
621 prev = NULL;
622 scat = &ic->i_data_op->op_sg[rm->data.op_dmasg];
623 i = 0;
624 do {
625 unsigned int len = 0;
626
627
628 send->s_wr.send_flags = send_flags;
629 send->s_wr.opcode = IB_WR_SEND;
630 send->s_wr.num_sge = 1;
631 send->s_wr.next = NULL;
632 send->s_queued = jiffies;
633 send->s_op = NULL;
634
635 send->s_sge[0].addr = ic->i_send_hdrs_dma[pos];
636
637 send->s_sge[0].length = sizeof(struct rds_header);
638 send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
639
640 ib_dma_sync_single_for_cpu(ic->rds_ibdev->dev,
641 ic->i_send_hdrs_dma[pos],
642 sizeof(struct rds_header),
643 DMA_TO_DEVICE);
644 memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
645 sizeof(struct rds_header));
646
647
648
649 if (i < work_alloc
650 && scat != &rm->data.op_sg[rm->data.op_count]) {
651 len = min(RDS_FRAG_SIZE,
652 sg_dma_len(scat) - rm->data.op_dmaoff);
653 send->s_wr.num_sge = 2;
654
655 send->s_sge[1].addr = sg_dma_address(scat);
656 send->s_sge[1].addr += rm->data.op_dmaoff;
657 send->s_sge[1].length = len;
658 send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
659
660 bytes_sent += len;
661 rm->data.op_dmaoff += len;
662 if (rm->data.op_dmaoff == sg_dma_len(scat)) {
663 scat++;
664 rm->data.op_dmasg++;
665 rm->data.op_dmaoff = 0;
666 }
667 }
668
669 rds_ib_set_wr_signal_state(ic, send, false);
670
671
672
673
674 if (ic->i_flowctl && flow_controlled && i == (work_alloc - 1)) {
675 rds_ib_set_wr_signal_state(ic, send, true);
676 send->s_wr.send_flags |= IB_SEND_SOLICITED;
677 }
678
679 if (send->s_wr.send_flags & IB_SEND_SIGNALED)
680 nr_sig++;
681
682 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
683 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
684
685 if (ic->i_flowctl && adv_credits) {
686 struct rds_header *hdr = ic->i_send_hdrs[pos];
687
688
689 hdr->h_credit = adv_credits;
690 rds_message_make_checksum(hdr);
691 adv_credits = 0;
692 rds_ib_stats_inc(s_ib_tx_credit_updates);
693 }
694 ib_dma_sync_single_for_device(ic->rds_ibdev->dev,
695 ic->i_send_hdrs_dma[pos],
696 sizeof(struct rds_header),
697 DMA_TO_DEVICE);
698
699 if (prev)
700 prev->s_wr.next = &send->s_wr;
701 prev = send;
702
703 pos = (pos + 1) % ic->i_send_ring.w_nr;
704 send = &ic->i_sends[pos];
705 i++;
706
707 } while (i < work_alloc
708 && scat != &rm->data.op_sg[rm->data.op_count]);
709
710
711
712 if (hdr_off == 0)
713 bytes_sent += sizeof(struct rds_header);
714
715
716 if (scat == &rm->data.op_sg[rm->data.op_count]) {
717 prev->s_op = ic->i_data_op;
718 prev->s_wr.send_flags |= IB_SEND_SOLICITED;
719 if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED))
720 nr_sig += rds_ib_set_wr_signal_state(ic, prev, true);
721 ic->i_data_op = NULL;
722 }
723
724
725 if (i < work_alloc) {
726 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
727 work_alloc = i;
728 }
729 if (ic->i_flowctl && i < credit_alloc)
730 rds_ib_send_add_credits(conn, credit_alloc - i);
731
732 if (nr_sig)
733 atomic_add(nr_sig, &ic->i_signaled_sends);
734
735
736 failed_wr = &first->s_wr;
737 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
738 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
739 first, &first->s_wr, ret, failed_wr);
740 BUG_ON(failed_wr != &first->s_wr);
741 if (ret) {
742 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
743 "returned %d\n", &conn->c_faddr, ret);
744 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
745 rds_ib_sub_signaled(ic, nr_sig);
746 if (prev->s_op) {
747 ic->i_data_op = prev->s_op;
748 prev->s_op = NULL;
749 }
750
751 rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
752 goto out;
753 }
754
755 ret = bytes_sent;
756out:
757 BUG_ON(adv_credits);
758 return ret;
759}
760
761
762
763
764
765
766int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
767{
768 struct rds_ib_connection *ic = conn->c_transport_data;
769 struct rds_ib_send_work *send = NULL;
770 const struct ib_send_wr *failed_wr;
771 u32 pos;
772 u32 work_alloc;
773 int ret;
774 int nr_sig = 0;
775
776 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
777 if (work_alloc != 1) {
778 rds_ib_stats_inc(s_ib_tx_ring_full);
779 ret = -ENOMEM;
780 goto out;
781 }
782
783
784 send = &ic->i_sends[pos];
785 send->s_queued = jiffies;
786
787 if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
788 send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
789 send->s_atomic_wr.compare_add = op->op_m_cswp.compare;
790 send->s_atomic_wr.swap = op->op_m_cswp.swap;
791 send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask;
792 send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask;
793 } else {
794 send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
795 send->s_atomic_wr.compare_add = op->op_m_fadd.add;
796 send->s_atomic_wr.swap = 0;
797 send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask;
798 send->s_atomic_wr.swap_mask = 0;
799 }
800 send->s_wr.send_flags = 0;
801 nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
802 send->s_atomic_wr.wr.num_sge = 1;
803 send->s_atomic_wr.wr.next = NULL;
804 send->s_atomic_wr.remote_addr = op->op_remote_addr;
805 send->s_atomic_wr.rkey = op->op_rkey;
806 send->s_op = op;
807 rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
808
809
810 ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
811 rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
812 if (ret != 1) {
813 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
814 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
815 ret = -ENOMEM;
816 goto out;
817 }
818
819
820 send->s_sge[0].addr = sg_dma_address(op->op_sg);
821 send->s_sge[0].length = sg_dma_len(op->op_sg);
822 send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
823
824 rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
825 send->s_sge[0].addr, send->s_sge[0].length);
826
827 if (nr_sig)
828 atomic_add(nr_sig, &ic->i_signaled_sends);
829
830 failed_wr = &send->s_atomic_wr.wr;
831 ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr);
832 rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
833 send, &send->s_atomic_wr, ret, failed_wr);
834 BUG_ON(failed_wr != &send->s_atomic_wr.wr);
835 if (ret) {
836 printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
837 "returned %d\n", &conn->c_faddr, ret);
838 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
839 rds_ib_sub_signaled(ic, nr_sig);
840 goto out;
841 }
842
843 if (unlikely(failed_wr != &send->s_atomic_wr.wr)) {
844 printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
845 BUG_ON(failed_wr != &send->s_atomic_wr.wr);
846 }
847
848out:
849 return ret;
850}
851
852int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
853{
854 struct rds_ib_connection *ic = conn->c_transport_data;
855 struct rds_ib_send_work *send = NULL;
856 struct rds_ib_send_work *first;
857 struct rds_ib_send_work *prev;
858 const struct ib_send_wr *failed_wr;
859 struct scatterlist *scat;
860 unsigned long len;
861 u64 remote_addr = op->op_remote_addr;
862 u32 max_sge = ic->rds_ibdev->max_sge;
863 u32 pos;
864 u32 work_alloc;
865 u32 i;
866 u32 j;
867 int sent;
868 int ret;
869 int num_sge;
870 int nr_sig = 0;
871 u64 odp_addr = op->op_odp_addr;
872 u32 odp_lkey = 0;
873
874
875 if (!op->op_odp_mr) {
876 if (!op->op_mapped) {
877 op->op_count =
878 ib_dma_map_sg(ic->i_cm_id->device, op->op_sg,
879 op->op_nents,
880 (op->op_write) ? DMA_TO_DEVICE :
881 DMA_FROM_DEVICE);
882 rdsdebug("ic %p mapping op %p: %d\n", ic, op,
883 op->op_count);
884 if (op->op_count == 0) {
885 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
886 ret = -ENOMEM;
887 goto out;
888 }
889 op->op_mapped = 1;
890 }
891 } else {
892 op->op_count = op->op_nents;
893 odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private);
894 }
895
896
897
898
899
900 i = DIV_ROUND_UP(op->op_count, max_sge);
901
902 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
903 if (work_alloc != i) {
904 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
905 rds_ib_stats_inc(s_ib_tx_ring_full);
906 ret = -ENOMEM;
907 goto out;
908 }
909
910 send = &ic->i_sends[pos];
911 first = send;
912 prev = NULL;
913 scat = &op->op_sg[0];
914 sent = 0;
915 num_sge = op->op_count;
916
917 for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
918 send->s_wr.send_flags = 0;
919 send->s_queued = jiffies;
920 send->s_op = NULL;
921
922 if (!op->op_notify)
923 nr_sig += rds_ib_set_wr_signal_state(ic, send,
924 op->op_notify);
925
926 send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
927 send->s_rdma_wr.remote_addr = remote_addr;
928 send->s_rdma_wr.rkey = op->op_rkey;
929
930 if (num_sge > max_sge) {
931 send->s_rdma_wr.wr.num_sge = max_sge;
932 num_sge -= max_sge;
933 } else {
934 send->s_rdma_wr.wr.num_sge = num_sge;
935 }
936
937 send->s_rdma_wr.wr.next = NULL;
938
939 if (prev)
940 prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr;
941
942 for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
943 scat != &op->op_sg[op->op_count]; j++) {
944 len = sg_dma_len(scat);
945 if (!op->op_odp_mr) {
946 send->s_sge[j].addr = sg_dma_address(scat);
947 send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
948 } else {
949 send->s_sge[j].addr = odp_addr;
950 send->s_sge[j].lkey = odp_lkey;
951 }
952 send->s_sge[j].length = len;
953
954 sent += len;
955 rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
956
957 remote_addr += len;
958 odp_addr += len;
959 scat++;
960 }
961
962 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
963 &send->s_rdma_wr.wr,
964 send->s_rdma_wr.wr.num_sge,
965 send->s_rdma_wr.wr.next);
966
967 prev = send;
968 if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
969 send = ic->i_sends;
970 }
971
972
973 if (scat == &op->op_sg[op->op_count]) {
974 prev->s_op = op;
975 rds_message_addref(container_of(op, struct rds_message, rdma));
976 }
977
978 if (i < work_alloc) {
979 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
980 work_alloc = i;
981 }
982
983 if (nr_sig)
984 atomic_add(nr_sig, &ic->i_signaled_sends);
985
986 failed_wr = &first->s_rdma_wr.wr;
987 ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
988 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
989 first, &first->s_rdma_wr.wr, ret, failed_wr);
990 BUG_ON(failed_wr != &first->s_rdma_wr.wr);
991 if (ret) {
992 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
993 "returned %d\n", &conn->c_faddr, ret);
994 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
995 rds_ib_sub_signaled(ic, nr_sig);
996 goto out;
997 }
998
999 if (unlikely(failed_wr != &first->s_rdma_wr.wr)) {
1000 printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
1001 BUG_ON(failed_wr != &first->s_rdma_wr.wr);
1002 }
1003
1004
1005out:
1006 return ret;
1007}
1008
1009void rds_ib_xmit_path_complete(struct rds_conn_path *cp)
1010{
1011 struct rds_connection *conn = cp->cp_conn;
1012 struct rds_ib_connection *ic = conn->c_transport_data;
1013
1014
1015
1016 rds_ib_attempt_ack(ic);
1017}
1018