1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33#include <linux/kernel.h>
34#include <linux/slab.h>
35#include <linux/pci.h>
36#include <linux/dma-mapping.h>
37#include <rdma/rdma_cm.h>
38
39#include "rds.h"
40#include "ib.h"
41
42static struct kmem_cache *rds_ib_incoming_slab;
43static struct kmem_cache *rds_ib_frag_slab;
44static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
45
46void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
47{
48 struct rds_ib_recv_work *recv;
49 u32 i;
50
51 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
52 struct ib_sge *sge;
53
54 recv->r_ibinc = NULL;
55 recv->r_frag = NULL;
56
57 recv->r_wr.next = NULL;
58 recv->r_wr.wr_id = i;
59 recv->r_wr.sg_list = recv->r_sge;
60 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
61
62 sge = &recv->r_sge[0];
63 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
64 sge->length = sizeof(struct rds_header);
65 sge->lkey = ic->i_mr->lkey;
66
67 sge = &recv->r_sge[1];
68 sge->addr = 0;
69 sge->length = RDS_FRAG_SIZE;
70 sge->lkey = ic->i_mr->lkey;
71 }
72}
73
74
75
76
77
78static void list_splice_entire_tail(struct list_head *from,
79 struct list_head *to)
80{
81 struct list_head *from_last = from->prev;
82
83 list_splice_tail(from_last, to);
84 list_add_tail(from_last, to);
85}
86
87static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
88{
89 struct list_head *tmp;
90
91 tmp = xchg(&cache->xfer, NULL);
92 if (tmp) {
93 if (cache->ready)
94 list_splice_entire_tail(tmp, cache->ready);
95 else
96 cache->ready = tmp;
97 }
98}
99
100static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
101{
102 struct rds_ib_cache_head *head;
103 int cpu;
104
105 cache->percpu = alloc_percpu(struct rds_ib_cache_head);
106 if (!cache->percpu)
107 return -ENOMEM;
108
109 for_each_possible_cpu(cpu) {
110 head = per_cpu_ptr(cache->percpu, cpu);
111 head->first = NULL;
112 head->count = 0;
113 }
114 cache->xfer = NULL;
115 cache->ready = NULL;
116
117 return 0;
118}
119
120int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
121{
122 int ret;
123
124 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
125 if (!ret) {
126 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
127 if (ret)
128 free_percpu(ic->i_cache_incs.percpu);
129 }
130
131 return ret;
132}
133
134static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
135 struct list_head *caller_list)
136{
137 struct rds_ib_cache_head *head;
138 int cpu;
139
140 for_each_possible_cpu(cpu) {
141 head = per_cpu_ptr(cache->percpu, cpu);
142 if (head->first) {
143 list_splice_entire_tail(head->first, caller_list);
144 head->first = NULL;
145 }
146 }
147
148 if (cache->ready) {
149 list_splice_entire_tail(cache->ready, caller_list);
150 cache->ready = NULL;
151 }
152}
153
154void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
155{
156 struct rds_ib_incoming *inc;
157 struct rds_ib_incoming *inc_tmp;
158 struct rds_page_frag *frag;
159 struct rds_page_frag *frag_tmp;
160 LIST_HEAD(list);
161
162 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
163 rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
164 free_percpu(ic->i_cache_incs.percpu);
165
166 list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
167 list_del(&inc->ii_cache_entry);
168 WARN_ON(!list_empty(&inc->ii_frags));
169 kmem_cache_free(rds_ib_incoming_slab, inc);
170 }
171
172 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
173 rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
174 free_percpu(ic->i_cache_frags.percpu);
175
176 list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
177 list_del(&frag->f_cache_entry);
178 WARN_ON(!list_empty(&frag->f_item));
179 kmem_cache_free(rds_ib_frag_slab, frag);
180 }
181}
182
183
184static void rds_ib_recv_cache_put(struct list_head *new_item,
185 struct rds_ib_refill_cache *cache);
186static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
187
188
189
190static void rds_ib_frag_free(struct rds_ib_connection *ic,
191 struct rds_page_frag *frag)
192{
193 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
194
195 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
196}
197
198
199void rds_ib_inc_free(struct rds_incoming *inc)
200{
201 struct rds_ib_incoming *ibinc;
202 struct rds_page_frag *frag;
203 struct rds_page_frag *pos;
204 struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
205
206 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
207
208
209 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
210 list_del_init(&frag->f_item);
211 rds_ib_frag_free(ic, frag);
212 }
213 BUG_ON(!list_empty(&ibinc->ii_frags));
214
215 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
216 rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
217}
218
219static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
220 struct rds_ib_recv_work *recv)
221{
222 if (recv->r_ibinc) {
223 rds_inc_put(&recv->r_ibinc->ii_inc);
224 recv->r_ibinc = NULL;
225 }
226 if (recv->r_frag) {
227 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
228 rds_ib_frag_free(ic, recv->r_frag);
229 recv->r_frag = NULL;
230 }
231}
232
233void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
234{
235 u32 i;
236
237 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
238 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
239}
240
241static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
242 gfp_t slab_mask)
243{
244 struct rds_ib_incoming *ibinc;
245 struct list_head *cache_item;
246 int avail_allocs;
247
248 cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
249 if (cache_item) {
250 ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
251 } else {
252 avail_allocs = atomic_add_unless(&rds_ib_allocation,
253 1, rds_ib_sysctl_max_recv_allocation);
254 if (!avail_allocs) {
255 rds_ib_stats_inc(s_ib_rx_alloc_limit);
256 return NULL;
257 }
258 ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
259 if (!ibinc) {
260 atomic_dec(&rds_ib_allocation);
261 return NULL;
262 }
263 }
264 INIT_LIST_HEAD(&ibinc->ii_frags);
265 rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
266
267 return ibinc;
268}
269
270static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
271 gfp_t slab_mask, gfp_t page_mask)
272{
273 struct rds_page_frag *frag;
274 struct list_head *cache_item;
275 int ret;
276
277 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
278 if (cache_item) {
279 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
280 } else {
281 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
282 if (!frag)
283 return NULL;
284
285 sg_init_table(&frag->f_sg, 1);
286 ret = rds_page_remainder_alloc(&frag->f_sg,
287 RDS_FRAG_SIZE, page_mask);
288 if (ret) {
289 kmem_cache_free(rds_ib_frag_slab, frag);
290 return NULL;
291 }
292 }
293
294 INIT_LIST_HEAD(&frag->f_item);
295
296 return frag;
297}
298
299static int rds_ib_recv_refill_one(struct rds_connection *conn,
300 struct rds_ib_recv_work *recv, int prefill)
301{
302 struct rds_ib_connection *ic = conn->c_transport_data;
303 struct ib_sge *sge;
304 int ret = -ENOMEM;
305 gfp_t slab_mask = GFP_NOWAIT;
306 gfp_t page_mask = GFP_NOWAIT;
307
308 if (prefill) {
309 slab_mask = GFP_KERNEL;
310 page_mask = GFP_HIGHUSER;
311 }
312
313 if (!ic->i_cache_incs.ready)
314 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
315 if (!ic->i_cache_frags.ready)
316 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
317
318
319
320
321
322 if (!recv->r_ibinc) {
323 recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
324 if (!recv->r_ibinc)
325 goto out;
326 }
327
328 WARN_ON(recv->r_frag);
329 recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
330 if (!recv->r_frag)
331 goto out;
332
333 ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
334 1, DMA_FROM_DEVICE);
335 WARN_ON(ret != 1);
336
337 sge = &recv->r_sge[0];
338 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
339 sge->length = sizeof(struct rds_header);
340
341 sge = &recv->r_sge[1];
342 sge->addr = sg_dma_address(&recv->r_frag->f_sg);
343 sge->length = sg_dma_len(&recv->r_frag->f_sg);
344
345 ret = 0;
346out:
347 return ret;
348}
349
350
351
352
353
354
355
356
357void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
358{
359 struct rds_ib_connection *ic = conn->c_transport_data;
360 struct rds_ib_recv_work *recv;
361 struct ib_recv_wr *failed_wr;
362 unsigned int posted = 0;
363 int ret = 0;
364 u32 pos;
365
366 while ((prefill || rds_conn_up(conn)) &&
367 rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
368 if (pos >= ic->i_recv_ring.w_nr) {
369 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
370 pos);
371 break;
372 }
373
374 recv = &ic->i_recvs[pos];
375 ret = rds_ib_recv_refill_one(conn, recv, prefill);
376 if (ret) {
377 break;
378 }
379
380
381 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
382 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
383 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
384 (long) sg_dma_address(&recv->r_frag->f_sg), ret);
385 if (ret) {
386 rds_ib_conn_error(conn, "recv post on "
387 "%pI4 returned %d, disconnecting and "
388 "reconnecting\n", &conn->c_faddr,
389 ret);
390 break;
391 }
392
393 posted++;
394 }
395
396
397 if (ic->i_flowctl && posted)
398 rds_ib_advertise_credits(conn, posted);
399
400 if (ret)
401 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
402}
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417static void rds_ib_recv_cache_put(struct list_head *new_item,
418 struct rds_ib_refill_cache *cache)
419{
420 unsigned long flags;
421 struct rds_ib_cache_head *chp;
422 struct list_head *old;
423
424 local_irq_save(flags);
425
426 chp = per_cpu_ptr(cache->percpu, smp_processor_id());
427 if (!chp->first)
428 INIT_LIST_HEAD(new_item);
429 else
430 list_add_tail(new_item, chp->first);
431 chp->first = new_item;
432 chp->count++;
433
434 if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT)
435 goto end;
436
437
438
439
440
441
442
443 do {
444 old = xchg(&cache->xfer, NULL);
445 if (old)
446 list_splice_entire_tail(old, chp->first);
447 old = cmpxchg(&cache->xfer, NULL, chp->first);
448 } while (old);
449
450 chp->first = NULL;
451 chp->count = 0;
452end:
453 local_irq_restore(flags);
454}
455
456static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
457{
458 struct list_head *head = cache->ready;
459
460 if (head) {
461 if (!list_empty(head)) {
462 cache->ready = head->next;
463 list_del_init(head);
464 } else
465 cache->ready = NULL;
466 }
467
468 return head;
469}
470
471int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
472 size_t size)
473{
474 struct rds_ib_incoming *ibinc;
475 struct rds_page_frag *frag;
476 struct iovec *iov = first_iov;
477 unsigned long to_copy;
478 unsigned long frag_off = 0;
479 unsigned long iov_off = 0;
480 int copied = 0;
481 int ret;
482 u32 len;
483
484 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
485 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
486 len = be32_to_cpu(inc->i_hdr.h_len);
487
488 while (copied < size && copied < len) {
489 if (frag_off == RDS_FRAG_SIZE) {
490 frag = list_entry(frag->f_item.next,
491 struct rds_page_frag, f_item);
492 frag_off = 0;
493 }
494 while (iov_off == iov->iov_len) {
495 iov_off = 0;
496 iov++;
497 }
498
499 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
500 to_copy = min_t(size_t, to_copy, size - copied);
501 to_copy = min_t(unsigned long, to_copy, len - copied);
502
503 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
504 "[%p, %u] + %lu\n",
505 to_copy, iov->iov_base, iov->iov_len, iov_off,
506 sg_page(&frag->f_sg), frag->f_sg.offset, frag_off);
507
508
509 ret = rds_page_copy_to_user(sg_page(&frag->f_sg),
510 frag->f_sg.offset + frag_off,
511 iov->iov_base + iov_off,
512 to_copy);
513 if (ret) {
514 copied = ret;
515 break;
516 }
517
518 iov_off += to_copy;
519 frag_off += to_copy;
520 copied += to_copy;
521 }
522
523 return copied;
524}
525
526
527void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
528{
529 struct ib_send_wr *wr = &ic->i_ack_wr;
530 struct ib_sge *sge = &ic->i_ack_sge;
531
532 sge->addr = ic->i_ack_dma;
533 sge->length = sizeof(struct rds_header);
534 sge->lkey = ic->i_mr->lkey;
535
536 wr->sg_list = sge;
537 wr->num_sge = 1;
538 wr->opcode = IB_WR_SEND;
539 wr->wr_id = RDS_IB_ACK_WR_ID;
540 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
541}
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565#ifndef KERNEL_HAS_ATOMIC64
566static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
567 int ack_required)
568{
569 unsigned long flags;
570
571 spin_lock_irqsave(&ic->i_ack_lock, flags);
572 ic->i_ack_next = seq;
573 if (ack_required)
574 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
575 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
576}
577
578static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
579{
580 unsigned long flags;
581 u64 seq;
582
583 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
584
585 spin_lock_irqsave(&ic->i_ack_lock, flags);
586 seq = ic->i_ack_next;
587 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
588
589 return seq;
590}
591#else
592static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
593 int ack_required)
594{
595 atomic64_set(&ic->i_ack_next, seq);
596 if (ack_required) {
597 smp_mb__before_clear_bit();
598 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
599 }
600}
601
602static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
603{
604 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
605 smp_mb__after_clear_bit();
606
607 return atomic64_read(&ic->i_ack_next);
608}
609#endif
610
611
612static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
613{
614 struct rds_header *hdr = ic->i_ack;
615 struct ib_send_wr *failed_wr;
616 u64 seq;
617 int ret;
618
619 seq = rds_ib_get_ack(ic);
620
621 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
622 rds_message_populate_header(hdr, 0, 0, 0);
623 hdr->h_ack = cpu_to_be64(seq);
624 hdr->h_credit = adv_credits;
625 rds_message_make_checksum(hdr);
626 ic->i_ack_queued = jiffies;
627
628 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
629 if (unlikely(ret)) {
630
631
632
633 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
634 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
635
636 rds_ib_stats_inc(s_ib_ack_send_failure);
637
638 rds_ib_conn_error(ic->conn, "sending ack failed\n");
639 } else
640 rds_ib_stats_inc(s_ib_ack_sent);
641}
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681void rds_ib_attempt_ack(struct rds_ib_connection *ic)
682{
683 unsigned int adv_credits;
684
685 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
686 return;
687
688 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
689 rds_ib_stats_inc(s_ib_ack_send_delayed);
690 return;
691 }
692
693
694 if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
695 rds_ib_stats_inc(s_ib_tx_throttle);
696 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
697 return;
698 }
699
700 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
701 rds_ib_send_ack(ic, adv_credits);
702}
703
704
705
706
707
708void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
709{
710 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
711 rds_ib_attempt_ack(ic);
712}
713
714
715
716
717
718u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
719{
720 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
721 rds_ib_stats_inc(s_ib_ack_send_piggybacked);
722 return rds_ib_get_ack(ic);
723}
724
725
726
727
728
729
730
731
732
733static void rds_ib_cong_recv(struct rds_connection *conn,
734 struct rds_ib_incoming *ibinc)
735{
736 struct rds_cong_map *map;
737 unsigned int map_off;
738 unsigned int map_page;
739 struct rds_page_frag *frag;
740 unsigned long frag_off;
741 unsigned long to_copy;
742 unsigned long copied;
743 uint64_t uncongested = 0;
744 void *addr;
745
746
747 if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
748 return;
749
750 map = conn->c_fcong;
751 map_page = 0;
752 map_off = 0;
753
754 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
755 frag_off = 0;
756
757 copied = 0;
758
759 while (copied < RDS_CONG_MAP_BYTES) {
760 uint64_t *src, *dst;
761 unsigned int k;
762
763 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
764 BUG_ON(to_copy & 7);
765
766 addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0);
767
768 src = addr + frag_off;
769 dst = (void *)map->m_page_addrs[map_page] + map_off;
770 for (k = 0; k < to_copy; k += 8) {
771
772
773 uncongested |= ~(*src) & *dst;
774 *dst++ = *src++;
775 }
776 kunmap_atomic(addr, KM_SOFTIRQ0);
777
778 copied += to_copy;
779
780 map_off += to_copy;
781 if (map_off == PAGE_SIZE) {
782 map_off = 0;
783 map_page++;
784 }
785
786 frag_off += to_copy;
787 if (frag_off == RDS_FRAG_SIZE) {
788 frag = list_entry(frag->f_item.next,
789 struct rds_page_frag, f_item);
790 frag_off = 0;
791 }
792 }
793
794
795 uncongested = le64_to_cpu(uncongested);
796
797 rds_cong_map_updated(map, uncongested);
798}
799
800
801
802
803
804
805
806struct rds_ib_ack_state {
807 u64 ack_next;
808 u64 ack_recv;
809 unsigned int ack_required:1;
810 unsigned int ack_next_valid:1;
811 unsigned int ack_recv_valid:1;
812};
813
814static void rds_ib_process_recv(struct rds_connection *conn,
815 struct rds_ib_recv_work *recv, u32 data_len,
816 struct rds_ib_ack_state *state)
817{
818 struct rds_ib_connection *ic = conn->c_transport_data;
819 struct rds_ib_incoming *ibinc = ic->i_ibinc;
820 struct rds_header *ihdr, *hdr;
821
822
823
824 rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
825 data_len);
826
827 if (data_len < sizeof(struct rds_header)) {
828 rds_ib_conn_error(conn, "incoming message "
829 "from %pI4 didn't inclue a "
830 "header, disconnecting and "
831 "reconnecting\n",
832 &conn->c_faddr);
833 return;
834 }
835 data_len -= sizeof(struct rds_header);
836
837 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
838
839
840 if (!rds_message_verify_checksum(ihdr)) {
841 rds_ib_conn_error(conn, "incoming message "
842 "from %pI4 has corrupted header - "
843 "forcing a reconnect\n",
844 &conn->c_faddr);
845 rds_stats_inc(s_recv_drop_bad_checksum);
846 return;
847 }
848
849
850 state->ack_recv = be64_to_cpu(ihdr->h_ack);
851 state->ack_recv_valid = 1;
852
853
854 if (ihdr->h_credit)
855 rds_ib_send_add_credits(conn, ihdr->h_credit);
856
857 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
858
859
860
861
862 rds_ib_stats_inc(s_ib_ack_received);
863
864
865
866
867
868
869
870
871
872
873 rds_ib_frag_free(ic, recv->r_frag);
874 recv->r_frag = NULL;
875 return;
876 }
877
878
879
880
881
882
883
884 if (!ibinc) {
885 ibinc = recv->r_ibinc;
886 recv->r_ibinc = NULL;
887 ic->i_ibinc = ibinc;
888
889 hdr = &ibinc->ii_inc.i_hdr;
890 memcpy(hdr, ihdr, sizeof(*hdr));
891 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
892
893 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
894 ic->i_recv_data_rem, hdr->h_flags);
895 } else {
896 hdr = &ibinc->ii_inc.i_hdr;
897
898
899 if (hdr->h_sequence != ihdr->h_sequence ||
900 hdr->h_len != ihdr->h_len ||
901 hdr->h_sport != ihdr->h_sport ||
902 hdr->h_dport != ihdr->h_dport) {
903 rds_ib_conn_error(conn,
904 "fragment header mismatch; forcing reconnect\n");
905 return;
906 }
907 }
908
909 list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
910 recv->r_frag = NULL;
911
912 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
913 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
914 else {
915 ic->i_recv_data_rem = 0;
916 ic->i_ibinc = NULL;
917
918 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
919 rds_ib_cong_recv(conn, ibinc);
920 else {
921 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
922 &ibinc->ii_inc, GFP_ATOMIC,
923 KM_SOFTIRQ0);
924 state->ack_next = be64_to_cpu(hdr->h_sequence);
925 state->ack_next_valid = 1;
926 }
927
928
929
930
931 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
932 rds_stats_inc(s_recv_ack_required);
933 state->ack_required = 1;
934 }
935
936 rds_inc_put(&ibinc->ii_inc);
937 }
938}
939
940
941
942
943
944
945
946
947
948
949void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
950{
951 struct rds_connection *conn = context;
952 struct rds_ib_connection *ic = conn->c_transport_data;
953
954 rdsdebug("conn %p cq %p\n", conn, cq);
955
956 rds_ib_stats_inc(s_ib_rx_cq_call);
957
958 tasklet_schedule(&ic->i_recv_tasklet);
959}
960
961static inline void rds_poll_cq(struct rds_ib_connection *ic,
962 struct rds_ib_ack_state *state)
963{
964 struct rds_connection *conn = ic->conn;
965 struct ib_wc wc;
966 struct rds_ib_recv_work *recv;
967
968 while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
969 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
970 (unsigned long long)wc.wr_id, wc.status,
971 rds_ib_wc_status_str(wc.status), wc.byte_len,
972 be32_to_cpu(wc.ex.imm_data));
973 rds_ib_stats_inc(s_ib_rx_cq_event);
974
975 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
976
977 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
978
979
980
981
982
983
984 if (wc.status == IB_WC_SUCCESS) {
985 rds_ib_process_recv(conn, recv, wc.byte_len, state);
986 } else {
987
988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
989 rds_ib_conn_error(conn, "recv completion on %pI4 had "
990 "status %u (%s), disconnecting and "
991 "reconnecting\n", &conn->c_faddr,
992 wc.status,
993 rds_ib_wc_status_str(wc.status));
994 }
995
996
997
998
999
1000
1001 rds_ib_ring_free(&ic->i_recv_ring, 1);
1002 }
1003}
1004
1005void rds_ib_recv_tasklet_fn(unsigned long data)
1006{
1007 struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
1008 struct rds_connection *conn = ic->conn;
1009 struct rds_ib_ack_state state = { 0, };
1010
1011 rds_poll_cq(ic, &state);
1012 ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
1013 rds_poll_cq(ic, &state);
1014
1015 if (state.ack_next_valid)
1016 rds_ib_set_ack(ic, state.ack_next, state.ack_required);
1017 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
1018 rds_send_drop_acked(conn, state.ack_recv, NULL);
1019 ic->i_ack_recv = state.ack_recv;
1020 }
1021 if (rds_conn_up(conn))
1022 rds_ib_attempt_ack(ic);
1023
1024
1025
1026
1027 if (rds_ib_ring_empty(&ic->i_recv_ring))
1028 rds_ib_stats_inc(s_ib_rx_ring_empty);
1029
1030 if (rds_ib_ring_low(&ic->i_recv_ring))
1031 rds_ib_recv_refill(conn, 0);
1032}
1033
1034int rds_ib_recv(struct rds_connection *conn)
1035{
1036 struct rds_ib_connection *ic = conn->c_transport_data;
1037 int ret = 0;
1038
1039 rdsdebug("conn %p\n", conn);
1040 if (rds_conn_up(conn))
1041 rds_ib_attempt_ack(ic);
1042
1043 return ret;
1044}
1045
1046int rds_ib_recv_init(void)
1047{
1048 struct sysinfo si;
1049 int ret = -ENOMEM;
1050
1051
1052 si_meminfo(&si);
1053 rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
1054
1055 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
1056 sizeof(struct rds_ib_incoming),
1057 0, SLAB_HWCACHE_ALIGN, NULL);
1058 if (!rds_ib_incoming_slab)
1059 goto out;
1060
1061 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
1062 sizeof(struct rds_page_frag),
1063 0, SLAB_HWCACHE_ALIGN, NULL);
1064 if (!rds_ib_frag_slab)
1065 kmem_cache_destroy(rds_ib_incoming_slab);
1066 else
1067 ret = 0;
1068out:
1069 return ret;
1070}
1071
1072void rds_ib_recv_exit(void)
1073{
1074 kmem_cache_destroy(rds_ib_incoming_slab);
1075 kmem_cache_destroy(rds_ib_frag_slab);
1076}
1077