1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33#include <linux/kernel.h>
34#include <linux/slab.h>
35#include <linux/pci.h>
36#include <linux/dma-mapping.h>
37#include <rdma/rdma_cm.h>
38
39#include "rds_single_path.h"
40#include "rds.h"
41#include "ib.h"
42
43static struct kmem_cache *rds_ib_incoming_slab;
44static struct kmem_cache *rds_ib_frag_slab;
45static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
46
47void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
48{
49 struct rds_ib_recv_work *recv;
50 u32 i;
51
52 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
53 struct ib_sge *sge;
54
55 recv->r_ibinc = NULL;
56 recv->r_frag = NULL;
57
58 recv->r_wr.next = NULL;
59 recv->r_wr.wr_id = i;
60 recv->r_wr.sg_list = recv->r_sge;
61 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
62
63 sge = &recv->r_sge[0];
64 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
65 sge->length = sizeof(struct rds_header);
66 sge->lkey = ic->i_pd->local_dma_lkey;
67
68 sge = &recv->r_sge[1];
69 sge->addr = 0;
70 sge->length = RDS_FRAG_SIZE;
71 sge->lkey = ic->i_pd->local_dma_lkey;
72 }
73}
74
75
76
77
78
79static void list_splice_entire_tail(struct list_head *from,
80 struct list_head *to)
81{
82 struct list_head *from_last = from->prev;
83
84 list_splice_tail(from_last, to);
85 list_add_tail(from_last, to);
86}
87
88static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
89{
90 struct list_head *tmp;
91
92 tmp = xchg(&cache->xfer, NULL);
93 if (tmp) {
94 if (cache->ready)
95 list_splice_entire_tail(tmp, cache->ready);
96 else
97 cache->ready = tmp;
98 }
99}
100
101static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
102{
103 struct rds_ib_cache_head *head;
104 int cpu;
105
106 cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
107 if (!cache->percpu)
108 return -ENOMEM;
109
110 for_each_possible_cpu(cpu) {
111 head = per_cpu_ptr(cache->percpu, cpu);
112 head->first = NULL;
113 head->count = 0;
114 }
115 cache->xfer = NULL;
116 cache->ready = NULL;
117
118 return 0;
119}
120
121int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
122{
123 int ret;
124
125 ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
126 if (!ret) {
127 ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
128 if (ret)
129 free_percpu(ic->i_cache_incs.percpu);
130 }
131
132 return ret;
133}
134
135static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
136 struct list_head *caller_list)
137{
138 struct rds_ib_cache_head *head;
139 int cpu;
140
141 for_each_possible_cpu(cpu) {
142 head = per_cpu_ptr(cache->percpu, cpu);
143 if (head->first) {
144 list_splice_entire_tail(head->first, caller_list);
145 head->first = NULL;
146 }
147 }
148
149 if (cache->ready) {
150 list_splice_entire_tail(cache->ready, caller_list);
151 cache->ready = NULL;
152 }
153}
154
155void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
156{
157 struct rds_ib_incoming *inc;
158 struct rds_ib_incoming *inc_tmp;
159 struct rds_page_frag *frag;
160 struct rds_page_frag *frag_tmp;
161 LIST_HEAD(list);
162
163 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
164 rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
165 free_percpu(ic->i_cache_incs.percpu);
166
167 list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
168 list_del(&inc->ii_cache_entry);
169 WARN_ON(!list_empty(&inc->ii_frags));
170 kmem_cache_free(rds_ib_incoming_slab, inc);
171 }
172
173 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
174 rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
175 free_percpu(ic->i_cache_frags.percpu);
176
177 list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
178 list_del(&frag->f_cache_entry);
179 WARN_ON(!list_empty(&frag->f_item));
180 kmem_cache_free(rds_ib_frag_slab, frag);
181 }
182}
183
184
185static void rds_ib_recv_cache_put(struct list_head *new_item,
186 struct rds_ib_refill_cache *cache);
187static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
188
189
190
191static void rds_ib_frag_free(struct rds_ib_connection *ic,
192 struct rds_page_frag *frag)
193{
194 rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
195
196 rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
197 atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
198 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
199}
200
201
202void rds_ib_inc_free(struct rds_incoming *inc)
203{
204 struct rds_ib_incoming *ibinc;
205 struct rds_page_frag *frag;
206 struct rds_page_frag *pos;
207 struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
208
209 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
210
211
212 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
213 list_del_init(&frag->f_item);
214 rds_ib_frag_free(ic, frag);
215 }
216 BUG_ON(!list_empty(&ibinc->ii_frags));
217
218 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
219 rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
220}
221
222static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
223 struct rds_ib_recv_work *recv)
224{
225 if (recv->r_ibinc) {
226 rds_inc_put(&recv->r_ibinc->ii_inc);
227 recv->r_ibinc = NULL;
228 }
229 if (recv->r_frag) {
230 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
231 rds_ib_frag_free(ic, recv->r_frag);
232 recv->r_frag = NULL;
233 }
234}
235
236void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
237{
238 u32 i;
239
240 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
241 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
242}
243
244static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
245 gfp_t slab_mask)
246{
247 struct rds_ib_incoming *ibinc;
248 struct list_head *cache_item;
249 int avail_allocs;
250
251 cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
252 if (cache_item) {
253 ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
254 } else {
255 avail_allocs = atomic_add_unless(&rds_ib_allocation,
256 1, rds_ib_sysctl_max_recv_allocation);
257 if (!avail_allocs) {
258 rds_ib_stats_inc(s_ib_rx_alloc_limit);
259 return NULL;
260 }
261 ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
262 if (!ibinc) {
263 atomic_dec(&rds_ib_allocation);
264 return NULL;
265 }
266 rds_ib_stats_inc(s_ib_rx_total_incs);
267 }
268 INIT_LIST_HEAD(&ibinc->ii_frags);
269 rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
270
271 return ibinc;
272}
273
274static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
275 gfp_t slab_mask, gfp_t page_mask)
276{
277 struct rds_page_frag *frag;
278 struct list_head *cache_item;
279 int ret;
280
281 cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
282 if (cache_item) {
283 frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
284 atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs);
285 rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE);
286 } else {
287 frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
288 if (!frag)
289 return NULL;
290
291 sg_init_table(&frag->f_sg, 1);
292 ret = rds_page_remainder_alloc(&frag->f_sg,
293 RDS_FRAG_SIZE, page_mask);
294 if (ret) {
295 kmem_cache_free(rds_ib_frag_slab, frag);
296 return NULL;
297 }
298 rds_ib_stats_inc(s_ib_rx_total_frags);
299 }
300
301 INIT_LIST_HEAD(&frag->f_item);
302
303 return frag;
304}
305
306static int rds_ib_recv_refill_one(struct rds_connection *conn,
307 struct rds_ib_recv_work *recv, gfp_t gfp)
308{
309 struct rds_ib_connection *ic = conn->c_transport_data;
310 struct ib_sge *sge;
311 int ret = -ENOMEM;
312 gfp_t slab_mask = GFP_NOWAIT;
313 gfp_t page_mask = GFP_NOWAIT;
314
315 if (gfp & __GFP_DIRECT_RECLAIM) {
316 slab_mask = GFP_KERNEL;
317 page_mask = GFP_HIGHUSER;
318 }
319
320 if (!ic->i_cache_incs.ready)
321 rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
322 if (!ic->i_cache_frags.ready)
323 rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
324
325
326
327
328
329 if (!recv->r_ibinc) {
330 recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
331 if (!recv->r_ibinc)
332 goto out;
333 }
334
335 WARN_ON(recv->r_frag);
336 recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
337 if (!recv->r_frag)
338 goto out;
339
340 ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
341 1, DMA_FROM_DEVICE);
342 WARN_ON(ret != 1);
343
344 sge = &recv->r_sge[0];
345 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
346 sge->length = sizeof(struct rds_header);
347
348 sge = &recv->r_sge[1];
349 sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
350 sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
351
352 ret = 0;
353out:
354 return ret;
355}
356
357static int acquire_refill(struct rds_connection *conn)
358{
359 return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0;
360}
361
362static void release_refill(struct rds_connection *conn)
363{
364 clear_bit(RDS_RECV_REFILL, &conn->c_flags);
365
366
367
368
369
370
371 if (waitqueue_active(&conn->c_waitq))
372 wake_up_all(&conn->c_waitq);
373}
374
375
376
377
378
379
380void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
381{
382 struct rds_ib_connection *ic = conn->c_transport_data;
383 struct rds_ib_recv_work *recv;
384 unsigned int posted = 0;
385 int ret = 0;
386 bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
387 u32 pos;
388
389
390
391
392
393 if (!acquire_refill(conn))
394 return;
395
396 while ((prefill || rds_conn_up(conn)) &&
397 rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
398 if (pos >= ic->i_recv_ring.w_nr) {
399 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
400 pos);
401 break;
402 }
403
404 recv = &ic->i_recvs[pos];
405 ret = rds_ib_recv_refill_one(conn, recv, gfp);
406 if (ret) {
407 break;
408 }
409
410 rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv,
411 recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
412 (long) ib_sg_dma_address(
413 ic->i_cm_id->device,
414 &recv->r_frag->f_sg));
415
416
417 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL);
418 if (ret) {
419 rds_ib_conn_error(conn, "recv post on "
420 "%pI6c returned %d, disconnecting and "
421 "reconnecting\n", &conn->c_faddr,
422 ret);
423 break;
424 }
425
426 posted++;
427 }
428
429
430 if (ic->i_flowctl && posted)
431 rds_ib_advertise_credits(conn, posted);
432
433 if (ret)
434 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
435
436 release_refill(conn);
437
438
439
440
441
442
443
444
445
446
447
448 if (rds_conn_up(conn) &&
449 ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
450 rds_ib_ring_empty(&ic->i_recv_ring))) {
451 queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
452 }
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468static void rds_ib_recv_cache_put(struct list_head *new_item,
469 struct rds_ib_refill_cache *cache)
470{
471 unsigned long flags;
472 struct list_head *old, *chpfirst;
473
474 local_irq_save(flags);
475
476 chpfirst = __this_cpu_read(cache->percpu->first);
477 if (!chpfirst)
478 INIT_LIST_HEAD(new_item);
479 else
480 list_add_tail(new_item, chpfirst);
481
482 __this_cpu_write(cache->percpu->first, new_item);
483 __this_cpu_inc(cache->percpu->count);
484
485 if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
486 goto end;
487
488
489
490
491
492
493
494 do {
495 old = xchg(&cache->xfer, NULL);
496 if (old)
497 list_splice_entire_tail(old, chpfirst);
498 old = cmpxchg(&cache->xfer, NULL, chpfirst);
499 } while (old);
500
501
502 __this_cpu_write(cache->percpu->first, NULL);
503 __this_cpu_write(cache->percpu->count, 0);
504end:
505 local_irq_restore(flags);
506}
507
508static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
509{
510 struct list_head *head = cache->ready;
511
512 if (head) {
513 if (!list_empty(head)) {
514 cache->ready = head->next;
515 list_del_init(head);
516 } else
517 cache->ready = NULL;
518 }
519
520 return head;
521}
522
523int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
524{
525 struct rds_ib_incoming *ibinc;
526 struct rds_page_frag *frag;
527 unsigned long to_copy;
528 unsigned long frag_off = 0;
529 int copied = 0;
530 int ret;
531 u32 len;
532
533 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
534 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
535 len = be32_to_cpu(inc->i_hdr.h_len);
536
537 while (iov_iter_count(to) && copied < len) {
538 if (frag_off == RDS_FRAG_SIZE) {
539 frag = list_entry(frag->f_item.next,
540 struct rds_page_frag, f_item);
541 frag_off = 0;
542 }
543 to_copy = min_t(unsigned long, iov_iter_count(to),
544 RDS_FRAG_SIZE - frag_off);
545 to_copy = min_t(unsigned long, to_copy, len - copied);
546
547
548 rds_stats_add(s_copy_to_user, to_copy);
549 ret = copy_page_to_iter(sg_page(&frag->f_sg),
550 frag->f_sg.offset + frag_off,
551 to_copy,
552 to);
553 if (ret != to_copy)
554 return -EFAULT;
555
556 frag_off += to_copy;
557 copied += to_copy;
558 }
559
560 return copied;
561}
562
563
564void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
565{
566 struct ib_send_wr *wr = &ic->i_ack_wr;
567 struct ib_sge *sge = &ic->i_ack_sge;
568
569 sge->addr = ic->i_ack_dma;
570 sge->length = sizeof(struct rds_header);
571 sge->lkey = ic->i_pd->local_dma_lkey;
572
573 wr->sg_list = sge;
574 wr->num_sge = 1;
575 wr->opcode = IB_WR_SEND;
576 wr->wr_id = RDS_IB_ACK_WR_ID;
577 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
578}
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602#ifndef KERNEL_HAS_ATOMIC64
603void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
604{
605 unsigned long flags;
606
607 spin_lock_irqsave(&ic->i_ack_lock, flags);
608 ic->i_ack_next = seq;
609 if (ack_required)
610 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
611 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
612}
613
614static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
615{
616 unsigned long flags;
617 u64 seq;
618
619 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
620
621 spin_lock_irqsave(&ic->i_ack_lock, flags);
622 seq = ic->i_ack_next;
623 spin_unlock_irqrestore(&ic->i_ack_lock, flags);
624
625 return seq;
626}
627#else
628void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required)
629{
630 atomic64_set(&ic->i_ack_next, seq);
631 if (ack_required) {
632 smp_mb__before_atomic();
633 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
634 }
635}
636
637static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
638{
639 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
640 smp_mb__after_atomic();
641
642 return atomic64_read(&ic->i_ack_next);
643}
644#endif
645
646
647static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
648{
649 struct rds_header *hdr = ic->i_ack;
650 u64 seq;
651 int ret;
652
653 seq = rds_ib_get_ack(ic);
654
655 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
656 rds_message_populate_header(hdr, 0, 0, 0);
657 hdr->h_ack = cpu_to_be64(seq);
658 hdr->h_credit = adv_credits;
659 rds_message_make_checksum(hdr);
660 ic->i_ack_queued = jiffies;
661
662 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, NULL);
663 if (unlikely(ret)) {
664
665
666
667 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
668 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
669
670 rds_ib_stats_inc(s_ib_ack_send_failure);
671
672 rds_ib_conn_error(ic->conn, "sending ack failed\n");
673 } else
674 rds_ib_stats_inc(s_ib_ack_sent);
675}
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715void rds_ib_attempt_ack(struct rds_ib_connection *ic)
716{
717 unsigned int adv_credits;
718
719 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
720 return;
721
722 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
723 rds_ib_stats_inc(s_ib_ack_send_delayed);
724 return;
725 }
726
727
728 if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
729 rds_ib_stats_inc(s_ib_tx_throttle);
730 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
731 return;
732 }
733
734 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
735 rds_ib_send_ack(ic, adv_credits);
736}
737
738
739
740
741
742void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
743{
744 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
745 rds_ib_attempt_ack(ic);
746}
747
748
749
750
751
752u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
753{
754 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
755 rds_ib_stats_inc(s_ib_ack_send_piggybacked);
756 return rds_ib_get_ack(ic);
757}
758
759
760
761
762
763
764
765
766
767static void rds_ib_cong_recv(struct rds_connection *conn,
768 struct rds_ib_incoming *ibinc)
769{
770 struct rds_cong_map *map;
771 unsigned int map_off;
772 unsigned int map_page;
773 struct rds_page_frag *frag;
774 unsigned long frag_off;
775 unsigned long to_copy;
776 unsigned long copied;
777 uint64_t uncongested = 0;
778 void *addr;
779
780
781 if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
782 return;
783
784 map = conn->c_fcong;
785 map_page = 0;
786 map_off = 0;
787
788 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
789 frag_off = 0;
790
791 copied = 0;
792
793 while (copied < RDS_CONG_MAP_BYTES) {
794 uint64_t *src, *dst;
795 unsigned int k;
796
797 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
798 BUG_ON(to_copy & 7);
799
800 addr = kmap_atomic(sg_page(&frag->f_sg));
801
802 src = addr + frag->f_sg.offset + frag_off;
803 dst = (void *)map->m_page_addrs[map_page] + map_off;
804 for (k = 0; k < to_copy; k += 8) {
805
806
807 uncongested |= ~(*src) & *dst;
808 *dst++ = *src++;
809 }
810 kunmap_atomic(addr);
811
812 copied += to_copy;
813
814 map_off += to_copy;
815 if (map_off == PAGE_SIZE) {
816 map_off = 0;
817 map_page++;
818 }
819
820 frag_off += to_copy;
821 if (frag_off == RDS_FRAG_SIZE) {
822 frag = list_entry(frag->f_item.next,
823 struct rds_page_frag, f_item);
824 frag_off = 0;
825 }
826 }
827
828
829 uncongested = le64_to_cpu(uncongested);
830
831 rds_cong_map_updated(map, uncongested);
832}
833
834static void rds_ib_process_recv(struct rds_connection *conn,
835 struct rds_ib_recv_work *recv, u32 data_len,
836 struct rds_ib_ack_state *state)
837{
838 struct rds_ib_connection *ic = conn->c_transport_data;
839 struct rds_ib_incoming *ibinc = ic->i_ibinc;
840 struct rds_header *ihdr, *hdr;
841
842
843
844 rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
845 data_len);
846
847 if (data_len < sizeof(struct rds_header)) {
848 rds_ib_conn_error(conn, "incoming message "
849 "from %pI6c didn't include a "
850 "header, disconnecting and "
851 "reconnecting\n",
852 &conn->c_faddr);
853 return;
854 }
855 data_len -= sizeof(struct rds_header);
856
857 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
858
859
860 if (!rds_message_verify_checksum(ihdr)) {
861 rds_ib_conn_error(conn, "incoming message "
862 "from %pI6c has corrupted header - "
863 "forcing a reconnect\n",
864 &conn->c_faddr);
865 rds_stats_inc(s_recv_drop_bad_checksum);
866 return;
867 }
868
869
870 state->ack_recv = be64_to_cpu(ihdr->h_ack);
871 state->ack_recv_valid = 1;
872
873
874 if (ihdr->h_credit)
875 rds_ib_send_add_credits(conn, ihdr->h_credit);
876
877 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
878
879
880
881
882 rds_ib_stats_inc(s_ib_ack_received);
883
884
885
886
887
888
889
890
891
892
893 rds_ib_frag_free(ic, recv->r_frag);
894 recv->r_frag = NULL;
895 return;
896 }
897
898
899
900
901
902
903
904 if (!ibinc) {
905 ibinc = recv->r_ibinc;
906 recv->r_ibinc = NULL;
907 ic->i_ibinc = ibinc;
908
909 hdr = &ibinc->ii_inc.i_hdr;
910 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
911 local_clock();
912 memcpy(hdr, ihdr, sizeof(*hdr));
913 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
914 ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] =
915 local_clock();
916
917 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
918 ic->i_recv_data_rem, hdr->h_flags);
919 } else {
920 hdr = &ibinc->ii_inc.i_hdr;
921
922
923 if (hdr->h_sequence != ihdr->h_sequence ||
924 hdr->h_len != ihdr->h_len ||
925 hdr->h_sport != ihdr->h_sport ||
926 hdr->h_dport != ihdr->h_dport) {
927 rds_ib_conn_error(conn,
928 "fragment header mismatch; forcing reconnect\n");
929 return;
930 }
931 }
932
933 list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
934 recv->r_frag = NULL;
935
936 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
937 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
938 else {
939 ic->i_recv_data_rem = 0;
940 ic->i_ibinc = NULL;
941
942 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
943 rds_ib_cong_recv(conn, ibinc);
944 } else {
945 rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
946 &ibinc->ii_inc, GFP_ATOMIC);
947 state->ack_next = be64_to_cpu(hdr->h_sequence);
948 state->ack_next_valid = 1;
949 }
950
951
952
953
954 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
955 rds_stats_inc(s_recv_ack_required);
956 state->ack_required = 1;
957 }
958
959 rds_inc_put(&ibinc->ii_inc);
960 }
961}
962
963void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
964 struct ib_wc *wc,
965 struct rds_ib_ack_state *state)
966{
967 struct rds_connection *conn = ic->conn;
968 struct rds_ib_recv_work *recv;
969
970 rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
971 (unsigned long long)wc->wr_id, wc->status,
972 ib_wc_status_msg(wc->status), wc->byte_len,
973 be32_to_cpu(wc->ex.imm_data));
974
975 rds_ib_stats_inc(s_ib_rx_cq_event);
976 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
977 ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1,
978 DMA_FROM_DEVICE);
979
980
981
982
983
984 if (wc->status == IB_WC_SUCCESS) {
985 rds_ib_process_recv(conn, recv, wc->byte_len, state);
986 } else {
987
988 if (rds_conn_up(conn) || rds_conn_connecting(conn))
989 rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
990 &conn->c_laddr, &conn->c_faddr,
991 wc->status,
992 ib_wc_status_msg(wc->status));
993 }
994
995
996
997
998
999
1000
1001
1002 if (recv->r_frag) {
1003 rds_ib_frag_free(ic, recv->r_frag);
1004 recv->r_frag = NULL;
1005 }
1006 rds_ib_ring_free(&ic->i_recv_ring, 1);
1007
1008
1009
1010
1011 if (rds_ib_ring_empty(&ic->i_recv_ring))
1012 rds_ib_stats_inc(s_ib_rx_ring_empty);
1013
1014 if (rds_ib_ring_low(&ic->i_recv_ring)) {
1015 rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
1016 rds_ib_stats_inc(s_ib_rx_refill_from_cq);
1017 }
1018}
1019
1020int rds_ib_recv_path(struct rds_conn_path *cp)
1021{
1022 struct rds_connection *conn = cp->cp_conn;
1023 struct rds_ib_connection *ic = conn->c_transport_data;
1024
1025 rdsdebug("conn %p\n", conn);
1026 if (rds_conn_up(conn)) {
1027 rds_ib_attempt_ack(ic);
1028 rds_ib_recv_refill(conn, 0, GFP_KERNEL);
1029 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
1030 }
1031
1032 return 0;
1033}
1034
1035int rds_ib_recv_init(void)
1036{
1037 struct sysinfo si;
1038 int ret = -ENOMEM;
1039
1040
1041 si_meminfo(&si);
1042 rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
1043
1044 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
1045 sizeof(struct rds_ib_incoming),
1046 0, SLAB_HWCACHE_ALIGN, NULL);
1047 if (!rds_ib_incoming_slab)
1048 goto out;
1049
1050 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
1051 sizeof(struct rds_page_frag),
1052 0, SLAB_HWCACHE_ALIGN, NULL);
1053 if (!rds_ib_frag_slab) {
1054 kmem_cache_destroy(rds_ib_incoming_slab);
1055 rds_ib_incoming_slab = NULL;
1056 } else
1057 ret = 0;
1058out:
1059 return ret;
1060}
1061
1062void rds_ib_recv_exit(void)
1063{
1064 kmem_cache_destroy(rds_ib_incoming_slab);
1065 kmem_cache_destroy(rds_ib_frag_slab);
1066}
1067