1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include "qemu/osdep.h"
18#include "qapi/error.h"
19#include "qemu/cutils.h"
20#include "rdma.h"
21#include "migration.h"
22#include "qemu-file.h"
23#include "ram.h"
24#include "qemu-file-channel.h"
25#include "qemu/error-report.h"
26#include "qemu/main-loop.h"
27#include "qemu/module.h"
28#include "qemu/rcu.h"
29#include "qemu/sockets.h"
30#include "qemu/bitmap.h"
31#include "qemu/coroutine.h"
32#include "exec/memory.h"
33#include <sys/socket.h>
34#include <netdb.h>
35#include <arpa/inet.h>
36#include <rdma/rdma_cma.h>
37#include "trace.h"
38#include "qom/object.h"
39#include <poll.h>
40
41
42
43
44#define ERROR(errp, fmt, ...) \
45 do { \
46 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
47 if (errp && (*(errp) == NULL)) { \
48 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
49 } \
50 } while (0)
51
52#define RDMA_RESOLVE_TIMEOUT_MS 10000
53
54
55#define RDMA_MERGE_MAX (2 * 1024 * 1024)
56#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
57
58#define RDMA_REG_CHUNK_SHIFT 20
59
60
61
62
63
64
65
66#define RDMA_SEND_INCREMENT 32768
67
68
69
70
71#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
72#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
73
74#define RDMA_CONTROL_VERSION_CURRENT 1
75
76
77
78#define RDMA_CAPABILITY_PIN_ALL 0x01
79
80
81
82
83
84static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
85
86#define CHECK_ERROR_STATE() \
87 do { \
88 if (rdma->error_state) { \
89 if (!rdma->error_reported) { \
90 error_report("RDMA is in an error state waiting migration" \
91 " to abort!"); \
92 rdma->error_reported = 1; \
93 } \
94 return rdma->error_state; \
95 } \
96 } while (0)
97
98
99
100
101
102
103
104
105
106
107
108
109
110#define RDMA_WRID_TYPE_SHIFT 0UL
111#define RDMA_WRID_BLOCK_SHIFT 16UL
112#define RDMA_WRID_CHUNK_SHIFT 30UL
113
114#define RDMA_WRID_TYPE_MASK \
115 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
116
117#define RDMA_WRID_BLOCK_MASK \
118 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
119
120#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
121
122
123
124
125
126
127enum {
128 RDMA_WRID_NONE = 0,
129 RDMA_WRID_RDMA_WRITE = 1,
130 RDMA_WRID_SEND_CONTROL = 2000,
131 RDMA_WRID_RECV_CONTROL = 4000,
132};
133
134static const char *wrid_desc[] = {
135 [RDMA_WRID_NONE] = "NONE",
136 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
137 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
138 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
139};
140
141
142
143
144
145
146
147
148enum {
149 RDMA_WRID_READY = 0,
150 RDMA_WRID_DATA,
151 RDMA_WRID_CONTROL,
152 RDMA_WRID_MAX,
153};
154
155
156
157
158enum {
159 RDMA_CONTROL_NONE = 0,
160 RDMA_CONTROL_ERROR,
161 RDMA_CONTROL_READY,
162 RDMA_CONTROL_QEMU_FILE,
163 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
164 RDMA_CONTROL_RAM_BLOCKS_RESULT,
165 RDMA_CONTROL_COMPRESS,
166 RDMA_CONTROL_REGISTER_REQUEST,
167 RDMA_CONTROL_REGISTER_RESULT,
168 RDMA_CONTROL_REGISTER_FINISHED,
169 RDMA_CONTROL_UNREGISTER_REQUEST,
170 RDMA_CONTROL_UNREGISTER_FINISHED,
171};
172
173
174
175
176
177
178typedef struct {
179 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
180 struct ibv_mr *control_mr;
181 size_t control_len;
182 uint8_t *control_curr;
183} RDMAWorkRequestData;
184
185
186
187
188typedef struct {
189 uint32_t version;
190 uint32_t flags;
191} RDMACapabilities;
192
193static void caps_to_network(RDMACapabilities *cap)
194{
195 cap->version = htonl(cap->version);
196 cap->flags = htonl(cap->flags);
197}
198
199static void network_to_caps(RDMACapabilities *cap)
200{
201 cap->version = ntohl(cap->version);
202 cap->flags = ntohl(cap->flags);
203}
204
205
206
207
208
209
210
211
212typedef struct RDMALocalBlock {
213 char *block_name;
214 uint8_t *local_host_addr;
215 uint64_t remote_host_addr;
216 uint64_t offset;
217 uint64_t length;
218 struct ibv_mr **pmr;
219 struct ibv_mr *mr;
220 uint32_t *remote_keys;
221 uint32_t remote_rkey;
222 int index;
223 unsigned int src_index;
224 bool is_ram_block;
225 int nb_chunks;
226 unsigned long *transit_bitmap;
227 unsigned long *unregister_bitmap;
228} RDMALocalBlock;
229
230
231
232
233
234
235
236
237typedef struct QEMU_PACKED RDMADestBlock {
238 uint64_t remote_host_addr;
239 uint64_t offset;
240 uint64_t length;
241 uint32_t remote_rkey;
242 uint32_t padding;
243} RDMADestBlock;
244
245static const char *control_desc(unsigned int rdma_control)
246{
247 static const char *strs[] = {
248 [RDMA_CONTROL_NONE] = "NONE",
249 [RDMA_CONTROL_ERROR] = "ERROR",
250 [RDMA_CONTROL_READY] = "READY",
251 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
252 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
253 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
254 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
255 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
256 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
257 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
258 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
259 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
260 };
261
262 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
263 return "??BAD CONTROL VALUE??";
264 }
265
266 return strs[rdma_control];
267}
268
269static uint64_t htonll(uint64_t v)
270{
271 union { uint32_t lv[2]; uint64_t llv; } u;
272 u.lv[0] = htonl(v >> 32);
273 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
274 return u.llv;
275}
276
277static uint64_t ntohll(uint64_t v)
278{
279 union { uint32_t lv[2]; uint64_t llv; } u;
280 u.llv = v;
281 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
282}
283
284static void dest_block_to_network(RDMADestBlock *db)
285{
286 db->remote_host_addr = htonll(db->remote_host_addr);
287 db->offset = htonll(db->offset);
288 db->length = htonll(db->length);
289 db->remote_rkey = htonl(db->remote_rkey);
290}
291
292static void network_to_dest_block(RDMADestBlock *db)
293{
294 db->remote_host_addr = ntohll(db->remote_host_addr);
295 db->offset = ntohll(db->offset);
296 db->length = ntohll(db->length);
297 db->remote_rkey = ntohl(db->remote_rkey);
298}
299
300
301
302
303
304
305typedef struct RDMALocalBlocks {
306 int nb_blocks;
307 bool init;
308 RDMALocalBlock *block;
309} RDMALocalBlocks;
310
311
312
313
314
315
316
317typedef struct RDMAContext {
318 char *host;
319 int port;
320 char *host_port;
321
322 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
323
324
325
326
327
328
329
330
331 int control_ready_expected;
332
333
334 int nb_sent;
335
336
337
338 uint64_t current_addr;
339 uint64_t current_length;
340
341 int current_index;
342
343 int current_chunk;
344
345 bool pin_all;
346
347
348
349
350
351
352
353
354 struct rdma_cm_id *cm_id;
355 struct rdma_cm_id *listen_id;
356 bool connected;
357
358 struct ibv_context *verbs;
359 struct rdma_event_channel *channel;
360 struct ibv_qp *qp;
361 struct ibv_comp_channel *recv_comp_channel;
362 struct ibv_comp_channel *send_comp_channel;
363 struct ibv_pd *pd;
364 struct ibv_cq *recv_cq;
365 struct ibv_cq *send_cq;
366
367
368
369
370
371
372 int error_state;
373 int error_reported;
374 int received_error;
375
376
377
378
379 RDMALocalBlocks local_ram_blocks;
380 RDMADestBlock *dest_blocks;
381
382
383 unsigned int next_src_index;
384
385
386
387
388
389
390 int migration_started_on_destination;
391
392 int total_registrations;
393 int total_writes;
394
395 int unregister_current, unregister_next;
396 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
397
398 GHashTable *blockmap;
399
400
401 struct RDMAContext *return_path;
402 bool is_return_path;
403} RDMAContext;
404
405#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
406OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
407
408
409
410struct QIOChannelRDMA {
411 QIOChannel parent;
412 RDMAContext *rdmain;
413 RDMAContext *rdmaout;
414 QEMUFile *file;
415 bool blocking;
416};
417
418
419
420
421
422typedef struct QEMU_PACKED {
423 uint32_t len;
424 uint32_t type;
425 uint32_t repeat;
426 uint32_t padding;
427} RDMAControlHeader;
428
429static void control_to_network(RDMAControlHeader *control)
430{
431 control->type = htonl(control->type);
432 control->len = htonl(control->len);
433 control->repeat = htonl(control->repeat);
434}
435
436static void network_to_control(RDMAControlHeader *control)
437{
438 control->type = ntohl(control->type);
439 control->len = ntohl(control->len);
440 control->repeat = ntohl(control->repeat);
441}
442
443
444
445
446
447
448
449typedef struct QEMU_PACKED {
450 union QEMU_PACKED {
451 uint64_t current_addr;
452 uint64_t chunk;
453 } key;
454 uint32_t current_index;
455 uint32_t padding;
456 uint64_t chunks;
457} RDMARegister;
458
459static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
460{
461 RDMALocalBlock *local_block;
462 local_block = &rdma->local_ram_blocks.block[reg->current_index];
463
464 if (local_block->is_ram_block) {
465
466
467
468
469 reg->key.current_addr -= local_block->offset;
470 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
471 }
472 reg->key.current_addr = htonll(reg->key.current_addr);
473 reg->current_index = htonl(reg->current_index);
474 reg->chunks = htonll(reg->chunks);
475}
476
477static void network_to_register(RDMARegister *reg)
478{
479 reg->key.current_addr = ntohll(reg->key.current_addr);
480 reg->current_index = ntohl(reg->current_index);
481 reg->chunks = ntohll(reg->chunks);
482}
483
484typedef struct QEMU_PACKED {
485 uint32_t value;
486 uint32_t block_idx;
487 uint64_t offset;
488 uint64_t length;
489} RDMACompress;
490
491static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
492{
493 comp->value = htonl(comp->value);
494
495
496
497
498 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
499 comp->offset += rdma->dest_blocks[comp->block_idx].offset;
500 comp->block_idx = htonl(comp->block_idx);
501 comp->offset = htonll(comp->offset);
502 comp->length = htonll(comp->length);
503}
504
505static void network_to_compress(RDMACompress *comp)
506{
507 comp->value = ntohl(comp->value);
508 comp->block_idx = ntohl(comp->block_idx);
509 comp->offset = ntohll(comp->offset);
510 comp->length = ntohll(comp->length);
511}
512
513
514
515
516
517
518typedef struct QEMU_PACKED {
519 uint32_t rkey;
520 uint32_t padding;
521 uint64_t host_addr;
522} RDMARegisterResult;
523
524static void result_to_network(RDMARegisterResult *result)
525{
526 result->rkey = htonl(result->rkey);
527 result->host_addr = htonll(result->host_addr);
528};
529
530static void network_to_result(RDMARegisterResult *result)
531{
532 result->rkey = ntohl(result->rkey);
533 result->host_addr = ntohll(result->host_addr);
534};
535
536const char *print_wrid(int wrid);
537static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
538 uint8_t *data, RDMAControlHeader *resp,
539 int *resp_idx,
540 int (*callback)(RDMAContext *rdma));
541
542static inline uint64_t ram_chunk_index(const uint8_t *start,
543 const uint8_t *host)
544{
545 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
546}
547
548static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
549 uint64_t i)
550{
551 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
552 (i << RDMA_REG_CHUNK_SHIFT));
553}
554
555static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
556 uint64_t i)
557{
558 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
559 (1UL << RDMA_REG_CHUNK_SHIFT);
560
561 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
562 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
563 }
564
565 return result;
566}
567
568static int rdma_add_block(RDMAContext *rdma, const char *block_name,
569 void *host_addr,
570 ram_addr_t block_offset, uint64_t length)
571{
572 RDMALocalBlocks *local = &rdma->local_ram_blocks;
573 RDMALocalBlock *block;
574 RDMALocalBlock *old = local->block;
575
576 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
577
578 if (local->nb_blocks) {
579 int x;
580
581 if (rdma->blockmap) {
582 for (x = 0; x < local->nb_blocks; x++) {
583 g_hash_table_remove(rdma->blockmap,
584 (void *)(uintptr_t)old[x].offset);
585 g_hash_table_insert(rdma->blockmap,
586 (void *)(uintptr_t)old[x].offset,
587 &local->block[x]);
588 }
589 }
590 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
591 g_free(old);
592 }
593
594 block = &local->block[local->nb_blocks];
595
596 block->block_name = g_strdup(block_name);
597 block->local_host_addr = host_addr;
598 block->offset = block_offset;
599 block->length = length;
600 block->index = local->nb_blocks;
601 block->src_index = ~0U;
602 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
603 block->transit_bitmap = bitmap_new(block->nb_chunks);
604 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
605 block->unregister_bitmap = bitmap_new(block->nb_chunks);
606 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
607 block->remote_keys = g_new0(uint32_t, block->nb_chunks);
608
609 block->is_ram_block = local->init ? false : true;
610
611 if (rdma->blockmap) {
612 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
613 }
614
615 trace_rdma_add_block(block_name, local->nb_blocks,
616 (uintptr_t) block->local_host_addr,
617 block->offset, block->length,
618 (uintptr_t) (block->local_host_addr + block->length),
619 BITS_TO_LONGS(block->nb_chunks) *
620 sizeof(unsigned long) * 8,
621 block->nb_chunks);
622
623 local->nb_blocks++;
624
625 return 0;
626}
627
628
629
630
631
632
633static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
634{
635 const char *block_name = qemu_ram_get_idstr(rb);
636 void *host_addr = qemu_ram_get_host_addr(rb);
637 ram_addr_t block_offset = qemu_ram_get_offset(rb);
638 ram_addr_t length = qemu_ram_get_used_length(rb);
639 return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
640}
641
642
643
644
645
646
647static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
648{
649 RDMALocalBlocks *local = &rdma->local_ram_blocks;
650 int ret;
651
652 assert(rdma->blockmap == NULL);
653 memset(local, 0, sizeof *local);
654 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
655 if (ret) {
656 return ret;
657 }
658 trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
659 rdma->dest_blocks = g_new0(RDMADestBlock,
660 rdma->local_ram_blocks.nb_blocks);
661 local->init = true;
662 return 0;
663}
664
665
666
667
668
669static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
670{
671 RDMALocalBlocks *local = &rdma->local_ram_blocks;
672 RDMALocalBlock *old = local->block;
673 int x;
674
675 if (rdma->blockmap) {
676 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
677 }
678 if (block->pmr) {
679 int j;
680
681 for (j = 0; j < block->nb_chunks; j++) {
682 if (!block->pmr[j]) {
683 continue;
684 }
685 ibv_dereg_mr(block->pmr[j]);
686 rdma->total_registrations--;
687 }
688 g_free(block->pmr);
689 block->pmr = NULL;
690 }
691
692 if (block->mr) {
693 ibv_dereg_mr(block->mr);
694 rdma->total_registrations--;
695 block->mr = NULL;
696 }
697
698 g_free(block->transit_bitmap);
699 block->transit_bitmap = NULL;
700
701 g_free(block->unregister_bitmap);
702 block->unregister_bitmap = NULL;
703
704 g_free(block->remote_keys);
705 block->remote_keys = NULL;
706
707 g_free(block->block_name);
708 block->block_name = NULL;
709
710 if (rdma->blockmap) {
711 for (x = 0; x < local->nb_blocks; x++) {
712 g_hash_table_remove(rdma->blockmap,
713 (void *)(uintptr_t)old[x].offset);
714 }
715 }
716
717 if (local->nb_blocks > 1) {
718
719 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
720
721 if (block->index) {
722 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
723 }
724
725 if (block->index < (local->nb_blocks - 1)) {
726 memcpy(local->block + block->index, old + (block->index + 1),
727 sizeof(RDMALocalBlock) *
728 (local->nb_blocks - (block->index + 1)));
729 for (x = block->index; x < local->nb_blocks - 1; x++) {
730 local->block[x].index--;
731 }
732 }
733 } else {
734 assert(block == local->block);
735 local->block = NULL;
736 }
737
738 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
739 block->offset, block->length,
740 (uintptr_t)(block->local_host_addr + block->length),
741 BITS_TO_LONGS(block->nb_chunks) *
742 sizeof(unsigned long) * 8, block->nb_chunks);
743
744 g_free(old);
745
746 local->nb_blocks--;
747
748 if (local->nb_blocks && rdma->blockmap) {
749 for (x = 0; x < local->nb_blocks; x++) {
750 g_hash_table_insert(rdma->blockmap,
751 (void *)(uintptr_t)local->block[x].offset,
752 &local->block[x]);
753 }
754 }
755
756 return 0;
757}
758
759
760
761
762
763static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
764{
765 struct ibv_port_attr port;
766
767 if (ibv_query_port(verbs, 1, &port)) {
768 error_report("Failed to query port information");
769 return;
770 }
771
772 printf("%s RDMA Device opened: kernel name %s "
773 "uverbs device name %s, "
774 "infiniband_verbs class device path %s, "
775 "infiniband class device path %s, "
776 "transport: (%d) %s\n",
777 who,
778 verbs->device->name,
779 verbs->device->dev_name,
780 verbs->device->dev_path,
781 verbs->device->ibdev_path,
782 port.link_layer,
783 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
784 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
785 ? "Ethernet" : "Unknown"));
786}
787
788
789
790
791
792
793static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
794{
795 char sgid[33];
796 char dgid[33];
797 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
798 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
799 trace_qemu_rdma_dump_gid(who, sgid, dgid);
800}
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
846{
847
848#ifdef CONFIG_LINUX
849 struct ibv_port_attr port_attr;
850
851
852
853
854
855
856
857
858
859
860 if (!verbs) {
861 int num_devices, x;
862 struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
863 bool roce_found = false;
864 bool ib_found = false;
865
866 for (x = 0; x < num_devices; x++) {
867 verbs = ibv_open_device(dev_list[x]);
868 if (!verbs) {
869 if (errno == EPERM) {
870 continue;
871 } else {
872 return -EINVAL;
873 }
874 }
875
876 if (ibv_query_port(verbs, 1, &port_attr)) {
877 ibv_close_device(verbs);
878 ERROR(errp, "Could not query initial IB port");
879 return -EINVAL;
880 }
881
882 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
883 ib_found = true;
884 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
885 roce_found = true;
886 }
887
888 ibv_close_device(verbs);
889
890 }
891
892 if (roce_found) {
893 if (ib_found) {
894 fprintf(stderr, "WARN: migrations may fail:"
895 " IPv6 over RoCE / iWARP in linux"
896 " is broken. But since you appear to have a"
897 " mixed RoCE / IB environment, be sure to only"
898 " migrate over the IB fabric until the kernel "
899 " fixes the bug.\n");
900 } else {
901 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
902 " and your management software has specified '[::]'"
903 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
904 return -ENONET;
905 }
906 }
907
908 return 0;
909 }
910
911
912
913
914
915
916
917
918 if (ibv_query_port(verbs, 1, &port_attr)) {
919 ERROR(errp, "Could not query initial IB port");
920 return -EINVAL;
921 }
922
923 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
924 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
925 "(but patches on linux-rdma in progress)");
926 return -ENONET;
927 }
928
929#endif
930
931 return 0;
932}
933
934
935
936
937
938
939static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
940{
941 int ret;
942 struct rdma_addrinfo *res;
943 char port_str[16];
944 struct rdma_cm_event *cm_event;
945 char ip[40] = "unknown";
946 struct rdma_addrinfo *e;
947
948 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
949 ERROR(errp, "RDMA hostname has not been set");
950 return -EINVAL;
951 }
952
953
954 rdma->channel = rdma_create_event_channel();
955 if (!rdma->channel) {
956 ERROR(errp, "could not create CM channel");
957 return -EINVAL;
958 }
959
960
961 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
962 if (ret) {
963 ERROR(errp, "could not create channel id");
964 goto err_resolve_create_id;
965 }
966
967 snprintf(port_str, 16, "%d", rdma->port);
968 port_str[15] = '\0';
969
970 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
971 if (ret < 0) {
972 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
973 goto err_resolve_get_addr;
974 }
975
976 for (e = res; e != NULL; e = e->ai_next) {
977 inet_ntop(e->ai_family,
978 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
979 trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
980
981 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
982 RDMA_RESOLVE_TIMEOUT_MS);
983 if (!ret) {
984 if (e->ai_family == AF_INET6) {
985 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
986 if (ret) {
987 continue;
988 }
989 }
990 goto route;
991 }
992 }
993
994 rdma_freeaddrinfo(res);
995 ERROR(errp, "could not resolve address %s", rdma->host);
996 goto err_resolve_get_addr;
997
998route:
999 rdma_freeaddrinfo(res);
1000 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
1001
1002 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1003 if (ret) {
1004 ERROR(errp, "could not perform event_addr_resolved");
1005 goto err_resolve_get_addr;
1006 }
1007
1008 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
1009 ERROR(errp, "result not equal to event_addr_resolved %s",
1010 rdma_event_str(cm_event->event));
1011 error_report("rdma_resolve_addr");
1012 rdma_ack_cm_event(cm_event);
1013 ret = -EINVAL;
1014 goto err_resolve_get_addr;
1015 }
1016 rdma_ack_cm_event(cm_event);
1017
1018
1019 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1020 if (ret) {
1021 ERROR(errp, "could not resolve rdma route");
1022 goto err_resolve_get_addr;
1023 }
1024
1025 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1026 if (ret) {
1027 ERROR(errp, "could not perform event_route_resolved");
1028 goto err_resolve_get_addr;
1029 }
1030 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1031 ERROR(errp, "result not equal to event_route_resolved: %s",
1032 rdma_event_str(cm_event->event));
1033 rdma_ack_cm_event(cm_event);
1034 ret = -EINVAL;
1035 goto err_resolve_get_addr;
1036 }
1037 rdma_ack_cm_event(cm_event);
1038 rdma->verbs = rdma->cm_id->verbs;
1039 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1040 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1041 return 0;
1042
1043err_resolve_get_addr:
1044 rdma_destroy_id(rdma->cm_id);
1045 rdma->cm_id = NULL;
1046err_resolve_create_id:
1047 rdma_destroy_event_channel(rdma->channel);
1048 rdma->channel = NULL;
1049 return ret;
1050}
1051
1052
1053
1054
1055static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1056{
1057
1058 rdma->pd = ibv_alloc_pd(rdma->verbs);
1059 if (!rdma->pd) {
1060 error_report("failed to allocate protection domain");
1061 return -1;
1062 }
1063
1064
1065 rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1066 if (!rdma->recv_comp_channel) {
1067 error_report("failed to allocate receive completion channel");
1068 goto err_alloc_pd_cq;
1069 }
1070
1071
1072
1073
1074 rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1075 NULL, rdma->recv_comp_channel, 0);
1076 if (!rdma->recv_cq) {
1077 error_report("failed to allocate receive completion queue");
1078 goto err_alloc_pd_cq;
1079 }
1080
1081
1082 rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1083 if (!rdma->send_comp_channel) {
1084 error_report("failed to allocate send completion channel");
1085 goto err_alloc_pd_cq;
1086 }
1087
1088 rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1089 NULL, rdma->send_comp_channel, 0);
1090 if (!rdma->send_cq) {
1091 error_report("failed to allocate send completion queue");
1092 goto err_alloc_pd_cq;
1093 }
1094
1095 return 0;
1096
1097err_alloc_pd_cq:
1098 if (rdma->pd) {
1099 ibv_dealloc_pd(rdma->pd);
1100 }
1101 if (rdma->recv_comp_channel) {
1102 ibv_destroy_comp_channel(rdma->recv_comp_channel);
1103 }
1104 if (rdma->send_comp_channel) {
1105 ibv_destroy_comp_channel(rdma->send_comp_channel);
1106 }
1107 if (rdma->recv_cq) {
1108 ibv_destroy_cq(rdma->recv_cq);
1109 rdma->recv_cq = NULL;
1110 }
1111 rdma->pd = NULL;
1112 rdma->recv_comp_channel = NULL;
1113 rdma->send_comp_channel = NULL;
1114 return -1;
1115
1116}
1117
1118
1119
1120
1121static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1122{
1123 struct ibv_qp_init_attr attr = { 0 };
1124 int ret;
1125
1126 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1127 attr.cap.max_recv_wr = 3;
1128 attr.cap.max_send_sge = 1;
1129 attr.cap.max_recv_sge = 1;
1130 attr.send_cq = rdma->send_cq;
1131 attr.recv_cq = rdma->recv_cq;
1132 attr.qp_type = IBV_QPT_RC;
1133
1134 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1135 if (ret) {
1136 return -1;
1137 }
1138
1139 rdma->qp = rdma->cm_id->qp;
1140 return 0;
1141}
1142
1143
1144static bool rdma_support_odp(struct ibv_context *dev)
1145{
1146 struct ibv_device_attr_ex attr = {0};
1147 int ret = ibv_query_device_ex(dev, NULL, &attr);
1148 if (ret) {
1149 return false;
1150 }
1151
1152 if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1153 return true;
1154 }
1155
1156 return false;
1157}
1158
1159
1160
1161
1162
1163
1164static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1165 uint32_t len, uint32_t lkey,
1166 const char *name, bool wr)
1167{
1168#ifdef HAVE_IBV_ADVISE_MR
1169 int ret;
1170 int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1171 IBV_ADVISE_MR_ADVICE_PREFETCH;
1172 struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1173
1174 ret = ibv_advise_mr(pd, advice,
1175 IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1176
1177 if (ret) {
1178 trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
1179 } else {
1180 trace_qemu_rdma_advise_mr(name, len, addr, "successed");
1181 }
1182#endif
1183}
1184
1185static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1186{
1187 int i;
1188 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1189
1190 for (i = 0; i < local->nb_blocks; i++) {
1191 int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1192
1193 local->block[i].mr =
1194 ibv_reg_mr(rdma->pd,
1195 local->block[i].local_host_addr,
1196 local->block[i].length, access
1197 );
1198
1199 if (!local->block[i].mr &&
1200 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1201 access |= IBV_ACCESS_ON_DEMAND;
1202
1203 local->block[i].mr =
1204 ibv_reg_mr(rdma->pd,
1205 local->block[i].local_host_addr,
1206 local->block[i].length, access);
1207 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1208
1209 if (local->block[i].mr) {
1210 qemu_rdma_advise_prefetch_mr(rdma->pd,
1211 (uintptr_t)local->block[i].local_host_addr,
1212 local->block[i].length,
1213 local->block[i].mr->lkey,
1214 local->block[i].block_name,
1215 true);
1216 }
1217 }
1218
1219 if (!local->block[i].mr) {
1220 perror("Failed to register local dest ram block!");
1221 break;
1222 }
1223 rdma->total_registrations++;
1224 }
1225
1226 if (i >= local->nb_blocks) {
1227 return 0;
1228 }
1229
1230 for (i--; i >= 0; i--) {
1231 ibv_dereg_mr(local->block[i].mr);
1232 local->block[i].mr = NULL;
1233 rdma->total_registrations--;
1234 }
1235
1236 return -1;
1237
1238}
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1250 uintptr_t block_offset,
1251 uint64_t offset,
1252 uint64_t length,
1253 uint64_t *block_index,
1254 uint64_t *chunk_index)
1255{
1256 uint64_t current_addr = block_offset + offset;
1257 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1258 (void *) block_offset);
1259 assert(block);
1260 assert(current_addr >= block->offset);
1261 assert((current_addr + length) <= (block->offset + block->length));
1262
1263 *block_index = block->index;
1264 *chunk_index = ram_chunk_index(block->local_host_addr,
1265 block->local_host_addr + (current_addr - block->offset));
1266
1267 return 0;
1268}
1269
1270
1271
1272
1273
1274
1275
1276
1277static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1278 RDMALocalBlock *block, uintptr_t host_addr,
1279 uint32_t *lkey, uint32_t *rkey, int chunk,
1280 uint8_t *chunk_start, uint8_t *chunk_end)
1281{
1282 if (block->mr) {
1283 if (lkey) {
1284 *lkey = block->mr->lkey;
1285 }
1286 if (rkey) {
1287 *rkey = block->mr->rkey;
1288 }
1289 return 0;
1290 }
1291
1292
1293 if (!block->pmr) {
1294 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1295 }
1296
1297
1298
1299
1300
1301
1302 if (!block->pmr[chunk]) {
1303 uint64_t len = chunk_end - chunk_start;
1304 int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1305 0;
1306
1307 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1308
1309 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1310 if (!block->pmr[chunk] &&
1311 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1312 access |= IBV_ACCESS_ON_DEMAND;
1313
1314 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1315 trace_qemu_rdma_register_odp_mr(block->block_name);
1316
1317 if (block->pmr[chunk]) {
1318 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1319 len, block->pmr[chunk]->lkey,
1320 block->block_name, rkey);
1321
1322 }
1323 }
1324 }
1325 if (!block->pmr[chunk]) {
1326 perror("Failed to register chunk!");
1327 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1328 " start %" PRIuPTR " end %" PRIuPTR
1329 " host %" PRIuPTR
1330 " local %" PRIuPTR " registrations: %d\n",
1331 block->index, chunk, (uintptr_t)chunk_start,
1332 (uintptr_t)chunk_end, host_addr,
1333 (uintptr_t)block->local_host_addr,
1334 rdma->total_registrations);
1335 return -1;
1336 }
1337 rdma->total_registrations++;
1338
1339 if (lkey) {
1340 *lkey = block->pmr[chunk]->lkey;
1341 }
1342 if (rkey) {
1343 *rkey = block->pmr[chunk]->rkey;
1344 }
1345 return 0;
1346}
1347
1348
1349
1350
1351
1352static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1353{
1354 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1355 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1356 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1357 if (rdma->wr_data[idx].control_mr) {
1358 rdma->total_registrations++;
1359 return 0;
1360 }
1361 error_report("qemu_rdma_reg_control failed");
1362 return -1;
1363}
1364
1365const char *print_wrid(int wrid)
1366{
1367 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1368 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1369 }
1370 return wrid_desc[wrid];
1371}
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1409{
1410 while (rdma->unregistrations[rdma->unregister_current]) {
1411 int ret;
1412 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1413 uint64_t chunk =
1414 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1415 uint64_t index =
1416 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1417 RDMALocalBlock *block =
1418 &(rdma->local_ram_blocks.block[index]);
1419 RDMARegister reg = { .current_index = index };
1420 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1421 };
1422 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1423 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1424 .repeat = 1,
1425 };
1426
1427 trace_qemu_rdma_unregister_waiting_proc(chunk,
1428 rdma->unregister_current);
1429
1430 rdma->unregistrations[rdma->unregister_current] = 0;
1431 rdma->unregister_current++;
1432
1433 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1434 rdma->unregister_current = 0;
1435 }
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445 clear_bit(chunk, block->unregister_bitmap);
1446
1447 if (test_bit(chunk, block->transit_bitmap)) {
1448 trace_qemu_rdma_unregister_waiting_inflight(chunk);
1449 continue;
1450 }
1451
1452 trace_qemu_rdma_unregister_waiting_send(chunk);
1453
1454 ret = ibv_dereg_mr(block->pmr[chunk]);
1455 block->pmr[chunk] = NULL;
1456 block->remote_keys[chunk] = 0;
1457
1458 if (ret != 0) {
1459 perror("unregistration chunk failed");
1460 return -ret;
1461 }
1462 rdma->total_registrations--;
1463
1464 reg.key.chunk = chunk;
1465 register_to_network(rdma, ®);
1466 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1467 &resp, NULL, NULL);
1468 if (ret < 0) {
1469 return ret;
1470 }
1471
1472 trace_qemu_rdma_unregister_waiting_complete(chunk);
1473 }
1474
1475 return 0;
1476}
1477
1478static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1479 uint64_t chunk)
1480{
1481 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1482
1483 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1484 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1485
1486 return result;
1487}
1488
1489
1490
1491
1492
1493static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1494 uint64_t chunk, uint64_t wr_id)
1495{
1496 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1497 error_report("rdma migration: queue is full");
1498 } else {
1499 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1500
1501 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1502 trace_qemu_rdma_signal_unregister_append(chunk,
1503 rdma->unregister_next);
1504
1505 rdma->unregistrations[rdma->unregister_next++] =
1506 qemu_rdma_make_wrid(wr_id, index, chunk);
1507
1508 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1509 rdma->unregister_next = 0;
1510 }
1511 } else {
1512 trace_qemu_rdma_signal_unregister_already(chunk);
1513 }
1514 }
1515}
1516
1517
1518
1519
1520
1521
1522static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1523 uint64_t *wr_id_out, uint32_t *byte_len)
1524{
1525 int ret;
1526 struct ibv_wc wc;
1527 uint64_t wr_id;
1528
1529 ret = ibv_poll_cq(cq, 1, &wc);
1530
1531 if (!ret) {
1532 *wr_id_out = RDMA_WRID_NONE;
1533 return 0;
1534 }
1535
1536 if (ret < 0) {
1537 error_report("ibv_poll_cq return %d", ret);
1538 return ret;
1539 }
1540
1541 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1542
1543 if (wc.status != IBV_WC_SUCCESS) {
1544 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1545 wc.status, ibv_wc_status_str(wc.status));
1546 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1547
1548 return -1;
1549 }
1550
1551 if (rdma->control_ready_expected &&
1552 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1553 trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1554 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1555 rdma->control_ready_expected = 0;
1556 }
1557
1558 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1559 uint64_t chunk =
1560 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1561 uint64_t index =
1562 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1563 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1564
1565 trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1566 index, chunk, block->local_host_addr,
1567 (void *)(uintptr_t)block->remote_host_addr);
1568
1569 clear_bit(chunk, block->transit_bitmap);
1570
1571 if (rdma->nb_sent > 0) {
1572 rdma->nb_sent--;
1573 }
1574
1575 if (!rdma->pin_all) {
1576
1577
1578
1579
1580
1581
1582#ifdef RDMA_UNREGISTRATION_EXAMPLE
1583 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1584#endif
1585 }
1586 } else {
1587 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1588 }
1589
1590 *wr_id_out = wc.wr_id;
1591 if (byte_len) {
1592 *byte_len = wc.byte_len;
1593 }
1594
1595 return 0;
1596}
1597
1598
1599
1600
1601static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1602 struct ibv_comp_channel *comp_channel)
1603{
1604 struct rdma_cm_event *cm_event;
1605 int ret = -1;
1606
1607
1608
1609
1610
1611 if (rdma->migration_started_on_destination &&
1612 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1613 yield_until_fd_readable(comp_channel->fd);
1614 } else {
1615
1616
1617
1618
1619
1620
1621
1622 while (!rdma->error_state && !rdma->received_error) {
1623 GPollFD pfds[2];
1624 pfds[0].fd = comp_channel->fd;
1625 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1626 pfds[0].revents = 0;
1627
1628 pfds[1].fd = rdma->channel->fd;
1629 pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1630 pfds[1].revents = 0;
1631
1632
1633 switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1634 case 2:
1635 case 1:
1636 if (pfds[0].revents) {
1637 return 0;
1638 }
1639
1640 if (pfds[1].revents) {
1641 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1642 if (ret) {
1643 error_report("failed to get cm event while wait "
1644 "completion channel");
1645 return -EPIPE;
1646 }
1647
1648 error_report("receive cm event while wait comp channel,"
1649 "cm event is %d", cm_event->event);
1650 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1651 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1652 rdma_ack_cm_event(cm_event);
1653 return -EPIPE;
1654 }
1655 rdma_ack_cm_event(cm_event);
1656 }
1657 break;
1658
1659 case 0:
1660 break;
1661
1662 default:
1663
1664
1665 error_report("%s: poll failed", __func__);
1666 return -EPIPE;
1667 }
1668
1669 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1670
1671 return -EPIPE;
1672 }
1673 }
1674 }
1675
1676 if (rdma->received_error) {
1677 return -EPIPE;
1678 }
1679 return rdma->error_state;
1680}
1681
1682static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
1683{
1684 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1685 rdma->recv_comp_channel;
1686}
1687
1688static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
1689{
1690 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1691}
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1707 uint32_t *byte_len)
1708{
1709 int num_cq_events = 0, ret = 0;
1710 struct ibv_cq *cq;
1711 void *cq_ctx;
1712 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1713 struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1714 struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1715
1716 if (ibv_req_notify_cq(poll_cq, 0)) {
1717 return -1;
1718 }
1719
1720 while (wr_id != wrid_requested) {
1721 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1722 if (ret < 0) {
1723 return ret;
1724 }
1725
1726 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1727
1728 if (wr_id == RDMA_WRID_NONE) {
1729 break;
1730 }
1731 if (wr_id != wrid_requested) {
1732 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1733 wrid_requested, print_wrid(wr_id), wr_id);
1734 }
1735 }
1736
1737 if (wr_id == wrid_requested) {
1738 return 0;
1739 }
1740
1741 while (1) {
1742 ret = qemu_rdma_wait_comp_channel(rdma, ch);
1743 if (ret) {
1744 goto err_block_for_wrid;
1745 }
1746
1747 ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
1748 if (ret) {
1749 perror("ibv_get_cq_event");
1750 goto err_block_for_wrid;
1751 }
1752
1753 num_cq_events++;
1754
1755 ret = -ibv_req_notify_cq(cq, 0);
1756 if (ret) {
1757 goto err_block_for_wrid;
1758 }
1759
1760 while (wr_id != wrid_requested) {
1761 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1762 if (ret < 0) {
1763 goto err_block_for_wrid;
1764 }
1765
1766 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1767
1768 if (wr_id == RDMA_WRID_NONE) {
1769 break;
1770 }
1771 if (wr_id != wrid_requested) {
1772 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1773 wrid_requested, print_wrid(wr_id), wr_id);
1774 }
1775 }
1776
1777 if (wr_id == wrid_requested) {
1778 goto success_block_for_wrid;
1779 }
1780 }
1781
1782success_block_for_wrid:
1783 if (num_cq_events) {
1784 ibv_ack_cq_events(cq, num_cq_events);
1785 }
1786 return 0;
1787
1788err_block_for_wrid:
1789 if (num_cq_events) {
1790 ibv_ack_cq_events(cq, num_cq_events);
1791 }
1792
1793 rdma->error_state = ret;
1794 return ret;
1795}
1796
1797
1798
1799
1800
1801static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1802 RDMAControlHeader *head)
1803{
1804 int ret = 0;
1805 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1806 struct ibv_send_wr *bad_wr;
1807 struct ibv_sge sge = {
1808 .addr = (uintptr_t)(wr->control),
1809 .length = head->len + sizeof(RDMAControlHeader),
1810 .lkey = wr->control_mr->lkey,
1811 };
1812 struct ibv_send_wr send_wr = {
1813 .wr_id = RDMA_WRID_SEND_CONTROL,
1814 .opcode = IBV_WR_SEND,
1815 .send_flags = IBV_SEND_SIGNALED,
1816 .sg_list = &sge,
1817 .num_sge = 1,
1818 };
1819
1820 trace_qemu_rdma_post_send_control(control_desc(head->type));
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1831 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1832 control_to_network((void *) wr->control);
1833
1834 if (buf) {
1835 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1836 }
1837
1838
1839 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1840
1841 if (ret > 0) {
1842 error_report("Failed to use post IB SEND for control");
1843 return -ret;
1844 }
1845
1846 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1847 if (ret < 0) {
1848 error_report("rdma migration: send polling control error");
1849 }
1850
1851 return ret;
1852}
1853
1854
1855
1856
1857
1858static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1859{
1860 struct ibv_recv_wr *bad_wr;
1861 struct ibv_sge sge = {
1862 .addr = (uintptr_t)(rdma->wr_data[idx].control),
1863 .length = RDMA_CONTROL_MAX_BUFFER,
1864 .lkey = rdma->wr_data[idx].control_mr->lkey,
1865 };
1866
1867 struct ibv_recv_wr recv_wr = {
1868 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1869 .sg_list = &sge,
1870 .num_sge = 1,
1871 };
1872
1873
1874 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1875 return -1;
1876 }
1877
1878 return 0;
1879}
1880
1881
1882
1883
1884static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1885 RDMAControlHeader *head, int expecting, int idx)
1886{
1887 uint32_t byte_len;
1888 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1889 &byte_len);
1890
1891 if (ret < 0) {
1892 error_report("rdma migration: recv polling control error!");
1893 return ret;
1894 }
1895
1896 network_to_control((void *) rdma->wr_data[idx].control);
1897 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1898
1899 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1900
1901 if (expecting == RDMA_CONTROL_NONE) {
1902 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1903 head->type);
1904 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1905 error_report("Was expecting a %s (%d) control message"
1906 ", but got: %s (%d), length: %d",
1907 control_desc(expecting), expecting,
1908 control_desc(head->type), head->type, head->len);
1909 if (head->type == RDMA_CONTROL_ERROR) {
1910 rdma->received_error = true;
1911 }
1912 return -EIO;
1913 }
1914 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1915 error_report("too long length: %d", head->len);
1916 return -EINVAL;
1917 }
1918 if (sizeof(*head) + head->len != byte_len) {
1919 error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1920 return -EINVAL;
1921 }
1922
1923 return 0;
1924}
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1935 RDMAControlHeader *head)
1936{
1937 rdma->wr_data[idx].control_len = head->len;
1938 rdma->wr_data[idx].control_curr =
1939 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1940}
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1956 uint8_t *data, RDMAControlHeader *resp,
1957 int *resp_idx,
1958 int (*callback)(RDMAContext *rdma))
1959{
1960 int ret = 0;
1961
1962
1963
1964
1965
1966 if (rdma->control_ready_expected) {
1967 RDMAControlHeader resp;
1968 ret = qemu_rdma_exchange_get_response(rdma,
1969 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1970 if (ret < 0) {
1971 return ret;
1972 }
1973 }
1974
1975
1976
1977
1978 if (resp) {
1979 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1980 if (ret) {
1981 error_report("rdma migration: error posting"
1982 " extra control recv for anticipated result!");
1983 return ret;
1984 }
1985 }
1986
1987
1988
1989
1990 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1991 if (ret) {
1992 error_report("rdma migration: error posting first control recv!");
1993 return ret;
1994 }
1995
1996
1997
1998
1999 ret = qemu_rdma_post_send_control(rdma, data, head);
2000
2001 if (ret < 0) {
2002 error_report("Failed to send control buffer!");
2003 return ret;
2004 }
2005
2006
2007
2008
2009 if (resp) {
2010 if (callback) {
2011 trace_qemu_rdma_exchange_send_issue_callback();
2012 ret = callback(rdma);
2013 if (ret < 0) {
2014 return ret;
2015 }
2016 }
2017
2018 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
2019 ret = qemu_rdma_exchange_get_response(rdma, resp,
2020 resp->type, RDMA_WRID_DATA);
2021
2022 if (ret < 0) {
2023 return ret;
2024 }
2025
2026 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
2027 if (resp_idx) {
2028 *resp_idx = RDMA_WRID_DATA;
2029 }
2030 trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
2031 }
2032
2033 rdma->control_ready_expected = 1;
2034
2035 return 0;
2036}
2037
2038
2039
2040
2041
2042static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
2043 int expecting)
2044{
2045 RDMAControlHeader ready = {
2046 .len = 0,
2047 .type = RDMA_CONTROL_READY,
2048 .repeat = 1,
2049 };
2050 int ret;
2051
2052
2053
2054
2055 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
2056
2057 if (ret < 0) {
2058 error_report("Failed to send control buffer!");
2059 return ret;
2060 }
2061
2062
2063
2064
2065 ret = qemu_rdma_exchange_get_response(rdma, head,
2066 expecting, RDMA_WRID_READY);
2067
2068 if (ret < 0) {
2069 return ret;
2070 }
2071
2072 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
2073
2074
2075
2076
2077 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2078 if (ret) {
2079 error_report("rdma migration: error posting second control recv!");
2080 return ret;
2081 }
2082
2083 return 0;
2084}
2085
2086
2087
2088
2089
2090
2091
2092static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
2093 int current_index, uint64_t current_addr,
2094 uint64_t length)
2095{
2096 struct ibv_sge sge;
2097 struct ibv_send_wr send_wr = { 0 };
2098 struct ibv_send_wr *bad_wr;
2099 int reg_result_idx, ret, count = 0;
2100 uint64_t chunk, chunks;
2101 uint8_t *chunk_start, *chunk_end;
2102 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2103 RDMARegister reg;
2104 RDMARegisterResult *reg_result;
2105 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2106 RDMAControlHeader head = { .len = sizeof(RDMARegister),
2107 .type = RDMA_CONTROL_REGISTER_REQUEST,
2108 .repeat = 1,
2109 };
2110
2111retry:
2112 sge.addr = (uintptr_t)(block->local_host_addr +
2113 (current_addr - block->offset));
2114 sge.length = length;
2115
2116 chunk = ram_chunk_index(block->local_host_addr,
2117 (uint8_t *)(uintptr_t)sge.addr);
2118 chunk_start = ram_chunk_start(block, chunk);
2119
2120 if (block->is_ram_block) {
2121 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2122
2123 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2124 chunks--;
2125 }
2126 } else {
2127 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2128
2129 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2130 chunks--;
2131 }
2132 }
2133
2134 trace_qemu_rdma_write_one_top(chunks + 1,
2135 (chunks + 1) *
2136 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2137
2138 chunk_end = ram_chunk_end(block, chunk + chunks);
2139
2140 if (!rdma->pin_all) {
2141#ifdef RDMA_UNREGISTRATION_EXAMPLE
2142 qemu_rdma_unregister_waiting(rdma);
2143#endif
2144 }
2145
2146 while (test_bit(chunk, block->transit_bitmap)) {
2147 (void)count;
2148 trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2149 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2150
2151 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2152
2153 if (ret < 0) {
2154 error_report("Failed to Wait for previous write to complete "
2155 "block %d chunk %" PRIu64
2156 " current %" PRIu64 " len %" PRIu64 " %d",
2157 current_index, chunk, sge.addr, length, rdma->nb_sent);
2158 return ret;
2159 }
2160 }
2161
2162 if (!rdma->pin_all || !block->is_ram_block) {
2163 if (!block->remote_keys[chunk]) {
2164
2165
2166
2167
2168
2169
2170 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2171 RDMACompress comp = {
2172 .offset = current_addr,
2173 .value = 0,
2174 .block_idx = current_index,
2175 .length = length,
2176 };
2177
2178 head.len = sizeof(comp);
2179 head.type = RDMA_CONTROL_COMPRESS;
2180
2181 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2182 current_index, current_addr);
2183
2184 compress_to_network(rdma, &comp);
2185 ret = qemu_rdma_exchange_send(rdma, &head,
2186 (uint8_t *) &comp, NULL, NULL, NULL);
2187
2188 if (ret < 0) {
2189 return -EIO;
2190 }
2191
2192 acct_update_position(f, sge.length, true);
2193
2194 return 1;
2195 }
2196
2197
2198
2199
2200 reg.current_index = current_index;
2201 if (block->is_ram_block) {
2202 reg.key.current_addr = current_addr;
2203 } else {
2204 reg.key.chunk = chunk;
2205 }
2206 reg.chunks = chunks;
2207
2208 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2209 current_addr);
2210
2211 register_to_network(rdma, ®);
2212 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2213 &resp, ®_result_idx, NULL);
2214 if (ret < 0) {
2215 return ret;
2216 }
2217
2218
2219 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2220 &sge.lkey, NULL, chunk,
2221 chunk_start, chunk_end)) {
2222 error_report("cannot get lkey");
2223 return -EINVAL;
2224 }
2225
2226 reg_result = (RDMARegisterResult *)
2227 rdma->wr_data[reg_result_idx].control_curr;
2228
2229 network_to_result(reg_result);
2230
2231 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2232 reg_result->rkey, chunk);
2233
2234 block->remote_keys[chunk] = reg_result->rkey;
2235 block->remote_host_addr = reg_result->host_addr;
2236 } else {
2237
2238 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2239 &sge.lkey, NULL, chunk,
2240 chunk_start, chunk_end)) {
2241 error_report("cannot get lkey!");
2242 return -EINVAL;
2243 }
2244 }
2245
2246 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2247 } else {
2248 send_wr.wr.rdma.rkey = block->remote_rkey;
2249
2250 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2251 &sge.lkey, NULL, chunk,
2252 chunk_start, chunk_end)) {
2253 error_report("cannot get lkey!");
2254 return -EINVAL;
2255 }
2256 }
2257
2258
2259
2260
2261
2262
2263
2264 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2265 current_index, chunk);
2266
2267 send_wr.opcode = IBV_WR_RDMA_WRITE;
2268 send_wr.send_flags = IBV_SEND_SIGNALED;
2269 send_wr.sg_list = &sge;
2270 send_wr.num_sge = 1;
2271 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2272 (current_addr - block->offset);
2273
2274 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2275 sge.length);
2276
2277
2278
2279
2280
2281 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2282
2283 if (ret == ENOMEM) {
2284 trace_qemu_rdma_write_one_queue_full();
2285 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2286 if (ret < 0) {
2287 error_report("rdma migration: failed to make "
2288 "room in full send queue! %d", ret);
2289 return ret;
2290 }
2291
2292 goto retry;
2293
2294 } else if (ret > 0) {
2295 perror("rdma migration: post rdma write failed");
2296 return -ret;
2297 }
2298
2299 set_bit(chunk, block->transit_bitmap);
2300 acct_update_position(f, sge.length, false);
2301 rdma->total_writes++;
2302
2303 return 0;
2304}
2305
2306
2307
2308
2309
2310
2311
2312static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2313{
2314 int ret;
2315
2316 if (!rdma->current_length) {
2317 return 0;
2318 }
2319
2320 ret = qemu_rdma_write_one(f, rdma,
2321 rdma->current_index, rdma->current_addr, rdma->current_length);
2322
2323 if (ret < 0) {
2324 return ret;
2325 }
2326
2327 if (ret == 0) {
2328 rdma->nb_sent++;
2329 trace_qemu_rdma_write_flush(rdma->nb_sent);
2330 }
2331
2332 rdma->current_length = 0;
2333 rdma->current_addr = 0;
2334
2335 return 0;
2336}
2337
2338static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2339 uint64_t offset, uint64_t len)
2340{
2341 RDMALocalBlock *block;
2342 uint8_t *host_addr;
2343 uint8_t *chunk_end;
2344
2345 if (rdma->current_index < 0) {
2346 return 0;
2347 }
2348
2349 if (rdma->current_chunk < 0) {
2350 return 0;
2351 }
2352
2353 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2354 host_addr = block->local_host_addr + (offset - block->offset);
2355 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2356
2357 if (rdma->current_length == 0) {
2358 return 0;
2359 }
2360
2361
2362
2363
2364 if (offset != (rdma->current_addr + rdma->current_length)) {
2365 return 0;
2366 }
2367
2368 if (offset < block->offset) {
2369 return 0;
2370 }
2371
2372 if ((offset + len) > (block->offset + block->length)) {
2373 return 0;
2374 }
2375
2376 if ((host_addr + len) > chunk_end) {
2377 return 0;
2378 }
2379
2380 return 1;
2381}
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2394 uint64_t block_offset, uint64_t offset,
2395 uint64_t len)
2396{
2397 uint64_t current_addr = block_offset + offset;
2398 uint64_t index = rdma->current_index;
2399 uint64_t chunk = rdma->current_chunk;
2400 int ret;
2401
2402
2403 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2404 ret = qemu_rdma_write_flush(f, rdma);
2405 if (ret) {
2406 return ret;
2407 }
2408 rdma->current_length = 0;
2409 rdma->current_addr = current_addr;
2410
2411 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2412 offset, len, &index, &chunk);
2413 if (ret) {
2414 error_report("ram block search failed");
2415 return ret;
2416 }
2417 rdma->current_index = index;
2418 rdma->current_chunk = chunk;
2419 }
2420
2421
2422 rdma->current_length += len;
2423
2424
2425 if (rdma->current_length >= RDMA_MERGE_MAX) {
2426 return qemu_rdma_write_flush(f, rdma);
2427 }
2428
2429 return 0;
2430}
2431
2432static void qemu_rdma_cleanup(RDMAContext *rdma)
2433{
2434 int idx;
2435
2436 if (rdma->cm_id && rdma->connected) {
2437 if ((rdma->error_state ||
2438 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2439 !rdma->received_error) {
2440 RDMAControlHeader head = { .len = 0,
2441 .type = RDMA_CONTROL_ERROR,
2442 .repeat = 1,
2443 };
2444 error_report("Early error. Sending error.");
2445 qemu_rdma_post_send_control(rdma, NULL, &head);
2446 }
2447
2448 rdma_disconnect(rdma->cm_id);
2449 trace_qemu_rdma_cleanup_disconnect();
2450 rdma->connected = false;
2451 }
2452
2453 if (rdma->channel) {
2454 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2455 }
2456 g_free(rdma->dest_blocks);
2457 rdma->dest_blocks = NULL;
2458
2459 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2460 if (rdma->wr_data[idx].control_mr) {
2461 rdma->total_registrations--;
2462 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2463 }
2464 rdma->wr_data[idx].control_mr = NULL;
2465 }
2466
2467 if (rdma->local_ram_blocks.block) {
2468 while (rdma->local_ram_blocks.nb_blocks) {
2469 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2470 }
2471 }
2472
2473 if (rdma->qp) {
2474 rdma_destroy_qp(rdma->cm_id);
2475 rdma->qp = NULL;
2476 }
2477 if (rdma->recv_cq) {
2478 ibv_destroy_cq(rdma->recv_cq);
2479 rdma->recv_cq = NULL;
2480 }
2481 if (rdma->send_cq) {
2482 ibv_destroy_cq(rdma->send_cq);
2483 rdma->send_cq = NULL;
2484 }
2485 if (rdma->recv_comp_channel) {
2486 ibv_destroy_comp_channel(rdma->recv_comp_channel);
2487 rdma->recv_comp_channel = NULL;
2488 }
2489 if (rdma->send_comp_channel) {
2490 ibv_destroy_comp_channel(rdma->send_comp_channel);
2491 rdma->send_comp_channel = NULL;
2492 }
2493 if (rdma->pd) {
2494 ibv_dealloc_pd(rdma->pd);
2495 rdma->pd = NULL;
2496 }
2497 if (rdma->cm_id) {
2498 rdma_destroy_id(rdma->cm_id);
2499 rdma->cm_id = NULL;
2500 }
2501
2502
2503 if (rdma->listen_id) {
2504 if (!rdma->is_return_path) {
2505 rdma_destroy_id(rdma->listen_id);
2506 }
2507 rdma->listen_id = NULL;
2508
2509 if (rdma->channel) {
2510 if (!rdma->is_return_path) {
2511 rdma_destroy_event_channel(rdma->channel);
2512 }
2513 rdma->channel = NULL;
2514 }
2515 }
2516
2517 if (rdma->channel) {
2518 rdma_destroy_event_channel(rdma->channel);
2519 rdma->channel = NULL;
2520 }
2521 g_free(rdma->host);
2522 g_free(rdma->host_port);
2523 rdma->host = NULL;
2524 rdma->host_port = NULL;
2525}
2526
2527
2528static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2529{
2530 int ret, idx;
2531 Error *local_err = NULL, **temp = &local_err;
2532
2533
2534
2535
2536
2537 rdma->pin_all = pin_all;
2538
2539 ret = qemu_rdma_resolve_host(rdma, temp);
2540 if (ret) {
2541 goto err_rdma_source_init;
2542 }
2543
2544 ret = qemu_rdma_alloc_pd_cq(rdma);
2545 if (ret) {
2546 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2547 " limits may be too low. Please check $ ulimit -a # and "
2548 "search for 'ulimit -l' in the output");
2549 goto err_rdma_source_init;
2550 }
2551
2552 ret = qemu_rdma_alloc_qp(rdma);
2553 if (ret) {
2554 ERROR(temp, "rdma migration: error allocating qp!");
2555 goto err_rdma_source_init;
2556 }
2557
2558 ret = qemu_rdma_init_ram_blocks(rdma);
2559 if (ret) {
2560 ERROR(temp, "rdma migration: error initializing ram blocks!");
2561 goto err_rdma_source_init;
2562 }
2563
2564
2565 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2566 for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2567 g_hash_table_insert(rdma->blockmap,
2568 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2569 &rdma->local_ram_blocks.block[idx]);
2570 }
2571
2572 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2573 ret = qemu_rdma_reg_control(rdma, idx);
2574 if (ret) {
2575 ERROR(temp, "rdma migration: error registering %d control!",
2576 idx);
2577 goto err_rdma_source_init;
2578 }
2579 }
2580
2581 return 0;
2582
2583err_rdma_source_init:
2584 error_propagate(errp, local_err);
2585 qemu_rdma_cleanup(rdma);
2586 return -1;
2587}
2588
2589static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2590 struct rdma_cm_event **cm_event,
2591 long msec, Error **errp)
2592{
2593 int ret;
2594 struct pollfd poll_fd = {
2595 .fd = rdma->channel->fd,
2596 .events = POLLIN,
2597 .revents = 0
2598 };
2599
2600 do {
2601 ret = poll(&poll_fd, 1, msec);
2602 } while (ret < 0 && errno == EINTR);
2603
2604 if (ret == 0) {
2605 ERROR(errp, "poll cm event timeout");
2606 return -1;
2607 } else if (ret < 0) {
2608 ERROR(errp, "failed to poll cm event, errno=%i", errno);
2609 return -1;
2610 } else if (poll_fd.revents & POLLIN) {
2611 return rdma_get_cm_event(rdma->channel, cm_event);
2612 } else {
2613 ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents);
2614 return -1;
2615 }
2616}
2617
2618static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
2619{
2620 RDMACapabilities cap = {
2621 .version = RDMA_CONTROL_VERSION_CURRENT,
2622 .flags = 0,
2623 };
2624 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2625 .retry_count = 5,
2626 .private_data = &cap,
2627 .private_data_len = sizeof(cap),
2628 };
2629 struct rdma_cm_event *cm_event;
2630 int ret;
2631
2632
2633
2634
2635
2636 if (rdma->pin_all) {
2637 trace_qemu_rdma_connect_pin_all_requested();
2638 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2639 }
2640
2641 caps_to_network(&cap);
2642
2643 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2644 if (ret) {
2645 ERROR(errp, "posting second control recv");
2646 goto err_rdma_source_connect;
2647 }
2648
2649 ret = rdma_connect(rdma->cm_id, &conn_param);
2650 if (ret) {
2651 perror("rdma_connect");
2652 ERROR(errp, "connecting to destination!");
2653 goto err_rdma_source_connect;
2654 }
2655
2656 if (return_path) {
2657 ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2658 } else {
2659 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2660 }
2661 if (ret) {
2662 perror("rdma_get_cm_event after rdma_connect");
2663 ERROR(errp, "connecting to destination!");
2664 goto err_rdma_source_connect;
2665 }
2666
2667 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2668 error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2669 ERROR(errp, "connecting to destination!");
2670 rdma_ack_cm_event(cm_event);
2671 goto err_rdma_source_connect;
2672 }
2673 rdma->connected = true;
2674
2675 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2676 network_to_caps(&cap);
2677
2678
2679
2680
2681
2682 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2683 ERROR(errp, "Server cannot support pinning all memory. "
2684 "Will register memory dynamically.");
2685 rdma->pin_all = false;
2686 }
2687
2688 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2689
2690 rdma_ack_cm_event(cm_event);
2691
2692 rdma->control_ready_expected = 1;
2693 rdma->nb_sent = 0;
2694 return 0;
2695
2696err_rdma_source_connect:
2697 qemu_rdma_cleanup(rdma);
2698 return -1;
2699}
2700
2701static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2702{
2703 int ret, idx;
2704 struct rdma_cm_id *listen_id;
2705 char ip[40] = "unknown";
2706 struct rdma_addrinfo *res, *e;
2707 char port_str[16];
2708 int reuse = 1;
2709
2710 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2711 rdma->wr_data[idx].control_len = 0;
2712 rdma->wr_data[idx].control_curr = NULL;
2713 }
2714
2715 if (!rdma->host || !rdma->host[0]) {
2716 ERROR(errp, "RDMA host is not set!");
2717 rdma->error_state = -EINVAL;
2718 return -1;
2719 }
2720
2721 rdma->channel = rdma_create_event_channel();
2722 if (!rdma->channel) {
2723 ERROR(errp, "could not create rdma event channel");
2724 rdma->error_state = -EINVAL;
2725 return -1;
2726 }
2727
2728
2729 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2730 if (ret) {
2731 ERROR(errp, "could not create cm_id!");
2732 goto err_dest_init_create_listen_id;
2733 }
2734
2735 snprintf(port_str, 16, "%d", rdma->port);
2736 port_str[15] = '\0';
2737
2738 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2739 if (ret < 0) {
2740 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2741 goto err_dest_init_bind_addr;
2742 }
2743
2744 ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
2745 &reuse, sizeof reuse);
2746 if (ret) {
2747 ERROR(errp, "Error: could not set REUSEADDR option");
2748 goto err_dest_init_bind_addr;
2749 }
2750 for (e = res; e != NULL; e = e->ai_next) {
2751 inet_ntop(e->ai_family,
2752 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2753 trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2754 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2755 if (ret) {
2756 continue;
2757 }
2758 if (e->ai_family == AF_INET6) {
2759 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2760 if (ret) {
2761 continue;
2762 }
2763 }
2764 break;
2765 }
2766
2767 rdma_freeaddrinfo(res);
2768 if (!e) {
2769 ERROR(errp, "Error: could not rdma_bind_addr!");
2770 goto err_dest_init_bind_addr;
2771 }
2772
2773 rdma->listen_id = listen_id;
2774 qemu_rdma_dump_gid("dest_init", listen_id);
2775 return 0;
2776
2777err_dest_init_bind_addr:
2778 rdma_destroy_id(listen_id);
2779err_dest_init_create_listen_id:
2780 rdma_destroy_event_channel(rdma->channel);
2781 rdma->channel = NULL;
2782 rdma->error_state = ret;
2783 return ret;
2784
2785}
2786
2787static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2788 RDMAContext *rdma)
2789{
2790 int idx;
2791
2792 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2793 rdma_return_path->wr_data[idx].control_len = 0;
2794 rdma_return_path->wr_data[idx].control_curr = NULL;
2795 }
2796
2797
2798 rdma_return_path->channel = rdma->channel;
2799 rdma_return_path->listen_id = rdma->listen_id;
2800
2801 rdma->return_path = rdma_return_path;
2802 rdma_return_path->return_path = rdma;
2803 rdma_return_path->is_return_path = true;
2804}
2805
2806static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2807{
2808 RDMAContext *rdma = NULL;
2809 InetSocketAddress *addr;
2810
2811 if (host_port) {
2812 rdma = g_new0(RDMAContext, 1);
2813 rdma->current_index = -1;
2814 rdma->current_chunk = -1;
2815
2816 addr = g_new(InetSocketAddress, 1);
2817 if (!inet_parse(addr, host_port, NULL)) {
2818 rdma->port = atoi(addr->port);
2819 rdma->host = g_strdup(addr->host);
2820 rdma->host_port = g_strdup(host_port);
2821 } else {
2822 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2823 g_free(rdma);
2824 rdma = NULL;
2825 }
2826
2827 qapi_free_InetSocketAddress(addr);
2828 }
2829
2830 return rdma;
2831}
2832
2833
2834
2835
2836
2837
2838static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2839 const struct iovec *iov,
2840 size_t niov,
2841 int *fds,
2842 size_t nfds,
2843 Error **errp)
2844{
2845 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2846 QEMUFile *f = rioc->file;
2847 RDMAContext *rdma;
2848 int ret;
2849 ssize_t done = 0;
2850 size_t i;
2851 size_t len = 0;
2852
2853 RCU_READ_LOCK_GUARD();
2854 rdma = qatomic_rcu_read(&rioc->rdmaout);
2855
2856 if (!rdma) {
2857 return -EIO;
2858 }
2859
2860 CHECK_ERROR_STATE();
2861
2862
2863
2864
2865
2866 ret = qemu_rdma_write_flush(f, rdma);
2867 if (ret < 0) {
2868 rdma->error_state = ret;
2869 return ret;
2870 }
2871
2872 for (i = 0; i < niov; i++) {
2873 size_t remaining = iov[i].iov_len;
2874 uint8_t * data = (void *)iov[i].iov_base;
2875 while (remaining) {
2876 RDMAControlHeader head;
2877
2878 len = MIN(remaining, RDMA_SEND_INCREMENT);
2879 remaining -= len;
2880
2881 head.len = len;
2882 head.type = RDMA_CONTROL_QEMU_FILE;
2883
2884 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2885
2886 if (ret < 0) {
2887 rdma->error_state = ret;
2888 return ret;
2889 }
2890
2891 data += len;
2892 done += len;
2893 }
2894 }
2895
2896 return done;
2897}
2898
2899static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2900 size_t size, int idx)
2901{
2902 size_t len = 0;
2903
2904 if (rdma->wr_data[idx].control_len) {
2905 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2906
2907 len = MIN(size, rdma->wr_data[idx].control_len);
2908 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2909 rdma->wr_data[idx].control_curr += len;
2910 rdma->wr_data[idx].control_len -= len;
2911 }
2912
2913 return len;
2914}
2915
2916
2917
2918
2919
2920
2921static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2922 const struct iovec *iov,
2923 size_t niov,
2924 int **fds,
2925 size_t *nfds,
2926 Error **errp)
2927{
2928 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2929 RDMAContext *rdma;
2930 RDMAControlHeader head;
2931 int ret = 0;
2932 ssize_t i;
2933 size_t done = 0;
2934
2935 RCU_READ_LOCK_GUARD();
2936 rdma = qatomic_rcu_read(&rioc->rdmain);
2937
2938 if (!rdma) {
2939 return -EIO;
2940 }
2941
2942 CHECK_ERROR_STATE();
2943
2944 for (i = 0; i < niov; i++) {
2945 size_t want = iov[i].iov_len;
2946 uint8_t *data = (void *)iov[i].iov_base;
2947
2948
2949
2950
2951
2952
2953 ret = qemu_rdma_fill(rdma, data, want, 0);
2954 done += ret;
2955 want -= ret;
2956
2957 if (want == 0) {
2958 continue;
2959 }
2960
2961
2962
2963 if (done > 0) {
2964 break;
2965 }
2966
2967
2968
2969
2970
2971 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2972
2973 if (ret < 0) {
2974 rdma->error_state = ret;
2975 return ret;
2976 }
2977
2978
2979
2980
2981 ret = qemu_rdma_fill(rdma, data, want, 0);
2982 done += ret;
2983 want -= ret;
2984
2985
2986 if (want) {
2987 if (done == 0) {
2988 return QIO_CHANNEL_ERR_BLOCK;
2989 } else {
2990 break;
2991 }
2992 }
2993 }
2994 return done;
2995}
2996
2997
2998
2999
3000static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
3001{
3002 int ret;
3003
3004 if (qemu_rdma_write_flush(f, rdma) < 0) {
3005 return -EIO;
3006 }
3007
3008 while (rdma->nb_sent) {
3009 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
3010 if (ret < 0) {
3011 error_report("rdma migration: complete polling error!");
3012 return -EIO;
3013 }
3014 }
3015
3016 qemu_rdma_unregister_waiting(rdma);
3017
3018 return 0;
3019}
3020
3021
3022static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
3023 bool blocking,
3024 Error **errp)
3025{
3026 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3027
3028 rioc->blocking = blocking;
3029 return 0;
3030}
3031
3032
3033typedef struct QIOChannelRDMASource QIOChannelRDMASource;
3034struct QIOChannelRDMASource {
3035 GSource parent;
3036 QIOChannelRDMA *rioc;
3037 GIOCondition condition;
3038};
3039
3040static gboolean
3041qio_channel_rdma_source_prepare(GSource *source,
3042 gint *timeout)
3043{
3044 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3045 RDMAContext *rdma;
3046 GIOCondition cond = 0;
3047 *timeout = -1;
3048
3049 RCU_READ_LOCK_GUARD();
3050 if (rsource->condition == G_IO_IN) {
3051 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3052 } else {
3053 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3054 }
3055
3056 if (!rdma) {
3057 error_report("RDMAContext is NULL when prepare Gsource");
3058 return FALSE;
3059 }
3060
3061 if (rdma->wr_data[0].control_len) {
3062 cond |= G_IO_IN;
3063 }
3064 cond |= G_IO_OUT;
3065
3066 return cond & rsource->condition;
3067}
3068
3069static gboolean
3070qio_channel_rdma_source_check(GSource *source)
3071{
3072 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3073 RDMAContext *rdma;
3074 GIOCondition cond = 0;
3075
3076 RCU_READ_LOCK_GUARD();
3077 if (rsource->condition == G_IO_IN) {
3078 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3079 } else {
3080 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3081 }
3082
3083 if (!rdma) {
3084 error_report("RDMAContext is NULL when check Gsource");
3085 return FALSE;
3086 }
3087
3088 if (rdma->wr_data[0].control_len) {
3089 cond |= G_IO_IN;
3090 }
3091 cond |= G_IO_OUT;
3092
3093 return cond & rsource->condition;
3094}
3095
3096static gboolean
3097qio_channel_rdma_source_dispatch(GSource *source,
3098 GSourceFunc callback,
3099 gpointer user_data)
3100{
3101 QIOChannelFunc func = (QIOChannelFunc)callback;
3102 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3103 RDMAContext *rdma;
3104 GIOCondition cond = 0;
3105
3106 RCU_READ_LOCK_GUARD();
3107 if (rsource->condition == G_IO_IN) {
3108 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3109 } else {
3110 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3111 }
3112
3113 if (!rdma) {
3114 error_report("RDMAContext is NULL when dispatch Gsource");
3115 return FALSE;
3116 }
3117
3118 if (rdma->wr_data[0].control_len) {
3119 cond |= G_IO_IN;
3120 }
3121 cond |= G_IO_OUT;
3122
3123 return (*func)(QIO_CHANNEL(rsource->rioc),
3124 (cond & rsource->condition),
3125 user_data);
3126}
3127
3128static void
3129qio_channel_rdma_source_finalize(GSource *source)
3130{
3131 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
3132
3133 object_unref(OBJECT(ssource->rioc));
3134}
3135
3136GSourceFuncs qio_channel_rdma_source_funcs = {
3137 qio_channel_rdma_source_prepare,
3138 qio_channel_rdma_source_check,
3139 qio_channel_rdma_source_dispatch,
3140 qio_channel_rdma_source_finalize
3141};
3142
3143static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
3144 GIOCondition condition)
3145{
3146 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3147 QIOChannelRDMASource *ssource;
3148 GSource *source;
3149
3150 source = g_source_new(&qio_channel_rdma_source_funcs,
3151 sizeof(QIOChannelRDMASource));
3152 ssource = (QIOChannelRDMASource *)source;
3153
3154 ssource->rioc = rioc;
3155 object_ref(OBJECT(rioc));
3156
3157 ssource->condition = condition;
3158
3159 return source;
3160}
3161
3162static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
3163 AioContext *ctx,
3164 IOHandler *io_read,
3165 IOHandler *io_write,
3166 void *opaque)
3167{
3168 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3169 if (io_read) {
3170 aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
3171 false, io_read, io_write, NULL, NULL, opaque);
3172 aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd,
3173 false, io_read, io_write, NULL, NULL, opaque);
3174 } else {
3175 aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd,
3176 false, io_read, io_write, NULL, NULL, opaque);
3177 aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd,
3178 false, io_read, io_write, NULL, NULL, opaque);
3179 }
3180}
3181
3182struct rdma_close_rcu {
3183 struct rcu_head rcu;
3184 RDMAContext *rdmain;
3185 RDMAContext *rdmaout;
3186};
3187
3188
3189static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3190{
3191 if (rcu->rdmain) {
3192 qemu_rdma_cleanup(rcu->rdmain);
3193 }
3194
3195 if (rcu->rdmaout) {
3196 qemu_rdma_cleanup(rcu->rdmaout);
3197 }
3198
3199 g_free(rcu->rdmain);
3200 g_free(rcu->rdmaout);
3201 g_free(rcu);
3202}
3203
3204static int qio_channel_rdma_close(QIOChannel *ioc,
3205 Error **errp)
3206{
3207 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3208 RDMAContext *rdmain, *rdmaout;
3209 struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3210
3211 trace_qemu_rdma_close();
3212
3213 rdmain = rioc->rdmain;
3214 if (rdmain) {
3215 qatomic_rcu_set(&rioc->rdmain, NULL);
3216 }
3217
3218 rdmaout = rioc->rdmaout;
3219 if (rdmaout) {
3220 qatomic_rcu_set(&rioc->rdmaout, NULL);
3221 }
3222
3223 rcu->rdmain = rdmain;
3224 rcu->rdmaout = rdmaout;
3225 call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
3226
3227 return 0;
3228}
3229
3230static int
3231qio_channel_rdma_shutdown(QIOChannel *ioc,
3232 QIOChannelShutdown how,
3233 Error **errp)
3234{
3235 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3236 RDMAContext *rdmain, *rdmaout;
3237
3238 RCU_READ_LOCK_GUARD();
3239
3240 rdmain = qatomic_rcu_read(&rioc->rdmain);
3241 rdmaout = qatomic_rcu_read(&rioc->rdmain);
3242
3243 switch (how) {
3244 case QIO_CHANNEL_SHUTDOWN_READ:
3245 if (rdmain) {
3246 rdmain->error_state = -1;
3247 }
3248 break;
3249 case QIO_CHANNEL_SHUTDOWN_WRITE:
3250 if (rdmaout) {
3251 rdmaout->error_state = -1;
3252 }
3253 break;
3254 case QIO_CHANNEL_SHUTDOWN_BOTH:
3255 default:
3256 if (rdmain) {
3257 rdmain->error_state = -1;
3258 }
3259 if (rdmaout) {
3260 rdmaout->error_state = -1;
3261 }
3262 break;
3263 }
3264
3265 return 0;
3266}
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
3303 ram_addr_t block_offset, ram_addr_t offset,
3304 size_t size, uint64_t *bytes_sent)
3305{
3306 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3307 RDMAContext *rdma;
3308 int ret;
3309
3310 RCU_READ_LOCK_GUARD();
3311 rdma = qatomic_rcu_read(&rioc->rdmaout);
3312
3313 if (!rdma) {
3314 return -EIO;
3315 }
3316
3317 CHECK_ERROR_STATE();
3318
3319 if (migration_in_postcopy()) {
3320 return RAM_SAVE_CONTROL_NOT_SUPP;
3321 }
3322
3323 qemu_fflush(f);
3324
3325 if (size > 0) {
3326
3327
3328
3329
3330
3331 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
3332 if (ret < 0) {
3333 error_report("rdma migration: write error! %d", ret);
3334 goto err;
3335 }
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345 if (bytes_sent) {
3346 *bytes_sent = 1;
3347 }
3348 } else {
3349 uint64_t index, chunk;
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362 ret = qemu_rdma_search_ram_block(rdma, block_offset,
3363 offset, size, &index, &chunk);
3364
3365 if (ret) {
3366 error_report("ram block search failed");
3367 goto err;
3368 }
3369
3370 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380 }
3381
3382
3383
3384
3385
3386
3387
3388
3389 while (1) {
3390 uint64_t wr_id, wr_id_in;
3391 int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3392 if (ret < 0) {
3393 error_report("rdma migration: polling error! %d", ret);
3394 goto err;
3395 }
3396
3397 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3398
3399 if (wr_id == RDMA_WRID_NONE) {
3400 break;
3401 }
3402 }
3403
3404 while (1) {
3405 uint64_t wr_id, wr_id_in;
3406 int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3407 if (ret < 0) {
3408 error_report("rdma migration: polling error! %d", ret);
3409 goto err;
3410 }
3411
3412 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3413
3414 if (wr_id == RDMA_WRID_NONE) {
3415 break;
3416 }
3417 }
3418
3419 return RAM_SAVE_CONTROL_DELAYED;
3420err:
3421 rdma->error_state = ret;
3422 return ret;
3423}
3424
3425static void rdma_accept_incoming_migration(void *opaque);
3426
3427static void rdma_cm_poll_handler(void *opaque)
3428{
3429 RDMAContext *rdma = opaque;
3430 int ret;
3431 struct rdma_cm_event *cm_event;
3432 MigrationIncomingState *mis = migration_incoming_get_current();
3433
3434 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3435 if (ret) {
3436 error_report("get_cm_event failed %d", errno);
3437 return;
3438 }
3439
3440 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3441 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3442 if (!rdma->error_state &&
3443 migration_incoming_get_current()->state !=
3444 MIGRATION_STATUS_COMPLETED) {
3445 error_report("receive cm event, cm event is %d", cm_event->event);
3446 rdma->error_state = -EPIPE;
3447 if (rdma->return_path) {
3448 rdma->return_path->error_state = -EPIPE;
3449 }
3450 }
3451 rdma_ack_cm_event(cm_event);
3452
3453 if (mis->migration_incoming_co) {
3454 qemu_coroutine_enter(mis->migration_incoming_co);
3455 }
3456 return;
3457 }
3458 rdma_ack_cm_event(cm_event);
3459}
3460
3461static int qemu_rdma_accept(RDMAContext *rdma)
3462{
3463 RDMACapabilities cap;
3464 struct rdma_conn_param conn_param = {
3465 .responder_resources = 2,
3466 .private_data = &cap,
3467 .private_data_len = sizeof(cap),
3468 };
3469 RDMAContext *rdma_return_path = NULL;
3470 struct rdma_cm_event *cm_event;
3471 struct ibv_context *verbs;
3472 int ret = -EINVAL;
3473 int idx;
3474
3475 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3476 if (ret) {
3477 goto err_rdma_dest_wait;
3478 }
3479
3480 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3481 rdma_ack_cm_event(cm_event);
3482 goto err_rdma_dest_wait;
3483 }
3484
3485
3486
3487
3488
3489 if (migrate_postcopy() && !rdma->is_return_path) {
3490 rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
3491 if (rdma_return_path == NULL) {
3492 rdma_ack_cm_event(cm_event);
3493 goto err_rdma_dest_wait;
3494 }
3495
3496 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
3497 }
3498
3499 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3500
3501 network_to_caps(&cap);
3502
3503 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3504 error_report("Unknown source RDMA version: %d, bailing...",
3505 cap.version);
3506 rdma_ack_cm_event(cm_event);
3507 goto err_rdma_dest_wait;
3508 }
3509
3510
3511
3512
3513 cap.flags &= known_capabilities;
3514
3515
3516
3517
3518
3519 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3520 rdma->pin_all = true;
3521 }
3522
3523 rdma->cm_id = cm_event->id;
3524 verbs = cm_event->id->verbs;
3525
3526 rdma_ack_cm_event(cm_event);
3527
3528 trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3529
3530 caps_to_network(&cap);
3531
3532 trace_qemu_rdma_accept_pin_verbsc(verbs);
3533
3534 if (!rdma->verbs) {
3535 rdma->verbs = verbs;
3536 } else if (rdma->verbs != verbs) {
3537 error_report("ibv context not matching %p, %p!", rdma->verbs,
3538 verbs);
3539 goto err_rdma_dest_wait;
3540 }
3541
3542 qemu_rdma_dump_id("dest_init", verbs);
3543
3544 ret = qemu_rdma_alloc_pd_cq(rdma);
3545 if (ret) {
3546 error_report("rdma migration: error allocating pd and cq!");
3547 goto err_rdma_dest_wait;
3548 }
3549
3550 ret = qemu_rdma_alloc_qp(rdma);
3551 if (ret) {
3552 error_report("rdma migration: error allocating qp!");
3553 goto err_rdma_dest_wait;
3554 }
3555
3556 ret = qemu_rdma_init_ram_blocks(rdma);
3557 if (ret) {
3558 error_report("rdma migration: error initializing ram blocks!");
3559 goto err_rdma_dest_wait;
3560 }
3561
3562 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3563 ret = qemu_rdma_reg_control(rdma, idx);
3564 if (ret) {
3565 error_report("rdma: error registering %d control", idx);
3566 goto err_rdma_dest_wait;
3567 }
3568 }
3569
3570
3571 if (migrate_postcopy() && !rdma->is_return_path) {
3572 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3573 NULL,
3574 (void *)(intptr_t)rdma->return_path);
3575 } else {
3576 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3577 NULL, rdma);
3578 }
3579
3580 ret = rdma_accept(rdma->cm_id, &conn_param);
3581 if (ret) {
3582 error_report("rdma_accept returns %d", ret);
3583 goto err_rdma_dest_wait;
3584 }
3585
3586 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3587 if (ret) {
3588 error_report("rdma_accept get_cm_event failed %d", ret);
3589 goto err_rdma_dest_wait;
3590 }
3591
3592 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3593 error_report("rdma_accept not event established");
3594 rdma_ack_cm_event(cm_event);
3595 goto err_rdma_dest_wait;
3596 }
3597
3598 rdma_ack_cm_event(cm_event);
3599 rdma->connected = true;
3600
3601 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3602 if (ret) {
3603 error_report("rdma migration: error posting second control recv");
3604 goto err_rdma_dest_wait;
3605 }
3606
3607 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3608
3609 return 0;
3610
3611err_rdma_dest_wait:
3612 rdma->error_state = ret;
3613 qemu_rdma_cleanup(rdma);
3614 g_free(rdma_return_path);
3615 return ret;
3616}
3617
3618static int dest_ram_sort_func(const void *a, const void *b)
3619{
3620 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3621 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3622
3623 return (a_index < b_index) ? -1 : (a_index != b_index);
3624}
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
3636{
3637 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3638 .type = RDMA_CONTROL_REGISTER_RESULT,
3639 .repeat = 0,
3640 };
3641 RDMAControlHeader unreg_resp = { .len = 0,
3642 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3643 .repeat = 0,
3644 };
3645 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3646 .repeat = 1 };
3647 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3648 RDMAContext *rdma;
3649 RDMALocalBlocks *local;
3650 RDMAControlHeader head;
3651 RDMARegister *reg, *registers;
3652 RDMACompress *comp;
3653 RDMARegisterResult *reg_result;
3654 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3655 RDMALocalBlock *block;
3656 void *host_addr;
3657 int ret = 0;
3658 int idx = 0;
3659 int count = 0;
3660 int i = 0;
3661
3662 RCU_READ_LOCK_GUARD();
3663 rdma = qatomic_rcu_read(&rioc->rdmain);
3664
3665 if (!rdma) {
3666 return -EIO;
3667 }
3668
3669 CHECK_ERROR_STATE();
3670
3671 local = &rdma->local_ram_blocks;
3672 do {
3673 trace_qemu_rdma_registration_handle_wait();
3674
3675 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3676
3677 if (ret < 0) {
3678 break;
3679 }
3680
3681 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3682 error_report("rdma: Too many requests in this message (%d)."
3683 "Bailing.", head.repeat);
3684 ret = -EIO;
3685 break;
3686 }
3687
3688 switch (head.type) {
3689 case RDMA_CONTROL_COMPRESS:
3690 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3691 network_to_compress(comp);
3692
3693 trace_qemu_rdma_registration_handle_compress(comp->length,
3694 comp->block_idx,
3695 comp->offset);
3696 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3697 error_report("rdma: 'compress' bad block index %u (vs %d)",
3698 (unsigned int)comp->block_idx,
3699 rdma->local_ram_blocks.nb_blocks);
3700 ret = -EIO;
3701 goto out;
3702 }
3703 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3704
3705 host_addr = block->local_host_addr +
3706 (comp->offset - block->offset);
3707
3708 ram_handle_compressed(host_addr, comp->value, comp->length);
3709 break;
3710
3711 case RDMA_CONTROL_REGISTER_FINISHED:
3712 trace_qemu_rdma_registration_handle_finished();
3713 goto out;
3714
3715 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3716 trace_qemu_rdma_registration_handle_ram_blocks();
3717
3718
3719
3720
3721
3722 qsort(rdma->local_ram_blocks.block,
3723 rdma->local_ram_blocks.nb_blocks,
3724 sizeof(RDMALocalBlock), dest_ram_sort_func);
3725 for (i = 0; i < local->nb_blocks; i++) {
3726 local->block[i].index = i;
3727 }
3728
3729 if (rdma->pin_all) {
3730 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3731 if (ret) {
3732 error_report("rdma migration: error dest "
3733 "registering ram blocks");
3734 goto out;
3735 }
3736 }
3737
3738
3739
3740
3741
3742
3743
3744 for (i = 0; i < local->nb_blocks; i++) {
3745 rdma->dest_blocks[i].remote_host_addr =
3746 (uintptr_t)(local->block[i].local_host_addr);
3747
3748 if (rdma->pin_all) {
3749 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3750 }
3751
3752 rdma->dest_blocks[i].offset = local->block[i].offset;
3753 rdma->dest_blocks[i].length = local->block[i].length;
3754
3755 dest_block_to_network(&rdma->dest_blocks[i]);
3756 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3757 local->block[i].block_name,
3758 local->block[i].offset,
3759 local->block[i].length,
3760 local->block[i].local_host_addr,
3761 local->block[i].src_index);
3762 }
3763
3764 blocks.len = rdma->local_ram_blocks.nb_blocks
3765 * sizeof(RDMADestBlock);
3766
3767
3768 ret = qemu_rdma_post_send_control(rdma,
3769 (uint8_t *) rdma->dest_blocks, &blocks);
3770
3771 if (ret < 0) {
3772 error_report("rdma migration: error sending remote info");
3773 goto out;
3774 }
3775
3776 break;
3777 case RDMA_CONTROL_REGISTER_REQUEST:
3778 trace_qemu_rdma_registration_handle_register(head.repeat);
3779
3780 reg_resp.repeat = head.repeat;
3781 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3782
3783 for (count = 0; count < head.repeat; count++) {
3784 uint64_t chunk;
3785 uint8_t *chunk_start, *chunk_end;
3786
3787 reg = ®isters[count];
3788 network_to_register(reg);
3789
3790 reg_result = &results[count];
3791
3792 trace_qemu_rdma_registration_handle_register_loop(count,
3793 reg->current_index, reg->key.current_addr, reg->chunks);
3794
3795 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3796 error_report("rdma: 'register' bad block index %u (vs %d)",
3797 (unsigned int)reg->current_index,
3798 rdma->local_ram_blocks.nb_blocks);
3799 ret = -ENOENT;
3800 goto out;
3801 }
3802 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3803 if (block->is_ram_block) {
3804 if (block->offset > reg->key.current_addr) {
3805 error_report("rdma: bad register address for block %s"
3806 " offset: %" PRIx64 " current_addr: %" PRIx64,
3807 block->block_name, block->offset,
3808 reg->key.current_addr);
3809 ret = -ERANGE;
3810 goto out;
3811 }
3812 host_addr = (block->local_host_addr +
3813 (reg->key.current_addr - block->offset));
3814 chunk = ram_chunk_index(block->local_host_addr,
3815 (uint8_t *) host_addr);
3816 } else {
3817 chunk = reg->key.chunk;
3818 host_addr = block->local_host_addr +
3819 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3820
3821 if (host_addr < (void *)block->local_host_addr) {
3822 error_report("rdma: bad chunk for block %s"
3823 " chunk: %" PRIx64,
3824 block->block_name, reg->key.chunk);
3825 ret = -ERANGE;
3826 goto out;
3827 }
3828 }
3829 chunk_start = ram_chunk_start(block, chunk);
3830 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3831
3832 uint32_t tmp_rkey = 0;
3833 if (qemu_rdma_register_and_get_keys(rdma, block,
3834 (uintptr_t)host_addr, NULL, &tmp_rkey,
3835 chunk, chunk_start, chunk_end)) {
3836 error_report("cannot get rkey");
3837 ret = -EINVAL;
3838 goto out;
3839 }
3840 reg_result->rkey = tmp_rkey;
3841
3842 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3843
3844 trace_qemu_rdma_registration_handle_register_rkey(
3845 reg_result->rkey);
3846
3847 result_to_network(reg_result);
3848 }
3849
3850 ret = qemu_rdma_post_send_control(rdma,
3851 (uint8_t *) results, ®_resp);
3852
3853 if (ret < 0) {
3854 error_report("Failed to send control buffer");
3855 goto out;
3856 }
3857 break;
3858 case RDMA_CONTROL_UNREGISTER_REQUEST:
3859 trace_qemu_rdma_registration_handle_unregister(head.repeat);
3860 unreg_resp.repeat = head.repeat;
3861 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3862
3863 for (count = 0; count < head.repeat; count++) {
3864 reg = ®isters[count];
3865 network_to_register(reg);
3866
3867 trace_qemu_rdma_registration_handle_unregister_loop(count,
3868 reg->current_index, reg->key.chunk);
3869
3870 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3871
3872 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3873 block->pmr[reg->key.chunk] = NULL;
3874
3875 if (ret != 0) {
3876 perror("rdma unregistration chunk failed");
3877 ret = -ret;
3878 goto out;
3879 }
3880
3881 rdma->total_registrations--;
3882
3883 trace_qemu_rdma_registration_handle_unregister_success(
3884 reg->key.chunk);
3885 }
3886
3887 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3888
3889 if (ret < 0) {
3890 error_report("Failed to send control buffer");
3891 goto out;
3892 }
3893 break;
3894 case RDMA_CONTROL_REGISTER_RESULT:
3895 error_report("Invalid RESULT message at dest.");
3896 ret = -EIO;
3897 goto out;
3898 default:
3899 error_report("Unknown control message %s", control_desc(head.type));
3900 ret = -EIO;
3901 goto out;
3902 }
3903 } while (1);
3904out:
3905 if (ret < 0) {
3906 rdma->error_state = ret;
3907 }
3908 return ret;
3909}
3910
3911
3912
3913
3914
3915
3916
3917
3918static int
3919rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
3920{
3921 RDMAContext *rdma;
3922 int curr;
3923 int found = -1;
3924
3925 RCU_READ_LOCK_GUARD();
3926 rdma = qatomic_rcu_read(&rioc->rdmain);
3927
3928 if (!rdma) {
3929 return -EIO;
3930 }
3931
3932
3933 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3934 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3935 found = curr;
3936 break;
3937 }
3938 }
3939
3940 if (found == -1) {
3941 error_report("RAMBlock '%s' not found on destination", name);
3942 return -ENOENT;
3943 }
3944
3945 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3946 trace_rdma_block_notification_handle(name, rdma->next_src_index);
3947 rdma->next_src_index++;
3948
3949 return 0;
3950}
3951
3952static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
3953{
3954 switch (flags) {
3955 case RAM_CONTROL_BLOCK_REG:
3956 return rdma_block_notification_handle(opaque, data);
3957
3958 case RAM_CONTROL_HOOK:
3959 return qemu_rdma_registration_handle(f, opaque);
3960
3961 default:
3962
3963 abort();
3964 }
3965}
3966
3967static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3968 uint64_t flags, void *data)
3969{
3970 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
3971 RDMAContext *rdma;
3972
3973 RCU_READ_LOCK_GUARD();
3974 rdma = qatomic_rcu_read(&rioc->rdmaout);
3975 if (!rdma) {
3976 return -EIO;
3977 }
3978
3979 CHECK_ERROR_STATE();
3980
3981 if (migration_in_postcopy()) {
3982 return 0;
3983 }
3984
3985 trace_qemu_rdma_registration_start(flags);
3986 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3987 qemu_fflush(f);
3988
3989 return 0;
3990}
3991
3992
3993
3994
3995
3996static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3997 uint64_t flags, void *data)
3998{
3999 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
4000 RDMAContext *rdma;
4001 RDMAControlHeader head = { .len = 0, .repeat = 1 };
4002 int ret = 0;
4003
4004 RCU_READ_LOCK_GUARD();
4005 rdma = qatomic_rcu_read(&rioc->rdmaout);
4006 if (!rdma) {
4007 return -EIO;
4008 }
4009
4010 CHECK_ERROR_STATE();
4011
4012 if (migration_in_postcopy()) {
4013 return 0;
4014 }
4015
4016 qemu_fflush(f);
4017 ret = qemu_rdma_drain_cq(f, rdma);
4018
4019 if (ret < 0) {
4020 goto err;
4021 }
4022
4023 if (flags == RAM_CONTROL_SETUP) {
4024 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
4025 RDMALocalBlocks *local = &rdma->local_ram_blocks;
4026 int reg_result_idx, i, nb_dest_blocks;
4027
4028 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
4029 trace_qemu_rdma_registration_stop_ram();
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
4040 ®_result_idx, rdma->pin_all ?
4041 qemu_rdma_reg_whole_ram_blocks : NULL);
4042 if (ret < 0) {
4043 fprintf(stderr, "receiving remote info!");
4044 return ret;
4045 }
4046
4047 nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061 if (local->nb_blocks != nb_dest_blocks) {
4062 fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
4063 "Your QEMU command line parameters are probably "
4064 "not identical on both the source and destination.",
4065 local->nb_blocks, nb_dest_blocks);
4066 rdma->error_state = -EINVAL;
4067 return -EINVAL;
4068 }
4069
4070 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
4071 memcpy(rdma->dest_blocks,
4072 rdma->wr_data[reg_result_idx].control_curr, resp.len);
4073 for (i = 0; i < nb_dest_blocks; i++) {
4074 network_to_dest_block(&rdma->dest_blocks[i]);
4075
4076
4077 if (rdma->dest_blocks[i].length != local->block[i].length) {
4078 fprintf(stderr, "Block %s/%d has a different length %" PRIu64
4079 "vs %" PRIu64, local->block[i].block_name, i,
4080 local->block[i].length,
4081 rdma->dest_blocks[i].length);
4082 rdma->error_state = -EINVAL;
4083 return -EINVAL;
4084 }
4085 local->block[i].remote_host_addr =
4086 rdma->dest_blocks[i].remote_host_addr;
4087 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
4088 }
4089 }
4090
4091 trace_qemu_rdma_registration_stop(flags);
4092
4093 head.type = RDMA_CONTROL_REGISTER_FINISHED;
4094 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
4095
4096 if (ret < 0) {
4097 goto err;
4098 }
4099
4100 return 0;
4101err:
4102 rdma->error_state = ret;
4103 return ret;
4104}
4105
4106static const QEMUFileHooks rdma_read_hooks = {
4107 .hook_ram_load = rdma_load_hook,
4108};
4109
4110static const QEMUFileHooks rdma_write_hooks = {
4111 .before_ram_iterate = qemu_rdma_registration_start,
4112 .after_ram_iterate = qemu_rdma_registration_stop,
4113 .save_page = qemu_rdma_save_page,
4114};
4115
4116
4117static void qio_channel_rdma_finalize(Object *obj)
4118{
4119 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
4120 if (rioc->rdmain) {
4121 qemu_rdma_cleanup(rioc->rdmain);
4122 g_free(rioc->rdmain);
4123 rioc->rdmain = NULL;
4124 }
4125 if (rioc->rdmaout) {
4126 qemu_rdma_cleanup(rioc->rdmaout);
4127 g_free(rioc->rdmaout);
4128 rioc->rdmaout = NULL;
4129 }
4130}
4131
4132static void qio_channel_rdma_class_init(ObjectClass *klass,
4133 void *class_data G_GNUC_UNUSED)
4134{
4135 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
4136
4137 ioc_klass->io_writev = qio_channel_rdma_writev;
4138 ioc_klass->io_readv = qio_channel_rdma_readv;
4139 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
4140 ioc_klass->io_close = qio_channel_rdma_close;
4141 ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
4142 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
4143 ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
4144}
4145
4146static const TypeInfo qio_channel_rdma_info = {
4147 .parent = TYPE_QIO_CHANNEL,
4148 .name = TYPE_QIO_CHANNEL_RDMA,
4149 .instance_size = sizeof(QIOChannelRDMA),
4150 .instance_finalize = qio_channel_rdma_finalize,
4151 .class_init = qio_channel_rdma_class_init,
4152};
4153
4154static void qio_channel_rdma_register_types(void)
4155{
4156 type_register_static(&qio_channel_rdma_info);
4157}
4158
4159type_init(qio_channel_rdma_register_types);
4160
4161static QEMUFile *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
4162{
4163 QIOChannelRDMA *rioc;
4164
4165 if (qemu_file_mode_is_not_valid(mode)) {
4166 return NULL;
4167 }
4168
4169 rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4170
4171 if (mode[0] == 'w') {
4172 rioc->file = qemu_fopen_channel_output(QIO_CHANNEL(rioc));
4173 rioc->rdmaout = rdma;
4174 rioc->rdmain = rdma->return_path;
4175 qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
4176 } else {
4177 rioc->file = qemu_fopen_channel_input(QIO_CHANNEL(rioc));
4178 rioc->rdmain = rdma;
4179 rioc->rdmaout = rdma->return_path;
4180 qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
4181 }
4182
4183 return rioc->file;
4184}
4185
4186static void rdma_accept_incoming_migration(void *opaque)
4187{
4188 RDMAContext *rdma = opaque;
4189 int ret;
4190 QEMUFile *f;
4191 Error *local_err = NULL;
4192
4193 trace_qemu_rdma_accept_incoming_migration();
4194 ret = qemu_rdma_accept(rdma);
4195
4196 if (ret) {
4197 fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
4198 return;
4199 }
4200
4201 trace_qemu_rdma_accept_incoming_migration_accepted();
4202
4203 if (rdma->is_return_path) {
4204 return;
4205 }
4206
4207 f = qemu_fopen_rdma(rdma, "rb");
4208 if (f == NULL) {
4209 fprintf(stderr, "RDMA ERROR: could not qemu_fopen_rdma\n");
4210 qemu_rdma_cleanup(rdma);
4211 return;
4212 }
4213
4214 rdma->migration_started_on_destination = 1;
4215 migration_fd_process_incoming(f, &local_err);
4216 if (local_err) {
4217 error_reportf_err(local_err, "RDMA ERROR:");
4218 }
4219}
4220
4221void rdma_start_incoming_migration(const char *host_port, Error **errp)
4222{
4223 int ret;
4224 RDMAContext *rdma, *rdma_return_path = NULL;
4225 Error *local_err = NULL;
4226
4227 trace_rdma_start_incoming_migration();
4228
4229
4230 if (ram_block_discard_is_required()) {
4231 error_setg(errp, "RDMA: cannot disable RAM discard");
4232 return;
4233 }
4234
4235 rdma = qemu_rdma_data_init(host_port, &local_err);
4236 if (rdma == NULL) {
4237 goto err;
4238 }
4239
4240 ret = qemu_rdma_dest_init(rdma, &local_err);
4241
4242 if (ret) {
4243 goto err;
4244 }
4245
4246 trace_rdma_start_incoming_migration_after_dest_init();
4247
4248 ret = rdma_listen(rdma->listen_id, 5);
4249
4250 if (ret) {
4251 ERROR(errp, "listening on socket!");
4252 goto cleanup_rdma;
4253 }
4254
4255 trace_rdma_start_incoming_migration_after_rdma_listen();
4256
4257 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4258 NULL, (void *)(intptr_t)rdma);
4259 return;
4260
4261cleanup_rdma:
4262 qemu_rdma_cleanup(rdma);
4263err:
4264 error_propagate(errp, local_err);
4265 if (rdma) {
4266 g_free(rdma->host);
4267 g_free(rdma->host_port);
4268 }
4269 g_free(rdma);
4270 g_free(rdma_return_path);
4271}
4272
4273void rdma_start_outgoing_migration(void *opaque,
4274 const char *host_port, Error **errp)
4275{
4276 MigrationState *s = opaque;
4277 RDMAContext *rdma_return_path = NULL;
4278 RDMAContext *rdma;
4279 int ret = 0;
4280
4281
4282 if (ram_block_discard_is_required()) {
4283 error_setg(errp, "RDMA: cannot disable RAM discard");
4284 return;
4285 }
4286
4287 rdma = qemu_rdma_data_init(host_port, errp);
4288 if (rdma == NULL) {
4289 goto err;
4290 }
4291
4292 ret = qemu_rdma_source_init(rdma,
4293 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4294
4295 if (ret) {
4296 goto err;
4297 }
4298
4299 trace_rdma_start_outgoing_migration_after_rdma_source_init();
4300 ret = qemu_rdma_connect(rdma, errp, false);
4301
4302 if (ret) {
4303 goto err;
4304 }
4305
4306
4307 if (migrate_postcopy()) {
4308 rdma_return_path = qemu_rdma_data_init(host_port, errp);
4309
4310 if (rdma_return_path == NULL) {
4311 goto return_path_err;
4312 }
4313
4314 ret = qemu_rdma_source_init(rdma_return_path,
4315 s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL], errp);
4316
4317 if (ret) {
4318 goto return_path_err;
4319 }
4320
4321 ret = qemu_rdma_connect(rdma_return_path, errp, true);
4322
4323 if (ret) {
4324 goto return_path_err;
4325 }
4326
4327 rdma->return_path = rdma_return_path;
4328 rdma_return_path->return_path = rdma;
4329 rdma_return_path->is_return_path = true;
4330 }
4331
4332 trace_rdma_start_outgoing_migration_after_rdma_connect();
4333
4334 s->to_dst_file = qemu_fopen_rdma(rdma, "wb");
4335 migrate_fd_connect(s, NULL);
4336 return;
4337return_path_err:
4338 qemu_rdma_cleanup(rdma);
4339err:
4340 g_free(rdma);
4341 g_free(rdma_return_path);
4342}
4343