1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17#include "qemu/osdep.h"
18#include "qapi/error.h"
19#include "qemu/cutils.h"
20#include "exec/target_page.h"
21#include "rdma.h"
22#include "migration.h"
23#include "migration-stats.h"
24#include "qemu-file.h"
25#include "ram.h"
26#include "qemu/error-report.h"
27#include "qemu/main-loop.h"
28#include "qemu/module.h"
29#include "qemu/rcu.h"
30#include "qemu/sockets.h"
31#include "qemu/bitmap.h"
32#include "qemu/coroutine.h"
33#include "exec/memory.h"
34#include <sys/socket.h>
35#include <netdb.h>
36#include <arpa/inet.h>
37#include <rdma/rdma_cma.h>
38#include "trace.h"
39#include "qom/object.h"
40#include "options.h"
41#include <poll.h>
42
43
44
45
46#define ERROR(errp, fmt, ...) \
47 do { \
48 fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
49 if (errp && (*(errp) == NULL)) { \
50 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
51 } \
52 } while (0)
53
54#define RDMA_RESOLVE_TIMEOUT_MS 10000
55
56
57#define RDMA_MERGE_MAX (2 * 1024 * 1024)
58#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
59
60#define RDMA_REG_CHUNK_SHIFT 20
61
62
63
64
65
66
67
68#define RDMA_SEND_INCREMENT 32768
69
70
71
72
73#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
74#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
75
76#define RDMA_CONTROL_VERSION_CURRENT 1
77
78
79
80#define RDMA_CAPABILITY_PIN_ALL 0x01
81
82
83
84
85
86static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
87
88#define CHECK_ERROR_STATE() \
89 do { \
90 if (rdma->error_state) { \
91 if (!rdma->error_reported) { \
92 error_report("RDMA is in an error state waiting migration" \
93 " to abort!"); \
94 rdma->error_reported = 1; \
95 } \
96 return rdma->error_state; \
97 } \
98 } while (0)
99
100
101
102
103
104
105
106
107
108
109
110
111
112#define RDMA_WRID_TYPE_SHIFT 0UL
113#define RDMA_WRID_BLOCK_SHIFT 16UL
114#define RDMA_WRID_CHUNK_SHIFT 30UL
115
116#define RDMA_WRID_TYPE_MASK \
117 ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
118
119#define RDMA_WRID_BLOCK_MASK \
120 (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
121
122#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
123
124
125
126
127
128
129enum {
130 RDMA_WRID_NONE = 0,
131 RDMA_WRID_RDMA_WRITE = 1,
132 RDMA_WRID_SEND_CONTROL = 2000,
133 RDMA_WRID_RECV_CONTROL = 4000,
134};
135
136static const char *wrid_desc[] = {
137 [RDMA_WRID_NONE] = "NONE",
138 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
139 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
140 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
141};
142
143
144
145
146
147
148
149
150enum {
151 RDMA_WRID_READY = 0,
152 RDMA_WRID_DATA,
153 RDMA_WRID_CONTROL,
154 RDMA_WRID_MAX,
155};
156
157
158
159
160enum {
161 RDMA_CONTROL_NONE = 0,
162 RDMA_CONTROL_ERROR,
163 RDMA_CONTROL_READY,
164 RDMA_CONTROL_QEMU_FILE,
165 RDMA_CONTROL_RAM_BLOCKS_REQUEST,
166 RDMA_CONTROL_RAM_BLOCKS_RESULT,
167 RDMA_CONTROL_COMPRESS,
168 RDMA_CONTROL_REGISTER_REQUEST,
169 RDMA_CONTROL_REGISTER_RESULT,
170 RDMA_CONTROL_REGISTER_FINISHED,
171 RDMA_CONTROL_UNREGISTER_REQUEST,
172 RDMA_CONTROL_UNREGISTER_FINISHED,
173};
174
175
176
177
178
179
180typedef struct {
181 uint8_t control[RDMA_CONTROL_MAX_BUFFER];
182 struct ibv_mr *control_mr;
183 size_t control_len;
184 uint8_t *control_curr;
185} RDMAWorkRequestData;
186
187
188
189
190typedef struct {
191 uint32_t version;
192 uint32_t flags;
193} RDMACapabilities;
194
195static void caps_to_network(RDMACapabilities *cap)
196{
197 cap->version = htonl(cap->version);
198 cap->flags = htonl(cap->flags);
199}
200
201static void network_to_caps(RDMACapabilities *cap)
202{
203 cap->version = ntohl(cap->version);
204 cap->flags = ntohl(cap->flags);
205}
206
207
208
209
210
211
212
213
214typedef struct RDMALocalBlock {
215 char *block_name;
216 uint8_t *local_host_addr;
217 uint64_t remote_host_addr;
218 uint64_t offset;
219 uint64_t length;
220 struct ibv_mr **pmr;
221 struct ibv_mr *mr;
222 uint32_t *remote_keys;
223 uint32_t remote_rkey;
224 int index;
225 unsigned int src_index;
226 bool is_ram_block;
227 int nb_chunks;
228 unsigned long *transit_bitmap;
229 unsigned long *unregister_bitmap;
230} RDMALocalBlock;
231
232
233
234
235
236
237
238
239typedef struct QEMU_PACKED RDMADestBlock {
240 uint64_t remote_host_addr;
241 uint64_t offset;
242 uint64_t length;
243 uint32_t remote_rkey;
244 uint32_t padding;
245} RDMADestBlock;
246
247static const char *control_desc(unsigned int rdma_control)
248{
249 static const char *strs[] = {
250 [RDMA_CONTROL_NONE] = "NONE",
251 [RDMA_CONTROL_ERROR] = "ERROR",
252 [RDMA_CONTROL_READY] = "READY",
253 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
254 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
255 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
256 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
257 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
258 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
259 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
260 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
261 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
262 };
263
264 if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
265 return "??BAD CONTROL VALUE??";
266 }
267
268 return strs[rdma_control];
269}
270
271static uint64_t htonll(uint64_t v)
272{
273 union { uint32_t lv[2]; uint64_t llv; } u;
274 u.lv[0] = htonl(v >> 32);
275 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
276 return u.llv;
277}
278
279static uint64_t ntohll(uint64_t v)
280{
281 union { uint32_t lv[2]; uint64_t llv; } u;
282 u.llv = v;
283 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
284}
285
286static void dest_block_to_network(RDMADestBlock *db)
287{
288 db->remote_host_addr = htonll(db->remote_host_addr);
289 db->offset = htonll(db->offset);
290 db->length = htonll(db->length);
291 db->remote_rkey = htonl(db->remote_rkey);
292}
293
294static void network_to_dest_block(RDMADestBlock *db)
295{
296 db->remote_host_addr = ntohll(db->remote_host_addr);
297 db->offset = ntohll(db->offset);
298 db->length = ntohll(db->length);
299 db->remote_rkey = ntohl(db->remote_rkey);
300}
301
302
303
304
305
306
307typedef struct RDMALocalBlocks {
308 int nb_blocks;
309 bool init;
310 RDMALocalBlock *block;
311} RDMALocalBlocks;
312
313
314
315
316
317
318
319typedef struct RDMAContext {
320 char *host;
321 int port;
322 char *host_port;
323
324 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
325
326
327
328
329
330
331
332
333 int control_ready_expected;
334
335
336 int nb_sent;
337
338
339
340 uint64_t current_addr;
341 uint64_t current_length;
342
343 int current_index;
344
345 int current_chunk;
346
347 bool pin_all;
348
349
350
351
352
353
354
355
356 struct rdma_cm_id *cm_id;
357 struct rdma_cm_id *listen_id;
358 bool connected;
359
360 struct ibv_context *verbs;
361 struct rdma_event_channel *channel;
362 struct ibv_qp *qp;
363 struct ibv_comp_channel *recv_comp_channel;
364 struct ibv_comp_channel *send_comp_channel;
365 struct ibv_pd *pd;
366 struct ibv_cq *recv_cq;
367 struct ibv_cq *send_cq;
368
369
370
371
372
373
374 int error_state;
375 int error_reported;
376 int received_error;
377
378
379
380
381 RDMALocalBlocks local_ram_blocks;
382 RDMADestBlock *dest_blocks;
383
384
385 unsigned int next_src_index;
386
387
388
389
390
391
392 int migration_started_on_destination;
393
394 int total_registrations;
395 int total_writes;
396
397 int unregister_current, unregister_next;
398 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
399
400 GHashTable *blockmap;
401
402
403 struct RDMAContext *return_path;
404 bool is_return_path;
405} RDMAContext;
406
407#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
408OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
409
410
411
412struct QIOChannelRDMA {
413 QIOChannel parent;
414 RDMAContext *rdmain;
415 RDMAContext *rdmaout;
416 QEMUFile *file;
417 bool blocking;
418};
419
420
421
422
423
424typedef struct QEMU_PACKED {
425 uint32_t len;
426 uint32_t type;
427 uint32_t repeat;
428 uint32_t padding;
429} RDMAControlHeader;
430
431static void control_to_network(RDMAControlHeader *control)
432{
433 control->type = htonl(control->type);
434 control->len = htonl(control->len);
435 control->repeat = htonl(control->repeat);
436}
437
438static void network_to_control(RDMAControlHeader *control)
439{
440 control->type = ntohl(control->type);
441 control->len = ntohl(control->len);
442 control->repeat = ntohl(control->repeat);
443}
444
445
446
447
448
449
450
451typedef struct QEMU_PACKED {
452 union QEMU_PACKED {
453 uint64_t current_addr;
454 uint64_t chunk;
455 } key;
456 uint32_t current_index;
457 uint32_t padding;
458 uint64_t chunks;
459} RDMARegister;
460
461static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
462{
463 RDMALocalBlock *local_block;
464 local_block = &rdma->local_ram_blocks.block[reg->current_index];
465
466 if (local_block->is_ram_block) {
467
468
469
470
471 reg->key.current_addr -= local_block->offset;
472 reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
473 }
474 reg->key.current_addr = htonll(reg->key.current_addr);
475 reg->current_index = htonl(reg->current_index);
476 reg->chunks = htonll(reg->chunks);
477}
478
479static void network_to_register(RDMARegister *reg)
480{
481 reg->key.current_addr = ntohll(reg->key.current_addr);
482 reg->current_index = ntohl(reg->current_index);
483 reg->chunks = ntohll(reg->chunks);
484}
485
486typedef struct QEMU_PACKED {
487 uint32_t value;
488 uint32_t block_idx;
489 uint64_t offset;
490 uint64_t length;
491} RDMACompress;
492
493static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
494{
495 comp->value = htonl(comp->value);
496
497
498
499
500 comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
501 comp->offset += rdma->dest_blocks[comp->block_idx].offset;
502 comp->block_idx = htonl(comp->block_idx);
503 comp->offset = htonll(comp->offset);
504 comp->length = htonll(comp->length);
505}
506
507static void network_to_compress(RDMACompress *comp)
508{
509 comp->value = ntohl(comp->value);
510 comp->block_idx = ntohl(comp->block_idx);
511 comp->offset = ntohll(comp->offset);
512 comp->length = ntohll(comp->length);
513}
514
515
516
517
518
519
520typedef struct QEMU_PACKED {
521 uint32_t rkey;
522 uint32_t padding;
523 uint64_t host_addr;
524} RDMARegisterResult;
525
526static void result_to_network(RDMARegisterResult *result)
527{
528 result->rkey = htonl(result->rkey);
529 result->host_addr = htonll(result->host_addr);
530};
531
532static void network_to_result(RDMARegisterResult *result)
533{
534 result->rkey = ntohl(result->rkey);
535 result->host_addr = ntohll(result->host_addr);
536};
537
538const char *print_wrid(int wrid);
539static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
540 uint8_t *data, RDMAControlHeader *resp,
541 int *resp_idx,
542 int (*callback)(RDMAContext *rdma));
543
544static inline uint64_t ram_chunk_index(const uint8_t *start,
545 const uint8_t *host)
546{
547 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
548}
549
550static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
551 uint64_t i)
552{
553 return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
554 (i << RDMA_REG_CHUNK_SHIFT));
555}
556
557static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
558 uint64_t i)
559{
560 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
561 (1UL << RDMA_REG_CHUNK_SHIFT);
562
563 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
564 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
565 }
566
567 return result;
568}
569
570static int rdma_add_block(RDMAContext *rdma, const char *block_name,
571 void *host_addr,
572 ram_addr_t block_offset, uint64_t length)
573{
574 RDMALocalBlocks *local = &rdma->local_ram_blocks;
575 RDMALocalBlock *block;
576 RDMALocalBlock *old = local->block;
577
578 local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
579
580 if (local->nb_blocks) {
581 int x;
582
583 if (rdma->blockmap) {
584 for (x = 0; x < local->nb_blocks; x++) {
585 g_hash_table_remove(rdma->blockmap,
586 (void *)(uintptr_t)old[x].offset);
587 g_hash_table_insert(rdma->blockmap,
588 (void *)(uintptr_t)old[x].offset,
589 &local->block[x]);
590 }
591 }
592 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
593 g_free(old);
594 }
595
596 block = &local->block[local->nb_blocks];
597
598 block->block_name = g_strdup(block_name);
599 block->local_host_addr = host_addr;
600 block->offset = block_offset;
601 block->length = length;
602 block->index = local->nb_blocks;
603 block->src_index = ~0U;
604 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
605 block->transit_bitmap = bitmap_new(block->nb_chunks);
606 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
607 block->unregister_bitmap = bitmap_new(block->nb_chunks);
608 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
609 block->remote_keys = g_new0(uint32_t, block->nb_chunks);
610
611 block->is_ram_block = local->init ? false : true;
612
613 if (rdma->blockmap) {
614 g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
615 }
616
617 trace_rdma_add_block(block_name, local->nb_blocks,
618 (uintptr_t) block->local_host_addr,
619 block->offset, block->length,
620 (uintptr_t) (block->local_host_addr + block->length),
621 BITS_TO_LONGS(block->nb_chunks) *
622 sizeof(unsigned long) * 8,
623 block->nb_chunks);
624
625 local->nb_blocks++;
626
627 return 0;
628}
629
630
631
632
633
634
635static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
636{
637 const char *block_name = qemu_ram_get_idstr(rb);
638 void *host_addr = qemu_ram_get_host_addr(rb);
639 ram_addr_t block_offset = qemu_ram_get_offset(rb);
640 ram_addr_t length = qemu_ram_get_used_length(rb);
641 return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
642}
643
644
645
646
647
648
649static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
650{
651 RDMALocalBlocks *local = &rdma->local_ram_blocks;
652 int ret;
653
654 assert(rdma->blockmap == NULL);
655 memset(local, 0, sizeof *local);
656 ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
657 if (ret) {
658 return ret;
659 }
660 trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
661 rdma->dest_blocks = g_new0(RDMADestBlock,
662 rdma->local_ram_blocks.nb_blocks);
663 local->init = true;
664 return 0;
665}
666
667
668
669
670
671static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
672{
673 RDMALocalBlocks *local = &rdma->local_ram_blocks;
674 RDMALocalBlock *old = local->block;
675 int x;
676
677 if (rdma->blockmap) {
678 g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
679 }
680 if (block->pmr) {
681 int j;
682
683 for (j = 0; j < block->nb_chunks; j++) {
684 if (!block->pmr[j]) {
685 continue;
686 }
687 ibv_dereg_mr(block->pmr[j]);
688 rdma->total_registrations--;
689 }
690 g_free(block->pmr);
691 block->pmr = NULL;
692 }
693
694 if (block->mr) {
695 ibv_dereg_mr(block->mr);
696 rdma->total_registrations--;
697 block->mr = NULL;
698 }
699
700 g_free(block->transit_bitmap);
701 block->transit_bitmap = NULL;
702
703 g_free(block->unregister_bitmap);
704 block->unregister_bitmap = NULL;
705
706 g_free(block->remote_keys);
707 block->remote_keys = NULL;
708
709 g_free(block->block_name);
710 block->block_name = NULL;
711
712 if (rdma->blockmap) {
713 for (x = 0; x < local->nb_blocks; x++) {
714 g_hash_table_remove(rdma->blockmap,
715 (void *)(uintptr_t)old[x].offset);
716 }
717 }
718
719 if (local->nb_blocks > 1) {
720
721 local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
722
723 if (block->index) {
724 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
725 }
726
727 if (block->index < (local->nb_blocks - 1)) {
728 memcpy(local->block + block->index, old + (block->index + 1),
729 sizeof(RDMALocalBlock) *
730 (local->nb_blocks - (block->index + 1)));
731 for (x = block->index; x < local->nb_blocks - 1; x++) {
732 local->block[x].index--;
733 }
734 }
735 } else {
736 assert(block == local->block);
737 local->block = NULL;
738 }
739
740 trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
741 block->offset, block->length,
742 (uintptr_t)(block->local_host_addr + block->length),
743 BITS_TO_LONGS(block->nb_chunks) *
744 sizeof(unsigned long) * 8, block->nb_chunks);
745
746 g_free(old);
747
748 local->nb_blocks--;
749
750 if (local->nb_blocks && rdma->blockmap) {
751 for (x = 0; x < local->nb_blocks; x++) {
752 g_hash_table_insert(rdma->blockmap,
753 (void *)(uintptr_t)local->block[x].offset,
754 &local->block[x]);
755 }
756 }
757
758 return 0;
759}
760
761
762
763
764
765static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
766{
767 struct ibv_port_attr port;
768
769 if (ibv_query_port(verbs, 1, &port)) {
770 error_report("Failed to query port information");
771 return;
772 }
773
774 printf("%s RDMA Device opened: kernel name %s "
775 "uverbs device name %s, "
776 "infiniband_verbs class device path %s, "
777 "infiniband class device path %s, "
778 "transport: (%d) %s\n",
779 who,
780 verbs->device->name,
781 verbs->device->dev_name,
782 verbs->device->dev_path,
783 verbs->device->ibdev_path,
784 port.link_layer,
785 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
786 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
787 ? "Ethernet" : "Unknown"));
788}
789
790
791
792
793
794
795static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
796{
797 char sgid[33];
798 char dgid[33];
799 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
800 inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
801 trace_qemu_rdma_dump_gid(who, sgid, dgid);
802}
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
848{
849
850#ifdef CONFIG_LINUX
851 struct ibv_port_attr port_attr;
852
853
854
855
856
857
858
859
860
861
862 if (!verbs) {
863 int num_devices, x;
864 struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
865 bool roce_found = false;
866 bool ib_found = false;
867
868 for (x = 0; x < num_devices; x++) {
869 verbs = ibv_open_device(dev_list[x]);
870 if (!verbs) {
871 if (errno == EPERM) {
872 continue;
873 } else {
874 return -EINVAL;
875 }
876 }
877
878 if (ibv_query_port(verbs, 1, &port_attr)) {
879 ibv_close_device(verbs);
880 ERROR(errp, "Could not query initial IB port");
881 return -EINVAL;
882 }
883
884 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
885 ib_found = true;
886 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
887 roce_found = true;
888 }
889
890 ibv_close_device(verbs);
891
892 }
893
894 if (roce_found) {
895 if (ib_found) {
896 fprintf(stderr, "WARN: migrations may fail:"
897 " IPv6 over RoCE / iWARP in linux"
898 " is broken. But since you appear to have a"
899 " mixed RoCE / IB environment, be sure to only"
900 " migrate over the IB fabric until the kernel "
901 " fixes the bug.\n");
902 } else {
903 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
904 " and your management software has specified '[::]'"
905 ", but IPv6 over RoCE / iWARP is not supported in Linux.");
906 return -ENONET;
907 }
908 }
909
910 return 0;
911 }
912
913
914
915
916
917
918
919
920 if (ibv_query_port(verbs, 1, &port_attr)) {
921 ERROR(errp, "Could not query initial IB port");
922 return -EINVAL;
923 }
924
925 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
926 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
927 "(but patches on linux-rdma in progress)");
928 return -ENONET;
929 }
930
931#endif
932
933 return 0;
934}
935
936
937
938
939
940
941static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
942{
943 int ret;
944 struct rdma_addrinfo *res;
945 char port_str[16];
946 struct rdma_cm_event *cm_event;
947 char ip[40] = "unknown";
948 struct rdma_addrinfo *e;
949
950 if (rdma->host == NULL || !strcmp(rdma->host, "")) {
951 ERROR(errp, "RDMA hostname has not been set");
952 return -EINVAL;
953 }
954
955
956 rdma->channel = rdma_create_event_channel();
957 if (!rdma->channel) {
958 ERROR(errp, "could not create CM channel");
959 return -EINVAL;
960 }
961
962
963 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
964 if (ret) {
965 ERROR(errp, "could not create channel id");
966 goto err_resolve_create_id;
967 }
968
969 snprintf(port_str, 16, "%d", rdma->port);
970 port_str[15] = '\0';
971
972 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
973 if (ret < 0) {
974 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
975 goto err_resolve_get_addr;
976 }
977
978 for (e = res; e != NULL; e = e->ai_next) {
979 inet_ntop(e->ai_family,
980 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
981 trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
982
983 ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
984 RDMA_RESOLVE_TIMEOUT_MS);
985 if (!ret) {
986 if (e->ai_family == AF_INET6) {
987 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
988 if (ret) {
989 continue;
990 }
991 }
992 goto route;
993 }
994 }
995
996 rdma_freeaddrinfo(res);
997 ERROR(errp, "could not resolve address %s", rdma->host);
998 goto err_resolve_get_addr;
999
1000route:
1001 rdma_freeaddrinfo(res);
1002 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
1003
1004 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1005 if (ret) {
1006 ERROR(errp, "could not perform event_addr_resolved");
1007 goto err_resolve_get_addr;
1008 }
1009
1010 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
1011 ERROR(errp, "result not equal to event_addr_resolved %s",
1012 rdma_event_str(cm_event->event));
1013 error_report("rdma_resolve_addr");
1014 rdma_ack_cm_event(cm_event);
1015 ret = -EINVAL;
1016 goto err_resolve_get_addr;
1017 }
1018 rdma_ack_cm_event(cm_event);
1019
1020
1021 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1022 if (ret) {
1023 ERROR(errp, "could not resolve rdma route");
1024 goto err_resolve_get_addr;
1025 }
1026
1027 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1028 if (ret) {
1029 ERROR(errp, "could not perform event_route_resolved");
1030 goto err_resolve_get_addr;
1031 }
1032 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1033 ERROR(errp, "result not equal to event_route_resolved: %s",
1034 rdma_event_str(cm_event->event));
1035 rdma_ack_cm_event(cm_event);
1036 ret = -EINVAL;
1037 goto err_resolve_get_addr;
1038 }
1039 rdma_ack_cm_event(cm_event);
1040 rdma->verbs = rdma->cm_id->verbs;
1041 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1042 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1043 return 0;
1044
1045err_resolve_get_addr:
1046 rdma_destroy_id(rdma->cm_id);
1047 rdma->cm_id = NULL;
1048err_resolve_create_id:
1049 rdma_destroy_event_channel(rdma->channel);
1050 rdma->channel = NULL;
1051 return ret;
1052}
1053
1054
1055
1056
1057static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1058{
1059
1060 rdma->pd = ibv_alloc_pd(rdma->verbs);
1061 if (!rdma->pd) {
1062 error_report("failed to allocate protection domain");
1063 return -1;
1064 }
1065
1066
1067 rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1068 if (!rdma->recv_comp_channel) {
1069 error_report("failed to allocate receive completion channel");
1070 goto err_alloc_pd_cq;
1071 }
1072
1073
1074
1075
1076 rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1077 NULL, rdma->recv_comp_channel, 0);
1078 if (!rdma->recv_cq) {
1079 error_report("failed to allocate receive completion queue");
1080 goto err_alloc_pd_cq;
1081 }
1082
1083
1084 rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1085 if (!rdma->send_comp_channel) {
1086 error_report("failed to allocate send completion channel");
1087 goto err_alloc_pd_cq;
1088 }
1089
1090 rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1091 NULL, rdma->send_comp_channel, 0);
1092 if (!rdma->send_cq) {
1093 error_report("failed to allocate send completion queue");
1094 goto err_alloc_pd_cq;
1095 }
1096
1097 return 0;
1098
1099err_alloc_pd_cq:
1100 if (rdma->pd) {
1101 ibv_dealloc_pd(rdma->pd);
1102 }
1103 if (rdma->recv_comp_channel) {
1104 ibv_destroy_comp_channel(rdma->recv_comp_channel);
1105 }
1106 if (rdma->send_comp_channel) {
1107 ibv_destroy_comp_channel(rdma->send_comp_channel);
1108 }
1109 if (rdma->recv_cq) {
1110 ibv_destroy_cq(rdma->recv_cq);
1111 rdma->recv_cq = NULL;
1112 }
1113 rdma->pd = NULL;
1114 rdma->recv_comp_channel = NULL;
1115 rdma->send_comp_channel = NULL;
1116 return -1;
1117
1118}
1119
1120
1121
1122
1123static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1124{
1125 struct ibv_qp_init_attr attr = { 0 };
1126 int ret;
1127
1128 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1129 attr.cap.max_recv_wr = 3;
1130 attr.cap.max_send_sge = 1;
1131 attr.cap.max_recv_sge = 1;
1132 attr.send_cq = rdma->send_cq;
1133 attr.recv_cq = rdma->recv_cq;
1134 attr.qp_type = IBV_QPT_RC;
1135
1136 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1137 if (ret) {
1138 return -1;
1139 }
1140
1141 rdma->qp = rdma->cm_id->qp;
1142 return 0;
1143}
1144
1145
1146static bool rdma_support_odp(struct ibv_context *dev)
1147{
1148 struct ibv_device_attr_ex attr = {0};
1149 int ret = ibv_query_device_ex(dev, NULL, &attr);
1150 if (ret) {
1151 return false;
1152 }
1153
1154 if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1155 return true;
1156 }
1157
1158 return false;
1159}
1160
1161
1162
1163
1164
1165
1166static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1167 uint32_t len, uint32_t lkey,
1168 const char *name, bool wr)
1169{
1170#ifdef HAVE_IBV_ADVISE_MR
1171 int ret;
1172 int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1173 IBV_ADVISE_MR_ADVICE_PREFETCH;
1174 struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1175
1176 ret = ibv_advise_mr(pd, advice,
1177 IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1178
1179 if (ret) {
1180 trace_qemu_rdma_advise_mr(name, len, addr, strerror(errno));
1181 } else {
1182 trace_qemu_rdma_advise_mr(name, len, addr, "successed");
1183 }
1184#endif
1185}
1186
1187static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1188{
1189 int i;
1190 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1191
1192 for (i = 0; i < local->nb_blocks; i++) {
1193 int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1194
1195 local->block[i].mr =
1196 ibv_reg_mr(rdma->pd,
1197 local->block[i].local_host_addr,
1198 local->block[i].length, access
1199 );
1200
1201 if (!local->block[i].mr &&
1202 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1203 access |= IBV_ACCESS_ON_DEMAND;
1204
1205 local->block[i].mr =
1206 ibv_reg_mr(rdma->pd,
1207 local->block[i].local_host_addr,
1208 local->block[i].length, access);
1209 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1210
1211 if (local->block[i].mr) {
1212 qemu_rdma_advise_prefetch_mr(rdma->pd,
1213 (uintptr_t)local->block[i].local_host_addr,
1214 local->block[i].length,
1215 local->block[i].mr->lkey,
1216 local->block[i].block_name,
1217 true);
1218 }
1219 }
1220
1221 if (!local->block[i].mr) {
1222 perror("Failed to register local dest ram block!");
1223 break;
1224 }
1225 rdma->total_registrations++;
1226 }
1227
1228 if (i >= local->nb_blocks) {
1229 return 0;
1230 }
1231
1232 for (i--; i >= 0; i--) {
1233 ibv_dereg_mr(local->block[i].mr);
1234 local->block[i].mr = NULL;
1235 rdma->total_registrations--;
1236 }
1237
1238 return -1;
1239
1240}
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1252 uintptr_t block_offset,
1253 uint64_t offset,
1254 uint64_t length,
1255 uint64_t *block_index,
1256 uint64_t *chunk_index)
1257{
1258 uint64_t current_addr = block_offset + offset;
1259 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1260 (void *) block_offset);
1261 assert(block);
1262 assert(current_addr >= block->offset);
1263 assert((current_addr + length) <= (block->offset + block->length));
1264
1265 *block_index = block->index;
1266 *chunk_index = ram_chunk_index(block->local_host_addr,
1267 block->local_host_addr + (current_addr - block->offset));
1268
1269 return 0;
1270}
1271
1272
1273
1274
1275
1276
1277
1278
1279static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1280 RDMALocalBlock *block, uintptr_t host_addr,
1281 uint32_t *lkey, uint32_t *rkey, int chunk,
1282 uint8_t *chunk_start, uint8_t *chunk_end)
1283{
1284 if (block->mr) {
1285 if (lkey) {
1286 *lkey = block->mr->lkey;
1287 }
1288 if (rkey) {
1289 *rkey = block->mr->rkey;
1290 }
1291 return 0;
1292 }
1293
1294
1295 if (!block->pmr) {
1296 block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1297 }
1298
1299
1300
1301
1302
1303
1304 if (!block->pmr[chunk]) {
1305 uint64_t len = chunk_end - chunk_start;
1306 int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1307 0;
1308
1309 trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1310
1311 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1312 if (!block->pmr[chunk] &&
1313 errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1314 access |= IBV_ACCESS_ON_DEMAND;
1315
1316 block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1317 trace_qemu_rdma_register_odp_mr(block->block_name);
1318
1319 if (block->pmr[chunk]) {
1320 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1321 len, block->pmr[chunk]->lkey,
1322 block->block_name, rkey);
1323
1324 }
1325 }
1326 }
1327 if (!block->pmr[chunk]) {
1328 perror("Failed to register chunk!");
1329 fprintf(stderr, "Chunk details: block: %d chunk index %d"
1330 " start %" PRIuPTR " end %" PRIuPTR
1331 " host %" PRIuPTR
1332 " local %" PRIuPTR " registrations: %d\n",
1333 block->index, chunk, (uintptr_t)chunk_start,
1334 (uintptr_t)chunk_end, host_addr,
1335 (uintptr_t)block->local_host_addr,
1336 rdma->total_registrations);
1337 return -1;
1338 }
1339 rdma->total_registrations++;
1340
1341 if (lkey) {
1342 *lkey = block->pmr[chunk]->lkey;
1343 }
1344 if (rkey) {
1345 *rkey = block->pmr[chunk]->rkey;
1346 }
1347 return 0;
1348}
1349
1350
1351
1352
1353
1354static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1355{
1356 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1357 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1358 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1359 if (rdma->wr_data[idx].control_mr) {
1360 rdma->total_registrations++;
1361 return 0;
1362 }
1363 error_report("qemu_rdma_reg_control failed");
1364 return -1;
1365}
1366
1367const char *print_wrid(int wrid)
1368{
1369 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1370 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1371 }
1372 return wrid_desc[wrid];
1373}
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1387{
1388 while (rdma->unregistrations[rdma->unregister_current]) {
1389 int ret;
1390 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1391 uint64_t chunk =
1392 (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1393 uint64_t index =
1394 (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1395 RDMALocalBlock *block =
1396 &(rdma->local_ram_blocks.block[index]);
1397 RDMARegister reg = { .current_index = index };
1398 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1399 };
1400 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1401 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1402 .repeat = 1,
1403 };
1404
1405 trace_qemu_rdma_unregister_waiting_proc(chunk,
1406 rdma->unregister_current);
1407
1408 rdma->unregistrations[rdma->unregister_current] = 0;
1409 rdma->unregister_current++;
1410
1411 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1412 rdma->unregister_current = 0;
1413 }
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423 clear_bit(chunk, block->unregister_bitmap);
1424
1425 if (test_bit(chunk, block->transit_bitmap)) {
1426 trace_qemu_rdma_unregister_waiting_inflight(chunk);
1427 continue;
1428 }
1429
1430 trace_qemu_rdma_unregister_waiting_send(chunk);
1431
1432 ret = ibv_dereg_mr(block->pmr[chunk]);
1433 block->pmr[chunk] = NULL;
1434 block->remote_keys[chunk] = 0;
1435
1436 if (ret != 0) {
1437 perror("unregistration chunk failed");
1438 return -ret;
1439 }
1440 rdma->total_registrations--;
1441
1442 reg.key.chunk = chunk;
1443 register_to_network(rdma, ®);
1444 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
1445 &resp, NULL, NULL);
1446 if (ret < 0) {
1447 return ret;
1448 }
1449
1450 trace_qemu_rdma_unregister_waiting_complete(chunk);
1451 }
1452
1453 return 0;
1454}
1455
1456static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1457 uint64_t chunk)
1458{
1459 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1460
1461 result |= (index << RDMA_WRID_BLOCK_SHIFT);
1462 result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1463
1464 return result;
1465}
1466
1467
1468
1469
1470
1471
1472static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1473 uint64_t *wr_id_out, uint32_t *byte_len)
1474{
1475 int ret;
1476 struct ibv_wc wc;
1477 uint64_t wr_id;
1478
1479 ret = ibv_poll_cq(cq, 1, &wc);
1480
1481 if (!ret) {
1482 *wr_id_out = RDMA_WRID_NONE;
1483 return 0;
1484 }
1485
1486 if (ret < 0) {
1487 error_report("ibv_poll_cq return %d", ret);
1488 return ret;
1489 }
1490
1491 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1492
1493 if (wc.status != IBV_WC_SUCCESS) {
1494 fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1495 wc.status, ibv_wc_status_str(wc.status));
1496 fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1497
1498 return -1;
1499 }
1500
1501 if (rdma->control_ready_expected &&
1502 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1503 trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1504 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1505 rdma->control_ready_expected = 0;
1506 }
1507
1508 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1509 uint64_t chunk =
1510 (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1511 uint64_t index =
1512 (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1513 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1514
1515 trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1516 index, chunk, block->local_host_addr,
1517 (void *)(uintptr_t)block->remote_host_addr);
1518
1519 clear_bit(chunk, block->transit_bitmap);
1520
1521 if (rdma->nb_sent > 0) {
1522 rdma->nb_sent--;
1523 }
1524 } else {
1525 trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1526 }
1527
1528 *wr_id_out = wc.wr_id;
1529 if (byte_len) {
1530 *byte_len = wc.byte_len;
1531 }
1532
1533 return 0;
1534}
1535
1536
1537
1538
1539static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1540 struct ibv_comp_channel *comp_channel)
1541{
1542 struct rdma_cm_event *cm_event;
1543 int ret = -1;
1544
1545
1546
1547
1548
1549 if (rdma->migration_started_on_destination &&
1550 migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1551 yield_until_fd_readable(comp_channel->fd);
1552 } else {
1553
1554
1555
1556
1557
1558
1559
1560 while (!rdma->error_state && !rdma->received_error) {
1561 GPollFD pfds[2];
1562 pfds[0].fd = comp_channel->fd;
1563 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1564 pfds[0].revents = 0;
1565
1566 pfds[1].fd = rdma->channel->fd;
1567 pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1568 pfds[1].revents = 0;
1569
1570
1571 switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1572 case 2:
1573 case 1:
1574 if (pfds[0].revents) {
1575 return 0;
1576 }
1577
1578 if (pfds[1].revents) {
1579 ret = rdma_get_cm_event(rdma->channel, &cm_event);
1580 if (ret) {
1581 error_report("failed to get cm event while wait "
1582 "completion channel");
1583 return -EPIPE;
1584 }
1585
1586 error_report("receive cm event while wait comp channel,"
1587 "cm event is %d", cm_event->event);
1588 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1589 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1590 rdma_ack_cm_event(cm_event);
1591 return -EPIPE;
1592 }
1593 rdma_ack_cm_event(cm_event);
1594 }
1595 break;
1596
1597 case 0:
1598 break;
1599
1600 default:
1601
1602
1603 error_report("%s: poll failed", __func__);
1604 return -EPIPE;
1605 }
1606
1607 if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1608
1609 return -EPIPE;
1610 }
1611 }
1612 }
1613
1614 if (rdma->received_error) {
1615 return -EPIPE;
1616 }
1617 return rdma->error_state;
1618}
1619
1620static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
1621{
1622 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1623 rdma->recv_comp_channel;
1624}
1625
1626static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
1627{
1628 return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1629}
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1645 uint32_t *byte_len)
1646{
1647 int num_cq_events = 0, ret = 0;
1648 struct ibv_cq *cq;
1649 void *cq_ctx;
1650 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1651 struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1652 struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1653
1654 if (ibv_req_notify_cq(poll_cq, 0)) {
1655 return -1;
1656 }
1657
1658 while (wr_id != wrid_requested) {
1659 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1660 if (ret < 0) {
1661 return ret;
1662 }
1663
1664 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1665
1666 if (wr_id == RDMA_WRID_NONE) {
1667 break;
1668 }
1669 if (wr_id != wrid_requested) {
1670 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1671 wrid_requested, print_wrid(wr_id), wr_id);
1672 }
1673 }
1674
1675 if (wr_id == wrid_requested) {
1676 return 0;
1677 }
1678
1679 while (1) {
1680 ret = qemu_rdma_wait_comp_channel(rdma, ch);
1681 if (ret) {
1682 goto err_block_for_wrid;
1683 }
1684
1685 ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
1686 if (ret) {
1687 perror("ibv_get_cq_event");
1688 goto err_block_for_wrid;
1689 }
1690
1691 num_cq_events++;
1692
1693 ret = -ibv_req_notify_cq(cq, 0);
1694 if (ret) {
1695 goto err_block_for_wrid;
1696 }
1697
1698 while (wr_id != wrid_requested) {
1699 ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1700 if (ret < 0) {
1701 goto err_block_for_wrid;
1702 }
1703
1704 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1705
1706 if (wr_id == RDMA_WRID_NONE) {
1707 break;
1708 }
1709 if (wr_id != wrid_requested) {
1710 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1711 wrid_requested, print_wrid(wr_id), wr_id);
1712 }
1713 }
1714
1715 if (wr_id == wrid_requested) {
1716 goto success_block_for_wrid;
1717 }
1718 }
1719
1720success_block_for_wrid:
1721 if (num_cq_events) {
1722 ibv_ack_cq_events(cq, num_cq_events);
1723 }
1724 return 0;
1725
1726err_block_for_wrid:
1727 if (num_cq_events) {
1728 ibv_ack_cq_events(cq, num_cq_events);
1729 }
1730
1731 rdma->error_state = ret;
1732 return ret;
1733}
1734
1735
1736
1737
1738
1739static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1740 RDMAControlHeader *head)
1741{
1742 int ret = 0;
1743 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1744 struct ibv_send_wr *bad_wr;
1745 struct ibv_sge sge = {
1746 .addr = (uintptr_t)(wr->control),
1747 .length = head->len + sizeof(RDMAControlHeader),
1748 .lkey = wr->control_mr->lkey,
1749 };
1750 struct ibv_send_wr send_wr = {
1751 .wr_id = RDMA_WRID_SEND_CONTROL,
1752 .opcode = IBV_WR_SEND,
1753 .send_flags = IBV_SEND_SIGNALED,
1754 .sg_list = &sge,
1755 .num_sge = 1,
1756 };
1757
1758 trace_qemu_rdma_post_send_control(control_desc(head->type));
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1769 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1770 control_to_network((void *) wr->control);
1771
1772 if (buf) {
1773 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1774 }
1775
1776
1777 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1778
1779 if (ret > 0) {
1780 error_report("Failed to use post IB SEND for control");
1781 return -ret;
1782 }
1783
1784 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1785 if (ret < 0) {
1786 error_report("rdma migration: send polling control error");
1787 }
1788
1789 return ret;
1790}
1791
1792
1793
1794
1795
1796static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1797{
1798 struct ibv_recv_wr *bad_wr;
1799 struct ibv_sge sge = {
1800 .addr = (uintptr_t)(rdma->wr_data[idx].control),
1801 .length = RDMA_CONTROL_MAX_BUFFER,
1802 .lkey = rdma->wr_data[idx].control_mr->lkey,
1803 };
1804
1805 struct ibv_recv_wr recv_wr = {
1806 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1807 .sg_list = &sge,
1808 .num_sge = 1,
1809 };
1810
1811
1812 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1813 return -1;
1814 }
1815
1816 return 0;
1817}
1818
1819
1820
1821
1822static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1823 RDMAControlHeader *head, int expecting, int idx)
1824{
1825 uint32_t byte_len;
1826 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1827 &byte_len);
1828
1829 if (ret < 0) {
1830 error_report("rdma migration: recv polling control error!");
1831 return ret;
1832 }
1833
1834 network_to_control((void *) rdma->wr_data[idx].control);
1835 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1836
1837 trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1838
1839 if (expecting == RDMA_CONTROL_NONE) {
1840 trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1841 head->type);
1842 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1843 error_report("Was expecting a %s (%d) control message"
1844 ", but got: %s (%d), length: %d",
1845 control_desc(expecting), expecting,
1846 control_desc(head->type), head->type, head->len);
1847 if (head->type == RDMA_CONTROL_ERROR) {
1848 rdma->received_error = true;
1849 }
1850 return -EIO;
1851 }
1852 if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1853 error_report("too long length: %d", head->len);
1854 return -EINVAL;
1855 }
1856 if (sizeof(*head) + head->len != byte_len) {
1857 error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1858 return -EINVAL;
1859 }
1860
1861 return 0;
1862}
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1873 RDMAControlHeader *head)
1874{
1875 rdma->wr_data[idx].control_len = head->len;
1876 rdma->wr_data[idx].control_curr =
1877 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1878}
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1894 uint8_t *data, RDMAControlHeader *resp,
1895 int *resp_idx,
1896 int (*callback)(RDMAContext *rdma))
1897{
1898 int ret = 0;
1899
1900
1901
1902
1903
1904 if (rdma->control_ready_expected) {
1905 RDMAControlHeader resp;
1906 ret = qemu_rdma_exchange_get_response(rdma,
1907 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1908 if (ret < 0) {
1909 return ret;
1910 }
1911 }
1912
1913
1914
1915
1916 if (resp) {
1917 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1918 if (ret) {
1919 error_report("rdma migration: error posting"
1920 " extra control recv for anticipated result!");
1921 return ret;
1922 }
1923 }
1924
1925
1926
1927
1928 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1929 if (ret) {
1930 error_report("rdma migration: error posting first control recv!");
1931 return ret;
1932 }
1933
1934
1935
1936
1937 ret = qemu_rdma_post_send_control(rdma, data, head);
1938
1939 if (ret < 0) {
1940 error_report("Failed to send control buffer!");
1941 return ret;
1942 }
1943
1944
1945
1946
1947 if (resp) {
1948 if (callback) {
1949 trace_qemu_rdma_exchange_send_issue_callback();
1950 ret = callback(rdma);
1951 if (ret < 0) {
1952 return ret;
1953 }
1954 }
1955
1956 trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1957 ret = qemu_rdma_exchange_get_response(rdma, resp,
1958 resp->type, RDMA_WRID_DATA);
1959
1960 if (ret < 0) {
1961 return ret;
1962 }
1963
1964 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1965 if (resp_idx) {
1966 *resp_idx = RDMA_WRID_DATA;
1967 }
1968 trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1969 }
1970
1971 rdma->control_ready_expected = 1;
1972
1973 return 0;
1974}
1975
1976
1977
1978
1979
1980static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1981 int expecting)
1982{
1983 RDMAControlHeader ready = {
1984 .len = 0,
1985 .type = RDMA_CONTROL_READY,
1986 .repeat = 1,
1987 };
1988 int ret;
1989
1990
1991
1992
1993 ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1994
1995 if (ret < 0) {
1996 error_report("Failed to send control buffer!");
1997 return ret;
1998 }
1999
2000
2001
2002
2003 ret = qemu_rdma_exchange_get_response(rdma, head,
2004 expecting, RDMA_WRID_READY);
2005
2006 if (ret < 0) {
2007 return ret;
2008 }
2009
2010 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
2011
2012
2013
2014
2015 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2016 if (ret) {
2017 error_report("rdma migration: error posting second control recv!");
2018 return ret;
2019 }
2020
2021 return 0;
2022}
2023
2024
2025
2026
2027
2028
2029
2030static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
2031 int current_index, uint64_t current_addr,
2032 uint64_t length)
2033{
2034 struct ibv_sge sge;
2035 struct ibv_send_wr send_wr = { 0 };
2036 struct ibv_send_wr *bad_wr;
2037 int reg_result_idx, ret, count = 0;
2038 uint64_t chunk, chunks;
2039 uint8_t *chunk_start, *chunk_end;
2040 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2041 RDMARegister reg;
2042 RDMARegisterResult *reg_result;
2043 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2044 RDMAControlHeader head = { .len = sizeof(RDMARegister),
2045 .type = RDMA_CONTROL_REGISTER_REQUEST,
2046 .repeat = 1,
2047 };
2048
2049retry:
2050 sge.addr = (uintptr_t)(block->local_host_addr +
2051 (current_addr - block->offset));
2052 sge.length = length;
2053
2054 chunk = ram_chunk_index(block->local_host_addr,
2055 (uint8_t *)(uintptr_t)sge.addr);
2056 chunk_start = ram_chunk_start(block, chunk);
2057
2058 if (block->is_ram_block) {
2059 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2060
2061 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2062 chunks--;
2063 }
2064 } else {
2065 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2066
2067 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2068 chunks--;
2069 }
2070 }
2071
2072 trace_qemu_rdma_write_one_top(chunks + 1,
2073 (chunks + 1) *
2074 (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2075
2076 chunk_end = ram_chunk_end(block, chunk + chunks);
2077
2078
2079 while (test_bit(chunk, block->transit_bitmap)) {
2080 (void)count;
2081 trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2082 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2083
2084 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2085
2086 if (ret < 0) {
2087 error_report("Failed to Wait for previous write to complete "
2088 "block %d chunk %" PRIu64
2089 " current %" PRIu64 " len %" PRIu64 " %d",
2090 current_index, chunk, sge.addr, length, rdma->nb_sent);
2091 return ret;
2092 }
2093 }
2094
2095 if (!rdma->pin_all || !block->is_ram_block) {
2096 if (!block->remote_keys[chunk]) {
2097
2098
2099
2100
2101
2102
2103 if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2104 RDMACompress comp = {
2105 .offset = current_addr,
2106 .value = 0,
2107 .block_idx = current_index,
2108 .length = length,
2109 };
2110
2111 head.len = sizeof(comp);
2112 head.type = RDMA_CONTROL_COMPRESS;
2113
2114 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2115 current_index, current_addr);
2116
2117 compress_to_network(rdma, &comp);
2118 ret = qemu_rdma_exchange_send(rdma, &head,
2119 (uint8_t *) &comp, NULL, NULL, NULL);
2120
2121 if (ret < 0) {
2122 return -EIO;
2123 }
2124
2125 stat64_add(&mig_stats.zero_pages,
2126 sge.length / qemu_target_page_size());
2127
2128 return 1;
2129 }
2130
2131
2132
2133
2134 reg.current_index = current_index;
2135 if (block->is_ram_block) {
2136 reg.key.current_addr = current_addr;
2137 } else {
2138 reg.key.chunk = chunk;
2139 }
2140 reg.chunks = chunks;
2141
2142 trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2143 current_addr);
2144
2145 register_to_network(rdma, ®);
2146 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
2147 &resp, ®_result_idx, NULL);
2148 if (ret < 0) {
2149 return ret;
2150 }
2151
2152
2153 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2154 &sge.lkey, NULL, chunk,
2155 chunk_start, chunk_end)) {
2156 error_report("cannot get lkey");
2157 return -EINVAL;
2158 }
2159
2160 reg_result = (RDMARegisterResult *)
2161 rdma->wr_data[reg_result_idx].control_curr;
2162
2163 network_to_result(reg_result);
2164
2165 trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2166 reg_result->rkey, chunk);
2167
2168 block->remote_keys[chunk] = reg_result->rkey;
2169 block->remote_host_addr = reg_result->host_addr;
2170 } else {
2171
2172 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2173 &sge.lkey, NULL, chunk,
2174 chunk_start, chunk_end)) {
2175 error_report("cannot get lkey!");
2176 return -EINVAL;
2177 }
2178 }
2179
2180 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2181 } else {
2182 send_wr.wr.rdma.rkey = block->remote_rkey;
2183
2184 if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2185 &sge.lkey, NULL, chunk,
2186 chunk_start, chunk_end)) {
2187 error_report("cannot get lkey!");
2188 return -EINVAL;
2189 }
2190 }
2191
2192
2193
2194
2195
2196
2197
2198 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2199 current_index, chunk);
2200
2201 send_wr.opcode = IBV_WR_RDMA_WRITE;
2202 send_wr.send_flags = IBV_SEND_SIGNALED;
2203 send_wr.sg_list = &sge;
2204 send_wr.num_sge = 1;
2205 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2206 (current_addr - block->offset);
2207
2208 trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2209 sge.length);
2210
2211
2212
2213
2214
2215 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2216
2217 if (ret == ENOMEM) {
2218 trace_qemu_rdma_write_one_queue_full();
2219 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2220 if (ret < 0) {
2221 error_report("rdma migration: failed to make "
2222 "room in full send queue! %d", ret);
2223 return ret;
2224 }
2225
2226 goto retry;
2227
2228 } else if (ret > 0) {
2229 perror("rdma migration: post rdma write failed");
2230 return -ret;
2231 }
2232
2233 set_bit(chunk, block->transit_bitmap);
2234 stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size());
2235 ram_transferred_add(sge.length);
2236 qemu_file_credit_transfer(f, sge.length);
2237 rdma->total_writes++;
2238
2239 return 0;
2240}
2241
2242
2243
2244
2245
2246
2247
2248static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2249{
2250 int ret;
2251
2252 if (!rdma->current_length) {
2253 return 0;
2254 }
2255
2256 ret = qemu_rdma_write_one(f, rdma,
2257 rdma->current_index, rdma->current_addr, rdma->current_length);
2258
2259 if (ret < 0) {
2260 return ret;
2261 }
2262
2263 if (ret == 0) {
2264 rdma->nb_sent++;
2265 trace_qemu_rdma_write_flush(rdma->nb_sent);
2266 }
2267
2268 rdma->current_length = 0;
2269 rdma->current_addr = 0;
2270
2271 return 0;
2272}
2273
2274static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2275 uint64_t offset, uint64_t len)
2276{
2277 RDMALocalBlock *block;
2278 uint8_t *host_addr;
2279 uint8_t *chunk_end;
2280
2281 if (rdma->current_index < 0) {
2282 return 0;
2283 }
2284
2285 if (rdma->current_chunk < 0) {
2286 return 0;
2287 }
2288
2289 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2290 host_addr = block->local_host_addr + (offset - block->offset);
2291 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2292
2293 if (rdma->current_length == 0) {
2294 return 0;
2295 }
2296
2297
2298
2299
2300 if (offset != (rdma->current_addr + rdma->current_length)) {
2301 return 0;
2302 }
2303
2304 if (offset < block->offset) {
2305 return 0;
2306 }
2307
2308 if ((offset + len) > (block->offset + block->length)) {
2309 return 0;
2310 }
2311
2312 if ((host_addr + len) > chunk_end) {
2313 return 0;
2314 }
2315
2316 return 1;
2317}
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2330 uint64_t block_offset, uint64_t offset,
2331 uint64_t len)
2332{
2333 uint64_t current_addr = block_offset + offset;
2334 uint64_t index = rdma->current_index;
2335 uint64_t chunk = rdma->current_chunk;
2336 int ret;
2337
2338
2339 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2340 ret = qemu_rdma_write_flush(f, rdma);
2341 if (ret) {
2342 return ret;
2343 }
2344 rdma->current_length = 0;
2345 rdma->current_addr = current_addr;
2346
2347 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2348 offset, len, &index, &chunk);
2349 if (ret) {
2350 error_report("ram block search failed");
2351 return ret;
2352 }
2353 rdma->current_index = index;
2354 rdma->current_chunk = chunk;
2355 }
2356
2357
2358 rdma->current_length += len;
2359
2360
2361 if (rdma->current_length >= RDMA_MERGE_MAX) {
2362 return qemu_rdma_write_flush(f, rdma);
2363 }
2364
2365 return 0;
2366}
2367
2368static void qemu_rdma_cleanup(RDMAContext *rdma)
2369{
2370 int idx;
2371
2372 if (rdma->cm_id && rdma->connected) {
2373 if ((rdma->error_state ||
2374 migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2375 !rdma->received_error) {
2376 RDMAControlHeader head = { .len = 0,
2377 .type = RDMA_CONTROL_ERROR,
2378 .repeat = 1,
2379 };
2380 error_report("Early error. Sending error.");
2381 qemu_rdma_post_send_control(rdma, NULL, &head);
2382 }
2383
2384 rdma_disconnect(rdma->cm_id);
2385 trace_qemu_rdma_cleanup_disconnect();
2386 rdma->connected = false;
2387 }
2388
2389 if (rdma->channel) {
2390 qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2391 }
2392 g_free(rdma->dest_blocks);
2393 rdma->dest_blocks = NULL;
2394
2395 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2396 if (rdma->wr_data[idx].control_mr) {
2397 rdma->total_registrations--;
2398 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2399 }
2400 rdma->wr_data[idx].control_mr = NULL;
2401 }
2402
2403 if (rdma->local_ram_blocks.block) {
2404 while (rdma->local_ram_blocks.nb_blocks) {
2405 rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2406 }
2407 }
2408
2409 if (rdma->qp) {
2410 rdma_destroy_qp(rdma->cm_id);
2411 rdma->qp = NULL;
2412 }
2413 if (rdma->recv_cq) {
2414 ibv_destroy_cq(rdma->recv_cq);
2415 rdma->recv_cq = NULL;
2416 }
2417 if (rdma->send_cq) {
2418 ibv_destroy_cq(rdma->send_cq);
2419 rdma->send_cq = NULL;
2420 }
2421 if (rdma->recv_comp_channel) {
2422 ibv_destroy_comp_channel(rdma->recv_comp_channel);
2423 rdma->recv_comp_channel = NULL;
2424 }
2425 if (rdma->send_comp_channel) {
2426 ibv_destroy_comp_channel(rdma->send_comp_channel);
2427 rdma->send_comp_channel = NULL;
2428 }
2429 if (rdma->pd) {
2430 ibv_dealloc_pd(rdma->pd);
2431 rdma->pd = NULL;
2432 }
2433 if (rdma->cm_id) {
2434 rdma_destroy_id(rdma->cm_id);
2435 rdma->cm_id = NULL;
2436 }
2437
2438
2439 if (rdma->listen_id) {
2440 if (!rdma->is_return_path) {
2441 rdma_destroy_id(rdma->listen_id);
2442 }
2443 rdma->listen_id = NULL;
2444
2445 if (rdma->channel) {
2446 if (!rdma->is_return_path) {
2447 rdma_destroy_event_channel(rdma->channel);
2448 }
2449 rdma->channel = NULL;
2450 }
2451 }
2452
2453 if (rdma->channel) {
2454 rdma_destroy_event_channel(rdma->channel);
2455 rdma->channel = NULL;
2456 }
2457 g_free(rdma->host);
2458 g_free(rdma->host_port);
2459 rdma->host = NULL;
2460 rdma->host_port = NULL;
2461}
2462
2463
2464static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2465{
2466 int ret, idx;
2467 Error *local_err = NULL, **temp = &local_err;
2468
2469
2470
2471
2472
2473 rdma->pin_all = pin_all;
2474
2475 ret = qemu_rdma_resolve_host(rdma, temp);
2476 if (ret) {
2477 goto err_rdma_source_init;
2478 }
2479
2480 ret = qemu_rdma_alloc_pd_cq(rdma);
2481 if (ret) {
2482 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2483 " limits may be too low. Please check $ ulimit -a # and "
2484 "search for 'ulimit -l' in the output");
2485 goto err_rdma_source_init;
2486 }
2487
2488 ret = qemu_rdma_alloc_qp(rdma);
2489 if (ret) {
2490 ERROR(temp, "rdma migration: error allocating qp!");
2491 goto err_rdma_source_init;
2492 }
2493
2494 ret = qemu_rdma_init_ram_blocks(rdma);
2495 if (ret) {
2496 ERROR(temp, "rdma migration: error initializing ram blocks!");
2497 goto err_rdma_source_init;
2498 }
2499
2500
2501 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2502 for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2503 g_hash_table_insert(rdma->blockmap,
2504 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2505 &rdma->local_ram_blocks.block[idx]);
2506 }
2507
2508 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2509 ret = qemu_rdma_reg_control(rdma, idx);
2510 if (ret) {
2511 ERROR(temp, "rdma migration: error registering %d control!",
2512 idx);
2513 goto err_rdma_source_init;
2514 }
2515 }
2516
2517 return 0;
2518
2519err_rdma_source_init:
2520 error_propagate(errp, local_err);
2521 qemu_rdma_cleanup(rdma);
2522 return -1;
2523}
2524
2525static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2526 struct rdma_cm_event **cm_event,
2527 long msec, Error **errp)
2528{
2529 int ret;
2530 struct pollfd poll_fd = {
2531 .fd = rdma->channel->fd,
2532 .events = POLLIN,
2533 .revents = 0
2534 };
2535
2536 do {
2537 ret = poll(&poll_fd, 1, msec);
2538 } while (ret < 0 && errno == EINTR);
2539
2540 if (ret == 0) {
2541 ERROR(errp, "poll cm event timeout");
2542 return -1;
2543 } else if (ret < 0) {
2544 ERROR(errp, "failed to poll cm event, errno=%i", errno);
2545 return -1;
2546 } else if (poll_fd.revents & POLLIN) {
2547 return rdma_get_cm_event(rdma->channel, cm_event);
2548 } else {
2549 ERROR(errp, "no POLLIN event, revent=%x", poll_fd.revents);
2550 return -1;
2551 }
2552}
2553
2554static int qemu_rdma_connect(RDMAContext *rdma, Error **errp, bool return_path)
2555{
2556 RDMACapabilities cap = {
2557 .version = RDMA_CONTROL_VERSION_CURRENT,
2558 .flags = 0,
2559 };
2560 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2561 .retry_count = 5,
2562 .private_data = &cap,
2563 .private_data_len = sizeof(cap),
2564 };
2565 struct rdma_cm_event *cm_event;
2566 int ret;
2567
2568
2569
2570
2571
2572 if (rdma->pin_all) {
2573 trace_qemu_rdma_connect_pin_all_requested();
2574 cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2575 }
2576
2577 caps_to_network(&cap);
2578
2579 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2580 if (ret) {
2581 ERROR(errp, "posting second control recv");
2582 goto err_rdma_source_connect;
2583 }
2584
2585 ret = rdma_connect(rdma->cm_id, &conn_param);
2586 if (ret) {
2587 perror("rdma_connect");
2588 ERROR(errp, "connecting to destination!");
2589 goto err_rdma_source_connect;
2590 }
2591
2592 if (return_path) {
2593 ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2594 } else {
2595 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2596 }
2597 if (ret) {
2598 perror("rdma_get_cm_event after rdma_connect");
2599 ERROR(errp, "connecting to destination!");
2600 goto err_rdma_source_connect;
2601 }
2602
2603 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2604 error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2605 ERROR(errp, "connecting to destination!");
2606 rdma_ack_cm_event(cm_event);
2607 goto err_rdma_source_connect;
2608 }
2609 rdma->connected = true;
2610
2611 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2612 network_to_caps(&cap);
2613
2614
2615
2616
2617
2618 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2619 ERROR(errp, "Server cannot support pinning all memory. "
2620 "Will register memory dynamically.");
2621 rdma->pin_all = false;
2622 }
2623
2624 trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2625
2626 rdma_ack_cm_event(cm_event);
2627
2628 rdma->control_ready_expected = 1;
2629 rdma->nb_sent = 0;
2630 return 0;
2631
2632err_rdma_source_connect:
2633 qemu_rdma_cleanup(rdma);
2634 return -1;
2635}
2636
2637static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2638{
2639 int ret, idx;
2640 struct rdma_cm_id *listen_id;
2641 char ip[40] = "unknown";
2642 struct rdma_addrinfo *res, *e;
2643 char port_str[16];
2644 int reuse = 1;
2645
2646 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2647 rdma->wr_data[idx].control_len = 0;
2648 rdma->wr_data[idx].control_curr = NULL;
2649 }
2650
2651 if (!rdma->host || !rdma->host[0]) {
2652 ERROR(errp, "RDMA host is not set!");
2653 rdma->error_state = -EINVAL;
2654 return -1;
2655 }
2656
2657 rdma->channel = rdma_create_event_channel();
2658 if (!rdma->channel) {
2659 ERROR(errp, "could not create rdma event channel");
2660 rdma->error_state = -EINVAL;
2661 return -1;
2662 }
2663
2664
2665 ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2666 if (ret) {
2667 ERROR(errp, "could not create cm_id!");
2668 goto err_dest_init_create_listen_id;
2669 }
2670
2671 snprintf(port_str, 16, "%d", rdma->port);
2672 port_str[15] = '\0';
2673
2674 ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2675 if (ret < 0) {
2676 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2677 goto err_dest_init_bind_addr;
2678 }
2679
2680 ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
2681 &reuse, sizeof reuse);
2682 if (ret) {
2683 ERROR(errp, "Error: could not set REUSEADDR option");
2684 goto err_dest_init_bind_addr;
2685 }
2686 for (e = res; e != NULL; e = e->ai_next) {
2687 inet_ntop(e->ai_family,
2688 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2689 trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2690 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2691 if (ret) {
2692 continue;
2693 }
2694 if (e->ai_family == AF_INET6) {
2695 ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
2696 if (ret) {
2697 continue;
2698 }
2699 }
2700 break;
2701 }
2702
2703 rdma_freeaddrinfo(res);
2704 if (!e) {
2705 ERROR(errp, "Error: could not rdma_bind_addr!");
2706 goto err_dest_init_bind_addr;
2707 }
2708
2709 rdma->listen_id = listen_id;
2710 qemu_rdma_dump_gid("dest_init", listen_id);
2711 return 0;
2712
2713err_dest_init_bind_addr:
2714 rdma_destroy_id(listen_id);
2715err_dest_init_create_listen_id:
2716 rdma_destroy_event_channel(rdma->channel);
2717 rdma->channel = NULL;
2718 rdma->error_state = ret;
2719 return ret;
2720
2721}
2722
2723static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2724 RDMAContext *rdma)
2725{
2726 int idx;
2727
2728 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2729 rdma_return_path->wr_data[idx].control_len = 0;
2730 rdma_return_path->wr_data[idx].control_curr = NULL;
2731 }
2732
2733
2734 rdma_return_path->channel = rdma->channel;
2735 rdma_return_path->listen_id = rdma->listen_id;
2736
2737 rdma->return_path = rdma_return_path;
2738 rdma_return_path->return_path = rdma;
2739 rdma_return_path->is_return_path = true;
2740}
2741
2742static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2743{
2744 RDMAContext *rdma = NULL;
2745 InetSocketAddress *addr;
2746
2747 if (host_port) {
2748 rdma = g_new0(RDMAContext, 1);
2749 rdma->current_index = -1;
2750 rdma->current_chunk = -1;
2751
2752 addr = g_new(InetSocketAddress, 1);
2753 if (!inet_parse(addr, host_port, NULL)) {
2754 rdma->port = atoi(addr->port);
2755 rdma->host = g_strdup(addr->host);
2756 rdma->host_port = g_strdup(host_port);
2757 } else {
2758 ERROR(errp, "bad RDMA migration address '%s'", host_port);
2759 g_free(rdma);
2760 rdma = NULL;
2761 }
2762
2763 qapi_free_InetSocketAddress(addr);
2764 }
2765
2766 return rdma;
2767}
2768
2769
2770
2771
2772
2773
2774static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2775 const struct iovec *iov,
2776 size_t niov,
2777 int *fds,
2778 size_t nfds,
2779 int flags,
2780 Error **errp)
2781{
2782 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2783 QEMUFile *f = rioc->file;
2784 RDMAContext *rdma;
2785 int ret;
2786 ssize_t done = 0;
2787 size_t i;
2788 size_t len = 0;
2789
2790 RCU_READ_LOCK_GUARD();
2791 rdma = qatomic_rcu_read(&rioc->rdmaout);
2792
2793 if (!rdma) {
2794 error_setg(errp, "RDMA control channel output is not set");
2795 return -1;
2796 }
2797
2798 CHECK_ERROR_STATE();
2799
2800
2801
2802
2803
2804 ret = qemu_rdma_write_flush(f, rdma);
2805 if (ret < 0) {
2806 rdma->error_state = ret;
2807 error_setg(errp, "qemu_rdma_write_flush returned %d", ret);
2808 return -1;
2809 }
2810
2811 for (i = 0; i < niov; i++) {
2812 size_t remaining = iov[i].iov_len;
2813 uint8_t * data = (void *)iov[i].iov_base;
2814 while (remaining) {
2815 RDMAControlHeader head;
2816
2817 len = MIN(remaining, RDMA_SEND_INCREMENT);
2818 remaining -= len;
2819
2820 head.len = len;
2821 head.type = RDMA_CONTROL_QEMU_FILE;
2822
2823 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2824
2825 if (ret < 0) {
2826 rdma->error_state = ret;
2827 error_setg(errp, "qemu_rdma_exchange_send returned %d", ret);
2828 return -1;
2829 }
2830
2831 data += len;
2832 done += len;
2833 }
2834 }
2835
2836 return done;
2837}
2838
2839static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2840 size_t size, int idx)
2841{
2842 size_t len = 0;
2843
2844 if (rdma->wr_data[idx].control_len) {
2845 trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2846
2847 len = MIN(size, rdma->wr_data[idx].control_len);
2848 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2849 rdma->wr_data[idx].control_curr += len;
2850 rdma->wr_data[idx].control_len -= len;
2851 }
2852
2853 return len;
2854}
2855
2856
2857
2858
2859
2860
2861static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2862 const struct iovec *iov,
2863 size_t niov,
2864 int **fds,
2865 size_t *nfds,
2866 int flags,
2867 Error **errp)
2868{
2869 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2870 RDMAContext *rdma;
2871 RDMAControlHeader head;
2872 int ret = 0;
2873 ssize_t i;
2874 size_t done = 0;
2875
2876 RCU_READ_LOCK_GUARD();
2877 rdma = qatomic_rcu_read(&rioc->rdmain);
2878
2879 if (!rdma) {
2880 error_setg(errp, "RDMA control channel input is not set");
2881 return -1;
2882 }
2883
2884 CHECK_ERROR_STATE();
2885
2886 for (i = 0; i < niov; i++) {
2887 size_t want = iov[i].iov_len;
2888 uint8_t *data = (void *)iov[i].iov_base;
2889
2890
2891
2892
2893
2894
2895 ret = qemu_rdma_fill(rdma, data, want, 0);
2896 done += ret;
2897 want -= ret;
2898
2899 if (want == 0) {
2900 continue;
2901 }
2902
2903
2904
2905 if (done > 0) {
2906 break;
2907 }
2908
2909
2910
2911
2912
2913 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2914
2915 if (ret < 0) {
2916 rdma->error_state = ret;
2917 error_setg(errp, "qemu_rdma_exchange_recv returned %d", ret);
2918 return -1;
2919 }
2920
2921
2922
2923
2924 ret = qemu_rdma_fill(rdma, data, want, 0);
2925 done += ret;
2926 want -= ret;
2927
2928
2929 if (want) {
2930 if (done == 0) {
2931 return QIO_CHANNEL_ERR_BLOCK;
2932 } else {
2933 break;
2934 }
2935 }
2936 }
2937 return done;
2938}
2939
2940
2941
2942
2943static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2944{
2945 int ret;
2946
2947 if (qemu_rdma_write_flush(f, rdma) < 0) {
2948 return -EIO;
2949 }
2950
2951 while (rdma->nb_sent) {
2952 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2953 if (ret < 0) {
2954 error_report("rdma migration: complete polling error!");
2955 return -EIO;
2956 }
2957 }
2958
2959 qemu_rdma_unregister_waiting(rdma);
2960
2961 return 0;
2962}
2963
2964
2965static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
2966 bool blocking,
2967 Error **errp)
2968{
2969 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2970
2971 rioc->blocking = blocking;
2972 return 0;
2973}
2974
2975
2976typedef struct QIOChannelRDMASource QIOChannelRDMASource;
2977struct QIOChannelRDMASource {
2978 GSource parent;
2979 QIOChannelRDMA *rioc;
2980 GIOCondition condition;
2981};
2982
2983static gboolean
2984qio_channel_rdma_source_prepare(GSource *source,
2985 gint *timeout)
2986{
2987 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2988 RDMAContext *rdma;
2989 GIOCondition cond = 0;
2990 *timeout = -1;
2991
2992 RCU_READ_LOCK_GUARD();
2993 if (rsource->condition == G_IO_IN) {
2994 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
2995 } else {
2996 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
2997 }
2998
2999 if (!rdma) {
3000 error_report("RDMAContext is NULL when prepare Gsource");
3001 return FALSE;
3002 }
3003
3004 if (rdma->wr_data[0].control_len) {
3005 cond |= G_IO_IN;
3006 }
3007 cond |= G_IO_OUT;
3008
3009 return cond & rsource->condition;
3010}
3011
3012static gboolean
3013qio_channel_rdma_source_check(GSource *source)
3014{
3015 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3016 RDMAContext *rdma;
3017 GIOCondition cond = 0;
3018
3019 RCU_READ_LOCK_GUARD();
3020 if (rsource->condition == G_IO_IN) {
3021 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3022 } else {
3023 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3024 }
3025
3026 if (!rdma) {
3027 error_report("RDMAContext is NULL when check Gsource");
3028 return FALSE;
3029 }
3030
3031 if (rdma->wr_data[0].control_len) {
3032 cond |= G_IO_IN;
3033 }
3034 cond |= G_IO_OUT;
3035
3036 return cond & rsource->condition;
3037}
3038
3039static gboolean
3040qio_channel_rdma_source_dispatch(GSource *source,
3041 GSourceFunc callback,
3042 gpointer user_data)
3043{
3044 QIOChannelFunc func = (QIOChannelFunc)callback;
3045 QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3046 RDMAContext *rdma;
3047 GIOCondition cond = 0;
3048
3049 RCU_READ_LOCK_GUARD();
3050 if (rsource->condition == G_IO_IN) {
3051 rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3052 } else {
3053 rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3054 }
3055
3056 if (!rdma) {
3057 error_report("RDMAContext is NULL when dispatch Gsource");
3058 return FALSE;
3059 }
3060
3061 if (rdma->wr_data[0].control_len) {
3062 cond |= G_IO_IN;
3063 }
3064 cond |= G_IO_OUT;
3065
3066 return (*func)(QIO_CHANNEL(rsource->rioc),
3067 (cond & rsource->condition),
3068 user_data);
3069}
3070
3071static void
3072qio_channel_rdma_source_finalize(GSource *source)
3073{
3074 QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
3075
3076 object_unref(OBJECT(ssource->rioc));
3077}
3078
3079GSourceFuncs qio_channel_rdma_source_funcs = {
3080 qio_channel_rdma_source_prepare,
3081 qio_channel_rdma_source_check,
3082 qio_channel_rdma_source_dispatch,
3083 qio_channel_rdma_source_finalize
3084};
3085
3086static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
3087 GIOCondition condition)
3088{
3089 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3090 QIOChannelRDMASource *ssource;
3091 GSource *source;
3092
3093 source = g_source_new(&qio_channel_rdma_source_funcs,
3094 sizeof(QIOChannelRDMASource));
3095 ssource = (QIOChannelRDMASource *)source;
3096
3097 ssource->rioc = rioc;
3098 object_ref(OBJECT(rioc));
3099
3100 ssource->condition = condition;
3101
3102 return source;
3103}
3104
3105static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
3106 AioContext *ctx,
3107 IOHandler *io_read,
3108 IOHandler *io_write,
3109 void *opaque)
3110{
3111 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3112 if (io_read) {
3113 aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd, io_read,
3114 io_write, NULL, NULL, opaque);
3115 aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd, io_read,
3116 io_write, NULL, NULL, opaque);
3117 } else {
3118 aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd, io_read,
3119 io_write, NULL, NULL, opaque);
3120 aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd, io_read,
3121 io_write, NULL, NULL, opaque);
3122 }
3123}
3124
3125struct rdma_close_rcu {
3126 struct rcu_head rcu;
3127 RDMAContext *rdmain;
3128 RDMAContext *rdmaout;
3129};
3130
3131
3132static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3133{
3134 if (rcu->rdmain) {
3135 qemu_rdma_cleanup(rcu->rdmain);
3136 }
3137
3138 if (rcu->rdmaout) {
3139 qemu_rdma_cleanup(rcu->rdmaout);
3140 }
3141
3142 g_free(rcu->rdmain);
3143 g_free(rcu->rdmaout);
3144 g_free(rcu);
3145}
3146
3147static int qio_channel_rdma_close(QIOChannel *ioc,
3148 Error **errp)
3149{
3150 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3151 RDMAContext *rdmain, *rdmaout;
3152 struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3153
3154 trace_qemu_rdma_close();
3155
3156 rdmain = rioc->rdmain;
3157 if (rdmain) {
3158 qatomic_rcu_set(&rioc->rdmain, NULL);
3159 }
3160
3161 rdmaout = rioc->rdmaout;
3162 if (rdmaout) {
3163 qatomic_rcu_set(&rioc->rdmaout, NULL);
3164 }
3165
3166 rcu->rdmain = rdmain;
3167 rcu->rdmaout = rdmaout;
3168 call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
3169
3170 return 0;
3171}
3172
3173static int
3174qio_channel_rdma_shutdown(QIOChannel *ioc,
3175 QIOChannelShutdown how,
3176 Error **errp)
3177{
3178 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3179 RDMAContext *rdmain, *rdmaout;
3180
3181 RCU_READ_LOCK_GUARD();
3182
3183 rdmain = qatomic_rcu_read(&rioc->rdmain);
3184 rdmaout = qatomic_rcu_read(&rioc->rdmain);
3185
3186 switch (how) {
3187 case QIO_CHANNEL_SHUTDOWN_READ:
3188 if (rdmain) {
3189 rdmain->error_state = -1;
3190 }
3191 break;
3192 case QIO_CHANNEL_SHUTDOWN_WRITE:
3193 if (rdmaout) {
3194 rdmaout->error_state = -1;
3195 }
3196 break;
3197 case QIO_CHANNEL_SHUTDOWN_BOTH:
3198 default:
3199 if (rdmain) {
3200 rdmain->error_state = -1;
3201 }
3202 if (rdmaout) {
3203 rdmaout->error_state = -1;
3204 }
3205 break;
3206 }
3207
3208 return 0;
3209}
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229static size_t qemu_rdma_save_page(QEMUFile *f,
3230 ram_addr_t block_offset, ram_addr_t offset,
3231 size_t size, uint64_t *bytes_sent)
3232{
3233 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3234 RDMAContext *rdma;
3235 int ret;
3236
3237 if (migration_in_postcopy()) {
3238 return RAM_SAVE_CONTROL_NOT_SUPP;
3239 }
3240
3241 RCU_READ_LOCK_GUARD();
3242 rdma = qatomic_rcu_read(&rioc->rdmaout);
3243
3244 if (!rdma) {
3245 return -EIO;
3246 }
3247
3248 CHECK_ERROR_STATE();
3249
3250 qemu_fflush(f);
3251
3252
3253
3254
3255
3256
3257 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
3258 if (ret < 0) {
3259 error_report("rdma migration: write error! %d", ret);
3260 goto err;
3261 }
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271 if (bytes_sent) {
3272 *bytes_sent = 1;
3273 }
3274
3275
3276
3277
3278
3279
3280
3281
3282 while (1) {
3283 uint64_t wr_id, wr_id_in;
3284 int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3285 if (ret < 0) {
3286 error_report("rdma migration: polling error! %d", ret);
3287 goto err;
3288 }
3289
3290 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3291
3292 if (wr_id == RDMA_WRID_NONE) {
3293 break;
3294 }
3295 }
3296
3297 while (1) {
3298 uint64_t wr_id, wr_id_in;
3299 int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3300 if (ret < 0) {
3301 error_report("rdma migration: polling error! %d", ret);
3302 goto err;
3303 }
3304
3305 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3306
3307 if (wr_id == RDMA_WRID_NONE) {
3308 break;
3309 }
3310 }
3311
3312 return RAM_SAVE_CONTROL_DELAYED;
3313err:
3314 rdma->error_state = ret;
3315 return ret;
3316}
3317
3318static void rdma_accept_incoming_migration(void *opaque);
3319
3320static void rdma_cm_poll_handler(void *opaque)
3321{
3322 RDMAContext *rdma = opaque;
3323 int ret;
3324 struct rdma_cm_event *cm_event;
3325 MigrationIncomingState *mis = migration_incoming_get_current();
3326
3327 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3328 if (ret) {
3329 error_report("get_cm_event failed %d", errno);
3330 return;
3331 }
3332
3333 if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3334 cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3335 if (!rdma->error_state &&
3336 migration_incoming_get_current()->state !=
3337 MIGRATION_STATUS_COMPLETED) {
3338 error_report("receive cm event, cm event is %d", cm_event->event);
3339 rdma->error_state = -EPIPE;
3340 if (rdma->return_path) {
3341 rdma->return_path->error_state = -EPIPE;
3342 }
3343 }
3344 rdma_ack_cm_event(cm_event);
3345 if (mis->loadvm_co) {
3346 qemu_coroutine_enter(mis->loadvm_co);
3347 }
3348 return;
3349 }
3350 rdma_ack_cm_event(cm_event);
3351}
3352
3353static int qemu_rdma_accept(RDMAContext *rdma)
3354{
3355 RDMACapabilities cap;
3356 struct rdma_conn_param conn_param = {
3357 .responder_resources = 2,
3358 .private_data = &cap,
3359 .private_data_len = sizeof(cap),
3360 };
3361 RDMAContext *rdma_return_path = NULL;
3362 struct rdma_cm_event *cm_event;
3363 struct ibv_context *verbs;
3364 int ret = -EINVAL;
3365 int idx;
3366
3367 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3368 if (ret) {
3369 goto err_rdma_dest_wait;
3370 }
3371
3372 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3373 rdma_ack_cm_event(cm_event);
3374 goto err_rdma_dest_wait;
3375 }
3376
3377
3378
3379
3380
3381 if ((migrate_postcopy() || migrate_return_path())
3382 && !rdma->is_return_path) {
3383 rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
3384 if (rdma_return_path == NULL) {
3385 rdma_ack_cm_event(cm_event);
3386 goto err_rdma_dest_wait;
3387 }
3388
3389 qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
3390 }
3391
3392 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3393
3394 network_to_caps(&cap);
3395
3396 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3397 error_report("Unknown source RDMA version: %d, bailing...",
3398 cap.version);
3399 rdma_ack_cm_event(cm_event);
3400 goto err_rdma_dest_wait;
3401 }
3402
3403
3404
3405
3406 cap.flags &= known_capabilities;
3407
3408
3409
3410
3411
3412 if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3413 rdma->pin_all = true;
3414 }
3415
3416 rdma->cm_id = cm_event->id;
3417 verbs = cm_event->id->verbs;
3418
3419 rdma_ack_cm_event(cm_event);
3420
3421 trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3422
3423 caps_to_network(&cap);
3424
3425 trace_qemu_rdma_accept_pin_verbsc(verbs);
3426
3427 if (!rdma->verbs) {
3428 rdma->verbs = verbs;
3429 } else if (rdma->verbs != verbs) {
3430 error_report("ibv context not matching %p, %p!", rdma->verbs,
3431 verbs);
3432 goto err_rdma_dest_wait;
3433 }
3434
3435 qemu_rdma_dump_id("dest_init", verbs);
3436
3437 ret = qemu_rdma_alloc_pd_cq(rdma);
3438 if (ret) {
3439 error_report("rdma migration: error allocating pd and cq!");
3440 goto err_rdma_dest_wait;
3441 }
3442
3443 ret = qemu_rdma_alloc_qp(rdma);
3444 if (ret) {
3445 error_report("rdma migration: error allocating qp!");
3446 goto err_rdma_dest_wait;
3447 }
3448
3449 ret = qemu_rdma_init_ram_blocks(rdma);
3450 if (ret) {
3451 error_report("rdma migration: error initializing ram blocks!");
3452 goto err_rdma_dest_wait;
3453 }
3454
3455 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
3456 ret = qemu_rdma_reg_control(rdma, idx);
3457 if (ret) {
3458 error_report("rdma: error registering %d control", idx);
3459 goto err_rdma_dest_wait;
3460 }
3461 }
3462
3463
3464 if ((migrate_postcopy() || migrate_return_path())
3465 && !rdma->is_return_path) {
3466 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3467 NULL,
3468 (void *)(intptr_t)rdma->return_path);
3469 } else {
3470 qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3471 NULL, rdma);
3472 }
3473
3474 ret = rdma_accept(rdma->cm_id, &conn_param);
3475 if (ret) {
3476 error_report("rdma_accept returns %d", ret);
3477 goto err_rdma_dest_wait;
3478 }
3479
3480 ret = rdma_get_cm_event(rdma->channel, &cm_event);
3481 if (ret) {
3482 error_report("rdma_accept get_cm_event failed %d", ret);
3483 goto err_rdma_dest_wait;
3484 }
3485
3486 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3487 error_report("rdma_accept not event established");
3488 rdma_ack_cm_event(cm_event);
3489 goto err_rdma_dest_wait;
3490 }
3491
3492 rdma_ack_cm_event(cm_event);
3493 rdma->connected = true;
3494
3495 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
3496 if (ret) {
3497 error_report("rdma migration: error posting second control recv");
3498 goto err_rdma_dest_wait;
3499 }
3500
3501 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3502
3503 return 0;
3504
3505err_rdma_dest_wait:
3506 rdma->error_state = ret;
3507 qemu_rdma_cleanup(rdma);
3508 g_free(rdma_return_path);
3509 return ret;
3510}
3511
3512static int dest_ram_sort_func(const void *a, const void *b)
3513{
3514 unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3515 unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3516
3517 return (a_index < b_index) ? -1 : (a_index != b_index);
3518}
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529static int qemu_rdma_registration_handle(QEMUFile *f)
3530{
3531 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3532 .type = RDMA_CONTROL_REGISTER_RESULT,
3533 .repeat = 0,
3534 };
3535 RDMAControlHeader unreg_resp = { .len = 0,
3536 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3537 .repeat = 0,
3538 };
3539 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3540 .repeat = 1 };
3541 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3542 RDMAContext *rdma;
3543 RDMALocalBlocks *local;
3544 RDMAControlHeader head;
3545 RDMARegister *reg, *registers;
3546 RDMACompress *comp;
3547 RDMARegisterResult *reg_result;
3548 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3549 RDMALocalBlock *block;
3550 void *host_addr;
3551 int ret = 0;
3552 int idx = 0;
3553 int count = 0;
3554 int i = 0;
3555
3556 RCU_READ_LOCK_GUARD();
3557 rdma = qatomic_rcu_read(&rioc->rdmain);
3558
3559 if (!rdma) {
3560 return -EIO;
3561 }
3562
3563 CHECK_ERROR_STATE();
3564
3565 local = &rdma->local_ram_blocks;
3566 do {
3567 trace_qemu_rdma_registration_handle_wait();
3568
3569 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
3570
3571 if (ret < 0) {
3572 break;
3573 }
3574
3575 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3576 error_report("rdma: Too many requests in this message (%d)."
3577 "Bailing.", head.repeat);
3578 ret = -EIO;
3579 break;
3580 }
3581
3582 switch (head.type) {
3583 case RDMA_CONTROL_COMPRESS:
3584 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3585 network_to_compress(comp);
3586
3587 trace_qemu_rdma_registration_handle_compress(comp->length,
3588 comp->block_idx,
3589 comp->offset);
3590 if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3591 error_report("rdma: 'compress' bad block index %u (vs %d)",
3592 (unsigned int)comp->block_idx,
3593 rdma->local_ram_blocks.nb_blocks);
3594 ret = -EIO;
3595 goto out;
3596 }
3597 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3598
3599 host_addr = block->local_host_addr +
3600 (comp->offset - block->offset);
3601
3602 ram_handle_compressed(host_addr, comp->value, comp->length);
3603 break;
3604
3605 case RDMA_CONTROL_REGISTER_FINISHED:
3606 trace_qemu_rdma_registration_handle_finished();
3607 goto out;
3608
3609 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3610 trace_qemu_rdma_registration_handle_ram_blocks();
3611
3612
3613
3614
3615
3616 qsort(rdma->local_ram_blocks.block,
3617 rdma->local_ram_blocks.nb_blocks,
3618 sizeof(RDMALocalBlock), dest_ram_sort_func);
3619 for (i = 0; i < local->nb_blocks; i++) {
3620 local->block[i].index = i;
3621 }
3622
3623 if (rdma->pin_all) {
3624 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3625 if (ret) {
3626 error_report("rdma migration: error dest "
3627 "registering ram blocks");
3628 goto out;
3629 }
3630 }
3631
3632
3633
3634
3635
3636
3637
3638 for (i = 0; i < local->nb_blocks; i++) {
3639 rdma->dest_blocks[i].remote_host_addr =
3640 (uintptr_t)(local->block[i].local_host_addr);
3641
3642 if (rdma->pin_all) {
3643 rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3644 }
3645
3646 rdma->dest_blocks[i].offset = local->block[i].offset;
3647 rdma->dest_blocks[i].length = local->block[i].length;
3648
3649 dest_block_to_network(&rdma->dest_blocks[i]);
3650 trace_qemu_rdma_registration_handle_ram_blocks_loop(
3651 local->block[i].block_name,
3652 local->block[i].offset,
3653 local->block[i].length,
3654 local->block[i].local_host_addr,
3655 local->block[i].src_index);
3656 }
3657
3658 blocks.len = rdma->local_ram_blocks.nb_blocks
3659 * sizeof(RDMADestBlock);
3660
3661
3662 ret = qemu_rdma_post_send_control(rdma,
3663 (uint8_t *) rdma->dest_blocks, &blocks);
3664
3665 if (ret < 0) {
3666 error_report("rdma migration: error sending remote info");
3667 goto out;
3668 }
3669
3670 break;
3671 case RDMA_CONTROL_REGISTER_REQUEST:
3672 trace_qemu_rdma_registration_handle_register(head.repeat);
3673
3674 reg_resp.repeat = head.repeat;
3675 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3676
3677 for (count = 0; count < head.repeat; count++) {
3678 uint64_t chunk;
3679 uint8_t *chunk_start, *chunk_end;
3680
3681 reg = ®isters[count];
3682 network_to_register(reg);
3683
3684 reg_result = &results[count];
3685
3686 trace_qemu_rdma_registration_handle_register_loop(count,
3687 reg->current_index, reg->key.current_addr, reg->chunks);
3688
3689 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3690 error_report("rdma: 'register' bad block index %u (vs %d)",
3691 (unsigned int)reg->current_index,
3692 rdma->local_ram_blocks.nb_blocks);
3693 ret = -ENOENT;
3694 goto out;
3695 }
3696 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3697 if (block->is_ram_block) {
3698 if (block->offset > reg->key.current_addr) {
3699 error_report("rdma: bad register address for block %s"
3700 " offset: %" PRIx64 " current_addr: %" PRIx64,
3701 block->block_name, block->offset,
3702 reg->key.current_addr);
3703 ret = -ERANGE;
3704 goto out;
3705 }
3706 host_addr = (block->local_host_addr +
3707 (reg->key.current_addr - block->offset));
3708 chunk = ram_chunk_index(block->local_host_addr,
3709 (uint8_t *) host_addr);
3710 } else {
3711 chunk = reg->key.chunk;
3712 host_addr = block->local_host_addr +
3713 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3714
3715 if (host_addr < (void *)block->local_host_addr) {
3716 error_report("rdma: bad chunk for block %s"
3717 " chunk: %" PRIx64,
3718 block->block_name, reg->key.chunk);
3719 ret = -ERANGE;
3720 goto out;
3721 }
3722 }
3723 chunk_start = ram_chunk_start(block, chunk);
3724 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3725
3726 uint32_t tmp_rkey = 0;
3727 if (qemu_rdma_register_and_get_keys(rdma, block,
3728 (uintptr_t)host_addr, NULL, &tmp_rkey,
3729 chunk, chunk_start, chunk_end)) {
3730 error_report("cannot get rkey");
3731 ret = -EINVAL;
3732 goto out;
3733 }
3734 reg_result->rkey = tmp_rkey;
3735
3736 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3737
3738 trace_qemu_rdma_registration_handle_register_rkey(
3739 reg_result->rkey);
3740
3741 result_to_network(reg_result);
3742 }
3743
3744 ret = qemu_rdma_post_send_control(rdma,
3745 (uint8_t *) results, ®_resp);
3746
3747 if (ret < 0) {
3748 error_report("Failed to send control buffer");
3749 goto out;
3750 }
3751 break;
3752 case RDMA_CONTROL_UNREGISTER_REQUEST:
3753 trace_qemu_rdma_registration_handle_unregister(head.repeat);
3754 unreg_resp.repeat = head.repeat;
3755 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3756
3757 for (count = 0; count < head.repeat; count++) {
3758 reg = ®isters[count];
3759 network_to_register(reg);
3760
3761 trace_qemu_rdma_registration_handle_unregister_loop(count,
3762 reg->current_index, reg->key.chunk);
3763
3764 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3765
3766 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3767 block->pmr[reg->key.chunk] = NULL;
3768
3769 if (ret != 0) {
3770 perror("rdma unregistration chunk failed");
3771 ret = -ret;
3772 goto out;
3773 }
3774
3775 rdma->total_registrations--;
3776
3777 trace_qemu_rdma_registration_handle_unregister_success(
3778 reg->key.chunk);
3779 }
3780
3781 ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3782
3783 if (ret < 0) {
3784 error_report("Failed to send control buffer");
3785 goto out;
3786 }
3787 break;
3788 case RDMA_CONTROL_REGISTER_RESULT:
3789 error_report("Invalid RESULT message at dest.");
3790 ret = -EIO;
3791 goto out;
3792 default:
3793 error_report("Unknown control message %s", control_desc(head.type));
3794 ret = -EIO;
3795 goto out;
3796 }
3797 } while (1);
3798out:
3799 if (ret < 0) {
3800 rdma->error_state = ret;
3801 }
3802 return ret;
3803}
3804
3805
3806
3807
3808
3809
3810
3811
3812static int
3813rdma_block_notification_handle(QEMUFile *f, const char *name)
3814{
3815 RDMAContext *rdma;
3816 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3817 int curr;
3818 int found = -1;
3819
3820 RCU_READ_LOCK_GUARD();
3821 rdma = qatomic_rcu_read(&rioc->rdmain);
3822
3823 if (!rdma) {
3824 return -EIO;
3825 }
3826
3827
3828 for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3829 if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3830 found = curr;
3831 break;
3832 }
3833 }
3834
3835 if (found == -1) {
3836 error_report("RAMBlock '%s' not found on destination", name);
3837 return -ENOENT;
3838 }
3839
3840 rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3841 trace_rdma_block_notification_handle(name, rdma->next_src_index);
3842 rdma->next_src_index++;
3843
3844 return 0;
3845}
3846
3847static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data)
3848{
3849 switch (flags) {
3850 case RAM_CONTROL_BLOCK_REG:
3851 return rdma_block_notification_handle(f, data);
3852
3853 case RAM_CONTROL_HOOK:
3854 return qemu_rdma_registration_handle(f);
3855
3856 default:
3857
3858 abort();
3859 }
3860}
3861
3862static int qemu_rdma_registration_start(QEMUFile *f,
3863 uint64_t flags, void *data)
3864{
3865 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3866 RDMAContext *rdma;
3867
3868 if (migration_in_postcopy()) {
3869 return 0;
3870 }
3871
3872 RCU_READ_LOCK_GUARD();
3873 rdma = qatomic_rcu_read(&rioc->rdmaout);
3874 if (!rdma) {
3875 return -EIO;
3876 }
3877
3878 CHECK_ERROR_STATE();
3879
3880 trace_qemu_rdma_registration_start(flags);
3881 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3882 qemu_fflush(f);
3883
3884 return 0;
3885}
3886
3887
3888
3889
3890
3891static int qemu_rdma_registration_stop(QEMUFile *f,
3892 uint64_t flags, void *data)
3893{
3894 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3895 RDMAContext *rdma;
3896 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3897 int ret = 0;
3898
3899 if (migration_in_postcopy()) {
3900 return 0;
3901 }
3902
3903 RCU_READ_LOCK_GUARD();
3904 rdma = qatomic_rcu_read(&rioc->rdmaout);
3905 if (!rdma) {
3906 return -EIO;
3907 }
3908
3909 CHECK_ERROR_STATE();
3910
3911 qemu_fflush(f);
3912 ret = qemu_rdma_drain_cq(f, rdma);
3913
3914 if (ret < 0) {
3915 goto err;
3916 }
3917
3918 if (flags == RAM_CONTROL_SETUP) {
3919 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3920 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3921 int reg_result_idx, i, nb_dest_blocks;
3922
3923 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3924 trace_qemu_rdma_registration_stop_ram();
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934 ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3935 ®_result_idx, rdma->pin_all ?
3936 qemu_rdma_reg_whole_ram_blocks : NULL);
3937 if (ret < 0) {
3938 fprintf(stderr, "receiving remote info!");
3939 return ret;
3940 }
3941
3942 nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956 if (local->nb_blocks != nb_dest_blocks) {
3957 fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
3958 "Your QEMU command line parameters are probably "
3959 "not identical on both the source and destination.",
3960 local->nb_blocks, nb_dest_blocks);
3961 rdma->error_state = -EINVAL;
3962 return -EINVAL;
3963 }
3964
3965 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3966 memcpy(rdma->dest_blocks,
3967 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3968 for (i = 0; i < nb_dest_blocks; i++) {
3969 network_to_dest_block(&rdma->dest_blocks[i]);
3970
3971
3972 if (rdma->dest_blocks[i].length != local->block[i].length) {
3973 fprintf(stderr, "Block %s/%d has a different length %" PRIu64
3974 "vs %" PRIu64, local->block[i].block_name, i,
3975 local->block[i].length,
3976 rdma->dest_blocks[i].length);
3977 rdma->error_state = -EINVAL;
3978 return -EINVAL;
3979 }
3980 local->block[i].remote_host_addr =
3981 rdma->dest_blocks[i].remote_host_addr;
3982 local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3983 }
3984 }
3985
3986 trace_qemu_rdma_registration_stop(flags);
3987
3988 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3989 ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3990
3991 if (ret < 0) {
3992 goto err;
3993 }
3994
3995 return 0;
3996err:
3997 rdma->error_state = ret;
3998 return ret;
3999}
4000
4001static const QEMUFileHooks rdma_read_hooks = {
4002 .hook_ram_load = rdma_load_hook,
4003};
4004
4005static const QEMUFileHooks rdma_write_hooks = {
4006 .before_ram_iterate = qemu_rdma_registration_start,
4007 .after_ram_iterate = qemu_rdma_registration_stop,
4008 .save_page = qemu_rdma_save_page,
4009};
4010
4011
4012static void qio_channel_rdma_finalize(Object *obj)
4013{
4014 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
4015 if (rioc->rdmain) {
4016 qemu_rdma_cleanup(rioc->rdmain);
4017 g_free(rioc->rdmain);
4018 rioc->rdmain = NULL;
4019 }
4020 if (rioc->rdmaout) {
4021 qemu_rdma_cleanup(rioc->rdmaout);
4022 g_free(rioc->rdmaout);
4023 rioc->rdmaout = NULL;
4024 }
4025}
4026
4027static void qio_channel_rdma_class_init(ObjectClass *klass,
4028 void *class_data G_GNUC_UNUSED)
4029{
4030 QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
4031
4032 ioc_klass->io_writev = qio_channel_rdma_writev;
4033 ioc_klass->io_readv = qio_channel_rdma_readv;
4034 ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
4035 ioc_klass->io_close = qio_channel_rdma_close;
4036 ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
4037 ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
4038 ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
4039}
4040
4041static const TypeInfo qio_channel_rdma_info = {
4042 .parent = TYPE_QIO_CHANNEL,
4043 .name = TYPE_QIO_CHANNEL_RDMA,
4044 .instance_size = sizeof(QIOChannelRDMA),
4045 .instance_finalize = qio_channel_rdma_finalize,
4046 .class_init = qio_channel_rdma_class_init,
4047};
4048
4049static void qio_channel_rdma_register_types(void)
4050{
4051 type_register_static(&qio_channel_rdma_info);
4052}
4053
4054type_init(qio_channel_rdma_register_types);
4055
4056static QEMUFile *rdma_new_input(RDMAContext *rdma)
4057{
4058 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4059
4060 rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
4061 rioc->rdmain = rdma;
4062 rioc->rdmaout = rdma->return_path;
4063 qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
4064
4065 return rioc->file;
4066}
4067
4068static QEMUFile *rdma_new_output(RDMAContext *rdma)
4069{
4070 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4071
4072 rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
4073 rioc->rdmaout = rdma;
4074 rioc->rdmain = rdma->return_path;
4075 qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
4076
4077 return rioc->file;
4078}
4079
4080static void rdma_accept_incoming_migration(void *opaque)
4081{
4082 RDMAContext *rdma = opaque;
4083 int ret;
4084 QEMUFile *f;
4085 Error *local_err = NULL;
4086
4087 trace_qemu_rdma_accept_incoming_migration();
4088 ret = qemu_rdma_accept(rdma);
4089
4090 if (ret) {
4091 fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
4092 return;
4093 }
4094
4095 trace_qemu_rdma_accept_incoming_migration_accepted();
4096
4097 if (rdma->is_return_path) {
4098 return;
4099 }
4100
4101 f = rdma_new_input(rdma);
4102 if (f == NULL) {
4103 fprintf(stderr, "RDMA ERROR: could not open RDMA for input\n");
4104 qemu_rdma_cleanup(rdma);
4105 return;
4106 }
4107
4108 rdma->migration_started_on_destination = 1;
4109 migration_fd_process_incoming(f, &local_err);
4110 if (local_err) {
4111 error_reportf_err(local_err, "RDMA ERROR:");
4112 }
4113}
4114
4115void rdma_start_incoming_migration(const char *host_port, Error **errp)
4116{
4117 int ret;
4118 RDMAContext *rdma;
4119 Error *local_err = NULL;
4120
4121 trace_rdma_start_incoming_migration();
4122
4123
4124 if (ram_block_discard_is_required()) {
4125 error_setg(errp, "RDMA: cannot disable RAM discard");
4126 return;
4127 }
4128
4129 rdma = qemu_rdma_data_init(host_port, &local_err);
4130 if (rdma == NULL) {
4131 goto err;
4132 }
4133
4134 ret = qemu_rdma_dest_init(rdma, &local_err);
4135
4136 if (ret) {
4137 goto err;
4138 }
4139
4140 trace_rdma_start_incoming_migration_after_dest_init();
4141
4142 ret = rdma_listen(rdma->listen_id, 5);
4143
4144 if (ret) {
4145 ERROR(errp, "listening on socket!");
4146 goto cleanup_rdma;
4147 }
4148
4149 trace_rdma_start_incoming_migration_after_rdma_listen();
4150
4151 qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4152 NULL, (void *)(intptr_t)rdma);
4153 return;
4154
4155cleanup_rdma:
4156 qemu_rdma_cleanup(rdma);
4157err:
4158 error_propagate(errp, local_err);
4159 if (rdma) {
4160 g_free(rdma->host);
4161 g_free(rdma->host_port);
4162 }
4163 g_free(rdma);
4164}
4165
4166void rdma_start_outgoing_migration(void *opaque,
4167 const char *host_port, Error **errp)
4168{
4169 MigrationState *s = opaque;
4170 RDMAContext *rdma_return_path = NULL;
4171 RDMAContext *rdma;
4172 int ret = 0;
4173
4174
4175 if (ram_block_discard_is_required()) {
4176 error_setg(errp, "RDMA: cannot disable RAM discard");
4177 return;
4178 }
4179
4180 rdma = qemu_rdma_data_init(host_port, errp);
4181 if (rdma == NULL) {
4182 goto err;
4183 }
4184
4185 ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
4186
4187 if (ret) {
4188 goto err;
4189 }
4190
4191 trace_rdma_start_outgoing_migration_after_rdma_source_init();
4192 ret = qemu_rdma_connect(rdma, errp, false);
4193
4194 if (ret) {
4195 goto err;
4196 }
4197
4198
4199 if (migrate_postcopy() || migrate_return_path()) {
4200 rdma_return_path = qemu_rdma_data_init(host_port, errp);
4201
4202 if (rdma_return_path == NULL) {
4203 goto return_path_err;
4204 }
4205
4206 ret = qemu_rdma_source_init(rdma_return_path,
4207 migrate_rdma_pin_all(), errp);
4208
4209 if (ret) {
4210 goto return_path_err;
4211 }
4212
4213 ret = qemu_rdma_connect(rdma_return_path, errp, true);
4214
4215 if (ret) {
4216 goto return_path_err;
4217 }
4218
4219 rdma->return_path = rdma_return_path;
4220 rdma_return_path->return_path = rdma;
4221 rdma_return_path->is_return_path = true;
4222 }
4223
4224 trace_rdma_start_outgoing_migration_after_rdma_connect();
4225
4226 s->to_dst_file = rdma_new_output(rdma);
4227 migrate_fd_connect(s, NULL);
4228 return;
4229return_path_err:
4230 qemu_rdma_cleanup(rdma);
4231err:
4232 g_free(rdma);
4233 g_free(rdma_return_path);
4234}
4235