1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "block/nbd.h"
20#include "sysemu/block-backend.h"
21
22#include "qemu/coroutine.h"
23
24#include <errno.h>
25#include <string.h>
26#ifndef _WIN32
27#include <sys/ioctl.h>
28#endif
29#if defined(__sun__) || defined(__HAIKU__)
30#include <sys/ioccom.h>
31#endif
32#include <ctype.h>
33#include <inttypes.h>
34
35#ifdef __linux__
36#include <linux/fs.h>
37#endif
38
39#include "qemu/sockets.h"
40#include "qemu/queue.h"
41#include "qemu/main-loop.h"
42
43
44
45#ifdef DEBUG_NBD
46#define TRACE(msg, ...) do { \
47 LOG(msg, ## __VA_ARGS__); \
48} while(0)
49#else
50#define TRACE(msg, ...) \
51 do { } while (0)
52#endif
53
54#define LOG(msg, ...) do { \
55 fprintf(stderr, "%s:%s():L%d: " msg "\n", \
56 __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
57} while(0)
58
59
60
61
62
63
64
65#define NBD_REQUEST_SIZE (4 + 4 + 8 + 8 + 4)
66#define NBD_REPLY_SIZE (4 + 4 + 8)
67#define NBD_REQUEST_MAGIC 0x25609513
68#define NBD_REPLY_MAGIC 0x67446698
69#define NBD_OPTS_MAGIC 0x49484156454F5054LL
70#define NBD_CLIENT_MAGIC 0x0000420281861253LL
71#define NBD_REP_MAGIC 0x3e889045565a9LL
72
73#define NBD_SET_SOCK _IO(0xab, 0)
74#define NBD_SET_BLKSIZE _IO(0xab, 1)
75#define NBD_SET_SIZE _IO(0xab, 2)
76#define NBD_DO_IT _IO(0xab, 3)
77#define NBD_CLEAR_SOCK _IO(0xab, 4)
78#define NBD_CLEAR_QUE _IO(0xab, 5)
79#define NBD_PRINT_DEBUG _IO(0xab, 6)
80#define NBD_SET_SIZE_BLOCKS _IO(0xab, 7)
81#define NBD_DISCONNECT _IO(0xab, 8)
82#define NBD_SET_TIMEOUT _IO(0xab, 9)
83#define NBD_SET_FLAGS _IO(0xab, 10)
84
85#define NBD_OPT_EXPORT_NAME (1)
86#define NBD_OPT_ABORT (2)
87#define NBD_OPT_LIST (3)
88
89
90
91
92
93#define NBD_SUCCESS 0
94#define NBD_EPERM 1
95#define NBD_EIO 5
96#define NBD_ENOMEM 12
97#define NBD_EINVAL 22
98#define NBD_ENOSPC 28
99
100static int system_errno_to_nbd_errno(int err)
101{
102 switch (err) {
103 case 0:
104 return NBD_SUCCESS;
105 case EPERM:
106 return NBD_EPERM;
107 case EIO:
108 return NBD_EIO;
109 case ENOMEM:
110 return NBD_ENOMEM;
111#ifdef EDQUOT
112 case EDQUOT:
113#endif
114 case EFBIG:
115 case ENOSPC:
116 return NBD_ENOSPC;
117 case EINVAL:
118 default:
119 return NBD_EINVAL;
120 }
121}
122
123static int nbd_errno_to_system_errno(int err)
124{
125 switch (err) {
126 case NBD_SUCCESS:
127 return 0;
128 case NBD_EPERM:
129 return EPERM;
130 case NBD_EIO:
131 return EIO;
132 case NBD_ENOMEM:
133 return ENOMEM;
134 case NBD_ENOSPC:
135 return ENOSPC;
136 case NBD_EINVAL:
137 default:
138 return EINVAL;
139 }
140}
141
142
143
144typedef struct NBDRequest NBDRequest;
145
146struct NBDRequest {
147 QSIMPLEQ_ENTRY(NBDRequest) entry;
148 NBDClient *client;
149 uint8_t *data;
150};
151
152struct NBDExport {
153 int refcount;
154 void (*close)(NBDExport *exp);
155
156 BlockBackend *blk;
157 char *name;
158 off_t dev_offset;
159 off_t size;
160 uint32_t nbdflags;
161 QTAILQ_HEAD(, NBDClient) clients;
162 QTAILQ_ENTRY(NBDExport) next;
163
164 AioContext *ctx;
165};
166
167static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
168
169struct NBDClient {
170 int refcount;
171 void (*close)(NBDClient *client);
172
173 NBDExport *exp;
174 int sock;
175
176 Coroutine *recv_coroutine;
177
178 CoMutex send_lock;
179 Coroutine *send_coroutine;
180
181 bool can_read;
182
183 QTAILQ_ENTRY(NBDClient) next;
184 int nb_requests;
185 bool closing;
186};
187
188
189
190static void nbd_set_handlers(NBDClient *client);
191static void nbd_unset_handlers(NBDClient *client);
192static void nbd_update_can_read(NBDClient *client);
193
194ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
195{
196 size_t offset = 0;
197 int err;
198
199 if (qemu_in_coroutine()) {
200 if (do_read) {
201 return qemu_co_recv(fd, buffer, size);
202 } else {
203 return qemu_co_send(fd, buffer, size);
204 }
205 }
206
207 while (offset < size) {
208 ssize_t len;
209
210 if (do_read) {
211 len = qemu_recv(fd, buffer + offset, size - offset, 0);
212 } else {
213 len = send(fd, buffer + offset, size - offset, 0);
214 }
215
216 if (len < 0) {
217 err = socket_error();
218
219
220 if (err == EINTR || (offset > 0 && (err == EAGAIN || err == EWOULDBLOCK))) {
221 continue;
222 }
223
224
225 return -err;
226 }
227
228
229 if (len == 0) {
230 break;
231 }
232
233 offset += len;
234 }
235
236 return offset;
237}
238
239static ssize_t read_sync(int fd, void *buffer, size_t size)
240{
241
242
243
244
245
246 return nbd_wr_sync(fd, buffer, size, true);
247}
248
249static ssize_t drop_sync(int fd, size_t size)
250{
251 ssize_t ret, dropped = size;
252 uint8_t *buffer = g_malloc(MIN(65536, size));
253
254 while (size > 0) {
255 ret = read_sync(fd, buffer, MIN(65536, size));
256 if (ret < 0) {
257 g_free(buffer);
258 return ret;
259 }
260
261 assert(ret <= size);
262 size -= ret;
263 }
264
265 g_free(buffer);
266 return dropped;
267}
268
269static ssize_t write_sync(int fd, void *buffer, size_t size)
270{
271 int ret;
272 do {
273
274 ret = nbd_wr_sync(fd, buffer, size, false);
275 } while (ret == -EAGAIN);
276 return ret;
277}
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306static int nbd_send_rep(int csock, uint32_t type, uint32_t opt)
307{
308 uint64_t magic;
309 uint32_t len;
310
311 magic = cpu_to_be64(NBD_REP_MAGIC);
312 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
313 LOG("write failed (rep magic)");
314 return -EINVAL;
315 }
316 opt = cpu_to_be32(opt);
317 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
318 LOG("write failed (rep opt)");
319 return -EINVAL;
320 }
321 type = cpu_to_be32(type);
322 if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
323 LOG("write failed (rep type)");
324 return -EINVAL;
325 }
326 len = cpu_to_be32(0);
327 if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
328 LOG("write failed (rep data length)");
329 return -EINVAL;
330 }
331 return 0;
332}
333
334static int nbd_send_rep_list(int csock, NBDExport *exp)
335{
336 uint64_t magic, name_len;
337 uint32_t opt, type, len;
338
339 name_len = strlen(exp->name);
340 magic = cpu_to_be64(NBD_REP_MAGIC);
341 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
342 LOG("write failed (magic)");
343 return -EINVAL;
344 }
345 opt = cpu_to_be32(NBD_OPT_LIST);
346 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
347 LOG("write failed (opt)");
348 return -EINVAL;
349 }
350 type = cpu_to_be32(NBD_REP_SERVER);
351 if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
352 LOG("write failed (reply type)");
353 return -EINVAL;
354 }
355 len = cpu_to_be32(name_len + sizeof(len));
356 if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
357 LOG("write failed (length)");
358 return -EINVAL;
359 }
360 len = cpu_to_be32(name_len);
361 if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
362 LOG("write failed (length)");
363 return -EINVAL;
364 }
365 if (write_sync(csock, exp->name, name_len) != name_len) {
366 LOG("write failed (buffer)");
367 return -EINVAL;
368 }
369 return 0;
370}
371
372static int nbd_handle_list(NBDClient *client, uint32_t length)
373{
374 int csock;
375 NBDExport *exp;
376
377 csock = client->sock;
378 if (length) {
379 if (drop_sync(csock, length) != length) {
380 return -EIO;
381 }
382 return nbd_send_rep(csock, NBD_REP_ERR_INVALID, NBD_OPT_LIST);
383 }
384
385
386 QTAILQ_FOREACH(exp, &exports, next) {
387 if (nbd_send_rep_list(csock, exp)) {
388 return -EINVAL;
389 }
390 }
391
392 return nbd_send_rep(csock, NBD_REP_ACK, NBD_OPT_LIST);
393}
394
395static int nbd_handle_export_name(NBDClient *client, uint32_t length)
396{
397 int rc = -EINVAL, csock = client->sock;
398 char name[256];
399
400
401
402
403 TRACE("Checking length");
404 if (length > 255) {
405 LOG("Bad length received");
406 goto fail;
407 }
408 if (read_sync(csock, name, length) != length) {
409 LOG("read failed");
410 goto fail;
411 }
412 name[length] = '\0';
413
414 client->exp = nbd_export_find(name);
415 if (!client->exp) {
416 LOG("export not found");
417 goto fail;
418 }
419
420 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
421 nbd_export_get(client->exp);
422 rc = 0;
423fail:
424 return rc;
425}
426
427static int nbd_receive_options(NBDClient *client)
428{
429 int csock = client->sock;
430 uint32_t flags;
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446 if (read_sync(csock, &flags, sizeof(flags)) != sizeof(flags)) {
447 LOG("read failed");
448 return -EIO;
449 }
450 TRACE("Checking client flags");
451 be32_to_cpus(&flags);
452 if (flags != 0 && flags != NBD_FLAG_C_FIXED_NEWSTYLE) {
453 LOG("Bad client flags received");
454 return -EIO;
455 }
456
457 while (1) {
458 int ret;
459 uint32_t tmp, length;
460 uint64_t magic;
461
462 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
463 LOG("read failed");
464 return -EINVAL;
465 }
466 TRACE("Checking opts magic");
467 if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
468 LOG("Bad magic received");
469 return -EINVAL;
470 }
471
472 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
473 LOG("read failed");
474 return -EINVAL;
475 }
476
477 if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
478 LOG("read failed");
479 return -EINVAL;
480 }
481 length = be32_to_cpu(length);
482
483 TRACE("Checking option");
484 switch (be32_to_cpu(tmp)) {
485 case NBD_OPT_LIST:
486 ret = nbd_handle_list(client, length);
487 if (ret < 0) {
488 return ret;
489 }
490 break;
491
492 case NBD_OPT_ABORT:
493 return -EINVAL;
494
495 case NBD_OPT_EXPORT_NAME:
496 return nbd_handle_export_name(client, length);
497
498 default:
499 tmp = be32_to_cpu(tmp);
500 LOG("Unsupported option 0x%x", tmp);
501 nbd_send_rep(client->sock, NBD_REP_ERR_UNSUP, tmp);
502 return -EINVAL;
503 }
504 }
505}
506
507static int nbd_send_negotiate(NBDClient *client)
508{
509 int csock = client->sock;
510 char buf[8 + 8 + 8 + 128];
511 int rc;
512 const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
513 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534 qemu_set_block(csock);
535 rc = -EINVAL;
536
537 TRACE("Beginning negotiation.");
538 memset(buf, 0, sizeof(buf));
539 memcpy(buf, "NBDMAGIC", 8);
540 if (client->exp) {
541 assert ((client->exp->nbdflags & ~65535) == 0);
542 cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
543 cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
544 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
545 } else {
546 cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
547 cpu_to_be16w((uint16_t *)(buf + 16), NBD_FLAG_FIXED_NEWSTYLE);
548 }
549
550 if (client->exp) {
551 if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
552 LOG("write failed");
553 goto fail;
554 }
555 } else {
556 if (write_sync(csock, buf, 18) != 18) {
557 LOG("write failed");
558 goto fail;
559 }
560 rc = nbd_receive_options(client);
561 if (rc != 0) {
562 LOG("option negotiation failed");
563 goto fail;
564 }
565
566 assert ((client->exp->nbdflags & ~65535) == 0);
567 cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
568 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
569 if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
570 LOG("write failed");
571 goto fail;
572 }
573 }
574
575 TRACE("Negotiation succeeded.");
576 rc = 0;
577fail:
578 qemu_set_nonblock(csock);
579 return rc;
580}
581
582int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
583 off_t *size, Error **errp)
584{
585 char buf[256];
586 uint64_t magic, s;
587 uint16_t tmp;
588 int rc;
589
590 TRACE("Receiving negotiation.");
591
592 rc = -EINVAL;
593
594 if (read_sync(csock, buf, 8) != 8) {
595 error_setg(errp, "Failed to read data");
596 goto fail;
597 }
598
599 buf[8] = '\0';
600 if (strlen(buf) == 0) {
601 error_setg(errp, "Server connection closed unexpectedly");
602 goto fail;
603 }
604
605 TRACE("Magic is %c%c%c%c%c%c%c%c",
606 qemu_isprint(buf[0]) ? buf[0] : '.',
607 qemu_isprint(buf[1]) ? buf[1] : '.',
608 qemu_isprint(buf[2]) ? buf[2] : '.',
609 qemu_isprint(buf[3]) ? buf[3] : '.',
610 qemu_isprint(buf[4]) ? buf[4] : '.',
611 qemu_isprint(buf[5]) ? buf[5] : '.',
612 qemu_isprint(buf[6]) ? buf[6] : '.',
613 qemu_isprint(buf[7]) ? buf[7] : '.');
614
615 if (memcmp(buf, "NBDMAGIC", 8) != 0) {
616 error_setg(errp, "Invalid magic received");
617 goto fail;
618 }
619
620 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
621 error_setg(errp, "Failed to read magic");
622 goto fail;
623 }
624 magic = be64_to_cpu(magic);
625 TRACE("Magic is 0x%" PRIx64, magic);
626
627 if (name) {
628 uint32_t reserved = 0;
629 uint32_t opt;
630 uint32_t namesize;
631
632 TRACE("Checking magic (opts_magic)");
633 if (magic != NBD_OPTS_MAGIC) {
634 if (magic == NBD_CLIENT_MAGIC) {
635 error_setg(errp, "Server does not support export names");
636 } else {
637 error_setg(errp, "Bad magic received");
638 }
639 goto fail;
640 }
641 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
642 error_setg(errp, "Failed to read server flags");
643 goto fail;
644 }
645 *flags = be16_to_cpu(tmp) << 16;
646
647 if (write_sync(csock, &reserved, sizeof(reserved)) !=
648 sizeof(reserved)) {
649 error_setg(errp, "Failed to read reserved field");
650 goto fail;
651 }
652
653 magic = cpu_to_be64(magic);
654 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
655 error_setg(errp, "Failed to send export name magic");
656 goto fail;
657 }
658 opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
659 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
660 error_setg(errp, "Failed to send export name option number");
661 goto fail;
662 }
663 namesize = cpu_to_be32(strlen(name));
664 if (write_sync(csock, &namesize, sizeof(namesize)) !=
665 sizeof(namesize)) {
666 error_setg(errp, "Failed to send export name length");
667 goto fail;
668 }
669 if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
670 error_setg(errp, "Failed to send export name");
671 goto fail;
672 }
673 } else {
674 TRACE("Checking magic (cli_magic)");
675
676 if (magic != NBD_CLIENT_MAGIC) {
677 if (magic == NBD_OPTS_MAGIC) {
678 error_setg(errp, "Server requires an export name");
679 } else {
680 error_setg(errp, "Bad magic received");
681 }
682 goto fail;
683 }
684 }
685
686 if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
687 error_setg(errp, "Failed to read export length");
688 goto fail;
689 }
690 *size = be64_to_cpu(s);
691 TRACE("Size is %" PRIu64, *size);
692
693 if (!name) {
694 if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
695 error_setg(errp, "Failed to read export flags");
696 goto fail;
697 }
698 *flags = be32_to_cpup(flags);
699 } else {
700 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
701 error_setg(errp, "Failed to read export flags");
702 goto fail;
703 }
704 *flags |= be16_to_cpu(tmp);
705 }
706 if (read_sync(csock, &buf, 124) != 124) {
707 error_setg(errp, "Failed to read reserved block");
708 goto fail;
709 }
710 rc = 0;
711
712fail:
713 return rc;
714}
715
716#ifdef __linux__
717int nbd_init(int fd, int csock, uint32_t flags, off_t size)
718{
719 TRACE("Setting NBD socket");
720
721 if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
722 int serrno = errno;
723 LOG("Failed to set NBD socket");
724 return -serrno;
725 }
726
727 TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
728
729 if (ioctl(fd, NBD_SET_BLKSIZE, (size_t)BDRV_SECTOR_SIZE) < 0) {
730 int serrno = errno;
731 LOG("Failed setting NBD block size");
732 return -serrno;
733 }
734
735 TRACE("Setting size to %zd block(s)", (size_t)(size / BDRV_SECTOR_SIZE));
736
737 if (ioctl(fd, NBD_SET_SIZE_BLOCKS, (size_t)(size / BDRV_SECTOR_SIZE)) < 0) {
738 int serrno = errno;
739 LOG("Failed setting size (in blocks)");
740 return -serrno;
741 }
742
743 if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
744 if (errno == ENOTTY) {
745 int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
746 TRACE("Setting readonly attribute");
747
748 if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
749 int serrno = errno;
750 LOG("Failed setting read-only attribute");
751 return -serrno;
752 }
753 } else {
754 int serrno = errno;
755 LOG("Failed setting flags");
756 return -serrno;
757 }
758 }
759
760 TRACE("Negotiation ended");
761
762 return 0;
763}
764
765int nbd_disconnect(int fd)
766{
767 ioctl(fd, NBD_CLEAR_QUE);
768 ioctl(fd, NBD_DISCONNECT);
769 ioctl(fd, NBD_CLEAR_SOCK);
770 return 0;
771}
772
773int nbd_client(int fd)
774{
775 int ret;
776 int serrno;
777
778 TRACE("Doing NBD loop");
779
780 ret = ioctl(fd, NBD_DO_IT);
781 if (ret < 0 && errno == EPIPE) {
782
783
784
785
786 ret = 0;
787 }
788 serrno = errno;
789
790 TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
791
792 TRACE("Clearing NBD queue");
793 ioctl(fd, NBD_CLEAR_QUE);
794
795 TRACE("Clearing NBD socket");
796 ioctl(fd, NBD_CLEAR_SOCK);
797
798 errno = serrno;
799 return ret;
800}
801#else
802int nbd_init(int fd, int csock, uint32_t flags, off_t size)
803{
804 return -ENOTSUP;
805}
806
807int nbd_disconnect(int fd)
808{
809 return -ENOTSUP;
810}
811
812int nbd_client(int fd)
813{
814 return -ENOTSUP;
815}
816#endif
817
818ssize_t nbd_send_request(int csock, struct nbd_request *request)
819{
820 uint8_t buf[NBD_REQUEST_SIZE];
821 ssize_t ret;
822
823 cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
824 cpu_to_be32w((uint32_t*)(buf + 4), request->type);
825 cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
826 cpu_to_be64w((uint64_t*)(buf + 16), request->from);
827 cpu_to_be32w((uint32_t*)(buf + 24), request->len);
828
829 TRACE("Sending request to client: "
830 "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
831 request->from, request->len, request->handle, request->type);
832
833 ret = write_sync(csock, buf, sizeof(buf));
834 if (ret < 0) {
835 return ret;
836 }
837
838 if (ret != sizeof(buf)) {
839 LOG("writing to socket failed");
840 return -EINVAL;
841 }
842 return 0;
843}
844
845static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
846{
847 uint8_t buf[NBD_REQUEST_SIZE];
848 uint32_t magic;
849 ssize_t ret;
850
851 ret = read_sync(csock, buf, sizeof(buf));
852 if (ret < 0) {
853 return ret;
854 }
855
856 if (ret != sizeof(buf)) {
857 LOG("read failed");
858 return -EINVAL;
859 }
860
861
862
863
864
865
866
867
868
869 magic = be32_to_cpup((uint32_t*)buf);
870 request->type = be32_to_cpup((uint32_t*)(buf + 4));
871 request->handle = be64_to_cpup((uint64_t*)(buf + 8));
872 request->from = be64_to_cpup((uint64_t*)(buf + 16));
873 request->len = be32_to_cpup((uint32_t*)(buf + 24));
874
875 TRACE("Got request: "
876 "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
877 magic, request->type, request->from, request->len);
878
879 if (magic != NBD_REQUEST_MAGIC) {
880 LOG("invalid magic (got 0x%x)", magic);
881 return -EINVAL;
882 }
883 return 0;
884}
885
886ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
887{
888 uint8_t buf[NBD_REPLY_SIZE];
889 uint32_t magic;
890 ssize_t ret;
891
892 ret = read_sync(csock, buf, sizeof(buf));
893 if (ret < 0) {
894 return ret;
895 }
896
897 if (ret != sizeof(buf)) {
898 LOG("read failed");
899 return -EINVAL;
900 }
901
902
903
904
905
906
907
908 magic = be32_to_cpup((uint32_t*)buf);
909 reply->error = be32_to_cpup((uint32_t*)(buf + 4));
910 reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
911
912 reply->error = nbd_errno_to_system_errno(reply->error);
913
914 TRACE("Got reply: "
915 "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
916 magic, reply->error, reply->handle);
917
918 if (magic != NBD_REPLY_MAGIC) {
919 LOG("invalid magic (got 0x%x)", magic);
920 return -EINVAL;
921 }
922 return 0;
923}
924
925static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
926{
927 uint8_t buf[NBD_REPLY_SIZE];
928 ssize_t ret;
929
930 reply->error = system_errno_to_nbd_errno(reply->error);
931
932
933
934
935
936
937 cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
938 cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
939 cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
940
941 TRACE("Sending response to client");
942
943 ret = write_sync(csock, buf, sizeof(buf));
944 if (ret < 0) {
945 return ret;
946 }
947
948 if (ret != sizeof(buf)) {
949 LOG("writing to socket failed");
950 return -EINVAL;
951 }
952 return 0;
953}
954
955#define MAX_NBD_REQUESTS 16
956
957void nbd_client_get(NBDClient *client)
958{
959 client->refcount++;
960}
961
962void nbd_client_put(NBDClient *client)
963{
964 if (--client->refcount == 0) {
965
966
967
968 assert(client->closing);
969
970 nbd_unset_handlers(client);
971 close(client->sock);
972 client->sock = -1;
973 if (client->exp) {
974 QTAILQ_REMOVE(&client->exp->clients, client, next);
975 nbd_export_put(client->exp);
976 }
977 g_free(client);
978 }
979}
980
981static void client_close(NBDClient *client)
982{
983 if (client->closing) {
984 return;
985 }
986
987 client->closing = true;
988
989
990
991
992 shutdown(client->sock, 2);
993
994
995 if (client->close) {
996 client->close(client);
997 }
998}
999
1000static NBDRequest *nbd_request_get(NBDClient *client)
1001{
1002 NBDRequest *req;
1003
1004 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
1005 client->nb_requests++;
1006 nbd_update_can_read(client);
1007
1008 req = g_new0(NBDRequest, 1);
1009 nbd_client_get(client);
1010 req->client = client;
1011 return req;
1012}
1013
1014static void nbd_request_put(NBDRequest *req)
1015{
1016 NBDClient *client = req->client;
1017
1018 if (req->data) {
1019 qemu_vfree(req->data);
1020 }
1021 g_free(req);
1022
1023 client->nb_requests--;
1024 nbd_update_can_read(client);
1025 nbd_client_put(client);
1026}
1027
1028static void blk_aio_attached(AioContext *ctx, void *opaque)
1029{
1030 NBDExport *exp = opaque;
1031 NBDClient *client;
1032
1033 TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx);
1034
1035 exp->ctx = ctx;
1036
1037 QTAILQ_FOREACH(client, &exp->clients, next) {
1038 nbd_set_handlers(client);
1039 }
1040}
1041
1042static void blk_aio_detach(void *opaque)
1043{
1044 NBDExport *exp = opaque;
1045 NBDClient *client;
1046
1047 TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
1048
1049 QTAILQ_FOREACH(client, &exp->clients, next) {
1050 nbd_unset_handlers(client);
1051 }
1052
1053 exp->ctx = NULL;
1054}
1055
1056NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
1057 uint32_t nbdflags, void (*close)(NBDExport *),
1058 Error **errp)
1059{
1060 NBDExport *exp = g_malloc0(sizeof(NBDExport));
1061 exp->refcount = 1;
1062 QTAILQ_INIT(&exp->clients);
1063 exp->blk = blk;
1064 exp->dev_offset = dev_offset;
1065 exp->nbdflags = nbdflags;
1066 exp->size = size < 0 ? blk_getlength(blk) : size;
1067 if (exp->size < 0) {
1068 error_setg_errno(errp, -exp->size,
1069 "Failed to determine the NBD export's length");
1070 goto fail;
1071 }
1072 exp->size -= exp->size % BDRV_SECTOR_SIZE;
1073
1074 exp->close = close;
1075 exp->ctx = blk_get_aio_context(blk);
1076 blk_ref(blk);
1077 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1078
1079
1080
1081
1082
1083 blk_invalidate_cache(blk, NULL);
1084 return exp;
1085
1086fail:
1087 g_free(exp);
1088 return NULL;
1089}
1090
1091NBDExport *nbd_export_find(const char *name)
1092{
1093 NBDExport *exp;
1094 QTAILQ_FOREACH(exp, &exports, next) {
1095 if (strcmp(name, exp->name) == 0) {
1096 return exp;
1097 }
1098 }
1099
1100 return NULL;
1101}
1102
1103void nbd_export_set_name(NBDExport *exp, const char *name)
1104{
1105 if (exp->name == name) {
1106 return;
1107 }
1108
1109 nbd_export_get(exp);
1110 if (exp->name != NULL) {
1111 g_free(exp->name);
1112 exp->name = NULL;
1113 QTAILQ_REMOVE(&exports, exp, next);
1114 nbd_export_put(exp);
1115 }
1116 if (name != NULL) {
1117 nbd_export_get(exp);
1118 exp->name = g_strdup(name);
1119 QTAILQ_INSERT_TAIL(&exports, exp, next);
1120 }
1121 nbd_export_put(exp);
1122}
1123
1124void nbd_export_close(NBDExport *exp)
1125{
1126 NBDClient *client, *next;
1127
1128 nbd_export_get(exp);
1129 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1130 client_close(client);
1131 }
1132 nbd_export_set_name(exp, NULL);
1133 nbd_export_put(exp);
1134}
1135
1136void nbd_export_get(NBDExport *exp)
1137{
1138 assert(exp->refcount > 0);
1139 exp->refcount++;
1140}
1141
1142void nbd_export_put(NBDExport *exp)
1143{
1144 assert(exp->refcount > 0);
1145 if (exp->refcount == 1) {
1146 nbd_export_close(exp);
1147 }
1148
1149 if (--exp->refcount == 0) {
1150 assert(exp->name == NULL);
1151
1152 if (exp->close) {
1153 exp->close(exp);
1154 }
1155
1156 if (exp->blk) {
1157 blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
1158 blk_aio_detach, exp);
1159 blk_unref(exp->blk);
1160 exp->blk = NULL;
1161 }
1162
1163 g_free(exp);
1164 }
1165}
1166
1167BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
1168{
1169 return exp->blk;
1170}
1171
1172void nbd_export_close_all(void)
1173{
1174 NBDExport *exp, *next;
1175
1176 QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
1177 nbd_export_close(exp);
1178 }
1179}
1180
1181static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
1182 int len)
1183{
1184 NBDClient *client = req->client;
1185 int csock = client->sock;
1186 ssize_t rc, ret;
1187
1188 qemu_co_mutex_lock(&client->send_lock);
1189 client->send_coroutine = qemu_coroutine_self();
1190 nbd_set_handlers(client);
1191
1192 if (!len) {
1193 rc = nbd_send_reply(csock, reply);
1194 } else {
1195 socket_set_cork(csock, 1);
1196 rc = nbd_send_reply(csock, reply);
1197 if (rc >= 0) {
1198 ret = qemu_co_send(csock, req->data, len);
1199 if (ret != len) {
1200 rc = -EIO;
1201 }
1202 }
1203 socket_set_cork(csock, 0);
1204 }
1205
1206 client->send_coroutine = NULL;
1207 nbd_set_handlers(client);
1208 qemu_co_mutex_unlock(&client->send_lock);
1209 return rc;
1210}
1211
1212static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
1213{
1214 NBDClient *client = req->client;
1215 int csock = client->sock;
1216 uint32_t command;
1217 ssize_t rc;
1218
1219 client->recv_coroutine = qemu_coroutine_self();
1220 nbd_update_can_read(client);
1221
1222 rc = nbd_receive_request(csock, request);
1223 if (rc < 0) {
1224 if (rc != -EAGAIN) {
1225 rc = -EIO;
1226 }
1227 goto out;
1228 }
1229
1230 if (request->len > NBD_MAX_BUFFER_SIZE) {
1231 LOG("len (%u) is larger than max len (%u)",
1232 request->len, NBD_MAX_BUFFER_SIZE);
1233 rc = -EINVAL;
1234 goto out;
1235 }
1236
1237 if ((request->from + request->len) < request->from) {
1238 LOG("integer overflow detected! "
1239 "you're probably being attacked");
1240 rc = -EINVAL;
1241 goto out;
1242 }
1243
1244 TRACE("Decoding type");
1245
1246 command = request->type & NBD_CMD_MASK_COMMAND;
1247 if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
1248 req->data = blk_blockalign(client->exp->blk, request->len);
1249 }
1250 if (command == NBD_CMD_WRITE) {
1251 TRACE("Reading %u byte(s)", request->len);
1252
1253 if (qemu_co_recv(csock, req->data, request->len) != request->len) {
1254 LOG("reading from socket failed");
1255 rc = -EIO;
1256 goto out;
1257 }
1258 }
1259 rc = 0;
1260
1261out:
1262 client->recv_coroutine = NULL;
1263 nbd_update_can_read(client);
1264
1265 return rc;
1266}
1267
1268static void nbd_trip(void *opaque)
1269{
1270 NBDClient *client = opaque;
1271 NBDExport *exp = client->exp;
1272 NBDRequest *req;
1273 struct nbd_request request;
1274 struct nbd_reply reply;
1275 ssize_t ret;
1276 uint32_t command;
1277
1278 TRACE("Reading request.");
1279 if (client->closing) {
1280 return;
1281 }
1282
1283 req = nbd_request_get(client);
1284 ret = nbd_co_receive_request(req, &request);
1285 if (ret == -EAGAIN) {
1286 goto done;
1287 }
1288 if (ret == -EIO) {
1289 goto out;
1290 }
1291
1292 reply.handle = request.handle;
1293 reply.error = 0;
1294
1295 if (ret < 0) {
1296 reply.error = -ret;
1297 goto error_reply;
1298 }
1299 command = request.type & NBD_CMD_MASK_COMMAND;
1300 if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
1301 LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
1302 ", Offset: %" PRIu64 "\n",
1303 request.from, request.len,
1304 (uint64_t)exp->size, (uint64_t)exp->dev_offset);
1305 LOG("requested operation past EOF--bad client?");
1306 goto invalid_request;
1307 }
1308
1309 if (client->closing) {
1310
1311
1312
1313
1314 goto done;
1315 }
1316
1317 switch (command) {
1318 case NBD_CMD_READ:
1319 TRACE("Request type is READ");
1320
1321 if (request.type & NBD_CMD_FLAG_FUA) {
1322 ret = blk_co_flush(exp->blk);
1323 if (ret < 0) {
1324 LOG("flush failed");
1325 reply.error = -ret;
1326 goto error_reply;
1327 }
1328 }
1329
1330 ret = blk_read(exp->blk,
1331 (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
1332 req->data, request.len / BDRV_SECTOR_SIZE);
1333 if (ret < 0) {
1334 LOG("reading from file failed");
1335 reply.error = -ret;
1336 goto error_reply;
1337 }
1338
1339 TRACE("Read %u byte(s)", request.len);
1340 if (nbd_co_send_reply(req, &reply, request.len) < 0)
1341 goto out;
1342 break;
1343 case NBD_CMD_WRITE:
1344 TRACE("Request type is WRITE");
1345
1346 if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1347 TRACE("Server is read-only, return error");
1348 reply.error = EROFS;
1349 goto error_reply;
1350 }
1351
1352 TRACE("Writing to device");
1353
1354 ret = blk_write(exp->blk,
1355 (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
1356 req->data, request.len / BDRV_SECTOR_SIZE);
1357 if (ret < 0) {
1358 LOG("writing to file failed");
1359 reply.error = -ret;
1360 goto error_reply;
1361 }
1362
1363 if (request.type & NBD_CMD_FLAG_FUA) {
1364 ret = blk_co_flush(exp->blk);
1365 if (ret < 0) {
1366 LOG("flush failed");
1367 reply.error = -ret;
1368 goto error_reply;
1369 }
1370 }
1371
1372 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1373 goto out;
1374 }
1375 break;
1376 case NBD_CMD_DISC:
1377 TRACE("Request type is DISCONNECT");
1378 errno = 0;
1379 goto out;
1380 case NBD_CMD_FLUSH:
1381 TRACE("Request type is FLUSH");
1382
1383 ret = blk_co_flush(exp->blk);
1384 if (ret < 0) {
1385 LOG("flush failed");
1386 reply.error = -ret;
1387 }
1388 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1389 goto out;
1390 }
1391 break;
1392 case NBD_CMD_TRIM:
1393 TRACE("Request type is TRIM");
1394 ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
1395 / BDRV_SECTOR_SIZE,
1396 request.len / BDRV_SECTOR_SIZE);
1397 if (ret < 0) {
1398 LOG("discard failed");
1399 reply.error = -ret;
1400 }
1401 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1402 goto out;
1403 }
1404 break;
1405 default:
1406 LOG("invalid request type (%u) received", request.type);
1407 invalid_request:
1408 reply.error = EINVAL;
1409 error_reply:
1410 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1411 goto out;
1412 }
1413 break;
1414 }
1415
1416 TRACE("Request/Reply complete");
1417
1418done:
1419 nbd_request_put(req);
1420 return;
1421
1422out:
1423 nbd_request_put(req);
1424 client_close(client);
1425}
1426
1427static void nbd_read(void *opaque)
1428{
1429 NBDClient *client = opaque;
1430
1431 if (client->recv_coroutine) {
1432 qemu_coroutine_enter(client->recv_coroutine, NULL);
1433 } else {
1434 qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1435 }
1436}
1437
1438static void nbd_restart_write(void *opaque)
1439{
1440 NBDClient *client = opaque;
1441
1442 qemu_coroutine_enter(client->send_coroutine, NULL);
1443}
1444
1445static void nbd_set_handlers(NBDClient *client)
1446{
1447 if (client->exp && client->exp->ctx) {
1448 aio_set_fd_handler(client->exp->ctx, client->sock,
1449 true,
1450 client->can_read ? nbd_read : NULL,
1451 client->send_coroutine ? nbd_restart_write : NULL,
1452 client);
1453 }
1454}
1455
1456static void nbd_unset_handlers(NBDClient *client)
1457{
1458 if (client->exp && client->exp->ctx) {
1459 aio_set_fd_handler(client->exp->ctx, client->sock,
1460 true, NULL, NULL, NULL);
1461 }
1462}
1463
1464static void nbd_update_can_read(NBDClient *client)
1465{
1466 bool can_read = client->recv_coroutine ||
1467 client->nb_requests < MAX_NBD_REQUESTS;
1468
1469 if (can_read != client->can_read) {
1470 client->can_read = can_read;
1471 nbd_set_handlers(client);
1472
1473
1474
1475 }
1476}
1477
1478NBDClient *nbd_client_new(NBDExport *exp, int csock,
1479 void (*close)(NBDClient *))
1480{
1481 NBDClient *client;
1482 client = g_malloc0(sizeof(NBDClient));
1483 client->refcount = 1;
1484 client->exp = exp;
1485 client->sock = csock;
1486 client->can_read = true;
1487 if (nbd_send_negotiate(client)) {
1488 g_free(client);
1489 return NULL;
1490 }
1491 client->close = close;
1492 qemu_co_mutex_init(&client->send_lock);
1493 nbd_set_handlers(client);
1494
1495 if (exp) {
1496 QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1497 nbd_export_get(exp);
1498 }
1499 return client;
1500}
1501