1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include "block/nbd.h"
20#include "sysemu/block-backend.h"
21
22#include "block/coroutine.h"
23
24#include <errno.h>
25#include <string.h>
26#ifndef _WIN32
27#include <sys/ioctl.h>
28#endif
29#if defined(__sun__) || defined(__HAIKU__)
30#include <sys/ioccom.h>
31#endif
32#include <ctype.h>
33#include <inttypes.h>
34
35#ifdef __linux__
36#include <linux/fs.h>
37#endif
38
39#include "qemu/sockets.h"
40#include "qemu/queue.h"
41#include "qemu/main-loop.h"
42
43
44
45#ifdef DEBUG_NBD
46#define TRACE(msg, ...) do { \
47 LOG(msg, ## __VA_ARGS__); \
48} while(0)
49#else
50#define TRACE(msg, ...) \
51 do { } while (0)
52#endif
53
54#define LOG(msg, ...) do { \
55 fprintf(stderr, "%s:%s():L%d: " msg "\n", \
56 __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
57} while(0)
58
59
60
61
62
63
64
65#define NBD_REQUEST_SIZE (4 + 4 + 8 + 8 + 4)
66#define NBD_REPLY_SIZE (4 + 4 + 8)
67#define NBD_REQUEST_MAGIC 0x25609513
68#define NBD_REPLY_MAGIC 0x67446698
69#define NBD_OPTS_MAGIC 0x49484156454F5054LL
70#define NBD_CLIENT_MAGIC 0x0000420281861253LL
71#define NBD_REP_MAGIC 0x3e889045565a9LL
72
73#define NBD_SET_SOCK _IO(0xab, 0)
74#define NBD_SET_BLKSIZE _IO(0xab, 1)
75#define NBD_SET_SIZE _IO(0xab, 2)
76#define NBD_DO_IT _IO(0xab, 3)
77#define NBD_CLEAR_SOCK _IO(0xab, 4)
78#define NBD_CLEAR_QUE _IO(0xab, 5)
79#define NBD_PRINT_DEBUG _IO(0xab, 6)
80#define NBD_SET_SIZE_BLOCKS _IO(0xab, 7)
81#define NBD_DISCONNECT _IO(0xab, 8)
82#define NBD_SET_TIMEOUT _IO(0xab, 9)
83#define NBD_SET_FLAGS _IO(0xab, 10)
84
85#define NBD_OPT_EXPORT_NAME (1)
86#define NBD_OPT_ABORT (2)
87#define NBD_OPT_LIST (3)
88
89
90
91typedef struct NBDRequest NBDRequest;
92
93struct NBDRequest {
94 QSIMPLEQ_ENTRY(NBDRequest) entry;
95 NBDClient *client;
96 uint8_t *data;
97};
98
99struct NBDExport {
100 int refcount;
101 void (*close)(NBDExport *exp);
102
103 BlockBackend *blk;
104 char *name;
105 off_t dev_offset;
106 off_t size;
107 uint32_t nbdflags;
108 QTAILQ_HEAD(, NBDClient) clients;
109 QTAILQ_ENTRY(NBDExport) next;
110
111 AioContext *ctx;
112};
113
114static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
115
116struct NBDClient {
117 int refcount;
118 void (*close)(NBDClient *client);
119
120 NBDExport *exp;
121 int sock;
122
123 Coroutine *recv_coroutine;
124
125 CoMutex send_lock;
126 Coroutine *send_coroutine;
127
128 bool can_read;
129
130 QTAILQ_ENTRY(NBDClient) next;
131 int nb_requests;
132 bool closing;
133};
134
135
136
137static void nbd_set_handlers(NBDClient *client);
138static void nbd_unset_handlers(NBDClient *client);
139static void nbd_update_can_read(NBDClient *client);
140
141ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
142{
143 size_t offset = 0;
144 int err;
145
146 if (qemu_in_coroutine()) {
147 if (do_read) {
148 return qemu_co_recv(fd, buffer, size);
149 } else {
150 return qemu_co_send(fd, buffer, size);
151 }
152 }
153
154 while (offset < size) {
155 ssize_t len;
156
157 if (do_read) {
158 len = qemu_recv(fd, buffer + offset, size - offset, 0);
159 } else {
160 len = send(fd, buffer + offset, size - offset, 0);
161 }
162
163 if (len < 0) {
164 err = socket_error();
165
166
167 if (err == EINTR || (offset > 0 && (err == EAGAIN || err == EWOULDBLOCK))) {
168 continue;
169 }
170
171
172 return -err;
173 }
174
175
176 if (len == 0) {
177 break;
178 }
179
180 offset += len;
181 }
182
183 return offset;
184}
185
186static ssize_t read_sync(int fd, void *buffer, size_t size)
187{
188
189
190
191
192
193 return nbd_wr_sync(fd, buffer, size, true);
194}
195
196static ssize_t drop_sync(int fd, size_t size)
197{
198 ssize_t ret, dropped = size;
199 uint8_t *buffer = g_malloc(MIN(65536, size));
200
201 while (size > 0) {
202 ret = read_sync(fd, buffer, MIN(65536, size));
203 if (ret < 0) {
204 g_free(buffer);
205 return ret;
206 }
207
208 assert(ret <= size);
209 size -= ret;
210 }
211
212 g_free(buffer);
213 return dropped;
214}
215
216static ssize_t write_sync(int fd, void *buffer, size_t size)
217{
218 int ret;
219 do {
220
221 ret = nbd_wr_sync(fd, buffer, size, false);
222 } while (ret == -EAGAIN);
223 return ret;
224}
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253static int nbd_send_rep(int csock, uint32_t type, uint32_t opt)
254{
255 uint64_t magic;
256 uint32_t len;
257
258 magic = cpu_to_be64(NBD_REP_MAGIC);
259 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
260 LOG("write failed (rep magic)");
261 return -EINVAL;
262 }
263 opt = cpu_to_be32(opt);
264 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
265 LOG("write failed (rep opt)");
266 return -EINVAL;
267 }
268 type = cpu_to_be32(type);
269 if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
270 LOG("write failed (rep type)");
271 return -EINVAL;
272 }
273 len = cpu_to_be32(0);
274 if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
275 LOG("write failed (rep data length)");
276 return -EINVAL;
277 }
278 return 0;
279}
280
281static int nbd_send_rep_list(int csock, NBDExport *exp)
282{
283 uint64_t magic, name_len;
284 uint32_t opt, type, len;
285
286 name_len = strlen(exp->name);
287 magic = cpu_to_be64(NBD_REP_MAGIC);
288 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
289 LOG("write failed (magic)");
290 return -EINVAL;
291 }
292 opt = cpu_to_be32(NBD_OPT_LIST);
293 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
294 LOG("write failed (opt)");
295 return -EINVAL;
296 }
297 type = cpu_to_be32(NBD_REP_SERVER);
298 if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
299 LOG("write failed (reply type)");
300 return -EINVAL;
301 }
302 len = cpu_to_be32(name_len + sizeof(len));
303 if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
304 LOG("write failed (length)");
305 return -EINVAL;
306 }
307 len = cpu_to_be32(name_len);
308 if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
309 LOG("write failed (length)");
310 return -EINVAL;
311 }
312 if (write_sync(csock, exp->name, name_len) != name_len) {
313 LOG("write failed (buffer)");
314 return -EINVAL;
315 }
316 return 0;
317}
318
319static int nbd_handle_list(NBDClient *client, uint32_t length)
320{
321 int csock;
322 NBDExport *exp;
323
324 csock = client->sock;
325 if (length) {
326 if (drop_sync(csock, length) != length) {
327 return -EIO;
328 }
329 return nbd_send_rep(csock, NBD_REP_ERR_INVALID, NBD_OPT_LIST);
330 }
331
332
333 QTAILQ_FOREACH(exp, &exports, next) {
334 if (nbd_send_rep_list(csock, exp)) {
335 return -EINVAL;
336 }
337 }
338
339 return nbd_send_rep(csock, NBD_REP_ACK, NBD_OPT_LIST);
340}
341
342static int nbd_handle_export_name(NBDClient *client, uint32_t length)
343{
344 int rc = -EINVAL, csock = client->sock;
345 char name[256];
346
347
348
349
350 TRACE("Checking length");
351 if (length > 255) {
352 LOG("Bad length received");
353 goto fail;
354 }
355 if (read_sync(csock, name, length) != length) {
356 LOG("read failed");
357 goto fail;
358 }
359 name[length] = '\0';
360
361 client->exp = nbd_export_find(name);
362 if (!client->exp) {
363 LOG("export not found");
364 goto fail;
365 }
366
367 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
368 nbd_export_get(client->exp);
369 rc = 0;
370fail:
371 return rc;
372}
373
374static int nbd_receive_options(NBDClient *client)
375{
376 int csock = client->sock;
377 uint32_t flags;
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393 if (read_sync(csock, &flags, sizeof(flags)) != sizeof(flags)) {
394 LOG("read failed");
395 return -EIO;
396 }
397 TRACE("Checking client flags");
398 be32_to_cpus(&flags);
399 if (flags != 0 && flags != NBD_FLAG_C_FIXED_NEWSTYLE) {
400 LOG("Bad client flags received");
401 return -EIO;
402 }
403
404 while (1) {
405 int ret;
406 uint32_t tmp, length;
407 uint64_t magic;
408
409 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
410 LOG("read failed");
411 return -EINVAL;
412 }
413 TRACE("Checking opts magic");
414 if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
415 LOG("Bad magic received");
416 return -EINVAL;
417 }
418
419 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
420 LOG("read failed");
421 return -EINVAL;
422 }
423
424 if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
425 LOG("read failed");
426 return -EINVAL;
427 }
428 length = be32_to_cpu(length);
429
430 TRACE("Checking option");
431 switch (be32_to_cpu(tmp)) {
432 case NBD_OPT_LIST:
433 ret = nbd_handle_list(client, length);
434 if (ret < 0) {
435 return ret;
436 }
437 break;
438
439 case NBD_OPT_ABORT:
440 return -EINVAL;
441
442 case NBD_OPT_EXPORT_NAME:
443 return nbd_handle_export_name(client, length);
444
445 default:
446 tmp = be32_to_cpu(tmp);
447 LOG("Unsupported option 0x%x", tmp);
448 nbd_send_rep(client->sock, NBD_REP_ERR_UNSUP, tmp);
449 return -EINVAL;
450 }
451 }
452}
453
454static int nbd_send_negotiate(NBDClient *client)
455{
456 int csock = client->sock;
457 char buf[8 + 8 + 8 + 128];
458 int rc;
459 const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
460 NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481 qemu_set_block(csock);
482 rc = -EINVAL;
483
484 TRACE("Beginning negotiation.");
485 memset(buf, 0, sizeof(buf));
486 memcpy(buf, "NBDMAGIC", 8);
487 if (client->exp) {
488 assert ((client->exp->nbdflags & ~65535) == 0);
489 cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
490 cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
491 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
492 } else {
493 cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
494 cpu_to_be16w((uint16_t *)(buf + 16), NBD_FLAG_FIXED_NEWSTYLE);
495 }
496
497 if (client->exp) {
498 if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
499 LOG("write failed");
500 goto fail;
501 }
502 } else {
503 if (write_sync(csock, buf, 18) != 18) {
504 LOG("write failed");
505 goto fail;
506 }
507 rc = nbd_receive_options(client);
508 if (rc != 0) {
509 LOG("option negotiation failed");
510 goto fail;
511 }
512
513 assert ((client->exp->nbdflags & ~65535) == 0);
514 cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
515 cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
516 if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
517 LOG("write failed");
518 goto fail;
519 }
520 }
521
522 TRACE("Negotiation succeeded.");
523 rc = 0;
524fail:
525 qemu_set_nonblock(csock);
526 return rc;
527}
528
529int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
530 off_t *size, Error **errp)
531{
532 char buf[256];
533 uint64_t magic, s;
534 uint16_t tmp;
535 int rc;
536
537 TRACE("Receiving negotiation.");
538
539 rc = -EINVAL;
540
541 if (read_sync(csock, buf, 8) != 8) {
542 error_setg(errp, "Failed to read data");
543 goto fail;
544 }
545
546 buf[8] = '\0';
547 if (strlen(buf) == 0) {
548 error_setg(errp, "Server connection closed unexpectedly");
549 goto fail;
550 }
551
552 TRACE("Magic is %c%c%c%c%c%c%c%c",
553 qemu_isprint(buf[0]) ? buf[0] : '.',
554 qemu_isprint(buf[1]) ? buf[1] : '.',
555 qemu_isprint(buf[2]) ? buf[2] : '.',
556 qemu_isprint(buf[3]) ? buf[3] : '.',
557 qemu_isprint(buf[4]) ? buf[4] : '.',
558 qemu_isprint(buf[5]) ? buf[5] : '.',
559 qemu_isprint(buf[6]) ? buf[6] : '.',
560 qemu_isprint(buf[7]) ? buf[7] : '.');
561
562 if (memcmp(buf, "NBDMAGIC", 8) != 0) {
563 error_setg(errp, "Invalid magic received");
564 goto fail;
565 }
566
567 if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
568 error_setg(errp, "Failed to read magic");
569 goto fail;
570 }
571 magic = be64_to_cpu(magic);
572 TRACE("Magic is 0x%" PRIx64, magic);
573
574 if (name) {
575 uint32_t reserved = 0;
576 uint32_t opt;
577 uint32_t namesize;
578
579 TRACE("Checking magic (opts_magic)");
580 if (magic != NBD_OPTS_MAGIC) {
581 if (magic == NBD_CLIENT_MAGIC) {
582 error_setg(errp, "Server does not support export names");
583 } else {
584 error_setg(errp, "Bad magic received");
585 }
586 goto fail;
587 }
588 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
589 error_setg(errp, "Failed to read server flags");
590 goto fail;
591 }
592 *flags = be16_to_cpu(tmp) << 16;
593
594 if (write_sync(csock, &reserved, sizeof(reserved)) !=
595 sizeof(reserved)) {
596 error_setg(errp, "Failed to read reserved field");
597 goto fail;
598 }
599
600 magic = cpu_to_be64(magic);
601 if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
602 error_setg(errp, "Failed to send export name magic");
603 goto fail;
604 }
605 opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
606 if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
607 error_setg(errp, "Failed to send export name option number");
608 goto fail;
609 }
610 namesize = cpu_to_be32(strlen(name));
611 if (write_sync(csock, &namesize, sizeof(namesize)) !=
612 sizeof(namesize)) {
613 error_setg(errp, "Failed to send export name length");
614 goto fail;
615 }
616 if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
617 error_setg(errp, "Failed to send export name");
618 goto fail;
619 }
620 } else {
621 TRACE("Checking magic (cli_magic)");
622
623 if (magic != NBD_CLIENT_MAGIC) {
624 if (magic == NBD_OPTS_MAGIC) {
625 error_setg(errp, "Server requires an export name");
626 } else {
627 error_setg(errp, "Bad magic received");
628 }
629 goto fail;
630 }
631 }
632
633 if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
634 error_setg(errp, "Failed to read export length");
635 goto fail;
636 }
637 *size = be64_to_cpu(s);
638 TRACE("Size is %" PRIu64, *size);
639
640 if (!name) {
641 if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
642 error_setg(errp, "Failed to read export flags");
643 goto fail;
644 }
645 *flags = be32_to_cpup(flags);
646 } else {
647 if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
648 error_setg(errp, "Failed to read export flags");
649 goto fail;
650 }
651 *flags |= be16_to_cpu(tmp);
652 }
653 if (read_sync(csock, &buf, 124) != 124) {
654 error_setg(errp, "Failed to read reserved block");
655 goto fail;
656 }
657 rc = 0;
658
659fail:
660 return rc;
661}
662
663#ifdef __linux__
664int nbd_init(int fd, int csock, uint32_t flags, off_t size)
665{
666 TRACE("Setting NBD socket");
667
668 if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
669 int serrno = errno;
670 LOG("Failed to set NBD socket");
671 return -serrno;
672 }
673
674 TRACE("Setting block size to %lu", (unsigned long)BDRV_SECTOR_SIZE);
675
676 if (ioctl(fd, NBD_SET_BLKSIZE, (size_t)BDRV_SECTOR_SIZE) < 0) {
677 int serrno = errno;
678 LOG("Failed setting NBD block size");
679 return -serrno;
680 }
681
682 TRACE("Setting size to %zd block(s)", (size_t)(size / BDRV_SECTOR_SIZE));
683
684 if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / (size_t)BDRV_SECTOR_SIZE) < 0) {
685 int serrno = errno;
686 LOG("Failed setting size (in blocks)");
687 return -serrno;
688 }
689
690 if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
691 if (errno == ENOTTY) {
692 int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
693 TRACE("Setting readonly attribute");
694
695 if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
696 int serrno = errno;
697 LOG("Failed setting read-only attribute");
698 return -serrno;
699 }
700 } else {
701 int serrno = errno;
702 LOG("Failed setting flags");
703 return -serrno;
704 }
705 }
706
707 TRACE("Negotiation ended");
708
709 return 0;
710}
711
712int nbd_disconnect(int fd)
713{
714 ioctl(fd, NBD_CLEAR_QUE);
715 ioctl(fd, NBD_DISCONNECT);
716 ioctl(fd, NBD_CLEAR_SOCK);
717 return 0;
718}
719
720int nbd_client(int fd)
721{
722 int ret;
723 int serrno;
724
725 TRACE("Doing NBD loop");
726
727 ret = ioctl(fd, NBD_DO_IT);
728 if (ret < 0 && errno == EPIPE) {
729
730
731
732
733 ret = 0;
734 }
735 serrno = errno;
736
737 TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
738
739 TRACE("Clearing NBD queue");
740 ioctl(fd, NBD_CLEAR_QUE);
741
742 TRACE("Clearing NBD socket");
743 ioctl(fd, NBD_CLEAR_SOCK);
744
745 errno = serrno;
746 return ret;
747}
748#else
749int nbd_init(int fd, int csock, uint32_t flags, off_t size)
750{
751 return -ENOTSUP;
752}
753
754int nbd_disconnect(int fd)
755{
756 return -ENOTSUP;
757}
758
759int nbd_client(int fd)
760{
761 return -ENOTSUP;
762}
763#endif
764
765ssize_t nbd_send_request(int csock, struct nbd_request *request)
766{
767 uint8_t buf[NBD_REQUEST_SIZE];
768 ssize_t ret;
769
770 cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
771 cpu_to_be32w((uint32_t*)(buf + 4), request->type);
772 cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
773 cpu_to_be64w((uint64_t*)(buf + 16), request->from);
774 cpu_to_be32w((uint32_t*)(buf + 24), request->len);
775
776 TRACE("Sending request to client: "
777 "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
778 request->from, request->len, request->handle, request->type);
779
780 ret = write_sync(csock, buf, sizeof(buf));
781 if (ret < 0) {
782 return ret;
783 }
784
785 if (ret != sizeof(buf)) {
786 LOG("writing to socket failed");
787 return -EINVAL;
788 }
789 return 0;
790}
791
792static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
793{
794 uint8_t buf[NBD_REQUEST_SIZE];
795 uint32_t magic;
796 ssize_t ret;
797
798 ret = read_sync(csock, buf, sizeof(buf));
799 if (ret < 0) {
800 return ret;
801 }
802
803 if (ret != sizeof(buf)) {
804 LOG("read failed");
805 return -EINVAL;
806 }
807
808
809
810
811
812
813
814
815
816 magic = be32_to_cpup((uint32_t*)buf);
817 request->type = be32_to_cpup((uint32_t*)(buf + 4));
818 request->handle = be64_to_cpup((uint64_t*)(buf + 8));
819 request->from = be64_to_cpup((uint64_t*)(buf + 16));
820 request->len = be32_to_cpup((uint32_t*)(buf + 24));
821
822 TRACE("Got request: "
823 "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
824 magic, request->type, request->from, request->len);
825
826 if (magic != NBD_REQUEST_MAGIC) {
827 LOG("invalid magic (got 0x%x)", magic);
828 return -EINVAL;
829 }
830 return 0;
831}
832
833ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
834{
835 uint8_t buf[NBD_REPLY_SIZE];
836 uint32_t magic;
837 ssize_t ret;
838
839 ret = read_sync(csock, buf, sizeof(buf));
840 if (ret < 0) {
841 return ret;
842 }
843
844 if (ret != sizeof(buf)) {
845 LOG("read failed");
846 return -EINVAL;
847 }
848
849
850
851
852
853
854
855 magic = be32_to_cpup((uint32_t*)buf);
856 reply->error = be32_to_cpup((uint32_t*)(buf + 4));
857 reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
858
859 TRACE("Got reply: "
860 "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
861 magic, reply->error, reply->handle);
862
863 if (magic != NBD_REPLY_MAGIC) {
864 LOG("invalid magic (got 0x%x)", magic);
865 return -EINVAL;
866 }
867 return 0;
868}
869
870static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
871{
872 uint8_t buf[NBD_REPLY_SIZE];
873 ssize_t ret;
874
875
876
877
878
879
880 cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
881 cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
882 cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
883
884 TRACE("Sending response to client");
885
886 ret = write_sync(csock, buf, sizeof(buf));
887 if (ret < 0) {
888 return ret;
889 }
890
891 if (ret != sizeof(buf)) {
892 LOG("writing to socket failed");
893 return -EINVAL;
894 }
895 return 0;
896}
897
898#define MAX_NBD_REQUESTS 16
899
900void nbd_client_get(NBDClient *client)
901{
902 client->refcount++;
903}
904
905void nbd_client_put(NBDClient *client)
906{
907 if (--client->refcount == 0) {
908
909
910
911 assert(client->closing);
912
913 nbd_unset_handlers(client);
914 close(client->sock);
915 client->sock = -1;
916 if (client->exp) {
917 QTAILQ_REMOVE(&client->exp->clients, client, next);
918 nbd_export_put(client->exp);
919 }
920 g_free(client);
921 }
922}
923
924static void client_close(NBDClient *client)
925{
926 if (client->closing) {
927 return;
928 }
929
930 client->closing = true;
931
932
933
934
935 shutdown(client->sock, 2);
936
937
938 if (client->close) {
939 client->close(client);
940 }
941}
942
943static NBDRequest *nbd_request_get(NBDClient *client)
944{
945 NBDRequest *req;
946
947 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
948 client->nb_requests++;
949 nbd_update_can_read(client);
950
951 req = g_slice_new0(NBDRequest);
952 nbd_client_get(client);
953 req->client = client;
954 return req;
955}
956
957static void nbd_request_put(NBDRequest *req)
958{
959 NBDClient *client = req->client;
960
961 if (req->data) {
962 qemu_vfree(req->data);
963 }
964 g_slice_free(NBDRequest, req);
965
966 client->nb_requests--;
967 nbd_update_can_read(client);
968 nbd_client_put(client);
969}
970
971static void blk_aio_attached(AioContext *ctx, void *opaque)
972{
973 NBDExport *exp = opaque;
974 NBDClient *client;
975
976 TRACE("Export %s: Attaching clients to AIO context %p\n", exp->name, ctx);
977
978 exp->ctx = ctx;
979
980 QTAILQ_FOREACH(client, &exp->clients, next) {
981 nbd_set_handlers(client);
982 }
983}
984
985static void blk_aio_detach(void *opaque)
986{
987 NBDExport *exp = opaque;
988 NBDClient *client;
989
990 TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
991
992 QTAILQ_FOREACH(client, &exp->clients, next) {
993 nbd_unset_handlers(client);
994 }
995
996 exp->ctx = NULL;
997}
998
999NBDExport *nbd_export_new(BlockBackend *blk, off_t dev_offset, off_t size,
1000 uint32_t nbdflags, void (*close)(NBDExport *),
1001 Error **errp)
1002{
1003 NBDExport *exp = g_malloc0(sizeof(NBDExport));
1004 exp->refcount = 1;
1005 QTAILQ_INIT(&exp->clients);
1006 exp->blk = blk;
1007 exp->dev_offset = dev_offset;
1008 exp->nbdflags = nbdflags;
1009 exp->size = size < 0 ? blk_getlength(blk) : size;
1010 if (exp->size < 0) {
1011 error_setg_errno(errp, -exp->size,
1012 "Failed to determine the NBD export's length");
1013 goto fail;
1014 }
1015 exp->size -= exp->size % BDRV_SECTOR_SIZE;
1016
1017 exp->close = close;
1018 exp->ctx = blk_get_aio_context(blk);
1019 blk_ref(blk);
1020 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1021
1022
1023
1024
1025
1026 blk_invalidate_cache(blk, NULL);
1027 return exp;
1028
1029fail:
1030 g_free(exp);
1031 return NULL;
1032}
1033
1034NBDExport *nbd_export_find(const char *name)
1035{
1036 NBDExport *exp;
1037 QTAILQ_FOREACH(exp, &exports, next) {
1038 if (strcmp(name, exp->name) == 0) {
1039 return exp;
1040 }
1041 }
1042
1043 return NULL;
1044}
1045
1046void nbd_export_set_name(NBDExport *exp, const char *name)
1047{
1048 if (exp->name == name) {
1049 return;
1050 }
1051
1052 nbd_export_get(exp);
1053 if (exp->name != NULL) {
1054 g_free(exp->name);
1055 exp->name = NULL;
1056 QTAILQ_REMOVE(&exports, exp, next);
1057 nbd_export_put(exp);
1058 }
1059 if (name != NULL) {
1060 nbd_export_get(exp);
1061 exp->name = g_strdup(name);
1062 QTAILQ_INSERT_TAIL(&exports, exp, next);
1063 }
1064 nbd_export_put(exp);
1065}
1066
1067void nbd_export_close(NBDExport *exp)
1068{
1069 NBDClient *client, *next;
1070
1071 nbd_export_get(exp);
1072 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1073 client_close(client);
1074 }
1075 nbd_export_set_name(exp, NULL);
1076 nbd_export_put(exp);
1077 if (exp->blk) {
1078 blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
1079 blk_aio_detach, exp);
1080 blk_unref(exp->blk);
1081 exp->blk = NULL;
1082 }
1083}
1084
1085void nbd_export_get(NBDExport *exp)
1086{
1087 assert(exp->refcount > 0);
1088 exp->refcount++;
1089}
1090
1091void nbd_export_put(NBDExport *exp)
1092{
1093 assert(exp->refcount > 0);
1094 if (exp->refcount == 1) {
1095 nbd_export_close(exp);
1096 }
1097
1098 if (--exp->refcount == 0) {
1099 assert(exp->name == NULL);
1100
1101 if (exp->close) {
1102 exp->close(exp);
1103 }
1104
1105 g_free(exp);
1106 }
1107}
1108
1109BlockBackend *nbd_export_get_blockdev(NBDExport *exp)
1110{
1111 return exp->blk;
1112}
1113
1114void nbd_export_close_all(void)
1115{
1116 NBDExport *exp, *next;
1117
1118 QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
1119 nbd_export_close(exp);
1120 }
1121}
1122
1123static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
1124 int len)
1125{
1126 NBDClient *client = req->client;
1127 int csock = client->sock;
1128 ssize_t rc, ret;
1129
1130 qemu_co_mutex_lock(&client->send_lock);
1131 client->send_coroutine = qemu_coroutine_self();
1132 nbd_set_handlers(client);
1133
1134 if (!len) {
1135 rc = nbd_send_reply(csock, reply);
1136 } else {
1137 socket_set_cork(csock, 1);
1138 rc = nbd_send_reply(csock, reply);
1139 if (rc >= 0) {
1140 ret = qemu_co_send(csock, req->data, len);
1141 if (ret != len) {
1142 rc = -EIO;
1143 }
1144 }
1145 socket_set_cork(csock, 0);
1146 }
1147
1148 client->send_coroutine = NULL;
1149 nbd_set_handlers(client);
1150 qemu_co_mutex_unlock(&client->send_lock);
1151 return rc;
1152}
1153
1154static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
1155{
1156 NBDClient *client = req->client;
1157 int csock = client->sock;
1158 uint32_t command;
1159 ssize_t rc;
1160
1161 client->recv_coroutine = qemu_coroutine_self();
1162 nbd_update_can_read(client);
1163
1164 rc = nbd_receive_request(csock, request);
1165 if (rc < 0) {
1166 if (rc != -EAGAIN) {
1167 rc = -EIO;
1168 }
1169 goto out;
1170 }
1171
1172 if (request->len > NBD_MAX_BUFFER_SIZE) {
1173 LOG("len (%u) is larger than max len (%u)",
1174 request->len, NBD_MAX_BUFFER_SIZE);
1175 rc = -EINVAL;
1176 goto out;
1177 }
1178
1179 if ((request->from + request->len) < request->from) {
1180 LOG("integer overflow detected! "
1181 "you're probably being attacked");
1182 rc = -EINVAL;
1183 goto out;
1184 }
1185
1186 TRACE("Decoding type");
1187
1188 command = request->type & NBD_CMD_MASK_COMMAND;
1189 if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
1190 req->data = blk_blockalign(client->exp->blk, request->len);
1191 }
1192 if (command == NBD_CMD_WRITE) {
1193 TRACE("Reading %u byte(s)", request->len);
1194
1195 if (qemu_co_recv(csock, req->data, request->len) != request->len) {
1196 LOG("reading from socket failed");
1197 rc = -EIO;
1198 goto out;
1199 }
1200 }
1201 rc = 0;
1202
1203out:
1204 client->recv_coroutine = NULL;
1205 nbd_update_can_read(client);
1206
1207 return rc;
1208}
1209
1210static void nbd_trip(void *opaque)
1211{
1212 NBDClient *client = opaque;
1213 NBDExport *exp = client->exp;
1214 NBDRequest *req;
1215 struct nbd_request request;
1216 struct nbd_reply reply;
1217 ssize_t ret;
1218 uint32_t command;
1219
1220 TRACE("Reading request.");
1221 if (client->closing) {
1222 return;
1223 }
1224
1225 req = nbd_request_get(client);
1226 ret = nbd_co_receive_request(req, &request);
1227 if (ret == -EAGAIN) {
1228 goto done;
1229 }
1230 if (ret == -EIO) {
1231 goto out;
1232 }
1233
1234 reply.handle = request.handle;
1235 reply.error = 0;
1236
1237 if (ret < 0) {
1238 reply.error = -ret;
1239 goto error_reply;
1240 }
1241 command = request.type & NBD_CMD_MASK_COMMAND;
1242 if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
1243 LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
1244 ", Offset: %" PRIu64 "\n",
1245 request.from, request.len,
1246 (uint64_t)exp->size, (uint64_t)exp->dev_offset);
1247 LOG("requested operation past EOF--bad client?");
1248 goto invalid_request;
1249 }
1250
1251 switch (command) {
1252 case NBD_CMD_READ:
1253 TRACE("Request type is READ");
1254
1255 if (request.type & NBD_CMD_FLAG_FUA) {
1256 ret = blk_co_flush(exp->blk);
1257 if (ret < 0) {
1258 LOG("flush failed");
1259 reply.error = -ret;
1260 goto error_reply;
1261 }
1262 }
1263
1264 ret = blk_read(exp->blk,
1265 (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
1266 req->data, request.len / BDRV_SECTOR_SIZE);
1267 if (ret < 0) {
1268 LOG("reading from file failed");
1269 reply.error = -ret;
1270 goto error_reply;
1271 }
1272
1273 TRACE("Read %u byte(s)", request.len);
1274 if (nbd_co_send_reply(req, &reply, request.len) < 0)
1275 goto out;
1276 break;
1277 case NBD_CMD_WRITE:
1278 TRACE("Request type is WRITE");
1279
1280 if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1281 TRACE("Server is read-only, return error");
1282 reply.error = EROFS;
1283 goto error_reply;
1284 }
1285
1286 TRACE("Writing to device");
1287
1288 ret = blk_write(exp->blk,
1289 (request.from + exp->dev_offset) / BDRV_SECTOR_SIZE,
1290 req->data, request.len / BDRV_SECTOR_SIZE);
1291 if (ret < 0) {
1292 LOG("writing to file failed");
1293 reply.error = -ret;
1294 goto error_reply;
1295 }
1296
1297 if (request.type & NBD_CMD_FLAG_FUA) {
1298 ret = blk_co_flush(exp->blk);
1299 if (ret < 0) {
1300 LOG("flush failed");
1301 reply.error = -ret;
1302 goto error_reply;
1303 }
1304 }
1305
1306 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1307 goto out;
1308 }
1309 break;
1310 case NBD_CMD_DISC:
1311 TRACE("Request type is DISCONNECT");
1312 errno = 0;
1313 goto out;
1314 case NBD_CMD_FLUSH:
1315 TRACE("Request type is FLUSH");
1316
1317 ret = blk_co_flush(exp->blk);
1318 if (ret < 0) {
1319 LOG("flush failed");
1320 reply.error = -ret;
1321 }
1322 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1323 goto out;
1324 }
1325 break;
1326 case NBD_CMD_TRIM:
1327 TRACE("Request type is TRIM");
1328 ret = blk_co_discard(exp->blk, (request.from + exp->dev_offset)
1329 / BDRV_SECTOR_SIZE,
1330 request.len / BDRV_SECTOR_SIZE);
1331 if (ret < 0) {
1332 LOG("discard failed");
1333 reply.error = -ret;
1334 }
1335 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1336 goto out;
1337 }
1338 break;
1339 default:
1340 LOG("invalid request type (%u) received", request.type);
1341 invalid_request:
1342 reply.error = EINVAL;
1343 error_reply:
1344 if (nbd_co_send_reply(req, &reply, 0) < 0) {
1345 goto out;
1346 }
1347 break;
1348 }
1349
1350 TRACE("Request/Reply complete");
1351
1352done:
1353 nbd_request_put(req);
1354 return;
1355
1356out:
1357 nbd_request_put(req);
1358 client_close(client);
1359}
1360
1361static void nbd_read(void *opaque)
1362{
1363 NBDClient *client = opaque;
1364
1365 if (client->recv_coroutine) {
1366 qemu_coroutine_enter(client->recv_coroutine, NULL);
1367 } else {
1368 qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1369 }
1370}
1371
1372static void nbd_restart_write(void *opaque)
1373{
1374 NBDClient *client = opaque;
1375
1376 qemu_coroutine_enter(client->send_coroutine, NULL);
1377}
1378
1379static void nbd_set_handlers(NBDClient *client)
1380{
1381 if (client->exp && client->exp->ctx) {
1382 aio_set_fd_handler(client->exp->ctx, client->sock,
1383 client->can_read ? nbd_read : NULL,
1384 client->send_coroutine ? nbd_restart_write : NULL,
1385 client);
1386 }
1387}
1388
1389static void nbd_unset_handlers(NBDClient *client)
1390{
1391 if (client->exp && client->exp->ctx) {
1392 aio_set_fd_handler(client->exp->ctx, client->sock, NULL, NULL, NULL);
1393 }
1394}
1395
1396static void nbd_update_can_read(NBDClient *client)
1397{
1398 bool can_read = client->recv_coroutine ||
1399 client->nb_requests < MAX_NBD_REQUESTS;
1400
1401 if (can_read != client->can_read) {
1402 client->can_read = can_read;
1403 nbd_set_handlers(client);
1404
1405
1406
1407 }
1408}
1409
1410NBDClient *nbd_client_new(NBDExport *exp, int csock,
1411 void (*close)(NBDClient *))
1412{
1413 NBDClient *client;
1414 client = g_malloc0(sizeof(NBDClient));
1415 client->refcount = 1;
1416 client->exp = exp;
1417 client->sock = csock;
1418 client->can_read = true;
1419 if (nbd_send_negotiate(client)) {
1420 g_free(client);
1421 return NULL;
1422 }
1423 client->close = close;
1424 qemu_co_mutex_init(&client->send_lock);
1425 nbd_set_handlers(client);
1426
1427 if (exp) {
1428 QTAILQ_INSERT_TAIL(&exp->clients, client, next);
1429 nbd_export_get(exp);
1430 }
1431 return client;
1432}
1433