1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include "qemu/osdep.h"
26#include "trace.h"
27#include "sysemu/block-backend.h"
28#include "block/aio-wait.h"
29#include "block/blockjob.h"
30#include "block/blockjob_int.h"
31#include "block/block_int.h"
32#include "qemu/cutils.h"
33#include "qapi/error.h"
34#include "qemu/error-report.h"
35
36#define NOT_DONE 0x7fffffff
37
38
39#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
40
41static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
42 int64_t offset, int bytes, BdrvRequestFlags flags);
43
44void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore)
45{
46 BdrvChild *c, *next;
47
48 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
49 if (c == ignore) {
50 continue;
51 }
52 if (c->role->drained_begin) {
53 c->role->drained_begin(c);
54 }
55 }
56}
57
58void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore)
59{
60 BdrvChild *c, *next;
61
62 QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
63 if (c == ignore) {
64 continue;
65 }
66 if (c->role->drained_end) {
67 c->role->drained_end(c);
68 }
69 }
70}
71
72static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
73{
74 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
75 dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
76 dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
77 src->opt_mem_alignment);
78 dst->min_mem_alignment = MAX(dst->min_mem_alignment,
79 src->min_mem_alignment);
80 dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
81}
82
83void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
84{
85 BlockDriver *drv = bs->drv;
86 Error *local_err = NULL;
87
88 memset(&bs->bl, 0, sizeof(bs->bl));
89
90 if (!drv) {
91 return;
92 }
93
94
95 bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
96
97
98 if (bs->file) {
99 bdrv_refresh_limits(bs->file->bs, &local_err);
100 if (local_err) {
101 error_propagate(errp, local_err);
102 return;
103 }
104 bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
105 } else {
106 bs->bl.min_mem_alignment = 512;
107 bs->bl.opt_mem_alignment = getpagesize();
108
109
110 bs->bl.max_iov = IOV_MAX;
111 }
112
113 if (bs->backing) {
114 bdrv_refresh_limits(bs->backing->bs, &local_err);
115 if (local_err) {
116 error_propagate(errp, local_err);
117 return;
118 }
119 bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
120 }
121
122
123 if (drv->bdrv_refresh_limits) {
124 drv->bdrv_refresh_limits(bs, errp);
125 }
126}
127
128
129
130
131
132
133void bdrv_enable_copy_on_read(BlockDriverState *bs)
134{
135 atomic_inc(&bs->copy_on_read);
136}
137
138void bdrv_disable_copy_on_read(BlockDriverState *bs)
139{
140 int old = atomic_fetch_dec(&bs->copy_on_read);
141 assert(old >= 1);
142}
143
144typedef struct {
145 Coroutine *co;
146 BlockDriverState *bs;
147 bool done;
148 bool begin;
149 bool recursive;
150 BdrvChild *parent;
151} BdrvCoDrainData;
152
153static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
154{
155 BdrvCoDrainData *data = opaque;
156 BlockDriverState *bs = data->bs;
157
158 if (data->begin) {
159 bs->drv->bdrv_co_drain_begin(bs);
160 } else {
161 bs->drv->bdrv_co_drain_end(bs);
162 }
163
164
165 atomic_mb_set(&data->done, true);
166 bdrv_wakeup(bs);
167}
168
169
170static void bdrv_drain_invoke(BlockDriverState *bs, bool begin, bool recursive)
171{
172 BdrvChild *child, *tmp;
173 BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
174
175 if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
176 (!begin && !bs->drv->bdrv_co_drain_end)) {
177 return;
178 }
179
180 data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
181 bdrv_coroutine_enter(bs, data.co);
182 BDRV_POLL_WHILE(bs, !data.done);
183
184 if (recursive) {
185 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
186 bdrv_drain_invoke(child->bs, begin, true);
187 }
188 }
189}
190
191static bool bdrv_drain_recurse(BlockDriverState *bs)
192{
193 BdrvChild *child, *tmp;
194 bool waited;
195
196
197 waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
198
199 QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
200 BlockDriverState *bs = child->bs;
201 bool in_main_loop =
202 qemu_get_current_aio_context() == qemu_get_aio_context();
203 assert(bs->refcnt > 0);
204 if (in_main_loop) {
205
206
207
208
209
210
211
212 bdrv_ref(bs);
213 }
214 waited |= bdrv_drain_recurse(bs);
215 if (in_main_loop) {
216 bdrv_unref(bs);
217 }
218 }
219
220 return waited;
221}
222
223static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
224 BdrvChild *parent);
225static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
226 BdrvChild *parent);
227
228static void bdrv_co_drain_bh_cb(void *opaque)
229{
230 BdrvCoDrainData *data = opaque;
231 Coroutine *co = data->co;
232 BlockDriverState *bs = data->bs;
233
234 bdrv_dec_in_flight(bs);
235 if (data->begin) {
236 bdrv_do_drained_begin(bs, data->recursive, data->parent);
237 } else {
238 bdrv_do_drained_end(bs, data->recursive, data->parent);
239 }
240
241 data->done = true;
242 aio_co_wake(co);
243}
244
245static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
246 bool begin, bool recursive,
247 BdrvChild *parent)
248{
249 BdrvCoDrainData data;
250
251
252
253
254 assert(qemu_in_coroutine());
255 data = (BdrvCoDrainData) {
256 .co = qemu_coroutine_self(),
257 .bs = bs,
258 .done = false,
259 .begin = begin,
260 .recursive = recursive,
261 .parent = parent,
262 };
263 bdrv_inc_in_flight(bs);
264 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
265 bdrv_co_drain_bh_cb, &data);
266
267 qemu_coroutine_yield();
268
269
270 assert(data.done);
271}
272
273void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
274 BdrvChild *parent)
275{
276 BdrvChild *child, *next;
277
278 if (qemu_in_coroutine()) {
279 bdrv_co_yield_to_drain(bs, true, recursive, parent);
280 return;
281 }
282
283
284 if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
285 aio_disable_external(bdrv_get_aio_context(bs));
286 }
287
288 bdrv_parent_drained_begin(bs, parent);
289 bdrv_drain_invoke(bs, true, false);
290 bdrv_drain_recurse(bs);
291
292 if (recursive) {
293 bs->recursive_quiesce_counter++;
294 QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
295 bdrv_do_drained_begin(child->bs, true, child);
296 }
297 }
298}
299
300void bdrv_drained_begin(BlockDriverState *bs)
301{
302 bdrv_do_drained_begin(bs, false, NULL);
303}
304
305void bdrv_subtree_drained_begin(BlockDriverState *bs)
306{
307 bdrv_do_drained_begin(bs, true, NULL);
308}
309
310void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
311 BdrvChild *parent)
312{
313 BdrvChild *child, *next;
314 int old_quiesce_counter;
315
316 if (qemu_in_coroutine()) {
317 bdrv_co_yield_to_drain(bs, false, recursive, parent);
318 return;
319 }
320 assert(bs->quiesce_counter > 0);
321 old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
322
323
324 bdrv_drain_invoke(bs, false, false);
325 bdrv_parent_drained_end(bs, parent);
326 if (old_quiesce_counter == 1) {
327 aio_enable_external(bdrv_get_aio_context(bs));
328 }
329
330 if (recursive) {
331 bs->recursive_quiesce_counter--;
332 QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
333 bdrv_do_drained_end(child->bs, true, child);
334 }
335 }
336}
337
338void bdrv_drained_end(BlockDriverState *bs)
339{
340 bdrv_do_drained_end(bs, false, NULL);
341}
342
343void bdrv_subtree_drained_end(BlockDriverState *bs)
344{
345 bdrv_do_drained_end(bs, true, NULL);
346}
347
348void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
349{
350 int i;
351
352 for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
353 bdrv_do_drained_begin(child->bs, true, child);
354 }
355}
356
357void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
358{
359 int i;
360
361 for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
362 bdrv_do_drained_end(child->bs, true, child);
363 }
364}
365
366
367
368
369
370
371
372
373
374
375
376
377void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
378{
379 assert(qemu_in_coroutine());
380 bdrv_drained_begin(bs);
381 bdrv_drained_end(bs);
382}
383
384void bdrv_drain(BlockDriverState *bs)
385{
386 bdrv_drained_begin(bs);
387 bdrv_drained_end(bs);
388}
389
390
391
392
393
394
395
396
397
398
399
400
401
402void bdrv_drain_all_begin(void)
403{
404
405 bool waited = true;
406 BlockDriverState *bs;
407 BdrvNextIterator it;
408 GSList *aio_ctxs = NULL, *ctx;
409
410
411
412
413
414 assert(qemu_get_current_aio_context() == qemu_get_aio_context());
415
416 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
417 AioContext *aio_context = bdrv_get_aio_context(bs);
418
419
420 aio_context_acquire(aio_context);
421 aio_disable_external(aio_context);
422 bdrv_parent_drained_begin(bs, NULL);
423 bdrv_drain_invoke(bs, true, true);
424 aio_context_release(aio_context);
425
426 if (!g_slist_find(aio_ctxs, aio_context)) {
427 aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
428 }
429 }
430
431
432
433
434
435
436
437 while (waited) {
438 waited = false;
439
440 for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
441 AioContext *aio_context = ctx->data;
442
443 aio_context_acquire(aio_context);
444 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
445 if (aio_context == bdrv_get_aio_context(bs)) {
446 waited |= bdrv_drain_recurse(bs);
447 }
448 }
449 aio_context_release(aio_context);
450 }
451 }
452
453 g_slist_free(aio_ctxs);
454}
455
456void bdrv_drain_all_end(void)
457{
458 BlockDriverState *bs;
459 BdrvNextIterator it;
460
461 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
462 AioContext *aio_context = bdrv_get_aio_context(bs);
463
464
465 aio_context_acquire(aio_context);
466 bdrv_drain_invoke(bs, false, true);
467 bdrv_parent_drained_end(bs, NULL);
468 aio_enable_external(aio_context);
469 aio_context_release(aio_context);
470 }
471}
472
473void bdrv_drain_all(void)
474{
475 bdrv_drain_all_begin();
476 bdrv_drain_all_end();
477}
478
479
480
481
482
483
484static void tracked_request_end(BdrvTrackedRequest *req)
485{
486 if (req->serialising) {
487 atomic_dec(&req->bs->serialising_in_flight);
488 }
489
490 qemu_co_mutex_lock(&req->bs->reqs_lock);
491 QLIST_REMOVE(req, list);
492 qemu_co_queue_restart_all(&req->wait_queue);
493 qemu_co_mutex_unlock(&req->bs->reqs_lock);
494}
495
496
497
498
499static void tracked_request_begin(BdrvTrackedRequest *req,
500 BlockDriverState *bs,
501 int64_t offset,
502 unsigned int bytes,
503 enum BdrvTrackedRequestType type)
504{
505 *req = (BdrvTrackedRequest){
506 .bs = bs,
507 .offset = offset,
508 .bytes = bytes,
509 .type = type,
510 .co = qemu_coroutine_self(),
511 .serialising = false,
512 .overlap_offset = offset,
513 .overlap_bytes = bytes,
514 };
515
516 qemu_co_queue_init(&req->wait_queue);
517
518 qemu_co_mutex_lock(&bs->reqs_lock);
519 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
520 qemu_co_mutex_unlock(&bs->reqs_lock);
521}
522
523static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
524{
525 int64_t overlap_offset = req->offset & ~(align - 1);
526 unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
527 - overlap_offset;
528
529 if (!req->serialising) {
530 atomic_inc(&req->bs->serialising_in_flight);
531 req->serialising = true;
532 }
533
534 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
535 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
536}
537
538
539
540
541void bdrv_round_to_clusters(BlockDriverState *bs,
542 int64_t offset, int64_t bytes,
543 int64_t *cluster_offset,
544 int64_t *cluster_bytes)
545{
546 BlockDriverInfo bdi;
547
548 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
549 *cluster_offset = offset;
550 *cluster_bytes = bytes;
551 } else {
552 int64_t c = bdi.cluster_size;
553 *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
554 *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
555 }
556}
557
558static int bdrv_get_cluster_size(BlockDriverState *bs)
559{
560 BlockDriverInfo bdi;
561 int ret;
562
563 ret = bdrv_get_info(bs, &bdi);
564 if (ret < 0 || bdi.cluster_size == 0) {
565 return bs->bl.request_alignment;
566 } else {
567 return bdi.cluster_size;
568 }
569}
570
571static bool tracked_request_overlaps(BdrvTrackedRequest *req,
572 int64_t offset, unsigned int bytes)
573{
574
575 if (offset >= req->overlap_offset + req->overlap_bytes) {
576 return false;
577 }
578
579 if (req->overlap_offset >= offset + bytes) {
580 return false;
581 }
582 return true;
583}
584
585void bdrv_inc_in_flight(BlockDriverState *bs)
586{
587 atomic_inc(&bs->in_flight);
588}
589
590void bdrv_wakeup(BlockDriverState *bs)
591{
592 aio_wait_kick(bdrv_get_aio_wait(bs));
593}
594
595void bdrv_dec_in_flight(BlockDriverState *bs)
596{
597 atomic_dec(&bs->in_flight);
598 bdrv_wakeup(bs);
599}
600
601static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
602{
603 BlockDriverState *bs = self->bs;
604 BdrvTrackedRequest *req;
605 bool retry;
606 bool waited = false;
607
608 if (!atomic_read(&bs->serialising_in_flight)) {
609 return false;
610 }
611
612 do {
613 retry = false;
614 qemu_co_mutex_lock(&bs->reqs_lock);
615 QLIST_FOREACH(req, &bs->tracked_requests, list) {
616 if (req == self || (!req->serialising && !self->serialising)) {
617 continue;
618 }
619 if (tracked_request_overlaps(req, self->overlap_offset,
620 self->overlap_bytes))
621 {
622
623
624
625
626 assert(qemu_coroutine_self() != req->co);
627
628
629
630
631 if (!req->waiting_for) {
632 self->waiting_for = req;
633 qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
634 self->waiting_for = NULL;
635 retry = true;
636 waited = true;
637 break;
638 }
639 }
640 }
641 qemu_co_mutex_unlock(&bs->reqs_lock);
642 } while (retry);
643
644 return waited;
645}
646
647static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
648 size_t size)
649{
650 if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
651 return -EIO;
652 }
653
654 if (!bdrv_is_inserted(bs)) {
655 return -ENOMEDIUM;
656 }
657
658 if (offset < 0) {
659 return -EIO;
660 }
661
662 return 0;
663}
664
665typedef struct RwCo {
666 BdrvChild *child;
667 int64_t offset;
668 QEMUIOVector *qiov;
669 bool is_write;
670 int ret;
671 BdrvRequestFlags flags;
672} RwCo;
673
674static void coroutine_fn bdrv_rw_co_entry(void *opaque)
675{
676 RwCo *rwco = opaque;
677
678 if (!rwco->is_write) {
679 rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
680 rwco->qiov->size, rwco->qiov,
681 rwco->flags);
682 } else {
683 rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
684 rwco->qiov->size, rwco->qiov,
685 rwco->flags);
686 }
687}
688
689
690
691
692static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
693 QEMUIOVector *qiov, bool is_write,
694 BdrvRequestFlags flags)
695{
696 Coroutine *co;
697 RwCo rwco = {
698 .child = child,
699 .offset = offset,
700 .qiov = qiov,
701 .is_write = is_write,
702 .ret = NOT_DONE,
703 .flags = flags,
704 };
705
706 if (qemu_in_coroutine()) {
707
708 bdrv_rw_co_entry(&rwco);
709 } else {
710 co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
711 bdrv_coroutine_enter(child->bs, co);
712 BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
713 }
714 return rwco.ret;
715}
716
717
718
719
720static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
721 int nb_sectors, bool is_write, BdrvRequestFlags flags)
722{
723 QEMUIOVector qiov;
724 struct iovec iov = {
725 .iov_base = (void *)buf,
726 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
727 };
728
729 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
730 return -EINVAL;
731 }
732
733 qemu_iovec_init_external(&qiov, &iov, 1);
734 return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
735 &qiov, is_write, flags);
736}
737
738
739int bdrv_read(BdrvChild *child, int64_t sector_num,
740 uint8_t *buf, int nb_sectors)
741{
742 return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
743}
744
745
746
747
748
749
750
751int bdrv_write(BdrvChild *child, int64_t sector_num,
752 const uint8_t *buf, int nb_sectors)
753{
754 return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
755}
756
757int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
758 int bytes, BdrvRequestFlags flags)
759{
760 QEMUIOVector qiov;
761 struct iovec iov = {
762 .iov_base = NULL,
763 .iov_len = bytes,
764 };
765
766 qemu_iovec_init_external(&qiov, &iov, 1);
767 return bdrv_prwv_co(child, offset, &qiov, true,
768 BDRV_REQ_ZERO_WRITE | flags);
769}
770
771
772
773
774
775
776
777
778
779
780int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
781{
782 int ret;
783 int64_t target_size, bytes, offset = 0;
784 BlockDriverState *bs = child->bs;
785
786 target_size = bdrv_getlength(bs);
787 if (target_size < 0) {
788 return target_size;
789 }
790
791 for (;;) {
792 bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
793 if (bytes <= 0) {
794 return 0;
795 }
796 ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
797 if (ret < 0) {
798 error_report("error getting block status at offset %" PRId64 ": %s",
799 offset, strerror(-ret));
800 return ret;
801 }
802 if (ret & BDRV_BLOCK_ZERO) {
803 offset += bytes;
804 continue;
805 }
806 ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
807 if (ret < 0) {
808 error_report("error writing zeroes at offset %" PRId64 ": %s",
809 offset, strerror(-ret));
810 return ret;
811 }
812 offset += bytes;
813 }
814}
815
816int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
817{
818 int ret;
819
820 ret = bdrv_prwv_co(child, offset, qiov, false, 0);
821 if (ret < 0) {
822 return ret;
823 }
824
825 return qiov->size;
826}
827
828int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
829{
830 QEMUIOVector qiov;
831 struct iovec iov = {
832 .iov_base = (void *)buf,
833 .iov_len = bytes,
834 };
835
836 if (bytes < 0) {
837 return -EINVAL;
838 }
839
840 qemu_iovec_init_external(&qiov, &iov, 1);
841 return bdrv_preadv(child, offset, &qiov);
842}
843
844int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
845{
846 int ret;
847
848 ret = bdrv_prwv_co(child, offset, qiov, true, 0);
849 if (ret < 0) {
850 return ret;
851 }
852
853 return qiov->size;
854}
855
856int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
857{
858 QEMUIOVector qiov;
859 struct iovec iov = {
860 .iov_base = (void *) buf,
861 .iov_len = bytes,
862 };
863
864 if (bytes < 0) {
865 return -EINVAL;
866 }
867
868 qemu_iovec_init_external(&qiov, &iov, 1);
869 return bdrv_pwritev(child, offset, &qiov);
870}
871
872
873
874
875
876
877
878int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
879 const void *buf, int count)
880{
881 int ret;
882
883 ret = bdrv_pwrite(child, offset, buf, count);
884 if (ret < 0) {
885 return ret;
886 }
887
888 ret = bdrv_flush(child->bs);
889 if (ret < 0) {
890 return ret;
891 }
892
893 return 0;
894}
895
896typedef struct CoroutineIOCompletion {
897 Coroutine *coroutine;
898 int ret;
899} CoroutineIOCompletion;
900
901static void bdrv_co_io_em_complete(void *opaque, int ret)
902{
903 CoroutineIOCompletion *co = opaque;
904
905 co->ret = ret;
906 aio_co_wake(co->coroutine);
907}
908
909static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
910 uint64_t offset, uint64_t bytes,
911 QEMUIOVector *qiov, int flags)
912{
913 BlockDriver *drv = bs->drv;
914 int64_t sector_num;
915 unsigned int nb_sectors;
916
917 assert(!(flags & ~BDRV_REQ_MASK));
918
919 if (!drv) {
920 return -ENOMEDIUM;
921 }
922
923 if (drv->bdrv_co_preadv) {
924 return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
925 }
926
927 sector_num = offset >> BDRV_SECTOR_BITS;
928 nb_sectors = bytes >> BDRV_SECTOR_BITS;
929
930 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
931 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
932 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
933
934 if (drv->bdrv_co_readv) {
935 return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
936 } else {
937 BlockAIOCB *acb;
938 CoroutineIOCompletion co = {
939 .coroutine = qemu_coroutine_self(),
940 };
941
942 acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
943 bdrv_co_io_em_complete, &co);
944 if (acb == NULL) {
945 return -EIO;
946 } else {
947 qemu_coroutine_yield();
948 return co.ret;
949 }
950 }
951}
952
953static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
954 uint64_t offset, uint64_t bytes,
955 QEMUIOVector *qiov, int flags)
956{
957 BlockDriver *drv = bs->drv;
958 int64_t sector_num;
959 unsigned int nb_sectors;
960 int ret;
961
962 assert(!(flags & ~BDRV_REQ_MASK));
963
964 if (!drv) {
965 return -ENOMEDIUM;
966 }
967
968 if (drv->bdrv_co_pwritev) {
969 ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
970 flags & bs->supported_write_flags);
971 flags &= ~bs->supported_write_flags;
972 goto emulate_flags;
973 }
974
975 sector_num = offset >> BDRV_SECTOR_BITS;
976 nb_sectors = bytes >> BDRV_SECTOR_BITS;
977
978 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
979 assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
980 assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
981
982 if (drv->bdrv_co_writev_flags) {
983 ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
984 flags & bs->supported_write_flags);
985 flags &= ~bs->supported_write_flags;
986 } else if (drv->bdrv_co_writev) {
987 assert(!bs->supported_write_flags);
988 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
989 } else {
990 BlockAIOCB *acb;
991 CoroutineIOCompletion co = {
992 .coroutine = qemu_coroutine_self(),
993 };
994
995 acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
996 bdrv_co_io_em_complete, &co);
997 if (acb == NULL) {
998 ret = -EIO;
999 } else {
1000 qemu_coroutine_yield();
1001 ret = co.ret;
1002 }
1003 }
1004
1005emulate_flags:
1006 if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1007 ret = bdrv_co_flush(bs);
1008 }
1009
1010 return ret;
1011}
1012
1013static int coroutine_fn
1014bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1015 uint64_t bytes, QEMUIOVector *qiov)
1016{
1017 BlockDriver *drv = bs->drv;
1018
1019 if (!drv) {
1020 return -ENOMEDIUM;
1021 }
1022
1023 if (!drv->bdrv_co_pwritev_compressed) {
1024 return -ENOTSUP;
1025 }
1026
1027 return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1028}
1029
1030static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1031 int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
1032{
1033 BlockDriverState *bs = child->bs;
1034
1035
1036
1037
1038
1039
1040 void *bounce_buffer;
1041
1042 BlockDriver *drv = bs->drv;
1043 struct iovec iov;
1044 QEMUIOVector local_qiov;
1045 int64_t cluster_offset;
1046 int64_t cluster_bytes;
1047 size_t skip_bytes;
1048 int ret;
1049 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1050 BDRV_REQUEST_MAX_BYTES);
1051 unsigned int progress = 0;
1052
1053 if (!drv) {
1054 return -ENOMEDIUM;
1055 }
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071 bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1072 skip_bytes = offset - cluster_offset;
1073
1074 trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1075 cluster_offset, cluster_bytes);
1076
1077 bounce_buffer = qemu_try_blockalign(bs,
1078 MIN(MIN(max_transfer, cluster_bytes),
1079 MAX_BOUNCE_BUFFER));
1080 if (bounce_buffer == NULL) {
1081 ret = -ENOMEM;
1082 goto err;
1083 }
1084
1085 while (cluster_bytes) {
1086 int64_t pnum;
1087
1088 ret = bdrv_is_allocated(bs, cluster_offset,
1089 MIN(cluster_bytes, max_transfer), &pnum);
1090 if (ret < 0) {
1091
1092
1093
1094
1095 pnum = MIN(cluster_bytes, max_transfer);
1096 }
1097
1098 assert(skip_bytes < pnum);
1099
1100 if (ret <= 0) {
1101
1102 iov.iov_base = bounce_buffer;
1103 iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1104 qemu_iovec_init_external(&local_qiov, &iov, 1);
1105
1106 ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1107 &local_qiov, 0);
1108 if (ret < 0) {
1109 goto err;
1110 }
1111
1112 bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1113 if (drv->bdrv_co_pwrite_zeroes &&
1114 buffer_is_zero(bounce_buffer, pnum)) {
1115
1116
1117
1118 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
1119 } else {
1120
1121
1122
1123 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1124 &local_qiov, 0);
1125 }
1126
1127 if (ret < 0) {
1128
1129
1130
1131
1132
1133 goto err;
1134 }
1135
1136 qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1137 pnum - skip_bytes);
1138 } else {
1139
1140 qemu_iovec_init(&local_qiov, qiov->niov);
1141 qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1142 ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1143 &local_qiov, 0);
1144 qemu_iovec_destroy(&local_qiov);
1145 if (ret < 0) {
1146 goto err;
1147 }
1148 }
1149
1150 cluster_offset += pnum;
1151 cluster_bytes -= pnum;
1152 progress += pnum - skip_bytes;
1153 skip_bytes = 0;
1154 }
1155 ret = 0;
1156
1157err:
1158 qemu_vfree(bounce_buffer);
1159 return ret;
1160}
1161
1162
1163
1164
1165
1166
1167static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1168 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1169 int64_t align, QEMUIOVector *qiov, int flags)
1170{
1171 BlockDriverState *bs = child->bs;
1172 int64_t total_bytes, max_bytes;
1173 int ret = 0;
1174 uint64_t bytes_remaining = bytes;
1175 int max_transfer;
1176
1177 assert(is_power_of_2(align));
1178 assert((offset & (align - 1)) == 0);
1179 assert((bytes & (align - 1)) == 0);
1180 assert(!qiov || bytes == qiov->size);
1181 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1182 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1183 align);
1184
1185
1186
1187
1188
1189 assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1190
1191
1192 if (flags & BDRV_REQ_COPY_ON_READ) {
1193
1194
1195
1196
1197
1198 mark_request_serialising(req, bdrv_get_cluster_size(bs));
1199 }
1200
1201 if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1202 wait_serialising_requests(req);
1203 }
1204
1205 if (flags & BDRV_REQ_COPY_ON_READ) {
1206 int64_t pnum;
1207
1208 ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1209 if (ret < 0) {
1210 goto out;
1211 }
1212
1213 if (!ret || pnum != bytes) {
1214 ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1215 goto out;
1216 }
1217 }
1218
1219
1220 total_bytes = bdrv_getlength(bs);
1221 if (total_bytes < 0) {
1222 ret = total_bytes;
1223 goto out;
1224 }
1225
1226 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1227 if (bytes <= max_bytes && bytes <= max_transfer) {
1228 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1229 goto out;
1230 }
1231
1232 while (bytes_remaining) {
1233 int num;
1234
1235 if (max_bytes) {
1236 QEMUIOVector local_qiov;
1237
1238 num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1239 assert(num);
1240 qemu_iovec_init(&local_qiov, qiov->niov);
1241 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1242
1243 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1244 num, &local_qiov, 0);
1245 max_bytes -= num;
1246 qemu_iovec_destroy(&local_qiov);
1247 } else {
1248 num = bytes_remaining;
1249 ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1250 bytes_remaining);
1251 }
1252 if (ret < 0) {
1253 goto out;
1254 }
1255 bytes_remaining -= num;
1256 }
1257
1258out:
1259 return ret < 0 ? ret : 0;
1260}
1261
1262
1263
1264
1265int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1266 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1267 BdrvRequestFlags flags)
1268{
1269 BlockDriverState *bs = child->bs;
1270 BlockDriver *drv = bs->drv;
1271 BdrvTrackedRequest req;
1272
1273 uint64_t align = bs->bl.request_alignment;
1274 uint8_t *head_buf = NULL;
1275 uint8_t *tail_buf = NULL;
1276 QEMUIOVector local_qiov;
1277 bool use_local_qiov = false;
1278 int ret;
1279
1280 trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1281
1282 if (!drv) {
1283 return -ENOMEDIUM;
1284 }
1285
1286 ret = bdrv_check_byte_request(bs, offset, bytes);
1287 if (ret < 0) {
1288 return ret;
1289 }
1290
1291 bdrv_inc_in_flight(bs);
1292
1293
1294 if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1295 flags |= BDRV_REQ_COPY_ON_READ;
1296 }
1297
1298
1299 if (offset & (align - 1)) {
1300 head_buf = qemu_blockalign(bs, align);
1301 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1302 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1303 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1304 use_local_qiov = true;
1305
1306 bytes += offset & (align - 1);
1307 offset = offset & ~(align - 1);
1308 }
1309
1310 if ((offset + bytes) & (align - 1)) {
1311 if (!use_local_qiov) {
1312 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1313 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1314 use_local_qiov = true;
1315 }
1316 tail_buf = qemu_blockalign(bs, align);
1317 qemu_iovec_add(&local_qiov, tail_buf,
1318 align - ((offset + bytes) & (align - 1)));
1319
1320 bytes = ROUND_UP(bytes, align);
1321 }
1322
1323 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1324 ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1325 use_local_qiov ? &local_qiov : qiov,
1326 flags);
1327 tracked_request_end(&req);
1328 bdrv_dec_in_flight(bs);
1329
1330 if (use_local_qiov) {
1331 qemu_iovec_destroy(&local_qiov);
1332 qemu_vfree(head_buf);
1333 qemu_vfree(tail_buf);
1334 }
1335
1336 return ret;
1337}
1338
1339static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
1340 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1341 BdrvRequestFlags flags)
1342{
1343 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1344 return -EINVAL;
1345 }
1346
1347 return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
1348 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1349}
1350
1351int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1352 int nb_sectors, QEMUIOVector *qiov)
1353{
1354 return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
1355}
1356
1357static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1358 int64_t offset, int bytes, BdrvRequestFlags flags)
1359{
1360 BlockDriver *drv = bs->drv;
1361 QEMUIOVector qiov;
1362 struct iovec iov = {0};
1363 int ret = 0;
1364 bool need_flush = false;
1365 int head = 0;
1366 int tail = 0;
1367
1368 int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1369 int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1370 bs->bl.request_alignment);
1371 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1372
1373 if (!drv) {
1374 return -ENOMEDIUM;
1375 }
1376
1377 assert(alignment % bs->bl.request_alignment == 0);
1378 head = offset % alignment;
1379 tail = (offset + bytes) % alignment;
1380 max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1381 assert(max_write_zeroes >= bs->bl.request_alignment);
1382
1383 while (bytes > 0 && !ret) {
1384 int num = bytes;
1385
1386
1387
1388
1389
1390 if (head) {
1391
1392
1393
1394 num = MIN(MIN(bytes, max_transfer), alignment - head);
1395 head = (head + num) % alignment;
1396 assert(num < max_write_zeroes);
1397 } else if (tail && num > alignment) {
1398
1399 num -= tail;
1400 }
1401
1402
1403 if (num > max_write_zeroes) {
1404 num = max_write_zeroes;
1405 }
1406
1407 ret = -ENOTSUP;
1408
1409 if (drv->bdrv_co_pwrite_zeroes) {
1410 ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1411 flags & bs->supported_zero_flags);
1412 if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1413 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1414 need_flush = true;
1415 }
1416 } else {
1417 assert(!bs->supported_zero_flags);
1418 }
1419
1420 if (ret == -ENOTSUP) {
1421
1422 BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1423
1424 if ((flags & BDRV_REQ_FUA) &&
1425 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1426
1427
1428 write_flags &= ~BDRV_REQ_FUA;
1429 need_flush = true;
1430 }
1431 num = MIN(num, max_transfer);
1432 iov.iov_len = num;
1433 if (iov.iov_base == NULL) {
1434 iov.iov_base = qemu_try_blockalign(bs, num);
1435 if (iov.iov_base == NULL) {
1436 ret = -ENOMEM;
1437 goto fail;
1438 }
1439 memset(iov.iov_base, 0, num);
1440 }
1441 qemu_iovec_init_external(&qiov, &iov, 1);
1442
1443 ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1444
1445
1446
1447
1448 if (num < max_transfer) {
1449 qemu_vfree(iov.iov_base);
1450 iov.iov_base = NULL;
1451 }
1452 }
1453
1454 offset += num;
1455 bytes -= num;
1456 }
1457
1458fail:
1459 if (ret == 0 && need_flush) {
1460 ret = bdrv_co_flush(bs);
1461 }
1462 qemu_vfree(iov.iov_base);
1463 return ret;
1464}
1465
1466
1467
1468
1469
1470static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1471 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1472 int64_t align, QEMUIOVector *qiov, int flags)
1473{
1474 BlockDriverState *bs = child->bs;
1475 BlockDriver *drv = bs->drv;
1476 bool waited;
1477 int ret;
1478
1479 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1480 uint64_t bytes_remaining = bytes;
1481 int max_transfer;
1482
1483 if (!drv) {
1484 return -ENOMEDIUM;
1485 }
1486
1487 if (bdrv_has_readonly_bitmaps(bs)) {
1488 return -EPERM;
1489 }
1490
1491 assert(is_power_of_2(align));
1492 assert((offset & (align - 1)) == 0);
1493 assert((bytes & (align - 1)) == 0);
1494 assert(!qiov || bytes == qiov->size);
1495 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1496 assert(!(flags & ~BDRV_REQ_MASK));
1497 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1498 align);
1499
1500 waited = wait_serialising_requests(req);
1501 assert(!waited || !req->serialising);
1502 assert(req->overlap_offset <= offset);
1503 assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1504 assert(child->perm & BLK_PERM_WRITE);
1505 assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1506
1507 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1508
1509 if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1510 !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1511 qemu_iovec_is_zero(qiov)) {
1512 flags |= BDRV_REQ_ZERO_WRITE;
1513 if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1514 flags |= BDRV_REQ_MAY_UNMAP;
1515 }
1516 }
1517
1518 if (ret < 0) {
1519
1520 } else if (flags & BDRV_REQ_ZERO_WRITE) {
1521 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1522 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1523 } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1524 ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1525 } else if (bytes <= max_transfer) {
1526 bdrv_debug_event(bs, BLKDBG_PWRITEV);
1527 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1528 } else {
1529 bdrv_debug_event(bs, BLKDBG_PWRITEV);
1530 while (bytes_remaining) {
1531 int num = MIN(bytes_remaining, max_transfer);
1532 QEMUIOVector local_qiov;
1533 int local_flags = flags;
1534
1535 assert(num);
1536 if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1537 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1538
1539
1540 local_flags &= ~BDRV_REQ_FUA;
1541 }
1542 qemu_iovec_init(&local_qiov, qiov->niov);
1543 qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1544
1545 ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1546 num, &local_qiov, local_flags);
1547 qemu_iovec_destroy(&local_qiov);
1548 if (ret < 0) {
1549 break;
1550 }
1551 bytes_remaining -= num;
1552 }
1553 }
1554 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1555
1556 atomic_inc(&bs->write_gen);
1557 bdrv_set_dirty(bs, offset, bytes);
1558
1559 stat64_max(&bs->wr_highest_offset, offset + bytes);
1560
1561 if (ret >= 0) {
1562 bs->total_sectors = MAX(bs->total_sectors, end_sector);
1563 ret = 0;
1564 }
1565
1566 return ret;
1567}
1568
1569static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1570 int64_t offset,
1571 unsigned int bytes,
1572 BdrvRequestFlags flags,
1573 BdrvTrackedRequest *req)
1574{
1575 BlockDriverState *bs = child->bs;
1576 uint8_t *buf = NULL;
1577 QEMUIOVector local_qiov;
1578 struct iovec iov;
1579 uint64_t align = bs->bl.request_alignment;
1580 unsigned int head_padding_bytes, tail_padding_bytes;
1581 int ret = 0;
1582
1583 head_padding_bytes = offset & (align - 1);
1584 tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
1585
1586
1587 assert(flags & BDRV_REQ_ZERO_WRITE);
1588 if (head_padding_bytes || tail_padding_bytes) {
1589 buf = qemu_blockalign(bs, align);
1590 iov = (struct iovec) {
1591 .iov_base = buf,
1592 .iov_len = align,
1593 };
1594 qemu_iovec_init_external(&local_qiov, &iov, 1);
1595 }
1596 if (head_padding_bytes) {
1597 uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1598
1599
1600 mark_request_serialising(req, align);
1601 wait_serialising_requests(req);
1602 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1603 ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1604 align, &local_qiov, 0);
1605 if (ret < 0) {
1606 goto fail;
1607 }
1608 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1609
1610 memset(buf + head_padding_bytes, 0, zero_bytes);
1611 ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1612 align, &local_qiov,
1613 flags & ~BDRV_REQ_ZERO_WRITE);
1614 if (ret < 0) {
1615 goto fail;
1616 }
1617 offset += zero_bytes;
1618 bytes -= zero_bytes;
1619 }
1620
1621 assert(!bytes || (offset & (align - 1)) == 0);
1622 if (bytes >= align) {
1623
1624 uint64_t aligned_bytes = bytes & ~(align - 1);
1625 ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1626 NULL, flags);
1627 if (ret < 0) {
1628 goto fail;
1629 }
1630 bytes -= aligned_bytes;
1631 offset += aligned_bytes;
1632 }
1633
1634 assert(!bytes || (offset & (align - 1)) == 0);
1635 if (bytes) {
1636 assert(align == tail_padding_bytes + bytes);
1637
1638 mark_request_serialising(req, align);
1639 wait_serialising_requests(req);
1640 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1641 ret = bdrv_aligned_preadv(child, req, offset, align,
1642 align, &local_qiov, 0);
1643 if (ret < 0) {
1644 goto fail;
1645 }
1646 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1647
1648 memset(buf, 0, bytes);
1649 ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1650 &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1651 }
1652fail:
1653 qemu_vfree(buf);
1654 return ret;
1655
1656}
1657
1658
1659
1660
1661int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1662 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1663 BdrvRequestFlags flags)
1664{
1665 BlockDriverState *bs = child->bs;
1666 BdrvTrackedRequest req;
1667 uint64_t align = bs->bl.request_alignment;
1668 uint8_t *head_buf = NULL;
1669 uint8_t *tail_buf = NULL;
1670 QEMUIOVector local_qiov;
1671 bool use_local_qiov = false;
1672 int ret;
1673
1674 trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1675
1676 if (!bs->drv) {
1677 return -ENOMEDIUM;
1678 }
1679 if (bs->read_only) {
1680 return -EPERM;
1681 }
1682 assert(!(bs->open_flags & BDRV_O_INACTIVE));
1683
1684 ret = bdrv_check_byte_request(bs, offset, bytes);
1685 if (ret < 0) {
1686 return ret;
1687 }
1688
1689 bdrv_inc_in_flight(bs);
1690
1691
1692
1693
1694
1695 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1696
1697 if (flags & BDRV_REQ_ZERO_WRITE) {
1698 ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1699 goto out;
1700 }
1701
1702 if (offset & (align - 1)) {
1703 QEMUIOVector head_qiov;
1704 struct iovec head_iov;
1705
1706 mark_request_serialising(&req, align);
1707 wait_serialising_requests(&req);
1708
1709 head_buf = qemu_blockalign(bs, align);
1710 head_iov = (struct iovec) {
1711 .iov_base = head_buf,
1712 .iov_len = align,
1713 };
1714 qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1715
1716 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1717 ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1718 align, &head_qiov, 0);
1719 if (ret < 0) {
1720 goto fail;
1721 }
1722 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1723
1724 qemu_iovec_init(&local_qiov, qiov->niov + 2);
1725 qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1726 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1727 use_local_qiov = true;
1728
1729 bytes += offset & (align - 1);
1730 offset = offset & ~(align - 1);
1731
1732
1733
1734
1735 if (bytes < align) {
1736 qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1737 bytes = align;
1738 }
1739 }
1740
1741 if ((offset + bytes) & (align - 1)) {
1742 QEMUIOVector tail_qiov;
1743 struct iovec tail_iov;
1744 size_t tail_bytes;
1745 bool waited;
1746
1747 mark_request_serialising(&req, align);
1748 waited = wait_serialising_requests(&req);
1749 assert(!waited || !use_local_qiov);
1750
1751 tail_buf = qemu_blockalign(bs, align);
1752 tail_iov = (struct iovec) {
1753 .iov_base = tail_buf,
1754 .iov_len = align,
1755 };
1756 qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1757
1758 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1759 ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1760 align, align, &tail_qiov, 0);
1761 if (ret < 0) {
1762 goto fail;
1763 }
1764 bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1765
1766 if (!use_local_qiov) {
1767 qemu_iovec_init(&local_qiov, qiov->niov + 1);
1768 qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1769 use_local_qiov = true;
1770 }
1771
1772 tail_bytes = (offset + bytes) & (align - 1);
1773 qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1774
1775 bytes = ROUND_UP(bytes, align);
1776 }
1777
1778 ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1779 use_local_qiov ? &local_qiov : qiov,
1780 flags);
1781
1782fail:
1783
1784 if (use_local_qiov) {
1785 qemu_iovec_destroy(&local_qiov);
1786 }
1787 qemu_vfree(head_buf);
1788 qemu_vfree(tail_buf);
1789out:
1790 tracked_request_end(&req);
1791 bdrv_dec_in_flight(bs);
1792 return ret;
1793}
1794
1795static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
1796 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1797 BdrvRequestFlags flags)
1798{
1799 if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1800 return -EINVAL;
1801 }
1802
1803 return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
1804 nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1805}
1806
1807int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
1808 int nb_sectors, QEMUIOVector *qiov)
1809{
1810 return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
1811}
1812
1813int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1814 int bytes, BdrvRequestFlags flags)
1815{
1816 trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
1817
1818 if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1819 flags &= ~BDRV_REQ_MAY_UNMAP;
1820 }
1821
1822 return bdrv_co_pwritev(child, offset, bytes, NULL,
1823 BDRV_REQ_ZERO_WRITE | flags);
1824}
1825
1826
1827
1828
1829int bdrv_flush_all(void)
1830{
1831 BdrvNextIterator it;
1832 BlockDriverState *bs = NULL;
1833 int result = 0;
1834
1835 for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1836 AioContext *aio_context = bdrv_get_aio_context(bs);
1837 int ret;
1838
1839 aio_context_acquire(aio_context);
1840 ret = bdrv_flush(bs);
1841 if (ret < 0 && !result) {
1842 result = ret;
1843 }
1844 aio_context_release(aio_context);
1845 }
1846
1847 return result;
1848}
1849
1850
1851typedef struct BdrvCoBlockStatusData {
1852 BlockDriverState *bs;
1853 BlockDriverState *base;
1854 bool want_zero;
1855 int64_t offset;
1856 int64_t bytes;
1857 int64_t *pnum;
1858 int64_t *map;
1859 BlockDriverState **file;
1860 int ret;
1861 bool done;
1862} BdrvCoBlockStatusData;
1863
1864int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
1865 bool want_zero,
1866 int64_t offset,
1867 int64_t bytes,
1868 int64_t *pnum,
1869 int64_t *map,
1870 BlockDriverState **file)
1871{
1872 assert(bs->file && bs->file->bs);
1873 *pnum = bytes;
1874 *map = offset;
1875 *file = bs->file->bs;
1876 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1877}
1878
1879int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
1880 bool want_zero,
1881 int64_t offset,
1882 int64_t bytes,
1883 int64_t *pnum,
1884 int64_t *map,
1885 BlockDriverState **file)
1886{
1887 assert(bs->backing && bs->backing->bs);
1888 *pnum = bytes;
1889 *map = offset;
1890 *file = bs->backing->bs;
1891 return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
1892}
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
1922 bool want_zero,
1923 int64_t offset, int64_t bytes,
1924 int64_t *pnum, int64_t *map,
1925 BlockDriverState **file)
1926{
1927 int64_t total_size;
1928 int64_t n;
1929 int ret;
1930 int64_t local_map = 0;
1931 BlockDriverState *local_file = NULL;
1932 int64_t aligned_offset, aligned_bytes;
1933 uint32_t align;
1934
1935 assert(pnum);
1936 *pnum = 0;
1937 total_size = bdrv_getlength(bs);
1938 if (total_size < 0) {
1939 ret = total_size;
1940 goto early_out;
1941 }
1942
1943 if (offset >= total_size) {
1944 ret = BDRV_BLOCK_EOF;
1945 goto early_out;
1946 }
1947 if (!bytes) {
1948 ret = 0;
1949 goto early_out;
1950 }
1951
1952 n = total_size - offset;
1953 if (n < bytes) {
1954 bytes = n;
1955 }
1956
1957
1958 assert(bs->drv);
1959 if (!bs->drv->bdrv_co_block_status) {
1960 *pnum = bytes;
1961 ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1962 if (offset + bytes == total_size) {
1963 ret |= BDRV_BLOCK_EOF;
1964 }
1965 if (bs->drv->protocol_name) {
1966 ret |= BDRV_BLOCK_OFFSET_VALID;
1967 local_map = offset;
1968 local_file = bs;
1969 }
1970 goto early_out;
1971 }
1972
1973 bdrv_inc_in_flight(bs);
1974
1975
1976 align = bs->bl.request_alignment;
1977 aligned_offset = QEMU_ALIGN_DOWN(offset, align);
1978 aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
1979
1980 ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
1981 aligned_bytes, pnum, &local_map,
1982 &local_file);
1983 if (ret < 0) {
1984 *pnum = 0;
1985 goto out;
1986 }
1987
1988
1989
1990
1991
1992 assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
1993 align > offset - aligned_offset);
1994 *pnum -= offset - aligned_offset;
1995 if (*pnum > bytes) {
1996 *pnum = bytes;
1997 }
1998 if (ret & BDRV_BLOCK_OFFSET_VALID) {
1999 local_map += offset - aligned_offset;
2000 }
2001
2002 if (ret & BDRV_BLOCK_RAW) {
2003 assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2004 ret = bdrv_co_block_status(local_file, want_zero, local_map,
2005 *pnum, pnum, &local_map, &local_file);
2006 goto out;
2007 }
2008
2009 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2010 ret |= BDRV_BLOCK_ALLOCATED;
2011 } else if (want_zero) {
2012 if (bdrv_unallocated_blocks_are_zero(bs)) {
2013 ret |= BDRV_BLOCK_ZERO;
2014 } else if (bs->backing) {
2015 BlockDriverState *bs2 = bs->backing->bs;
2016 int64_t size2 = bdrv_getlength(bs2);
2017
2018 if (size2 >= 0 && offset >= size2) {
2019 ret |= BDRV_BLOCK_ZERO;
2020 }
2021 }
2022 }
2023
2024 if (want_zero && local_file && local_file != bs &&
2025 (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2026 (ret & BDRV_BLOCK_OFFSET_VALID)) {
2027 int64_t file_pnum;
2028 int ret2;
2029
2030 ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2031 *pnum, &file_pnum, NULL, NULL);
2032 if (ret2 >= 0) {
2033
2034
2035
2036 if (ret2 & BDRV_BLOCK_EOF &&
2037 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2038
2039
2040
2041
2042
2043 ret |= BDRV_BLOCK_ZERO;
2044 } else {
2045
2046 *pnum = file_pnum;
2047 ret |= (ret2 & BDRV_BLOCK_ZERO);
2048 }
2049 }
2050 }
2051
2052out:
2053 bdrv_dec_in_flight(bs);
2054 if (ret >= 0 && offset + *pnum == total_size) {
2055 ret |= BDRV_BLOCK_EOF;
2056 }
2057early_out:
2058 if (file) {
2059 *file = local_file;
2060 }
2061 if (map) {
2062 *map = local_map;
2063 }
2064 return ret;
2065}
2066
2067static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2068 BlockDriverState *base,
2069 bool want_zero,
2070 int64_t offset,
2071 int64_t bytes,
2072 int64_t *pnum,
2073 int64_t *map,
2074 BlockDriverState **file)
2075{
2076 BlockDriverState *p;
2077 int ret = 0;
2078 bool first = true;
2079
2080 assert(bs != base);
2081 for (p = bs; p != base; p = backing_bs(p)) {
2082 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2083 file);
2084 if (ret < 0) {
2085 break;
2086 }
2087 if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2088
2089
2090
2091
2092
2093
2094 *pnum = bytes;
2095 }
2096 if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2097 break;
2098 }
2099
2100
2101 bytes = MIN(bytes, *pnum);
2102 first = false;
2103 }
2104 return ret;
2105}
2106
2107
2108static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2109{
2110 BdrvCoBlockStatusData *data = opaque;
2111
2112 data->ret = bdrv_co_block_status_above(data->bs, data->base,
2113 data->want_zero,
2114 data->offset, data->bytes,
2115 data->pnum, data->map, data->file);
2116 data->done = true;
2117}
2118
2119
2120
2121
2122
2123
2124static int bdrv_common_block_status_above(BlockDriverState *bs,
2125 BlockDriverState *base,
2126 bool want_zero, int64_t offset,
2127 int64_t bytes, int64_t *pnum,
2128 int64_t *map,
2129 BlockDriverState **file)
2130{
2131 Coroutine *co;
2132 BdrvCoBlockStatusData data = {
2133 .bs = bs,
2134 .base = base,
2135 .want_zero = want_zero,
2136 .offset = offset,
2137 .bytes = bytes,
2138 .pnum = pnum,
2139 .map = map,
2140 .file = file,
2141 .done = false,
2142 };
2143
2144 if (qemu_in_coroutine()) {
2145
2146 bdrv_block_status_above_co_entry(&data);
2147 } else {
2148 co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2149 bdrv_coroutine_enter(bs, co);
2150 BDRV_POLL_WHILE(bs, !data.done);
2151 }
2152 return data.ret;
2153}
2154
2155int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2156 int64_t offset, int64_t bytes, int64_t *pnum,
2157 int64_t *map, BlockDriverState **file)
2158{
2159 return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2160 pnum, map, file);
2161}
2162
2163int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2164 int64_t *pnum, int64_t *map, BlockDriverState **file)
2165{
2166 return bdrv_block_status_above(bs, backing_bs(bs),
2167 offset, bytes, pnum, map, file);
2168}
2169
2170int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2171 int64_t bytes, int64_t *pnum)
2172{
2173 int ret;
2174 int64_t dummy;
2175
2176 ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2177 bytes, pnum ? pnum : &dummy, NULL,
2178 NULL);
2179 if (ret < 0) {
2180 return ret;
2181 }
2182 return !!(ret & BDRV_BLOCK_ALLOCATED);
2183}
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201int bdrv_is_allocated_above(BlockDriverState *top,
2202 BlockDriverState *base,
2203 int64_t offset, int64_t bytes, int64_t *pnum)
2204{
2205 BlockDriverState *intermediate;
2206 int ret;
2207 int64_t n = bytes;
2208
2209 intermediate = top;
2210 while (intermediate && intermediate != base) {
2211 int64_t pnum_inter;
2212 int64_t size_inter;
2213
2214 ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2215 if (ret < 0) {
2216 return ret;
2217 }
2218 if (ret) {
2219 *pnum = pnum_inter;
2220 return 1;
2221 }
2222
2223 size_inter = bdrv_getlength(intermediate);
2224 if (size_inter < 0) {
2225 return size_inter;
2226 }
2227 if (n > pnum_inter &&
2228 (intermediate == top || offset + pnum_inter < size_inter)) {
2229 n = pnum_inter;
2230 }
2231
2232 intermediate = backing_bs(intermediate);
2233 }
2234
2235 *pnum = n;
2236 return 0;
2237}
2238
2239typedef struct BdrvVmstateCo {
2240 BlockDriverState *bs;
2241 QEMUIOVector *qiov;
2242 int64_t pos;
2243 bool is_read;
2244 int ret;
2245} BdrvVmstateCo;
2246
2247static int coroutine_fn
2248bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2249 bool is_read)
2250{
2251 BlockDriver *drv = bs->drv;
2252 int ret = -ENOTSUP;
2253
2254 bdrv_inc_in_flight(bs);
2255
2256 if (!drv) {
2257 ret = -ENOMEDIUM;
2258 } else if (drv->bdrv_load_vmstate) {
2259 if (is_read) {
2260 ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2261 } else {
2262 ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2263 }
2264 } else if (bs->file) {
2265 ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2266 }
2267
2268 bdrv_dec_in_flight(bs);
2269 return ret;
2270}
2271
2272static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2273{
2274 BdrvVmstateCo *co = opaque;
2275 co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2276}
2277
2278static inline int
2279bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2280 bool is_read)
2281{
2282 if (qemu_in_coroutine()) {
2283 return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2284 } else {
2285 BdrvVmstateCo data = {
2286 .bs = bs,
2287 .qiov = qiov,
2288 .pos = pos,
2289 .is_read = is_read,
2290 .ret = -EINPROGRESS,
2291 };
2292 Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2293
2294 bdrv_coroutine_enter(bs, co);
2295 BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2296 return data.ret;
2297 }
2298}
2299
2300int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2301 int64_t pos, int size)
2302{
2303 QEMUIOVector qiov;
2304 struct iovec iov = {
2305 .iov_base = (void *) buf,
2306 .iov_len = size,
2307 };
2308 int ret;
2309
2310 qemu_iovec_init_external(&qiov, &iov, 1);
2311
2312 ret = bdrv_writev_vmstate(bs, &qiov, pos);
2313 if (ret < 0) {
2314 return ret;
2315 }
2316
2317 return size;
2318}
2319
2320int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2321{
2322 return bdrv_rw_vmstate(bs, qiov, pos, false);
2323}
2324
2325int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2326 int64_t pos, int size)
2327{
2328 QEMUIOVector qiov;
2329 struct iovec iov = {
2330 .iov_base = buf,
2331 .iov_len = size,
2332 };
2333 int ret;
2334
2335 qemu_iovec_init_external(&qiov, &iov, 1);
2336 ret = bdrv_readv_vmstate(bs, &qiov, pos);
2337 if (ret < 0) {
2338 return ret;
2339 }
2340
2341 return size;
2342}
2343
2344int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2345{
2346 return bdrv_rw_vmstate(bs, qiov, pos, true);
2347}
2348
2349
2350
2351
2352void bdrv_aio_cancel(BlockAIOCB *acb)
2353{
2354 qemu_aio_ref(acb);
2355 bdrv_aio_cancel_async(acb);
2356 while (acb->refcnt > 1) {
2357 if (acb->aiocb_info->get_aio_context) {
2358 aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2359 } else if (acb->bs) {
2360
2361
2362
2363
2364 assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2365 aio_poll(bdrv_get_aio_context(acb->bs), true);
2366 } else {
2367 abort();
2368 }
2369 }
2370 qemu_aio_unref(acb);
2371}
2372
2373
2374
2375
2376void bdrv_aio_cancel_async(BlockAIOCB *acb)
2377{
2378 if (acb->aiocb_info->cancel_async) {
2379 acb->aiocb_info->cancel_async(acb);
2380 }
2381}
2382
2383
2384
2385
2386typedef struct FlushCo {
2387 BlockDriverState *bs;
2388 int ret;
2389} FlushCo;
2390
2391
2392static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2393{
2394 FlushCo *rwco = opaque;
2395
2396 rwco->ret = bdrv_co_flush(rwco->bs);
2397}
2398
2399int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2400{
2401 int current_gen;
2402 int ret = 0;
2403
2404 bdrv_inc_in_flight(bs);
2405
2406 if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2407 bdrv_is_sg(bs)) {
2408 goto early_exit;
2409 }
2410
2411 qemu_co_mutex_lock(&bs->reqs_lock);
2412 current_gen = atomic_read(&bs->write_gen);
2413
2414
2415 while (bs->active_flush_req) {
2416 qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2417 }
2418
2419
2420 bs->active_flush_req = true;
2421 qemu_co_mutex_unlock(&bs->reqs_lock);
2422
2423
2424 if (bs->drv->bdrv_co_flush) {
2425 ret = bs->drv->bdrv_co_flush(bs);
2426 goto out;
2427 }
2428
2429
2430 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2431 if (bs->drv->bdrv_co_flush_to_os) {
2432 ret = bs->drv->bdrv_co_flush_to_os(bs);
2433 if (ret < 0) {
2434 goto out;
2435 }
2436 }
2437
2438
2439 if (bs->open_flags & BDRV_O_NO_FLUSH) {
2440 goto flush_parent;
2441 }
2442
2443
2444 if (bs->flushed_gen == current_gen) {
2445 goto flush_parent;
2446 }
2447
2448 BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2449 if (!bs->drv) {
2450
2451
2452 ret = -ENOMEDIUM;
2453 goto out;
2454 }
2455 if (bs->drv->bdrv_co_flush_to_disk) {
2456 ret = bs->drv->bdrv_co_flush_to_disk(bs);
2457 } else if (bs->drv->bdrv_aio_flush) {
2458 BlockAIOCB *acb;
2459 CoroutineIOCompletion co = {
2460 .coroutine = qemu_coroutine_self(),
2461 };
2462
2463 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2464 if (acb == NULL) {
2465 ret = -EIO;
2466 } else {
2467 qemu_coroutine_yield();
2468 ret = co.ret;
2469 }
2470 } else {
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482 ret = 0;
2483 }
2484
2485 if (ret < 0) {
2486 goto out;
2487 }
2488
2489
2490
2491
2492flush_parent:
2493 ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2494out:
2495
2496 if (ret == 0) {
2497 bs->flushed_gen = current_gen;
2498 }
2499
2500 qemu_co_mutex_lock(&bs->reqs_lock);
2501 bs->active_flush_req = false;
2502
2503 qemu_co_queue_next(&bs->flush_queue);
2504 qemu_co_mutex_unlock(&bs->reqs_lock);
2505
2506early_exit:
2507 bdrv_dec_in_flight(bs);
2508 return ret;
2509}
2510
2511int bdrv_flush(BlockDriverState *bs)
2512{
2513 Coroutine *co;
2514 FlushCo flush_co = {
2515 .bs = bs,
2516 .ret = NOT_DONE,
2517 };
2518
2519 if (qemu_in_coroutine()) {
2520
2521 bdrv_flush_co_entry(&flush_co);
2522 } else {
2523 co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2524 bdrv_coroutine_enter(bs, co);
2525 BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2526 }
2527
2528 return flush_co.ret;
2529}
2530
2531typedef struct DiscardCo {
2532 BlockDriverState *bs;
2533 int64_t offset;
2534 int bytes;
2535 int ret;
2536} DiscardCo;
2537static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2538{
2539 DiscardCo *rwco = opaque;
2540
2541 rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
2542}
2543
2544int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2545 int bytes)
2546{
2547 BdrvTrackedRequest req;
2548 int max_pdiscard, ret;
2549 int head, tail, align;
2550
2551 if (!bs->drv) {
2552 return -ENOMEDIUM;
2553 }
2554
2555 if (bdrv_has_readonly_bitmaps(bs)) {
2556 return -EPERM;
2557 }
2558
2559 ret = bdrv_check_byte_request(bs, offset, bytes);
2560 if (ret < 0) {
2561 return ret;
2562 } else if (bs->read_only) {
2563 return -EPERM;
2564 }
2565 assert(!(bs->open_flags & BDRV_O_INACTIVE));
2566
2567
2568 if (!(bs->open_flags & BDRV_O_UNMAP)) {
2569 return 0;
2570 }
2571
2572 if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2573 return 0;
2574 }
2575
2576
2577
2578
2579
2580
2581 align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2582 assert(align % bs->bl.request_alignment == 0);
2583 head = offset % align;
2584 tail = (offset + bytes) % align;
2585
2586 bdrv_inc_in_flight(bs);
2587 tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2588
2589 ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2590 if (ret < 0) {
2591 goto out;
2592 }
2593
2594 max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2595 align);
2596 assert(max_pdiscard >= bs->bl.request_alignment);
2597
2598 while (bytes > 0) {
2599 int num = bytes;
2600
2601 if (head) {
2602
2603 num = MIN(bytes, align - head);
2604 if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2605 num %= bs->bl.request_alignment;
2606 }
2607 head = (head + num) % align;
2608 assert(num < max_pdiscard);
2609 } else if (tail) {
2610 if (num > align) {
2611
2612 num -= tail;
2613 } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2614 tail > bs->bl.request_alignment) {
2615 tail %= bs->bl.request_alignment;
2616 num -= tail;
2617 }
2618 }
2619
2620 if (num > max_pdiscard) {
2621 num = max_pdiscard;
2622 }
2623
2624 if (!bs->drv) {
2625 ret = -ENOMEDIUM;
2626 goto out;
2627 }
2628 if (bs->drv->bdrv_co_pdiscard) {
2629 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2630 } else {
2631 BlockAIOCB *acb;
2632 CoroutineIOCompletion co = {
2633 .coroutine = qemu_coroutine_self(),
2634 };
2635
2636 acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2637 bdrv_co_io_em_complete, &co);
2638 if (acb == NULL) {
2639 ret = -EIO;
2640 goto out;
2641 } else {
2642 qemu_coroutine_yield();
2643 ret = co.ret;
2644 }
2645 }
2646 if (ret && ret != -ENOTSUP) {
2647 goto out;
2648 }
2649
2650 offset += num;
2651 bytes -= num;
2652 }
2653 ret = 0;
2654out:
2655 atomic_inc(&bs->write_gen);
2656 bdrv_set_dirty(bs, req.offset, req.bytes);
2657 tracked_request_end(&req);
2658 bdrv_dec_in_flight(bs);
2659 return ret;
2660}
2661
2662int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2663{
2664 Coroutine *co;
2665 DiscardCo rwco = {
2666 .bs = bs,
2667 .offset = offset,
2668 .bytes = bytes,
2669 .ret = NOT_DONE,
2670 };
2671
2672 if (qemu_in_coroutine()) {
2673
2674 bdrv_pdiscard_co_entry(&rwco);
2675 } else {
2676 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2677 bdrv_coroutine_enter(bs, co);
2678 BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
2679 }
2680
2681 return rwco.ret;
2682}
2683
2684int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2685{
2686 BlockDriver *drv = bs->drv;
2687 CoroutineIOCompletion co = {
2688 .coroutine = qemu_coroutine_self(),
2689 };
2690 BlockAIOCB *acb;
2691
2692 bdrv_inc_in_flight(bs);
2693 if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2694 co.ret = -ENOTSUP;
2695 goto out;
2696 }
2697
2698 if (drv->bdrv_co_ioctl) {
2699 co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2700 } else {
2701 acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2702 if (!acb) {
2703 co.ret = -ENOTSUP;
2704 goto out;
2705 }
2706 qemu_coroutine_yield();
2707 }
2708out:
2709 bdrv_dec_in_flight(bs);
2710 return co.ret;
2711}
2712
2713void *qemu_blockalign(BlockDriverState *bs, size_t size)
2714{
2715 return qemu_memalign(bdrv_opt_mem_align(bs), size);
2716}
2717
2718void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2719{
2720 return memset(qemu_blockalign(bs, size), 0, size);
2721}
2722
2723void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2724{
2725 size_t align = bdrv_opt_mem_align(bs);
2726
2727
2728 assert(align > 0);
2729 if (size == 0) {
2730 size = align;
2731 }
2732
2733 return qemu_try_memalign(align, size);
2734}
2735
2736void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2737{
2738 void *mem = qemu_try_blockalign(bs, size);
2739
2740 if (mem) {
2741 memset(mem, 0, size);
2742 }
2743
2744 return mem;
2745}
2746
2747
2748
2749
2750bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2751{
2752 int i;
2753 size_t alignment = bdrv_min_mem_align(bs);
2754
2755 for (i = 0; i < qiov->niov; i++) {
2756 if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2757 return false;
2758 }
2759 if (qiov->iov[i].iov_len % alignment) {
2760 return false;
2761 }
2762 }
2763
2764 return true;
2765}
2766
2767void bdrv_add_before_write_notifier(BlockDriverState *bs,
2768 NotifierWithReturn *notifier)
2769{
2770 notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2771}
2772
2773void bdrv_io_plug(BlockDriverState *bs)
2774{
2775 BdrvChild *child;
2776
2777 QLIST_FOREACH(child, &bs->children, next) {
2778 bdrv_io_plug(child->bs);
2779 }
2780
2781 if (atomic_fetch_inc(&bs->io_plugged) == 0) {
2782 BlockDriver *drv = bs->drv;
2783 if (drv && drv->bdrv_io_plug) {
2784 drv->bdrv_io_plug(bs);
2785 }
2786 }
2787}
2788
2789void bdrv_io_unplug(BlockDriverState *bs)
2790{
2791 BdrvChild *child;
2792
2793 assert(bs->io_plugged);
2794 if (atomic_fetch_dec(&bs->io_plugged) == 1) {
2795 BlockDriver *drv = bs->drv;
2796 if (drv && drv->bdrv_io_unplug) {
2797 drv->bdrv_io_unplug(bs);
2798 }
2799 }
2800
2801 QLIST_FOREACH(child, &bs->children, next) {
2802 bdrv_io_unplug(child->bs);
2803 }
2804}
2805
2806void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
2807{
2808 BdrvChild *child;
2809
2810 if (bs->drv && bs->drv->bdrv_register_buf) {
2811 bs->drv->bdrv_register_buf(bs, host, size);
2812 }
2813 QLIST_FOREACH(child, &bs->children, next) {
2814 bdrv_register_buf(child->bs, host, size);
2815 }
2816}
2817
2818void bdrv_unregister_buf(BlockDriverState *bs, void *host)
2819{
2820 BdrvChild *child;
2821
2822 if (bs->drv && bs->drv->bdrv_unregister_buf) {
2823 bs->drv->bdrv_unregister_buf(bs, host);
2824 }
2825 QLIST_FOREACH(child, &bs->children, next) {
2826 bdrv_unregister_buf(child->bs, host);
2827 }
2828}
2829