1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16#include "qemu/osdep.h"
17#include "qemu-common.h"
18#include "block/block.h"
19#include "qemu/rcu_queue.h"
20#include "qemu/sockets.h"
21#include "qemu/cutils.h"
22#include "trace.h"
23#ifdef CONFIG_EPOLL_CREATE1
24#include <sys/epoll.h>
25#endif
26
27struct AioHandler
28{
29 GPollFD pfd;
30 IOHandler *io_read;
31 IOHandler *io_write;
32 AioPollFn *io_poll;
33 IOHandler *io_poll_begin;
34 IOHandler *io_poll_end;
35 int deleted;
36 void *opaque;
37 bool is_external;
38 QLIST_ENTRY(AioHandler) node;
39};
40
41#ifdef CONFIG_EPOLL_CREATE1
42
43
44#define EPOLL_ENABLE_THRESHOLD 64
45
46static void aio_epoll_disable(AioContext *ctx)
47{
48 ctx->epoll_enabled = false;
49 if (!ctx->epoll_available) {
50 return;
51 }
52 ctx->epoll_available = false;
53 close(ctx->epollfd);
54}
55
56static inline int epoll_events_from_pfd(int pfd_events)
57{
58 return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
59 (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
60 (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
61 (pfd_events & G_IO_ERR ? EPOLLERR : 0);
62}
63
64static bool aio_epoll_try_enable(AioContext *ctx)
65{
66 AioHandler *node;
67 struct epoll_event event;
68
69 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
70 int r;
71 if (node->deleted || !node->pfd.events) {
72 continue;
73 }
74 event.events = epoll_events_from_pfd(node->pfd.events);
75 event.data.ptr = node;
76 r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
77 if (r) {
78 return false;
79 }
80 }
81 ctx->epoll_enabled = true;
82 return true;
83}
84
85static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
86{
87 struct epoll_event event;
88 int r;
89 int ctl;
90
91 if (!ctx->epoll_enabled) {
92 return;
93 }
94 if (!node->pfd.events) {
95 ctl = EPOLL_CTL_DEL;
96 } else {
97 event.data.ptr = node;
98 event.events = epoll_events_from_pfd(node->pfd.events);
99 ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
100 }
101
102 r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
103 if (r) {
104 aio_epoll_disable(ctx);
105 }
106}
107
108static int aio_epoll(AioContext *ctx, GPollFD *pfds,
109 unsigned npfd, int64_t timeout)
110{
111 AioHandler *node;
112 int i, ret = 0;
113 struct epoll_event events[128];
114
115 assert(npfd == 1);
116 assert(pfds[0].fd == ctx->epollfd);
117 if (timeout > 0) {
118 ret = qemu_poll_ns(pfds, npfd, timeout);
119 }
120 if (timeout <= 0 || ret > 0) {
121 ret = epoll_wait(ctx->epollfd, events,
122 ARRAY_SIZE(events),
123 timeout);
124 if (ret <= 0) {
125 goto out;
126 }
127 for (i = 0; i < ret; i++) {
128 int ev = events[i].events;
129 node = events[i].data.ptr;
130 node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
131 (ev & EPOLLOUT ? G_IO_OUT : 0) |
132 (ev & EPOLLHUP ? G_IO_HUP : 0) |
133 (ev & EPOLLERR ? G_IO_ERR : 0);
134 }
135 }
136out:
137 return ret;
138}
139
140static bool aio_epoll_enabled(AioContext *ctx)
141{
142
143 return !aio_external_disabled(ctx) && ctx->epoll_enabled;
144}
145
146static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
147 unsigned npfd, int64_t timeout)
148{
149 if (!ctx->epoll_available) {
150 return false;
151 }
152 if (aio_epoll_enabled(ctx)) {
153 return true;
154 }
155 if (npfd >= EPOLL_ENABLE_THRESHOLD) {
156 if (aio_epoll_try_enable(ctx)) {
157 return true;
158 } else {
159 aio_epoll_disable(ctx);
160 }
161 }
162 return false;
163}
164
165#else
166
167static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
168{
169}
170
171static int aio_epoll(AioContext *ctx, GPollFD *pfds,
172 unsigned npfd, int64_t timeout)
173{
174 assert(false);
175}
176
177static bool aio_epoll_enabled(AioContext *ctx)
178{
179 return false;
180}
181
182static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
183 unsigned npfd, int64_t timeout)
184{
185 return false;
186}
187
188#endif
189
190static AioHandler *find_aio_handler(AioContext *ctx, int fd)
191{
192 AioHandler *node;
193
194 QLIST_FOREACH(node, &ctx->aio_handlers, node) {
195 if (node->pfd.fd == fd)
196 if (!node->deleted)
197 return node;
198 }
199
200 return NULL;
201}
202
203static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
204{
205
206
207
208
209
210 if (!g_source_is_destroyed(&ctx->source)) {
211 g_source_remove_poll(&ctx->source, &node->pfd);
212 }
213
214
215 if (qemu_lockcnt_count(&ctx->list_lock)) {
216 node->deleted = 1;
217 node->pfd.revents = 0;
218 return false;
219 }
220
221
222
223
224 QLIST_REMOVE(node, node);
225 return true;
226}
227
228void aio_set_fd_handler(AioContext *ctx,
229 int fd,
230 bool is_external,
231 IOHandler *io_read,
232 IOHandler *io_write,
233 AioPollFn *io_poll,
234 void *opaque)
235{
236 AioHandler *node;
237 AioHandler *new_node = NULL;
238 bool is_new = false;
239 bool deleted = false;
240 int poll_disable_change;
241
242 qemu_lockcnt_lock(&ctx->list_lock);
243
244 node = find_aio_handler(ctx, fd);
245
246
247 if (!io_read && !io_write && !io_poll) {
248 if (node == NULL) {
249 qemu_lockcnt_unlock(&ctx->list_lock);
250 return;
251 }
252
253 node->pfd.events = 0;
254
255 poll_disable_change = -!node->io_poll;
256 } else {
257 poll_disable_change = !io_poll - (node && !node->io_poll);
258 if (node == NULL) {
259 is_new = true;
260 }
261
262 new_node = g_new0(AioHandler, 1);
263
264
265 new_node->io_read = io_read;
266 new_node->io_write = io_write;
267 new_node->io_poll = io_poll;
268 new_node->opaque = opaque;
269 new_node->is_external = is_external;
270
271 if (is_new) {
272 new_node->pfd.fd = fd;
273 } else {
274 new_node->pfd = node->pfd;
275 }
276 g_source_add_poll(&ctx->source, &new_node->pfd);
277
278 new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
279 new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
280
281 QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
282 }
283 if (node) {
284 deleted = aio_remove_fd_handler(ctx, node);
285 }
286
287
288
289
290
291
292
293 atomic_set(&ctx->poll_disable_cnt,
294 atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
295
296 if (new_node) {
297 aio_epoll_update(ctx, new_node, is_new);
298 } else if (node) {
299
300 aio_epoll_update(ctx, node, false);
301 }
302 qemu_lockcnt_unlock(&ctx->list_lock);
303 aio_notify(ctx);
304
305 if (deleted) {
306 g_free(node);
307 }
308}
309
310void aio_set_fd_poll(AioContext *ctx, int fd,
311 IOHandler *io_poll_begin,
312 IOHandler *io_poll_end)
313{
314 AioHandler *node = find_aio_handler(ctx, fd);
315
316 if (!node) {
317 return;
318 }
319
320 node->io_poll_begin = io_poll_begin;
321 node->io_poll_end = io_poll_end;
322}
323
324void aio_set_event_notifier(AioContext *ctx,
325 EventNotifier *notifier,
326 bool is_external,
327 EventNotifierHandler *io_read,
328 AioPollFn *io_poll)
329{
330 aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
331 (IOHandler *)io_read, NULL, io_poll, notifier);
332}
333
334void aio_set_event_notifier_poll(AioContext *ctx,
335 EventNotifier *notifier,
336 EventNotifierHandler *io_poll_begin,
337 EventNotifierHandler *io_poll_end)
338{
339 aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
340 (IOHandler *)io_poll_begin,
341 (IOHandler *)io_poll_end);
342}
343
344static void poll_set_started(AioContext *ctx, bool started)
345{
346 AioHandler *node;
347
348 if (started == ctx->poll_started) {
349 return;
350 }
351
352 ctx->poll_started = started;
353
354 qemu_lockcnt_inc(&ctx->list_lock);
355 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
356 IOHandler *fn;
357
358 if (node->deleted) {
359 continue;
360 }
361
362 if (started) {
363 fn = node->io_poll_begin;
364 } else {
365 fn = node->io_poll_end;
366 }
367
368 if (fn) {
369 fn(node->opaque);
370 }
371 }
372 qemu_lockcnt_dec(&ctx->list_lock);
373}
374
375
376bool aio_prepare(AioContext *ctx)
377{
378
379 poll_set_started(ctx, false);
380
381 return false;
382}
383
384bool aio_pending(AioContext *ctx)
385{
386 AioHandler *node;
387 bool result = false;
388
389
390
391
392
393 qemu_lockcnt_inc(&ctx->list_lock);
394
395 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
396 int revents;
397
398 revents = node->pfd.revents & node->pfd.events;
399 if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
400 aio_node_check(ctx, node->is_external)) {
401 result = true;
402 break;
403 }
404 if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
405 aio_node_check(ctx, node->is_external)) {
406 result = true;
407 break;
408 }
409 }
410 qemu_lockcnt_dec(&ctx->list_lock);
411
412 return result;
413}
414
415static bool aio_dispatch_handlers(AioContext *ctx)
416{
417 AioHandler *node, *tmp;
418 bool progress = false;
419
420 QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
421 int revents;
422
423 revents = node->pfd.revents & node->pfd.events;
424 node->pfd.revents = 0;
425
426 if (!node->deleted &&
427 (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
428 aio_node_check(ctx, node->is_external) &&
429 node->io_read) {
430 node->io_read(node->opaque);
431
432
433 if (node->opaque != &ctx->notifier) {
434 progress = true;
435 }
436 }
437 if (!node->deleted &&
438 (revents & (G_IO_OUT | G_IO_ERR)) &&
439 aio_node_check(ctx, node->is_external) &&
440 node->io_write) {
441 node->io_write(node->opaque);
442 progress = true;
443 }
444
445 if (node->deleted) {
446 if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
447 QLIST_REMOVE(node, node);
448 g_free(node);
449 qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
450 }
451 }
452 }
453
454 return progress;
455}
456
457void aio_dispatch(AioContext *ctx)
458{
459 qemu_lockcnt_inc(&ctx->list_lock);
460 aio_bh_poll(ctx);
461 aio_dispatch_handlers(ctx);
462 qemu_lockcnt_dec(&ctx->list_lock);
463
464 timerlistgroup_run_timers(&ctx->tlg);
465}
466
467
468
469
470
471
472
473
474
475
476
477
478static __thread GPollFD *pollfds;
479static __thread AioHandler **nodes;
480static __thread unsigned npfd, nalloc;
481static __thread Notifier pollfds_cleanup_notifier;
482
483static void pollfds_cleanup(Notifier *n, void *unused)
484{
485 g_assert(npfd == 0);
486 g_free(pollfds);
487 g_free(nodes);
488 nalloc = 0;
489}
490
491static void add_pollfd(AioHandler *node)
492{
493 if (npfd == nalloc) {
494 if (nalloc == 0) {
495 pollfds_cleanup_notifier.notify = pollfds_cleanup;
496 qemu_thread_atexit_add(&pollfds_cleanup_notifier);
497 nalloc = 8;
498 } else {
499 g_assert(nalloc <= INT_MAX);
500 nalloc *= 2;
501 }
502 pollfds = g_renew(GPollFD, pollfds, nalloc);
503 nodes = g_renew(AioHandler *, nodes, nalloc);
504 }
505 nodes[npfd] = node;
506 pollfds[npfd] = (GPollFD) {
507 .fd = node->pfd.fd,
508 .events = node->pfd.events,
509 };
510 npfd++;
511}
512
513static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
514{
515 bool progress = false;
516 AioHandler *node;
517
518 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
519 if (!node->deleted && node->io_poll &&
520 aio_node_check(ctx, node->is_external) &&
521 node->io_poll(node->opaque)) {
522 *timeout = 0;
523 if (node->opaque != &ctx->notifier) {
524 progress = true;
525 }
526 }
527
528
529 }
530
531 return progress;
532}
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
548{
549 bool progress;
550 int64_t start_time, elapsed_time;
551
552 assert(ctx->notify_me);
553 assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
554
555 trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
556
557 start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
558 do {
559 progress = run_poll_handlers_once(ctx, timeout);
560 elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
561 } while (!progress && elapsed_time < max_ns
562 && !atomic_read(&ctx->poll_disable_cnt));
563
564
565
566
567 if (*timeout != -1) {
568 *timeout -= MIN(*timeout, elapsed_time);
569 }
570
571 trace_run_poll_handlers_end(ctx, progress, *timeout);
572 return progress;
573}
574
575
576
577
578
579
580
581
582
583
584
585
586static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
587{
588
589 int64_t max_ns = MIN((uint64_t)*timeout, (uint64_t)ctx->poll_ns);
590
591 if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
592 poll_set_started(ctx, true);
593
594 if (run_poll_handlers(ctx, max_ns, timeout)) {
595 return true;
596 }
597 }
598
599 poll_set_started(ctx, false);
600
601
602
603
604 return run_poll_handlers_once(ctx, timeout);
605}
606
607bool aio_poll(AioContext *ctx, bool blocking)
608{
609 AioHandler *node;
610 int i;
611 int ret = 0;
612 bool progress;
613 int64_t timeout;
614 int64_t start = 0;
615
616 assert(in_aio_context_home_thread(ctx));
617
618
619
620
621
622
623
624
625 if (blocking) {
626 atomic_add(&ctx->notify_me, 2);
627 }
628
629 qemu_lockcnt_inc(&ctx->list_lock);
630
631 if (ctx->poll_max_ns) {
632 start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
633 }
634
635 timeout = blocking ? aio_compute_timeout(ctx) : 0;
636 progress = try_poll_mode(ctx, &timeout);
637 assert(!(timeout && progress));
638
639
640
641
642 if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
643 assert(npfd == 0);
644
645
646
647 if (!aio_epoll_enabled(ctx)) {
648 QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
649 if (!node->deleted && node->pfd.events
650 && aio_node_check(ctx, node->is_external)) {
651 add_pollfd(node);
652 }
653 }
654 }
655
656
657 if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
658 AioHandler epoll_handler;
659
660 epoll_handler.pfd.fd = ctx->epollfd;
661 epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
662 npfd = 0;
663 add_pollfd(&epoll_handler);
664 ret = aio_epoll(ctx, pollfds, npfd, timeout);
665 } else {
666 ret = qemu_poll_ns(pollfds, npfd, timeout);
667 }
668 }
669
670 if (blocking) {
671 atomic_sub(&ctx->notify_me, 2);
672 aio_notify_accept(ctx);
673 }
674
675
676 if (ctx->poll_max_ns) {
677 int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
678
679 if (block_ns <= ctx->poll_ns) {
680
681 } else if (block_ns > ctx->poll_max_ns) {
682
683 int64_t old = ctx->poll_ns;
684
685 if (ctx->poll_shrink) {
686 ctx->poll_ns /= ctx->poll_shrink;
687 } else {
688 ctx->poll_ns = 0;
689 }
690
691 trace_poll_shrink(ctx, old, ctx->poll_ns);
692 } else if (ctx->poll_ns < ctx->poll_max_ns &&
693 block_ns < ctx->poll_max_ns) {
694
695 int64_t old = ctx->poll_ns;
696 int64_t grow = ctx->poll_grow;
697
698 if (grow == 0) {
699 grow = 2;
700 }
701
702 if (ctx->poll_ns) {
703 ctx->poll_ns *= grow;
704 } else {
705 ctx->poll_ns = 4000;
706 }
707
708 if (ctx->poll_ns > ctx->poll_max_ns) {
709 ctx->poll_ns = ctx->poll_max_ns;
710 }
711
712 trace_poll_grow(ctx, old, ctx->poll_ns);
713 }
714 }
715
716
717 if (ret > 0) {
718 for (i = 0; i < npfd; i++) {
719 nodes[i]->pfd.revents = pollfds[i].revents;
720 }
721 }
722
723 npfd = 0;
724
725 progress |= aio_bh_poll(ctx);
726
727 if (ret > 0) {
728 progress |= aio_dispatch_handlers(ctx);
729 }
730
731 qemu_lockcnt_dec(&ctx->list_lock);
732
733 progress |= timerlistgroup_run_timers(&ctx->tlg);
734
735 return progress;
736}
737
738void aio_context_setup(AioContext *ctx)
739{
740#ifdef CONFIG_EPOLL_CREATE1
741 assert(!ctx->epollfd);
742 ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
743 if (ctx->epollfd == -1) {
744 fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
745 ctx->epoll_available = false;
746 } else {
747 ctx->epoll_available = true;
748 }
749#endif
750}
751
752void aio_context_destroy(AioContext *ctx)
753{
754#ifdef CONFIG_EPOLL_CREATE1
755 aio_epoll_disable(ctx);
756#endif
757}
758
759void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
760 int64_t grow, int64_t shrink, Error **errp)
761{
762
763
764
765 ctx->poll_max_ns = max_ns;
766 ctx->poll_ns = 0;
767 ctx->poll_grow = grow;
768 ctx->poll_shrink = shrink;
769
770 aio_notify(ctx);
771}
772