1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <net/compat.h>
48#include <linux/refcount.h>
49#include <linux/uio.h>
50#include <linux/bits.h>
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
60#include <linux/blkdev.h>
61#include <linux/bvec.h>
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
65#include <net/scm.h>
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
72#include <linux/highmem.h>
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
75#include <linux/fadvise.h>
76#include <linux/eventpoll.h>
77#include <linux/splice.h>
78#include <linux/task_work.h>
79#include <linux/pagemap.h>
80#include <linux/io_uring.h>
81#include <linux/tracehook.h>
82#include <linux/audit.h>
83#include <linux/security.h>
84
85#define CREATE_TRACE_POINTS
86#include <trace/events/io_uring.h>
87
88#include <uapi/linux/io_uring.h>
89
90#include "internal.h"
91#include "io-wq.h"
92
93#define IORING_MAX_ENTRIES 32768
94#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
96
97
98#define IORING_MAX_FIXED_FILES (1U << 15)
99#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
100 IORING_REGISTER_LAST + IORING_OP_LAST)
101
102#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
103#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
104#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
105
106#define IORING_MAX_REG_BUFFERS (1U << 14)
107
108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
109 IOSQE_IO_HARDLINK | IOSQE_ASYNC)
110
111#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
112
113#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
114 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
115 REQ_F_ASYNC_DATA)
116
117#define IO_TCTX_REFS_CACHE_NR (1U << 10)
118
119struct io_uring {
120 u32 head ____cacheline_aligned_in_smp;
121 u32 tail ____cacheline_aligned_in_smp;
122};
123
124
125
126
127
128
129
130
131struct io_rings {
132
133
134
135
136
137
138
139
140 struct io_uring sq, cq;
141
142
143
144
145 u32 sq_ring_mask, cq_ring_mask;
146
147 u32 sq_ring_entries, cq_ring_entries;
148
149
150
151
152
153
154
155
156
157
158
159
160 u32 sq_dropped;
161
162
163
164
165
166
167
168
169
170 u32 sq_flags;
171
172
173
174
175
176
177 u32 cq_flags;
178
179
180
181
182
183
184
185
186
187
188
189
190
191 u32 cq_overflow;
192
193
194
195
196
197
198
199 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
200};
201
202enum io_uring_cmd_flags {
203 IO_URING_F_COMPLETE_DEFER = 1,
204 IO_URING_F_UNLOCKED = 2,
205
206 IO_URING_F_NONBLOCK = INT_MIN,
207};
208
209struct io_mapped_ubuf {
210 u64 ubuf;
211 u64 ubuf_end;
212 unsigned int nr_bvecs;
213 unsigned long acct_pages;
214 struct bio_vec bvec[];
215};
216
217struct io_ring_ctx;
218
219struct io_overflow_cqe {
220 struct io_uring_cqe cqe;
221 struct list_head list;
222};
223
224struct io_fixed_file {
225
226 unsigned long file_ptr;
227};
228
229struct io_rsrc_put {
230 struct list_head list;
231 u64 tag;
232 union {
233 void *rsrc;
234 struct file *file;
235 struct io_mapped_ubuf *buf;
236 };
237};
238
239struct io_file_table {
240 struct io_fixed_file *files;
241};
242
243struct io_rsrc_node {
244 struct percpu_ref refs;
245 struct list_head node;
246 struct list_head rsrc_list;
247 struct io_rsrc_data *rsrc_data;
248 struct llist_node llist;
249 bool done;
250};
251
252typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
253
254struct io_rsrc_data {
255 struct io_ring_ctx *ctx;
256
257 u64 **tags;
258 unsigned int nr;
259 rsrc_put_fn *do_put;
260 atomic_t refs;
261 struct completion done;
262 bool quiesce;
263};
264
265struct io_buffer {
266 struct list_head list;
267 __u64 addr;
268 __u32 len;
269 __u16 bid;
270};
271
272struct io_restriction {
273 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
274 DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
275 u8 sqe_flags_allowed;
276 u8 sqe_flags_required;
277 bool registered;
278};
279
280enum {
281 IO_SQ_THREAD_SHOULD_STOP = 0,
282 IO_SQ_THREAD_SHOULD_PARK,
283};
284
285struct io_sq_data {
286 refcount_t refs;
287 atomic_t park_pending;
288 struct mutex lock;
289
290
291 struct list_head ctx_list;
292
293 struct task_struct *thread;
294 struct wait_queue_head wait;
295
296 unsigned sq_thread_idle;
297 int sq_cpu;
298 pid_t task_pid;
299 pid_t task_tgid;
300
301 unsigned long state;
302 struct completion exited;
303};
304
305#define IO_COMPL_BATCH 32
306#define IO_REQ_CACHE_SIZE 32
307#define IO_REQ_ALLOC_BATCH 8
308
309struct io_submit_link {
310 struct io_kiocb *head;
311 struct io_kiocb *last;
312};
313
314struct io_submit_state {
315
316 struct io_wq_work_node free_list;
317
318 struct io_wq_work_list compl_reqs;
319 struct io_submit_link link;
320
321 bool plug_started;
322 bool need_plug;
323 unsigned short submit_nr;
324 struct blk_plug plug;
325};
326
327struct io_ring_ctx {
328
329 struct {
330 struct percpu_ref refs;
331
332 struct io_rings *rings;
333 unsigned int flags;
334 unsigned int compat: 1;
335 unsigned int drain_next: 1;
336 unsigned int eventfd_async: 1;
337 unsigned int restricted: 1;
338 unsigned int off_timeout_used: 1;
339 unsigned int drain_active: 1;
340 } ____cacheline_aligned_in_smp;
341
342
343 struct {
344 struct mutex uring_lock;
345
346
347
348
349
350
351
352
353
354
355
356
357 u32 *sq_array;
358 struct io_uring_sqe *sq_sqes;
359 unsigned cached_sq_head;
360 unsigned sq_entries;
361 struct list_head defer_list;
362
363
364
365
366
367 struct io_rsrc_node *rsrc_node;
368 int rsrc_cached_refs;
369 struct io_file_table file_table;
370 unsigned nr_user_files;
371 unsigned nr_user_bufs;
372 struct io_mapped_ubuf **user_bufs;
373
374 struct io_submit_state submit_state;
375 struct list_head timeout_list;
376 struct list_head ltimeout_list;
377 struct list_head cq_overflow_list;
378 struct xarray io_buffers;
379 struct xarray personalities;
380 u32 pers_next;
381 unsigned sq_thread_idle;
382 } ____cacheline_aligned_in_smp;
383
384
385 struct io_wq_work_list locked_free_list;
386 unsigned int locked_free_nr;
387
388 const struct cred *sq_creds;
389 struct io_sq_data *sq_data;
390
391 struct wait_queue_head sqo_sq_wait;
392 struct list_head sqd_list;
393
394 unsigned long check_cq_overflow;
395
396 struct {
397 unsigned cached_cq_tail;
398 unsigned cq_entries;
399 struct eventfd_ctx *cq_ev_fd;
400 struct wait_queue_head cq_wait;
401 unsigned cq_extra;
402 atomic_t cq_timeouts;
403 unsigned cq_last_tm_flush;
404 } ____cacheline_aligned_in_smp;
405
406 struct {
407 spinlock_t completion_lock;
408
409 spinlock_t timeout_lock;
410
411
412
413
414
415
416
417 struct io_wq_work_list iopoll_list;
418 struct hlist_head *cancel_hash;
419 unsigned cancel_hash_bits;
420 bool poll_multi_queue;
421 } ____cacheline_aligned_in_smp;
422
423 struct io_restriction restrictions;
424
425
426 struct {
427 struct io_rsrc_node *rsrc_backup_node;
428 struct io_mapped_ubuf *dummy_ubuf;
429 struct io_rsrc_data *file_data;
430 struct io_rsrc_data *buf_data;
431
432 struct delayed_work rsrc_put_work;
433 struct llist_head rsrc_put_llist;
434 struct list_head rsrc_ref_list;
435 spinlock_t rsrc_ref_lock;
436 };
437
438
439 struct {
440 #if defined(CONFIG_UNIX)
441 struct socket *ring_sock;
442 #endif
443
444 struct io_wq_hash *hash_map;
445
446
447 struct user_struct *user;
448 struct mm_struct *mm_account;
449
450
451 struct llist_head fallback_llist;
452 struct delayed_work fallback_work;
453 struct work_struct exit_work;
454 struct list_head tctx_list;
455 struct completion ref_comp;
456 u32 iowq_limits[2];
457 bool iowq_limits_set;
458 };
459};
460
461struct io_uring_task {
462
463 int cached_refs;
464 struct xarray xa;
465 struct wait_queue_head wait;
466 const struct io_ring_ctx *last;
467 struct io_wq *io_wq;
468 struct percpu_counter inflight;
469 atomic_t inflight_tracked;
470 atomic_t in_idle;
471
472 spinlock_t task_lock;
473 struct io_wq_work_list task_list;
474 struct callback_head task_work;
475 bool task_running;
476};
477
478
479
480
481
482struct io_poll_iocb {
483 struct file *file;
484 struct wait_queue_head *head;
485 __poll_t events;
486 bool done;
487 bool canceled;
488 struct wait_queue_entry wait;
489};
490
491struct io_poll_update {
492 struct file *file;
493 u64 old_user_data;
494 u64 new_user_data;
495 __poll_t events;
496 bool update_events;
497 bool update_user_data;
498};
499
500struct io_close {
501 struct file *file;
502 int fd;
503 u32 file_slot;
504};
505
506struct io_timeout_data {
507 struct io_kiocb *req;
508 struct hrtimer timer;
509 struct timespec64 ts;
510 enum hrtimer_mode mode;
511 u32 flags;
512};
513
514struct io_accept {
515 struct file *file;
516 struct sockaddr __user *addr;
517 int __user *addr_len;
518 int flags;
519 u32 file_slot;
520 unsigned long nofile;
521};
522
523struct io_sync {
524 struct file *file;
525 loff_t len;
526 loff_t off;
527 int flags;
528 int mode;
529};
530
531struct io_cancel {
532 struct file *file;
533 u64 addr;
534};
535
536struct io_timeout {
537 struct file *file;
538 u32 off;
539 u32 target_seq;
540 struct list_head list;
541
542 struct io_kiocb *head;
543
544 struct io_kiocb *prev;
545};
546
547struct io_timeout_rem {
548 struct file *file;
549 u64 addr;
550
551
552 struct timespec64 ts;
553 u32 flags;
554 bool ltimeout;
555};
556
557struct io_rw {
558
559 struct kiocb kiocb;
560 u64 addr;
561 u64 len;
562};
563
564struct io_connect {
565 struct file *file;
566 struct sockaddr __user *addr;
567 int addr_len;
568};
569
570struct io_sr_msg {
571 struct file *file;
572 union {
573 struct compat_msghdr __user *umsg_compat;
574 struct user_msghdr __user *umsg;
575 void __user *buf;
576 };
577 int msg_flags;
578 int bgid;
579 size_t len;
580};
581
582struct io_open {
583 struct file *file;
584 int dfd;
585 u32 file_slot;
586 struct filename *filename;
587 struct open_how how;
588 unsigned long nofile;
589};
590
591struct io_rsrc_update {
592 struct file *file;
593 u64 arg;
594 u32 nr_args;
595 u32 offset;
596};
597
598struct io_fadvise {
599 struct file *file;
600 u64 offset;
601 u32 len;
602 u32 advice;
603};
604
605struct io_madvise {
606 struct file *file;
607 u64 addr;
608 u32 len;
609 u32 advice;
610};
611
612struct io_epoll {
613 struct file *file;
614 int epfd;
615 int op;
616 int fd;
617 struct epoll_event event;
618};
619
620struct io_splice {
621 struct file *file_out;
622 struct file *file_in;
623 loff_t off_out;
624 loff_t off_in;
625 u64 len;
626 unsigned int flags;
627};
628
629struct io_provide_buf {
630 struct file *file;
631 __u64 addr;
632 __u32 len;
633 __u32 bgid;
634 __u16 nbufs;
635 __u16 bid;
636};
637
638struct io_statx {
639 struct file *file;
640 int dfd;
641 unsigned int mask;
642 unsigned int flags;
643 const char __user *filename;
644 struct statx __user *buffer;
645};
646
647struct io_shutdown {
648 struct file *file;
649 int how;
650};
651
652struct io_rename {
653 struct file *file;
654 int old_dfd;
655 int new_dfd;
656 struct filename *oldpath;
657 struct filename *newpath;
658 int flags;
659};
660
661struct io_unlink {
662 struct file *file;
663 int dfd;
664 int flags;
665 struct filename *filename;
666};
667
668struct io_mkdir {
669 struct file *file;
670 int dfd;
671 umode_t mode;
672 struct filename *filename;
673};
674
675struct io_symlink {
676 struct file *file;
677 int new_dfd;
678 struct filename *oldpath;
679 struct filename *newpath;
680};
681
682struct io_hardlink {
683 struct file *file;
684 int old_dfd;
685 int new_dfd;
686 struct filename *oldpath;
687 struct filename *newpath;
688 int flags;
689};
690
691struct io_async_connect {
692 struct sockaddr_storage address;
693};
694
695struct io_async_msghdr {
696 struct iovec fast_iov[UIO_FASTIOV];
697
698 struct iovec *free_iov;
699 struct sockaddr __user *uaddr;
700 struct msghdr msg;
701 struct sockaddr_storage addr;
702};
703
704struct io_rw_state {
705 struct iov_iter iter;
706 struct iov_iter_state iter_state;
707 struct iovec fast_iov[UIO_FASTIOV];
708};
709
710struct io_async_rw {
711 struct io_rw_state s;
712 const struct iovec *free_iovec;
713 size_t bytes_done;
714 struct wait_page_queue wpq;
715};
716
717enum {
718 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
719 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
720 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
721 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
722 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
723 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
724
725
726 REQ_F_FAIL_BIT = 8,
727 REQ_F_INFLIGHT_BIT,
728 REQ_F_CUR_POS_BIT,
729 REQ_F_NOWAIT_BIT,
730 REQ_F_LINK_TIMEOUT_BIT,
731 REQ_F_NEED_CLEANUP_BIT,
732 REQ_F_POLLED_BIT,
733 REQ_F_BUFFER_SELECTED_BIT,
734 REQ_F_COMPLETE_INLINE_BIT,
735 REQ_F_REISSUE_BIT,
736 REQ_F_CREDS_BIT,
737 REQ_F_REFCOUNT_BIT,
738 REQ_F_ARM_LTIMEOUT_BIT,
739 REQ_F_ASYNC_DATA_BIT,
740
741 REQ_F_SUPPORT_NOWAIT_BIT,
742 REQ_F_ISREG_BIT,
743
744
745 __REQ_F_LAST_BIT,
746};
747
748enum {
749
750 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
751
752 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
753
754 REQ_F_LINK = BIT(REQ_F_LINK_BIT),
755
756 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
757
758 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
759
760 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
761
762
763 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
764
765 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
766
767 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
768
769 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
770
771 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
772
773 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
774
775 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
776
777 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
778
779 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
780
781 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
782
783 REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
784
785 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
786
787 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
788
789 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
790
791 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
792
793 REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
794};
795
796struct async_poll {
797 struct io_poll_iocb poll;
798 struct io_poll_iocb *double_poll;
799};
800
801typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
802
803struct io_task_work {
804 union {
805 struct io_wq_work_node node;
806 struct llist_node fallback_node;
807 };
808 io_req_tw_func_t func;
809};
810
811enum {
812 IORING_RSRC_FILE = 0,
813 IORING_RSRC_BUFFER = 1,
814};
815
816
817
818
819
820
821
822struct io_kiocb {
823 union {
824 struct file *file;
825 struct io_rw rw;
826 struct io_poll_iocb poll;
827 struct io_poll_update poll_update;
828 struct io_accept accept;
829 struct io_sync sync;
830 struct io_cancel cancel;
831 struct io_timeout timeout;
832 struct io_timeout_rem timeout_rem;
833 struct io_connect connect;
834 struct io_sr_msg sr_msg;
835 struct io_open open;
836 struct io_close close;
837 struct io_rsrc_update rsrc_update;
838 struct io_fadvise fadvise;
839 struct io_madvise madvise;
840 struct io_epoll epoll;
841 struct io_splice splice;
842 struct io_provide_buf pbuf;
843 struct io_statx statx;
844 struct io_shutdown shutdown;
845 struct io_rename rename;
846 struct io_unlink unlink;
847 struct io_mkdir mkdir;
848 struct io_symlink symlink;
849 struct io_hardlink hardlink;
850 };
851
852 u8 opcode;
853
854 u8 iopoll_completed;
855 u16 buf_index;
856 unsigned int flags;
857
858 u64 user_data;
859 u32 result;
860 u32 cflags;
861
862 struct io_ring_ctx *ctx;
863 struct task_struct *task;
864
865 struct percpu_ref *fixed_rsrc_refs;
866
867 struct io_mapped_ubuf *imu;
868
869
870 struct io_wq_work_node comp_list;
871 atomic_t refs;
872 struct io_kiocb *link;
873 struct io_task_work io_task_work;
874
875 struct hlist_node hash_node;
876
877 struct async_poll *apoll;
878
879 void *async_data;
880 struct io_wq_work work;
881
882 const struct cred *creds;
883
884 struct io_buffer *kbuf;
885};
886
887struct io_tctx_node {
888 struct list_head ctx_node;
889 struct task_struct *task;
890 struct io_ring_ctx *ctx;
891};
892
893struct io_defer_entry {
894 struct list_head list;
895 struct io_kiocb *req;
896 u32 seq;
897};
898
899struct io_op_def {
900
901 unsigned needs_file : 1;
902
903 unsigned plug : 1;
904
905 unsigned hash_reg_file : 1;
906
907 unsigned unbound_nonreg_file : 1;
908
909 unsigned pollin : 1;
910 unsigned pollout : 1;
911
912 unsigned buffer_select : 1;
913
914 unsigned needs_async_setup : 1;
915
916 unsigned not_supported : 1;
917
918 unsigned audit_skip : 1;
919
920 unsigned short async_size;
921};
922
923static const struct io_op_def io_op_defs[] = {
924 [IORING_OP_NOP] = {},
925 [IORING_OP_READV] = {
926 .needs_file = 1,
927 .unbound_nonreg_file = 1,
928 .pollin = 1,
929 .buffer_select = 1,
930 .needs_async_setup = 1,
931 .plug = 1,
932 .audit_skip = 1,
933 .async_size = sizeof(struct io_async_rw),
934 },
935 [IORING_OP_WRITEV] = {
936 .needs_file = 1,
937 .hash_reg_file = 1,
938 .unbound_nonreg_file = 1,
939 .pollout = 1,
940 .needs_async_setup = 1,
941 .plug = 1,
942 .audit_skip = 1,
943 .async_size = sizeof(struct io_async_rw),
944 },
945 [IORING_OP_FSYNC] = {
946 .needs_file = 1,
947 .audit_skip = 1,
948 },
949 [IORING_OP_READ_FIXED] = {
950 .needs_file = 1,
951 .unbound_nonreg_file = 1,
952 .pollin = 1,
953 .plug = 1,
954 .audit_skip = 1,
955 .async_size = sizeof(struct io_async_rw),
956 },
957 [IORING_OP_WRITE_FIXED] = {
958 .needs_file = 1,
959 .hash_reg_file = 1,
960 .unbound_nonreg_file = 1,
961 .pollout = 1,
962 .plug = 1,
963 .audit_skip = 1,
964 .async_size = sizeof(struct io_async_rw),
965 },
966 [IORING_OP_POLL_ADD] = {
967 .needs_file = 1,
968 .unbound_nonreg_file = 1,
969 .audit_skip = 1,
970 },
971 [IORING_OP_POLL_REMOVE] = {
972 .audit_skip = 1,
973 },
974 [IORING_OP_SYNC_FILE_RANGE] = {
975 .needs_file = 1,
976 .audit_skip = 1,
977 },
978 [IORING_OP_SENDMSG] = {
979 .needs_file = 1,
980 .unbound_nonreg_file = 1,
981 .pollout = 1,
982 .needs_async_setup = 1,
983 .async_size = sizeof(struct io_async_msghdr),
984 },
985 [IORING_OP_RECVMSG] = {
986 .needs_file = 1,
987 .unbound_nonreg_file = 1,
988 .pollin = 1,
989 .buffer_select = 1,
990 .needs_async_setup = 1,
991 .async_size = sizeof(struct io_async_msghdr),
992 },
993 [IORING_OP_TIMEOUT] = {
994 .audit_skip = 1,
995 .async_size = sizeof(struct io_timeout_data),
996 },
997 [IORING_OP_TIMEOUT_REMOVE] = {
998
999 .audit_skip = 1,
1000 },
1001 [IORING_OP_ACCEPT] = {
1002 .needs_file = 1,
1003 .unbound_nonreg_file = 1,
1004 .pollin = 1,
1005 },
1006 [IORING_OP_ASYNC_CANCEL] = {
1007 .audit_skip = 1,
1008 },
1009 [IORING_OP_LINK_TIMEOUT] = {
1010 .audit_skip = 1,
1011 .async_size = sizeof(struct io_timeout_data),
1012 },
1013 [IORING_OP_CONNECT] = {
1014 .needs_file = 1,
1015 .unbound_nonreg_file = 1,
1016 .pollout = 1,
1017 .needs_async_setup = 1,
1018 .async_size = sizeof(struct io_async_connect),
1019 },
1020 [IORING_OP_FALLOCATE] = {
1021 .needs_file = 1,
1022 },
1023 [IORING_OP_OPENAT] = {},
1024 [IORING_OP_CLOSE] = {},
1025 [IORING_OP_FILES_UPDATE] = {
1026 .audit_skip = 1,
1027 },
1028 [IORING_OP_STATX] = {
1029 .audit_skip = 1,
1030 },
1031 [IORING_OP_READ] = {
1032 .needs_file = 1,
1033 .unbound_nonreg_file = 1,
1034 .pollin = 1,
1035 .buffer_select = 1,
1036 .plug = 1,
1037 .audit_skip = 1,
1038 .async_size = sizeof(struct io_async_rw),
1039 },
1040 [IORING_OP_WRITE] = {
1041 .needs_file = 1,
1042 .hash_reg_file = 1,
1043 .unbound_nonreg_file = 1,
1044 .pollout = 1,
1045 .plug = 1,
1046 .audit_skip = 1,
1047 .async_size = sizeof(struct io_async_rw),
1048 },
1049 [IORING_OP_FADVISE] = {
1050 .needs_file = 1,
1051 .audit_skip = 1,
1052 },
1053 [IORING_OP_MADVISE] = {},
1054 [IORING_OP_SEND] = {
1055 .needs_file = 1,
1056 .unbound_nonreg_file = 1,
1057 .pollout = 1,
1058 .audit_skip = 1,
1059 },
1060 [IORING_OP_RECV] = {
1061 .needs_file = 1,
1062 .unbound_nonreg_file = 1,
1063 .pollin = 1,
1064 .buffer_select = 1,
1065 .audit_skip = 1,
1066 },
1067 [IORING_OP_OPENAT2] = {
1068 },
1069 [IORING_OP_EPOLL_CTL] = {
1070 .unbound_nonreg_file = 1,
1071 .audit_skip = 1,
1072 },
1073 [IORING_OP_SPLICE] = {
1074 .needs_file = 1,
1075 .hash_reg_file = 1,
1076 .unbound_nonreg_file = 1,
1077 .audit_skip = 1,
1078 },
1079 [IORING_OP_PROVIDE_BUFFERS] = {
1080 .audit_skip = 1,
1081 },
1082 [IORING_OP_REMOVE_BUFFERS] = {
1083 .audit_skip = 1,
1084 },
1085 [IORING_OP_TEE] = {
1086 .needs_file = 1,
1087 .hash_reg_file = 1,
1088 .unbound_nonreg_file = 1,
1089 .audit_skip = 1,
1090 },
1091 [IORING_OP_SHUTDOWN] = {
1092 .needs_file = 1,
1093 },
1094 [IORING_OP_RENAMEAT] = {},
1095 [IORING_OP_UNLINKAT] = {},
1096 [IORING_OP_MKDIRAT] = {},
1097 [IORING_OP_SYMLINKAT] = {},
1098 [IORING_OP_LINKAT] = {},
1099};
1100
1101
1102#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1103
1104static bool io_disarm_next(struct io_kiocb *req);
1105static void io_uring_del_tctx_node(unsigned long index);
1106static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1107 struct task_struct *task,
1108 bool cancel_all);
1109static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1110
1111static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1112 s32 res, u32 cflags);
1113static void io_put_req(struct io_kiocb *req);
1114static void io_put_req_deferred(struct io_kiocb *req);
1115static void io_dismantle_req(struct io_kiocb *req);
1116static void io_queue_linked_timeout(struct io_kiocb *req);
1117static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1118 struct io_uring_rsrc_update2 *up,
1119 unsigned nr_args);
1120static void io_clean_op(struct io_kiocb *req);
1121static struct file *io_file_get(struct io_ring_ctx *ctx,
1122 struct io_kiocb *req, int fd, bool fixed);
1123static void __io_queue_sqe(struct io_kiocb *req);
1124static void io_rsrc_put_work(struct work_struct *work);
1125
1126static void io_req_task_queue(struct io_kiocb *req);
1127static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
1128static int io_req_prep_async(struct io_kiocb *req);
1129
1130static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1131 unsigned int issue_flags, u32 slot_index);
1132static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1133
1134static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
1135
1136static struct kmem_cache *req_cachep;
1137
1138static const struct file_operations io_uring_fops;
1139
1140struct sock *io_uring_get_socket(struct file *file)
1141{
1142#if defined(CONFIG_UNIX)
1143 if (file->f_op == &io_uring_fops) {
1144 struct io_ring_ctx *ctx = file->private_data;
1145
1146 return ctx->ring_sock->sk;
1147 }
1148#endif
1149 return NULL;
1150}
1151EXPORT_SYMBOL(io_uring_get_socket);
1152
1153static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1154{
1155 if (!*locked) {
1156 mutex_lock(&ctx->uring_lock);
1157 *locked = true;
1158 }
1159}
1160
1161#define io_for_each_link(pos, head) \
1162 for (pos = (head); pos; pos = pos->link)
1163
1164
1165
1166
1167
1168#define req_ref_zero_or_close_to_overflow(req) \
1169 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1170
1171static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1172{
1173 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1174 return atomic_inc_not_zero(&req->refs);
1175}
1176
1177static inline bool req_ref_put_and_test(struct io_kiocb *req)
1178{
1179 if (likely(!(req->flags & REQ_F_REFCOUNT)))
1180 return true;
1181
1182 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1183 return atomic_dec_and_test(&req->refs);
1184}
1185
1186static inline void req_ref_put(struct io_kiocb *req)
1187{
1188 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1189 WARN_ON_ONCE(req_ref_put_and_test(req));
1190}
1191
1192static inline void req_ref_get(struct io_kiocb *req)
1193{
1194 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1195 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1196 atomic_inc(&req->refs);
1197}
1198
1199static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1200{
1201 if (!wq_list_empty(&ctx->submit_state.compl_reqs))
1202 __io_submit_flush_completions(ctx);
1203}
1204
1205static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1206{
1207 if (!(req->flags & REQ_F_REFCOUNT)) {
1208 req->flags |= REQ_F_REFCOUNT;
1209 atomic_set(&req->refs, nr);
1210 }
1211}
1212
1213static inline void io_req_set_refcount(struct io_kiocb *req)
1214{
1215 __io_req_set_refcount(req, 1);
1216}
1217
1218#define IO_RSRC_REF_BATCH 100
1219
1220static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1221 struct io_ring_ctx *ctx)
1222 __must_hold(&ctx->uring_lock)
1223{
1224 struct percpu_ref *ref = req->fixed_rsrc_refs;
1225
1226 if (ref) {
1227 if (ref == &ctx->rsrc_node->refs)
1228 ctx->rsrc_cached_refs++;
1229 else
1230 percpu_ref_put(ref);
1231 }
1232}
1233
1234static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1235{
1236 if (req->fixed_rsrc_refs)
1237 percpu_ref_put(req->fixed_rsrc_refs);
1238}
1239
1240static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1241 __must_hold(&ctx->uring_lock)
1242{
1243 if (ctx->rsrc_cached_refs) {
1244 percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1245 ctx->rsrc_cached_refs = 0;
1246 }
1247}
1248
1249static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1250 __must_hold(&ctx->uring_lock)
1251{
1252 ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1253 percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1254}
1255
1256static inline void io_req_set_rsrc_node(struct io_kiocb *req,
1257 struct io_ring_ctx *ctx)
1258{
1259 if (!req->fixed_rsrc_refs) {
1260 req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1261 ctx->rsrc_cached_refs--;
1262 if (unlikely(ctx->rsrc_cached_refs < 0))
1263 io_rsrc_refs_refill(ctx);
1264 }
1265}
1266
1267static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1268{
1269 bool got = percpu_ref_tryget(ref);
1270
1271
1272 if (!got)
1273 wait_for_completion(compl);
1274 percpu_ref_resurrect(ref);
1275 if (got)
1276 percpu_ref_put(ref);
1277}
1278
1279static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1280 bool cancel_all)
1281 __must_hold(&req->ctx->timeout_lock)
1282{
1283 struct io_kiocb *req;
1284
1285 if (task && head->task != task)
1286 return false;
1287 if (cancel_all)
1288 return true;
1289
1290 io_for_each_link(req, head) {
1291 if (req->flags & REQ_F_INFLIGHT)
1292 return true;
1293 }
1294 return false;
1295}
1296
1297static bool io_match_linked(struct io_kiocb *head)
1298{
1299 struct io_kiocb *req;
1300
1301 io_for_each_link(req, head) {
1302 if (req->flags & REQ_F_INFLIGHT)
1303 return true;
1304 }
1305 return false;
1306}
1307
1308
1309
1310
1311
1312static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1313 bool cancel_all)
1314{
1315 bool matched;
1316
1317 if (task && head->task != task)
1318 return false;
1319 if (cancel_all)
1320 return true;
1321
1322 if (head->flags & REQ_F_LINK_TIMEOUT) {
1323 struct io_ring_ctx *ctx = head->ctx;
1324
1325
1326 spin_lock_irq(&ctx->timeout_lock);
1327 matched = io_match_linked(head);
1328 spin_unlock_irq(&ctx->timeout_lock);
1329 } else {
1330 matched = io_match_linked(head);
1331 }
1332 return matched;
1333}
1334
1335static inline bool req_has_async_data(struct io_kiocb *req)
1336{
1337 return req->flags & REQ_F_ASYNC_DATA;
1338}
1339
1340static inline void req_set_fail(struct io_kiocb *req)
1341{
1342 req->flags |= REQ_F_FAIL;
1343}
1344
1345static inline void req_fail_link_node(struct io_kiocb *req, int res)
1346{
1347 req_set_fail(req);
1348 req->result = res;
1349}
1350
1351static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
1352{
1353 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1354
1355 complete(&ctx->ref_comp);
1356}
1357
1358static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1359{
1360 return !req->timeout.off;
1361}
1362
1363static __cold void io_fallback_req_func(struct work_struct *work)
1364{
1365 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1366 fallback_work.work);
1367 struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1368 struct io_kiocb *req, *tmp;
1369 bool locked = false;
1370
1371 percpu_ref_get(&ctx->refs);
1372 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1373 req->io_task_work.func(req, &locked);
1374
1375 if (locked) {
1376 io_submit_flush_completions(ctx);
1377 mutex_unlock(&ctx->uring_lock);
1378 }
1379 percpu_ref_put(&ctx->refs);
1380}
1381
1382static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1383{
1384 struct io_ring_ctx *ctx;
1385 int hash_bits;
1386
1387 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1388 if (!ctx)
1389 return NULL;
1390
1391
1392
1393
1394
1395 hash_bits = ilog2(p->cq_entries);
1396 hash_bits -= 5;
1397 if (hash_bits <= 0)
1398 hash_bits = 1;
1399 ctx->cancel_hash_bits = hash_bits;
1400 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1401 GFP_KERNEL);
1402 if (!ctx->cancel_hash)
1403 goto err;
1404 __hash_init(ctx->cancel_hash, 1U << hash_bits);
1405
1406 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1407 if (!ctx->dummy_ubuf)
1408 goto err;
1409
1410 ctx->dummy_ubuf->ubuf = -1UL;
1411
1412 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1413 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1414 goto err;
1415
1416 ctx->flags = p->flags;
1417 init_waitqueue_head(&ctx->sqo_sq_wait);
1418 INIT_LIST_HEAD(&ctx->sqd_list);
1419 INIT_LIST_HEAD(&ctx->cq_overflow_list);
1420 init_completion(&ctx->ref_comp);
1421 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1422 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1423 mutex_init(&ctx->uring_lock);
1424 init_waitqueue_head(&ctx->cq_wait);
1425 spin_lock_init(&ctx->completion_lock);
1426 spin_lock_init(&ctx->timeout_lock);
1427 INIT_WQ_LIST(&ctx->iopoll_list);
1428 INIT_LIST_HEAD(&ctx->defer_list);
1429 INIT_LIST_HEAD(&ctx->timeout_list);
1430 INIT_LIST_HEAD(&ctx->ltimeout_list);
1431 spin_lock_init(&ctx->rsrc_ref_lock);
1432 INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1433 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1434 init_llist_head(&ctx->rsrc_put_llist);
1435 INIT_LIST_HEAD(&ctx->tctx_list);
1436 ctx->submit_state.free_list.next = NULL;
1437 INIT_WQ_LIST(&ctx->locked_free_list);
1438 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1439 INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
1440 return ctx;
1441err:
1442 kfree(ctx->dummy_ubuf);
1443 kfree(ctx->cancel_hash);
1444 kfree(ctx);
1445 return NULL;
1446}
1447
1448static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1449{
1450 struct io_rings *r = ctx->rings;
1451
1452 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1453 ctx->cq_extra--;
1454}
1455
1456static bool req_need_defer(struct io_kiocb *req, u32 seq)
1457{
1458 if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1459 struct io_ring_ctx *ctx = req->ctx;
1460
1461 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1462 }
1463
1464 return false;
1465}
1466
1467#define FFS_NOWAIT 0x1UL
1468#define FFS_ISREG 0x2UL
1469#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
1470
1471static inline bool io_req_ffs_set(struct io_kiocb *req)
1472{
1473 return req->flags & REQ_F_FIXED_FILE;
1474}
1475
1476static inline void io_req_track_inflight(struct io_kiocb *req)
1477{
1478 if (!(req->flags & REQ_F_INFLIGHT)) {
1479 req->flags |= REQ_F_INFLIGHT;
1480 atomic_inc(¤t->io_uring->inflight_tracked);
1481 }
1482}
1483
1484static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1485{
1486 if (WARN_ON_ONCE(!req->link))
1487 return NULL;
1488
1489 req->flags &= ~REQ_F_ARM_LTIMEOUT;
1490 req->flags |= REQ_F_LINK_TIMEOUT;
1491
1492
1493 io_req_set_refcount(req);
1494 __io_req_set_refcount(req->link, 2);
1495 return req->link;
1496}
1497
1498static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1499{
1500 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1501 return NULL;
1502 return __io_prep_linked_timeout(req);
1503}
1504
1505static void io_prep_async_work(struct io_kiocb *req)
1506{
1507 const struct io_op_def *def = &io_op_defs[req->opcode];
1508 struct io_ring_ctx *ctx = req->ctx;
1509
1510 if (!(req->flags & REQ_F_CREDS)) {
1511 req->flags |= REQ_F_CREDS;
1512 req->creds = get_current_cred();
1513 }
1514
1515 req->work.list.next = NULL;
1516 req->work.flags = 0;
1517 if (req->flags & REQ_F_FORCE_ASYNC)
1518 req->work.flags |= IO_WQ_WORK_CONCURRENT;
1519
1520 if (req->flags & REQ_F_ISREG) {
1521 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1522 io_wq_hash_work(&req->work, file_inode(req->file));
1523 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1524 if (def->unbound_nonreg_file)
1525 req->work.flags |= IO_WQ_WORK_UNBOUND;
1526 }
1527
1528 switch (req->opcode) {
1529 case IORING_OP_SPLICE:
1530 case IORING_OP_TEE:
1531 if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1532 req->work.flags |= IO_WQ_WORK_UNBOUND;
1533 break;
1534 }
1535}
1536
1537static void io_prep_async_link(struct io_kiocb *req)
1538{
1539 struct io_kiocb *cur;
1540
1541 if (req->flags & REQ_F_LINK_TIMEOUT) {
1542 struct io_ring_ctx *ctx = req->ctx;
1543
1544 spin_lock_irq(&ctx->timeout_lock);
1545 io_for_each_link(cur, req)
1546 io_prep_async_work(cur);
1547 spin_unlock_irq(&ctx->timeout_lock);
1548 } else {
1549 io_for_each_link(cur, req)
1550 io_prep_async_work(cur);
1551 }
1552}
1553
1554static inline void io_req_add_compl_list(struct io_kiocb *req)
1555{
1556 struct io_submit_state *state = &req->ctx->submit_state;
1557
1558 wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1559}
1560
1561static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
1562{
1563 struct io_ring_ctx *ctx = req->ctx;
1564 struct io_kiocb *link = io_prep_linked_timeout(req);
1565 struct io_uring_task *tctx = req->task->io_uring;
1566
1567 BUG_ON(!tctx);
1568 BUG_ON(!tctx->io_wq);
1569
1570
1571 io_prep_async_link(req);
1572
1573
1574
1575
1576
1577
1578
1579
1580 if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1581 req->work.flags |= IO_WQ_WORK_CANCEL;
1582
1583 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1584 &req->work, req->flags);
1585 io_wq_enqueue(tctx->io_wq, &req->work);
1586 if (link)
1587 io_queue_linked_timeout(link);
1588}
1589
1590static void io_kill_timeout(struct io_kiocb *req, int status)
1591 __must_hold(&req->ctx->completion_lock)
1592 __must_hold(&req->ctx->timeout_lock)
1593{
1594 struct io_timeout_data *io = req->async_data;
1595
1596 if (hrtimer_try_to_cancel(&io->timer) != -1) {
1597 if (status)
1598 req_set_fail(req);
1599 atomic_set(&req->ctx->cq_timeouts,
1600 atomic_read(&req->ctx->cq_timeouts) + 1);
1601 list_del_init(&req->timeout.list);
1602 io_cqring_fill_event(req->ctx, req->user_data, status, 0);
1603 io_put_req_deferred(req);
1604 }
1605}
1606
1607static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
1608{
1609 while (!list_empty(&ctx->defer_list)) {
1610 struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1611 struct io_defer_entry, list);
1612
1613 if (req_need_defer(de->req, de->seq))
1614 break;
1615 list_del_init(&de->list);
1616 io_req_task_queue(de->req);
1617 kfree(de);
1618 }
1619}
1620
1621static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
1622 __must_hold(&ctx->completion_lock)
1623{
1624 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1625
1626 spin_lock_irq(&ctx->timeout_lock);
1627 while (!list_empty(&ctx->timeout_list)) {
1628 u32 events_needed, events_got;
1629 struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1630 struct io_kiocb, timeout.list);
1631
1632 if (io_is_timeout_noseq(req))
1633 break;
1634
1635
1636
1637
1638
1639
1640
1641
1642 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1643 events_got = seq - ctx->cq_last_tm_flush;
1644 if (events_got < events_needed)
1645 break;
1646
1647 list_del_init(&req->timeout.list);
1648 io_kill_timeout(req, 0);
1649 }
1650 ctx->cq_last_tm_flush = seq;
1651 spin_unlock_irq(&ctx->timeout_lock);
1652}
1653
1654static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1655{
1656 if (ctx->off_timeout_used)
1657 io_flush_timeouts(ctx);
1658 if (ctx->drain_active)
1659 io_queue_deferred(ctx);
1660}
1661
1662static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1663{
1664 if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1665 __io_commit_cqring_flush(ctx);
1666
1667 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1668}
1669
1670static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1671{
1672 struct io_rings *r = ctx->rings;
1673
1674 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1675}
1676
1677static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1678{
1679 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1680}
1681
1682static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1683{
1684 struct io_rings *rings = ctx->rings;
1685 unsigned tail, mask = ctx->cq_entries - 1;
1686
1687
1688
1689
1690
1691
1692 if (__io_cqring_events(ctx) == ctx->cq_entries)
1693 return NULL;
1694
1695 tail = ctx->cached_cq_tail++;
1696 return &rings->cqes[tail & mask];
1697}
1698
1699static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1700{
1701 if (likely(!ctx->cq_ev_fd))
1702 return false;
1703 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1704 return false;
1705 return !ctx->eventfd_async || io_wq_current_is_worker();
1706}
1707
1708
1709
1710
1711
1712
1713
1714
1715static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1716{
1717
1718
1719
1720
1721
1722 if (wq_has_sleeper(&ctx->cq_wait))
1723 wake_up_all(&ctx->cq_wait);
1724 if (io_should_trigger_evfd(ctx))
1725 eventfd_signal(ctx->cq_ev_fd, 1);
1726}
1727
1728static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1729{
1730
1731 smp_mb();
1732
1733 if (ctx->flags & IORING_SETUP_SQPOLL) {
1734 if (waitqueue_active(&ctx->cq_wait))
1735 wake_up_all(&ctx->cq_wait);
1736 }
1737 if (io_should_trigger_evfd(ctx))
1738 eventfd_signal(ctx->cq_ev_fd, 1);
1739}
1740
1741
1742static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1743{
1744 bool all_flushed, posted;
1745
1746 if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1747 return false;
1748
1749 posted = false;
1750 spin_lock(&ctx->completion_lock);
1751 while (!list_empty(&ctx->cq_overflow_list)) {
1752 struct io_uring_cqe *cqe = io_get_cqe(ctx);
1753 struct io_overflow_cqe *ocqe;
1754
1755 if (!cqe && !force)
1756 break;
1757 ocqe = list_first_entry(&ctx->cq_overflow_list,
1758 struct io_overflow_cqe, list);
1759 if (cqe)
1760 memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1761 else
1762 io_account_cq_overflow(ctx);
1763
1764 posted = true;
1765 list_del(&ocqe->list);
1766 kfree(ocqe);
1767 }
1768
1769 all_flushed = list_empty(&ctx->cq_overflow_list);
1770 if (all_flushed) {
1771 clear_bit(0, &ctx->check_cq_overflow);
1772 WRITE_ONCE(ctx->rings->sq_flags,
1773 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1774 }
1775
1776 if (posted)
1777 io_commit_cqring(ctx);
1778 spin_unlock(&ctx->completion_lock);
1779 if (posted)
1780 io_cqring_ev_posted(ctx);
1781 return all_flushed;
1782}
1783
1784static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1785{
1786 bool ret = true;
1787
1788 if (test_bit(0, &ctx->check_cq_overflow)) {
1789
1790 if (ctx->flags & IORING_SETUP_IOPOLL)
1791 mutex_lock(&ctx->uring_lock);
1792 ret = __io_cqring_overflow_flush(ctx, false);
1793 if (ctx->flags & IORING_SETUP_IOPOLL)
1794 mutex_unlock(&ctx->uring_lock);
1795 }
1796
1797 return ret;
1798}
1799
1800
1801static inline void io_put_task(struct task_struct *task, int nr)
1802{
1803 struct io_uring_task *tctx = task->io_uring;
1804
1805 if (likely(task == current)) {
1806 tctx->cached_refs += nr;
1807 } else {
1808 percpu_counter_sub(&tctx->inflight, nr);
1809 if (unlikely(atomic_read(&tctx->in_idle)))
1810 wake_up(&tctx->wait);
1811 put_task_struct_many(task, nr);
1812 }
1813}
1814
1815static void io_task_refs_refill(struct io_uring_task *tctx)
1816{
1817 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
1818
1819 percpu_counter_add(&tctx->inflight, refill);
1820 refcount_add(refill, ¤t->usage);
1821 tctx->cached_refs += refill;
1822}
1823
1824static inline void io_get_task_refs(int nr)
1825{
1826 struct io_uring_task *tctx = current->io_uring;
1827
1828 tctx->cached_refs -= nr;
1829 if (unlikely(tctx->cached_refs < 0))
1830 io_task_refs_refill(tctx);
1831}
1832
1833static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1834 s32 res, u32 cflags)
1835{
1836 struct io_overflow_cqe *ocqe;
1837
1838 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1839 if (!ocqe) {
1840
1841
1842
1843
1844
1845 io_account_cq_overflow(ctx);
1846 return false;
1847 }
1848 if (list_empty(&ctx->cq_overflow_list)) {
1849 set_bit(0, &ctx->check_cq_overflow);
1850 WRITE_ONCE(ctx->rings->sq_flags,
1851 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1852
1853 }
1854 ocqe->cqe.user_data = user_data;
1855 ocqe->cqe.res = res;
1856 ocqe->cqe.flags = cflags;
1857 list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1858 return true;
1859}
1860
1861static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1862 s32 res, u32 cflags)
1863{
1864 struct io_uring_cqe *cqe;
1865
1866 trace_io_uring_complete(ctx, user_data, res, cflags);
1867
1868
1869
1870
1871
1872
1873 cqe = io_get_cqe(ctx);
1874 if (likely(cqe)) {
1875 WRITE_ONCE(cqe->user_data, user_data);
1876 WRITE_ONCE(cqe->res, res);
1877 WRITE_ONCE(cqe->flags, cflags);
1878 return true;
1879 }
1880 return io_cqring_event_overflow(ctx, user_data, res, cflags);
1881}
1882
1883
1884static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1885 s32 res, u32 cflags)
1886{
1887 return __io_cqring_fill_event(ctx, user_data, res, cflags);
1888}
1889
1890static void io_req_complete_post(struct io_kiocb *req, s32 res,
1891 u32 cflags)
1892{
1893 struct io_ring_ctx *ctx = req->ctx;
1894
1895 spin_lock(&ctx->completion_lock);
1896 __io_cqring_fill_event(ctx, req->user_data, res, cflags);
1897
1898
1899
1900
1901 if (req_ref_put_and_test(req)) {
1902 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1903 if (req->flags & IO_DISARM_MASK)
1904 io_disarm_next(req);
1905 if (req->link) {
1906 io_req_task_queue(req->link);
1907 req->link = NULL;
1908 }
1909 }
1910 io_req_put_rsrc(req, ctx);
1911 io_dismantle_req(req);
1912 io_put_task(req->task, 1);
1913 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
1914 ctx->locked_free_nr++;
1915 }
1916 io_commit_cqring(ctx);
1917 spin_unlock(&ctx->completion_lock);
1918 io_cqring_ev_posted(ctx);
1919}
1920
1921static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
1922 u32 cflags)
1923{
1924 req->result = res;
1925 req->cflags = cflags;
1926 req->flags |= REQ_F_COMPLETE_INLINE;
1927}
1928
1929static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1930 s32 res, u32 cflags)
1931{
1932 if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1933 io_req_complete_state(req, res, cflags);
1934 else
1935 io_req_complete_post(req, res, cflags);
1936}
1937
1938static inline void io_req_complete(struct io_kiocb *req, s32 res)
1939{
1940 __io_req_complete(req, 0, res, 0);
1941}
1942
1943static void io_req_complete_failed(struct io_kiocb *req, s32 res)
1944{
1945 req_set_fail(req);
1946 io_req_complete_post(req, res, 0);
1947}
1948
1949static void io_req_complete_fail_submit(struct io_kiocb *req)
1950{
1951
1952
1953
1954
1955 req->flags &= ~REQ_F_HARDLINK;
1956 req->flags |= REQ_F_LINK;
1957 io_req_complete_failed(req, req->result);
1958}
1959
1960
1961
1962
1963
1964static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1965{
1966 req->ctx = ctx;
1967 req->link = NULL;
1968 req->async_data = NULL;
1969
1970 req->result = 0;
1971}
1972
1973static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1974 struct io_submit_state *state)
1975{
1976 spin_lock(&ctx->completion_lock);
1977 wq_list_splice(&ctx->locked_free_list, &state->free_list);
1978 ctx->locked_free_nr = 0;
1979 spin_unlock(&ctx->completion_lock);
1980}
1981
1982
1983static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1984{
1985 struct io_submit_state *state = &ctx->submit_state;
1986
1987
1988
1989
1990
1991
1992 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1993 io_flush_cached_locked_reqs(ctx, state);
1994 return !!state->free_list.next;
1995}
1996
1997
1998
1999
2000
2001
2002
2003static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
2004 __must_hold(&ctx->uring_lock)
2005{
2006 struct io_submit_state *state = &ctx->submit_state;
2007 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2008 void *reqs[IO_REQ_ALLOC_BATCH];
2009 struct io_kiocb *req;
2010 int ret, i;
2011
2012 if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
2013 return true;
2014
2015 ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
2016
2017
2018
2019
2020
2021 if (unlikely(ret <= 0)) {
2022 reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2023 if (!reqs[0])
2024 return false;
2025 ret = 1;
2026 }
2027
2028 percpu_ref_get_many(&ctx->refs, ret);
2029 for (i = 0; i < ret; i++) {
2030 req = reqs[i];
2031
2032 io_preinit_req(req, ctx);
2033 wq_stack_add_head(&req->comp_list, &state->free_list);
2034 }
2035 return true;
2036}
2037
2038static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
2039{
2040 if (unlikely(!ctx->submit_state.free_list.next))
2041 return __io_alloc_req_refill(ctx);
2042 return true;
2043}
2044
2045static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2046{
2047 struct io_wq_work_node *node;
2048
2049 node = wq_stack_extract(&ctx->submit_state.free_list);
2050 return container_of(node, struct io_kiocb, comp_list);
2051}
2052
2053static inline void io_put_file(struct file *file)
2054{
2055 if (file)
2056 fput(file);
2057}
2058
2059static inline void io_dismantle_req(struct io_kiocb *req)
2060{
2061 unsigned int flags = req->flags;
2062
2063 if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
2064 io_clean_op(req);
2065 if (!(flags & REQ_F_FIXED_FILE))
2066 io_put_file(req->file);
2067}
2068
2069static __cold void __io_free_req(struct io_kiocb *req)
2070{
2071 struct io_ring_ctx *ctx = req->ctx;
2072
2073 io_req_put_rsrc(req, ctx);
2074 io_dismantle_req(req);
2075 io_put_task(req->task, 1);
2076
2077 spin_lock(&ctx->completion_lock);
2078 wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
2079 ctx->locked_free_nr++;
2080 spin_unlock(&ctx->completion_lock);
2081}
2082
2083static inline void io_remove_next_linked(struct io_kiocb *req)
2084{
2085 struct io_kiocb *nxt = req->link;
2086
2087 req->link = nxt->link;
2088 nxt->link = NULL;
2089}
2090
2091static bool io_kill_linked_timeout(struct io_kiocb *req)
2092 __must_hold(&req->ctx->completion_lock)
2093 __must_hold(&req->ctx->timeout_lock)
2094{
2095 struct io_kiocb *link = req->link;
2096
2097 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2098 struct io_timeout_data *io = link->async_data;
2099
2100 io_remove_next_linked(req);
2101 link->timeout.head = NULL;
2102 if (hrtimer_try_to_cancel(&io->timer) != -1) {
2103 list_del(&link->timeout.list);
2104 io_cqring_fill_event(link->ctx, link->user_data,
2105 -ECANCELED, 0);
2106 io_put_req_deferred(link);
2107 return true;
2108 }
2109 }
2110 return false;
2111}
2112
2113static void io_fail_links(struct io_kiocb *req)
2114 __must_hold(&req->ctx->completion_lock)
2115{
2116 struct io_kiocb *nxt, *link = req->link;
2117
2118 req->link = NULL;
2119 while (link) {
2120 long res = -ECANCELED;
2121
2122 if (link->flags & REQ_F_FAIL)
2123 res = link->result;
2124
2125 nxt = link->link;
2126 link->link = NULL;
2127
2128 trace_io_uring_fail_link(req, link);
2129 io_cqring_fill_event(link->ctx, link->user_data, res, 0);
2130 io_put_req_deferred(link);
2131 link = nxt;
2132 }
2133}
2134
2135static bool io_disarm_next(struct io_kiocb *req)
2136 __must_hold(&req->ctx->completion_lock)
2137{
2138 bool posted = false;
2139
2140 if (req->flags & REQ_F_ARM_LTIMEOUT) {
2141 struct io_kiocb *link = req->link;
2142
2143 req->flags &= ~REQ_F_ARM_LTIMEOUT;
2144 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2145 io_remove_next_linked(req);
2146 io_cqring_fill_event(link->ctx, link->user_data,
2147 -ECANCELED, 0);
2148 io_put_req_deferred(link);
2149 posted = true;
2150 }
2151 } else if (req->flags & REQ_F_LINK_TIMEOUT) {
2152 struct io_ring_ctx *ctx = req->ctx;
2153
2154 spin_lock_irq(&ctx->timeout_lock);
2155 posted = io_kill_linked_timeout(req);
2156 spin_unlock_irq(&ctx->timeout_lock);
2157 }
2158 if (unlikely((req->flags & REQ_F_FAIL) &&
2159 !(req->flags & REQ_F_HARDLINK))) {
2160 posted |= (req->link != NULL);
2161 io_fail_links(req);
2162 }
2163 return posted;
2164}
2165
2166static void __io_req_find_next_prep(struct io_kiocb *req)
2167{
2168 struct io_ring_ctx *ctx = req->ctx;
2169 bool posted;
2170
2171 spin_lock(&ctx->completion_lock);
2172 posted = io_disarm_next(req);
2173 if (posted)
2174 io_commit_cqring(req->ctx);
2175 spin_unlock(&ctx->completion_lock);
2176 if (posted)
2177 io_cqring_ev_posted(ctx);
2178}
2179
2180static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2181{
2182 struct io_kiocb *nxt;
2183
2184 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2185 return NULL;
2186
2187
2188
2189
2190
2191
2192 if (unlikely(req->flags & IO_DISARM_MASK))
2193 __io_req_find_next_prep(req);
2194 nxt = req->link;
2195 req->link = NULL;
2196 return nxt;
2197}
2198
2199static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2200{
2201 if (!ctx)
2202 return;
2203 if (*locked) {
2204 io_submit_flush_completions(ctx);
2205 mutex_unlock(&ctx->uring_lock);
2206 *locked = false;
2207 }
2208 percpu_ref_put(&ctx->refs);
2209}
2210
2211static void tctx_task_work(struct callback_head *cb)
2212{
2213 bool locked = false;
2214 struct io_ring_ctx *ctx = NULL;
2215 struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2216 task_work);
2217
2218 while (1) {
2219 struct io_wq_work_node *node;
2220
2221 if (!tctx->task_list.first && locked)
2222 io_submit_flush_completions(ctx);
2223
2224 spin_lock_irq(&tctx->task_lock);
2225 node = tctx->task_list.first;
2226 INIT_WQ_LIST(&tctx->task_list);
2227 if (!node)
2228 tctx->task_running = false;
2229 spin_unlock_irq(&tctx->task_lock);
2230 if (!node)
2231 break;
2232
2233 do {
2234 struct io_wq_work_node *next = node->next;
2235 struct io_kiocb *req = container_of(node, struct io_kiocb,
2236 io_task_work.node);
2237
2238 if (req->ctx != ctx) {
2239 ctx_flush_and_put(ctx, &locked);
2240 ctx = req->ctx;
2241
2242 locked = mutex_trylock(&ctx->uring_lock);
2243 percpu_ref_get(&ctx->refs);
2244 }
2245 req->io_task_work.func(req, &locked);
2246 node = next;
2247 } while (node);
2248
2249 cond_resched();
2250 }
2251
2252 ctx_flush_and_put(ctx, &locked);
2253}
2254
2255static void io_req_task_work_add(struct io_kiocb *req)
2256{
2257 struct task_struct *tsk = req->task;
2258 struct io_uring_task *tctx = tsk->io_uring;
2259 enum task_work_notify_mode notify;
2260 struct io_wq_work_node *node;
2261 unsigned long flags;
2262 bool running;
2263
2264 WARN_ON_ONCE(!tctx);
2265
2266 spin_lock_irqsave(&tctx->task_lock, flags);
2267 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2268 running = tctx->task_running;
2269 if (!running)
2270 tctx->task_running = true;
2271 spin_unlock_irqrestore(&tctx->task_lock, flags);
2272
2273
2274 if (running)
2275 return;
2276
2277
2278
2279
2280
2281
2282
2283 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2284 if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2285 if (notify == TWA_NONE)
2286 wake_up_process(tsk);
2287 return;
2288 }
2289
2290 spin_lock_irqsave(&tctx->task_lock, flags);
2291 tctx->task_running = false;
2292 node = tctx->task_list.first;
2293 INIT_WQ_LIST(&tctx->task_list);
2294 spin_unlock_irqrestore(&tctx->task_lock, flags);
2295
2296 while (node) {
2297 req = container_of(node, struct io_kiocb, io_task_work.node);
2298 node = node->next;
2299 if (llist_add(&req->io_task_work.fallback_node,
2300 &req->ctx->fallback_llist))
2301 schedule_delayed_work(&req->ctx->fallback_work, 1);
2302 }
2303}
2304
2305static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2306{
2307 struct io_ring_ctx *ctx = req->ctx;
2308
2309
2310 io_tw_lock(ctx, locked);
2311 io_req_complete_failed(req, req->result);
2312}
2313
2314static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2315{
2316 struct io_ring_ctx *ctx = req->ctx;
2317
2318 io_tw_lock(ctx, locked);
2319
2320 if (likely(!(req->task->flags & PF_EXITING)))
2321 __io_queue_sqe(req);
2322 else
2323 io_req_complete_failed(req, -EFAULT);
2324}
2325
2326static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2327{
2328 req->result = ret;
2329 req->io_task_work.func = io_req_task_cancel;
2330 io_req_task_work_add(req);
2331}
2332
2333static void io_req_task_queue(struct io_kiocb *req)
2334{
2335 req->io_task_work.func = io_req_task_submit;
2336 io_req_task_work_add(req);
2337}
2338
2339static void io_req_task_queue_reissue(struct io_kiocb *req)
2340{
2341 req->io_task_work.func = io_queue_async_work;
2342 io_req_task_work_add(req);
2343}
2344
2345static inline void io_queue_next(struct io_kiocb *req)
2346{
2347 struct io_kiocb *nxt = io_req_find_next(req);
2348
2349 if (nxt)
2350 io_req_task_queue(nxt);
2351}
2352
2353static void io_free_req(struct io_kiocb *req)
2354{
2355 io_queue_next(req);
2356 __io_free_req(req);
2357}
2358
2359static void io_free_req_work(struct io_kiocb *req, bool *locked)
2360{
2361 io_free_req(req);
2362}
2363
2364static void io_free_batch_list(struct io_ring_ctx *ctx,
2365 struct io_wq_work_node *node)
2366 __must_hold(&ctx->uring_lock)
2367{
2368 struct task_struct *task = NULL;
2369 int task_refs = 0;
2370
2371 do {
2372 struct io_kiocb *req = container_of(node, struct io_kiocb,
2373 comp_list);
2374
2375 if (unlikely(req->flags & REQ_F_REFCOUNT)) {
2376 node = req->comp_list.next;
2377 if (!req_ref_put_and_test(req))
2378 continue;
2379 }
2380
2381 io_req_put_rsrc_locked(req, ctx);
2382 io_queue_next(req);
2383 io_dismantle_req(req);
2384
2385 if (req->task != task) {
2386 if (task)
2387 io_put_task(task, task_refs);
2388 task = req->task;
2389 task_refs = 0;
2390 }
2391 task_refs++;
2392 node = req->comp_list.next;
2393 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
2394 } while (node);
2395
2396 if (task)
2397 io_put_task(task, task_refs);
2398}
2399
2400static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
2401 __must_hold(&ctx->uring_lock)
2402{
2403 struct io_wq_work_node *node, *prev;
2404 struct io_submit_state *state = &ctx->submit_state;
2405
2406 spin_lock(&ctx->completion_lock);
2407 wq_list_for_each(node, prev, &state->compl_reqs) {
2408 struct io_kiocb *req = container_of(node, struct io_kiocb,
2409 comp_list);
2410
2411 __io_cqring_fill_event(ctx, req->user_data, req->result,
2412 req->cflags);
2413 }
2414 io_commit_cqring(ctx);
2415 spin_unlock(&ctx->completion_lock);
2416 io_cqring_ev_posted(ctx);
2417
2418 io_free_batch_list(ctx, state->compl_reqs.first);
2419 INIT_WQ_LIST(&state->compl_reqs);
2420}
2421
2422
2423
2424
2425
2426static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2427{
2428 struct io_kiocb *nxt = NULL;
2429
2430 if (req_ref_put_and_test(req)) {
2431 nxt = io_req_find_next(req);
2432 __io_free_req(req);
2433 }
2434 return nxt;
2435}
2436
2437static inline void io_put_req(struct io_kiocb *req)
2438{
2439 if (req_ref_put_and_test(req))
2440 io_free_req(req);
2441}
2442
2443static inline void io_put_req_deferred(struct io_kiocb *req)
2444{
2445 if (req_ref_put_and_test(req)) {
2446 req->io_task_work.func = io_free_req_work;
2447 io_req_task_work_add(req);
2448 }
2449}
2450
2451static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2452{
2453
2454 smp_rmb();
2455 return __io_cqring_events(ctx);
2456}
2457
2458static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2459{
2460 struct io_rings *rings = ctx->rings;
2461
2462
2463 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2464}
2465
2466static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2467{
2468 unsigned int cflags;
2469
2470 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2471 cflags |= IORING_CQE_F_BUFFER;
2472 req->flags &= ~REQ_F_BUFFER_SELECTED;
2473 kfree(kbuf);
2474 return cflags;
2475}
2476
2477static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2478{
2479 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2480 return 0;
2481 return io_put_kbuf(req, req->kbuf);
2482}
2483
2484static inline bool io_run_task_work(void)
2485{
2486 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
2487 __set_current_state(TASK_RUNNING);
2488 tracehook_notify_signal();
2489 return true;
2490 }
2491
2492 return false;
2493}
2494
2495static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
2496{
2497 struct io_wq_work_node *pos, *start, *prev;
2498 unsigned int poll_flags = BLK_POLL_NOSLEEP;
2499 DEFINE_IO_COMP_BATCH(iob);
2500 int nr_events = 0;
2501
2502
2503
2504
2505
2506 if (ctx->poll_multi_queue || force_nonspin)
2507 poll_flags |= BLK_POLL_ONESHOT;
2508
2509 wq_list_for_each(pos, start, &ctx->iopoll_list) {
2510 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2511 struct kiocb *kiocb = &req->rw.kiocb;
2512 int ret;
2513
2514
2515
2516
2517
2518
2519 if (READ_ONCE(req->iopoll_completed))
2520 break;
2521
2522 ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
2523 if (unlikely(ret < 0))
2524 return ret;
2525 else if (ret)
2526 poll_flags |= BLK_POLL_ONESHOT;
2527
2528
2529 if (!rq_list_empty(iob.req_list) ||
2530 READ_ONCE(req->iopoll_completed))
2531 break;
2532 }
2533
2534 if (!rq_list_empty(iob.req_list))
2535 iob.complete(&iob);
2536 else if (!pos)
2537 return 0;
2538
2539 prev = start;
2540 wq_list_for_each_resume(pos, prev) {
2541 struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2542
2543
2544 if (!smp_load_acquire(&req->iopoll_completed))
2545 break;
2546 __io_cqring_fill_event(ctx, req->user_data, req->result,
2547 io_put_rw_kbuf(req));
2548 nr_events++;
2549 }
2550
2551 if (unlikely(!nr_events))
2552 return 0;
2553
2554 io_commit_cqring(ctx);
2555 io_cqring_ev_posted_iopoll(ctx);
2556 pos = start ? start->next : ctx->iopoll_list.first;
2557 wq_list_cut(&ctx->iopoll_list, prev, start);
2558 io_free_batch_list(ctx, pos);
2559 return nr_events;
2560}
2561
2562
2563
2564
2565
2566static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2567{
2568 if (!(ctx->flags & IORING_SETUP_IOPOLL))
2569 return;
2570
2571 mutex_lock(&ctx->uring_lock);
2572 while (!wq_list_empty(&ctx->iopoll_list)) {
2573
2574 if (io_do_iopoll(ctx, true) == 0)
2575 break;
2576
2577
2578
2579
2580
2581 if (need_resched()) {
2582 mutex_unlock(&ctx->uring_lock);
2583 cond_resched();
2584 mutex_lock(&ctx->uring_lock);
2585 }
2586 }
2587 mutex_unlock(&ctx->uring_lock);
2588}
2589
2590static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2591{
2592 unsigned int nr_events = 0;
2593 int ret = 0;
2594
2595
2596
2597
2598
2599
2600 mutex_lock(&ctx->uring_lock);
2601
2602
2603
2604
2605
2606 if (test_bit(0, &ctx->check_cq_overflow))
2607 __io_cqring_overflow_flush(ctx, false);
2608 if (io_cqring_events(ctx))
2609 goto out;
2610 do {
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621 if (wq_list_empty(&ctx->iopoll_list)) {
2622 u32 tail = ctx->cached_cq_tail;
2623
2624 mutex_unlock(&ctx->uring_lock);
2625 io_run_task_work();
2626 mutex_lock(&ctx->uring_lock);
2627
2628
2629 if (tail != ctx->cached_cq_tail ||
2630 wq_list_empty(&ctx->iopoll_list))
2631 break;
2632 }
2633 ret = io_do_iopoll(ctx, !min);
2634 if (ret < 0)
2635 break;
2636 nr_events += ret;
2637 ret = 0;
2638 } while (nr_events < min && !need_resched());
2639out:
2640 mutex_unlock(&ctx->uring_lock);
2641 return ret;
2642}
2643
2644static void kiocb_end_write(struct io_kiocb *req)
2645{
2646
2647
2648
2649
2650 if (req->flags & REQ_F_ISREG) {
2651 struct super_block *sb = file_inode(req->file)->i_sb;
2652
2653 __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2654 sb_end_write(sb);
2655 }
2656}
2657
2658#ifdef CONFIG_BLOCK
2659static bool io_resubmit_prep(struct io_kiocb *req)
2660{
2661 struct io_async_rw *rw = req->async_data;
2662
2663 if (!req_has_async_data(req))
2664 return !io_req_prep_async(req);
2665 iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
2666 return true;
2667}
2668
2669static bool io_rw_should_reissue(struct io_kiocb *req)
2670{
2671 umode_t mode = file_inode(req->file)->i_mode;
2672 struct io_ring_ctx *ctx = req->ctx;
2673
2674 if (!S_ISBLK(mode) && !S_ISREG(mode))
2675 return false;
2676 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2677 !(ctx->flags & IORING_SETUP_IOPOLL)))
2678 return false;
2679
2680
2681
2682
2683
2684 if (percpu_ref_is_dying(&ctx->refs))
2685 return false;
2686
2687
2688
2689
2690 if (!same_thread_group(req->task, current) || !in_task())
2691 return false;
2692 return true;
2693}
2694#else
2695static bool io_resubmit_prep(struct io_kiocb *req)
2696{
2697 return false;
2698}
2699static bool io_rw_should_reissue(struct io_kiocb *req)
2700{
2701 return false;
2702}
2703#endif
2704
2705static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2706{
2707 if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2708 kiocb_end_write(req);
2709 if (unlikely(res != req->result)) {
2710 if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2711 io_rw_should_reissue(req)) {
2712 req->flags |= REQ_F_REISSUE;
2713 return true;
2714 }
2715 req_set_fail(req);
2716 req->result = res;
2717 }
2718 return false;
2719}
2720
2721static void io_req_task_complete(struct io_kiocb *req, bool *locked)
2722{
2723 unsigned int cflags = io_put_rw_kbuf(req);
2724 int res = req->result;
2725
2726 if (*locked) {
2727 io_req_complete_state(req, res, cflags);
2728 io_req_add_compl_list(req);
2729 } else {
2730 io_req_complete_post(req, res, cflags);
2731 }
2732}
2733
2734static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2735 unsigned int issue_flags)
2736{
2737 if (__io_complete_rw_common(req, res))
2738 return;
2739 __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
2740}
2741
2742static void io_complete_rw(struct kiocb *kiocb, long res)
2743{
2744 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2745
2746 if (__io_complete_rw_common(req, res))
2747 return;
2748 req->result = res;
2749 req->io_task_work.func = io_req_task_complete;
2750 io_req_task_work_add(req);
2751}
2752
2753static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
2754{
2755 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2756
2757 if (kiocb->ki_flags & IOCB_WRITE)
2758 kiocb_end_write(req);
2759 if (unlikely(res != req->result)) {
2760 if (res == -EAGAIN && io_rw_should_reissue(req)) {
2761 req->flags |= REQ_F_REISSUE;
2762 return;
2763 }
2764 req->result = res;
2765 }
2766
2767
2768 smp_store_release(&req->iopoll_completed, 1);
2769}
2770
2771
2772
2773
2774
2775
2776
2777static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
2778{
2779 struct io_ring_ctx *ctx = req->ctx;
2780 const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
2781
2782
2783 if (unlikely(needs_lock))
2784 mutex_lock(&ctx->uring_lock);
2785
2786
2787
2788
2789
2790
2791 if (wq_list_empty(&ctx->iopoll_list)) {
2792 ctx->poll_multi_queue = false;
2793 } else if (!ctx->poll_multi_queue) {
2794 struct io_kiocb *list_req;
2795
2796 list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
2797 comp_list);
2798 if (list_req->file != req->file)
2799 ctx->poll_multi_queue = true;
2800 }
2801
2802
2803
2804
2805
2806 if (READ_ONCE(req->iopoll_completed))
2807 wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
2808 else
2809 wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
2810
2811 if (unlikely(needs_lock)) {
2812
2813
2814
2815
2816
2817
2818 if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2819 wq_has_sleeper(&ctx->sq_data->wait))
2820 wake_up(&ctx->sq_data->wait);
2821
2822 mutex_unlock(&ctx->uring_lock);
2823 }
2824}
2825
2826static bool io_bdev_nowait(struct block_device *bdev)
2827{
2828 return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2829}
2830
2831
2832
2833
2834
2835
2836static bool __io_file_supports_nowait(struct file *file, umode_t mode)
2837{
2838 if (S_ISBLK(mode)) {
2839 if (IS_ENABLED(CONFIG_BLOCK) &&
2840 io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2841 return true;
2842 return false;
2843 }
2844 if (S_ISSOCK(mode))
2845 return true;
2846 if (S_ISREG(mode)) {
2847 if (IS_ENABLED(CONFIG_BLOCK) &&
2848 io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2849 file->f_op != &io_uring_fops)
2850 return true;
2851 return false;
2852 }
2853
2854
2855 if (file->f_flags & O_NONBLOCK)
2856 return true;
2857 return file->f_mode & FMODE_NOWAIT;
2858}
2859
2860
2861
2862
2863
2864
2865static unsigned int io_file_get_flags(struct file *file)
2866{
2867 umode_t mode = file_inode(file)->i_mode;
2868 unsigned int res = 0;
2869
2870 if (S_ISREG(mode))
2871 res |= FFS_ISREG;
2872 if (__io_file_supports_nowait(file, mode))
2873 res |= FFS_NOWAIT;
2874 return res;
2875}
2876
2877static inline bool io_file_supports_nowait(struct io_kiocb *req)
2878{
2879 return req->flags & REQ_F_SUPPORT_NOWAIT;
2880}
2881
2882static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2883{
2884 struct io_ring_ctx *ctx = req->ctx;
2885 struct kiocb *kiocb = &req->rw.kiocb;
2886 struct file *file = req->file;
2887 unsigned ioprio;
2888 int ret;
2889
2890 if (!io_req_ffs_set(req))
2891 req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
2892
2893 kiocb->ki_pos = READ_ONCE(sqe->off);
2894 if (kiocb->ki_pos == -1) {
2895 if (!(file->f_mode & FMODE_STREAM)) {
2896 req->flags |= REQ_F_CUR_POS;
2897 kiocb->ki_pos = file->f_pos;
2898 } else {
2899 kiocb->ki_pos = 0;
2900 }
2901 }
2902 kiocb->ki_flags = iocb_flags(file);
2903 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2904 if (unlikely(ret))
2905 return ret;
2906
2907
2908
2909
2910
2911
2912 if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2913 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
2914 req->flags |= REQ_F_NOWAIT;
2915
2916 if (ctx->flags & IORING_SETUP_IOPOLL) {
2917 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
2918 return -EOPNOTSUPP;
2919
2920 kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
2921 kiocb->ki_complete = io_complete_rw_iopoll;
2922 req->iopoll_completed = 0;
2923 } else {
2924 if (kiocb->ki_flags & IOCB_HIPRI)
2925 return -EINVAL;
2926 kiocb->ki_complete = io_complete_rw;
2927 }
2928
2929 ioprio = READ_ONCE(sqe->ioprio);
2930 if (ioprio) {
2931 ret = ioprio_check_cap(ioprio);
2932 if (ret)
2933 return ret;
2934
2935 kiocb->ki_ioprio = ioprio;
2936 } else {
2937 kiocb->ki_ioprio = get_current_ioprio();
2938 }
2939
2940 req->imu = NULL;
2941 req->rw.addr = READ_ONCE(sqe->addr);
2942 req->rw.len = READ_ONCE(sqe->len);
2943 req->buf_index = READ_ONCE(sqe->buf_index);
2944 return 0;
2945}
2946
2947static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2948{
2949 switch (ret) {
2950 case -EIOCBQUEUED:
2951 break;
2952 case -ERESTARTSYS:
2953 case -ERESTARTNOINTR:
2954 case -ERESTARTNOHAND:
2955 case -ERESTART_RESTARTBLOCK:
2956
2957
2958
2959
2960
2961 ret = -EINTR;
2962 fallthrough;
2963 default:
2964 kiocb->ki_complete(kiocb, ret);
2965 }
2966}
2967
2968static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2969 unsigned int issue_flags)
2970{
2971 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2972 struct io_async_rw *io = req->async_data;
2973
2974
2975 if (req_has_async_data(req) && io->bytes_done > 0) {
2976 if (ret < 0)
2977 ret = io->bytes_done;
2978 else
2979 ret += io->bytes_done;
2980 }
2981
2982 if (req->flags & REQ_F_CUR_POS)
2983 req->file->f_pos = kiocb->ki_pos;
2984 if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
2985 __io_complete_rw(req, ret, 0, issue_flags);
2986 else
2987 io_rw_done(kiocb, ret);
2988
2989 if (req->flags & REQ_F_REISSUE) {
2990 req->flags &= ~REQ_F_REISSUE;
2991 if (io_resubmit_prep(req)) {
2992 io_req_task_queue_reissue(req);
2993 } else {
2994 unsigned int cflags = io_put_rw_kbuf(req);
2995 struct io_ring_ctx *ctx = req->ctx;
2996
2997 req_set_fail(req);
2998 if (issue_flags & IO_URING_F_UNLOCKED) {
2999 mutex_lock(&ctx->uring_lock);
3000 __io_req_complete(req, issue_flags, ret, cflags);
3001 mutex_unlock(&ctx->uring_lock);
3002 } else {
3003 __io_req_complete(req, issue_flags, ret, cflags);
3004 }
3005 }
3006 }
3007}
3008
3009static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3010 struct io_mapped_ubuf *imu)
3011{
3012 size_t len = req->rw.len;
3013 u64 buf_end, buf_addr = req->rw.addr;
3014 size_t offset;
3015
3016 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
3017 return -EFAULT;
3018
3019 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
3020 return -EFAULT;
3021
3022
3023
3024
3025
3026 offset = buf_addr - imu->ubuf;
3027 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
3028
3029 if (offset) {
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046 const struct bio_vec *bvec = imu->bvec;
3047
3048 if (offset <= bvec->bv_len) {
3049 iov_iter_advance(iter, offset);
3050 } else {
3051 unsigned long seg_skip;
3052
3053
3054 offset -= bvec->bv_len;
3055 seg_skip = 1 + (offset >> PAGE_SHIFT);
3056
3057 iter->bvec = bvec + seg_skip;
3058 iter->nr_segs -= seg_skip;
3059 iter->count -= bvec->bv_len + offset;
3060 iter->iov_offset = offset & ~PAGE_MASK;
3061 }
3062 }
3063
3064 return 0;
3065}
3066
3067static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
3068{
3069 struct io_mapped_ubuf *imu = req->imu;
3070 u16 index, buf_index = req->buf_index;
3071
3072 if (likely(!imu)) {
3073 struct io_ring_ctx *ctx = req->ctx;
3074
3075 if (unlikely(buf_index >= ctx->nr_user_bufs))
3076 return -EFAULT;
3077 io_req_set_rsrc_node(req, ctx);
3078 index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3079 imu = READ_ONCE(ctx->user_bufs[index]);
3080 req->imu = imu;
3081 }
3082 return __io_import_fixed(req, rw, iter, imu);
3083}
3084
3085static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3086{
3087 if (needs_lock)
3088 mutex_unlock(&ctx->uring_lock);
3089}
3090
3091static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3092{
3093
3094
3095
3096
3097
3098
3099 if (needs_lock)
3100 mutex_lock(&ctx->uring_lock);
3101}
3102
3103static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3104 int bgid, unsigned int issue_flags)
3105{
3106 struct io_buffer *kbuf = req->kbuf;
3107 struct io_buffer *head;
3108 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
3109
3110 if (req->flags & REQ_F_BUFFER_SELECTED)
3111 return kbuf;
3112
3113 io_ring_submit_lock(req->ctx, needs_lock);
3114
3115 lockdep_assert_held(&req->ctx->uring_lock);
3116
3117 head = xa_load(&req->ctx->io_buffers, bgid);
3118 if (head) {
3119 if (!list_empty(&head->list)) {
3120 kbuf = list_last_entry(&head->list, struct io_buffer,
3121 list);
3122 list_del(&kbuf->list);
3123 } else {
3124 kbuf = head;
3125 xa_erase(&req->ctx->io_buffers, bgid);
3126 }
3127 if (*len > kbuf->len)
3128 *len = kbuf->len;
3129 req->flags |= REQ_F_BUFFER_SELECTED;
3130 req->kbuf = kbuf;
3131 } else {
3132 kbuf = ERR_PTR(-ENOBUFS);
3133 }
3134
3135 io_ring_submit_unlock(req->ctx, needs_lock);
3136 return kbuf;
3137}
3138
3139static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3140 unsigned int issue_flags)
3141{
3142 struct io_buffer *kbuf;
3143 u16 bgid;
3144
3145 bgid = req->buf_index;
3146 kbuf = io_buffer_select(req, len, bgid, issue_flags);
3147 if (IS_ERR(kbuf))
3148 return kbuf;
3149 return u64_to_user_ptr(kbuf->addr);
3150}
3151
3152#ifdef CONFIG_COMPAT
3153static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3154 unsigned int issue_flags)
3155{
3156 struct compat_iovec __user *uiov;
3157 compat_ssize_t clen;
3158 void __user *buf;
3159 ssize_t len;
3160
3161 uiov = u64_to_user_ptr(req->rw.addr);
3162 if (!access_ok(uiov, sizeof(*uiov)))
3163 return -EFAULT;
3164 if (__get_user(clen, &uiov->iov_len))
3165 return -EFAULT;
3166 if (clen < 0)
3167 return -EINVAL;
3168
3169 len = clen;
3170 buf = io_rw_buffer_select(req, &len, issue_flags);
3171 if (IS_ERR(buf))
3172 return PTR_ERR(buf);
3173 iov[0].iov_base = buf;
3174 iov[0].iov_len = (compat_size_t) len;
3175 return 0;
3176}
3177#endif
3178
3179static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3180 unsigned int issue_flags)
3181{
3182 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3183 void __user *buf;
3184 ssize_t len;
3185
3186 if (copy_from_user(iov, uiov, sizeof(*uiov)))
3187 return -EFAULT;
3188
3189 len = iov[0].iov_len;
3190 if (len < 0)
3191 return -EINVAL;
3192 buf = io_rw_buffer_select(req, &len, issue_flags);
3193 if (IS_ERR(buf))
3194 return PTR_ERR(buf);
3195 iov[0].iov_base = buf;
3196 iov[0].iov_len = len;
3197 return 0;
3198}
3199
3200static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3201 unsigned int issue_flags)
3202{
3203 if (req->flags & REQ_F_BUFFER_SELECTED) {
3204 struct io_buffer *kbuf = req->kbuf;
3205
3206 iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3207 iov[0].iov_len = kbuf->len;
3208 return 0;
3209 }
3210 if (req->rw.len != 1)
3211 return -EINVAL;
3212
3213#ifdef CONFIG_COMPAT
3214 if (req->ctx->compat)
3215 return io_compat_import(req, iov, issue_flags);
3216#endif
3217
3218 return __io_iov_buffer_select(req, iov, issue_flags);
3219}
3220
3221static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3222 struct io_rw_state *s,
3223 unsigned int issue_flags)
3224{
3225 struct iov_iter *iter = &s->iter;
3226 u8 opcode = req->opcode;
3227 struct iovec *iovec;
3228 void __user *buf;
3229 size_t sqe_len;
3230 ssize_t ret;
3231
3232 BUILD_BUG_ON(ERR_PTR(0) != NULL);
3233
3234 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
3235 return ERR_PTR(io_import_fixed(req, rw, iter));
3236
3237
3238 if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
3239 return ERR_PTR(-EINVAL);
3240
3241 buf = u64_to_user_ptr(req->rw.addr);
3242 sqe_len = req->rw.len;
3243
3244 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3245 if (req->flags & REQ_F_BUFFER_SELECT) {
3246 buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
3247 if (IS_ERR(buf))
3248 return ERR_CAST(buf);
3249 req->rw.len = sqe_len;
3250 }
3251
3252 ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
3253 return ERR_PTR(ret);
3254 }
3255
3256 iovec = s->fast_iov;
3257 if (req->flags & REQ_F_BUFFER_SELECT) {
3258 ret = io_iov_buffer_select(req, iovec, issue_flags);
3259 if (!ret)
3260 iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3261 return ERR_PTR(ret);
3262 }
3263
3264 ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
3265 req->ctx->compat);
3266 if (unlikely(ret < 0))
3267 return ERR_PTR(ret);
3268 return iovec;
3269}
3270
3271static inline int io_import_iovec(int rw, struct io_kiocb *req,
3272 struct iovec **iovec, struct io_rw_state *s,
3273 unsigned int issue_flags)
3274{
3275 *iovec = __io_import_iovec(rw, req, s, issue_flags);
3276 if (unlikely(IS_ERR(*iovec)))
3277 return PTR_ERR(*iovec);
3278
3279 iov_iter_save_state(&s->iter, &s->iter_state);
3280 return 0;
3281}
3282
3283static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3284{
3285 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3286}
3287
3288
3289
3290
3291
3292static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3293{
3294 struct kiocb *kiocb = &req->rw.kiocb;
3295 struct file *file = req->file;
3296 ssize_t ret = 0;
3297
3298
3299
3300
3301
3302
3303 if (kiocb->ki_flags & IOCB_HIPRI)
3304 return -EOPNOTSUPP;
3305 if ((kiocb->ki_flags & IOCB_NOWAIT) &&
3306 !(kiocb->ki_filp->f_flags & O_NONBLOCK))
3307 return -EAGAIN;
3308
3309 while (iov_iter_count(iter)) {
3310 struct iovec iovec;
3311 ssize_t nr;
3312
3313 if (!iov_iter_is_bvec(iter)) {
3314 iovec = iov_iter_iovec(iter);
3315 } else {
3316 iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3317 iovec.iov_len = req->rw.len;
3318 }
3319
3320 if (rw == READ) {
3321 nr = file->f_op->read(file, iovec.iov_base,
3322 iovec.iov_len, io_kiocb_ppos(kiocb));
3323 } else {
3324 nr = file->f_op->write(file, iovec.iov_base,
3325 iovec.iov_len, io_kiocb_ppos(kiocb));
3326 }
3327
3328 if (nr < 0) {
3329 if (!ret)
3330 ret = nr;
3331 break;
3332 }
3333 if (!iov_iter_is_bvec(iter)) {
3334 iov_iter_advance(iter, nr);
3335 } else {
3336 req->rw.len -= nr;
3337 req->rw.addr += nr;
3338 }
3339 ret += nr;
3340 if (nr != iovec.iov_len)
3341 break;
3342 }
3343
3344 return ret;
3345}
3346
3347static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3348 const struct iovec *fast_iov, struct iov_iter *iter)
3349{
3350 struct io_async_rw *rw = req->async_data;
3351
3352 memcpy(&rw->s.iter, iter, sizeof(*iter));
3353 rw->free_iovec = iovec;
3354 rw->bytes_done = 0;
3355
3356 if (iov_iter_is_bvec(iter))
3357 return;
3358 if (!iovec) {
3359 unsigned iov_off = 0;
3360
3361 rw->s.iter.iov = rw->s.fast_iov;
3362 if (iter->iov != fast_iov) {
3363 iov_off = iter->iov - fast_iov;
3364 rw->s.iter.iov += iov_off;
3365 }
3366 if (rw->s.fast_iov != fast_iov)
3367 memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
3368 sizeof(struct iovec) * iter->nr_segs);
3369 } else {
3370 req->flags |= REQ_F_NEED_CLEANUP;
3371 }
3372}
3373
3374static inline bool io_alloc_async_data(struct io_kiocb *req)
3375{
3376 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3377 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3378 if (req->async_data) {
3379 req->flags |= REQ_F_ASYNC_DATA;
3380 return false;
3381 }
3382 return true;
3383}
3384
3385static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3386 struct io_rw_state *s, bool force)
3387{
3388 if (!force && !io_op_defs[req->opcode].needs_async_setup)
3389 return 0;
3390 if (!req_has_async_data(req)) {
3391 struct io_async_rw *iorw;
3392
3393 if (io_alloc_async_data(req)) {
3394 kfree(iovec);
3395 return -ENOMEM;
3396 }
3397
3398 io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
3399 iorw = req->async_data;
3400
3401 iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
3402 }
3403 return 0;
3404}
3405
3406static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3407{
3408 struct io_async_rw *iorw = req->async_data;
3409 struct iovec *iov;
3410 int ret;
3411
3412
3413 ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
3414 if (unlikely(ret < 0))
3415 return ret;
3416
3417 iorw->bytes_done = 0;
3418 iorw->free_iovec = iov;
3419 if (iov)
3420 req->flags |= REQ_F_NEED_CLEANUP;
3421 return 0;
3422}
3423
3424static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3425{
3426 if (unlikely(!(req->file->f_mode & FMODE_READ)))
3427 return -EBADF;
3428 return io_prep_rw(req, sqe);
3429}
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3442 int sync, void *arg)
3443{
3444 struct wait_page_queue *wpq;
3445 struct io_kiocb *req = wait->private;
3446 struct wait_page_key *key = arg;
3447
3448 wpq = container_of(wait, struct wait_page_queue, wait);
3449
3450 if (!wake_page_match(wpq, key))
3451 return 0;
3452
3453 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3454 list_del_init(&wait->entry);
3455 io_req_task_queue(req);
3456 return 1;
3457}
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471static bool io_rw_should_retry(struct io_kiocb *req)
3472{
3473 struct io_async_rw *rw = req->async_data;
3474 struct wait_page_queue *wait = &rw->wpq;
3475 struct kiocb *kiocb = &req->rw.kiocb;
3476
3477
3478 if (req->flags & REQ_F_NOWAIT)
3479 return false;
3480
3481
3482 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3483 return false;
3484
3485
3486
3487
3488
3489 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3490 return false;
3491
3492 wait->wait.func = io_async_buf_func;
3493 wait->wait.private = req;
3494 wait->wait.flags = 0;
3495 INIT_LIST_HEAD(&wait->wait.entry);
3496 kiocb->ki_flags |= IOCB_WAITQ;
3497 kiocb->ki_flags &= ~IOCB_NOWAIT;
3498 kiocb->ki_waitq = wait;
3499 return true;
3500}
3501
3502static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3503{
3504 if (likely(req->file->f_op->read_iter))
3505 return call_read_iter(req->file, &req->rw.kiocb, iter);
3506 else if (req->file->f_op->read)
3507 return loop_rw_iter(READ, req, iter);
3508 else
3509 return -EINVAL;
3510}
3511
3512static bool need_read_all(struct io_kiocb *req)
3513{
3514 return req->flags & REQ_F_ISREG ||
3515 S_ISBLK(file_inode(req->file)->i_mode);
3516}
3517
3518static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3519{
3520 struct io_rw_state __s, *s = &__s;
3521 struct iovec *iovec;
3522 struct kiocb *kiocb = &req->rw.kiocb;
3523 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3524 struct io_async_rw *rw;
3525 ssize_t ret, ret2;
3526
3527 if (!req_has_async_data(req)) {
3528 ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3529 if (unlikely(ret < 0))
3530 return ret;
3531 } else {
3532 rw = req->async_data;
3533 s = &rw->s;
3534
3535
3536
3537
3538
3539 iov_iter_restore(&s->iter, &s->iter_state);
3540 iovec = NULL;
3541 }
3542 req->result = iov_iter_count(&s->iter);
3543
3544 if (force_nonblock) {
3545
3546 if (unlikely(!io_file_supports_nowait(req))) {
3547 ret = io_setup_async_rw(req, iovec, s, true);
3548 return ret ?: -EAGAIN;
3549 }
3550 kiocb->ki_flags |= IOCB_NOWAIT;
3551 } else {
3552
3553 kiocb->ki_flags &= ~IOCB_NOWAIT;
3554 }
3555
3556 ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
3557 if (unlikely(ret)) {
3558 kfree(iovec);
3559 return ret;
3560 }
3561
3562 ret = io_iter_do_read(req, &s->iter);
3563
3564 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3565 req->flags &= ~REQ_F_REISSUE;
3566
3567 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3568 goto done;
3569
3570 if (req->flags & REQ_F_NOWAIT)
3571 goto done;
3572 ret = 0;
3573 } else if (ret == -EIOCBQUEUED) {
3574 goto out_free;
3575 } else if (ret == req->result || ret <= 0 || !force_nonblock ||
3576 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
3577
3578 goto done;
3579 }
3580
3581
3582
3583
3584
3585
3586 iov_iter_restore(&s->iter, &s->iter_state);
3587
3588 ret2 = io_setup_async_rw(req, iovec, s, true);
3589 if (ret2)
3590 return ret2;
3591
3592 iovec = NULL;
3593 rw = req->async_data;
3594 s = &rw->s;
3595
3596
3597
3598
3599
3600 do {
3601
3602
3603
3604
3605
3606 iov_iter_advance(&s->iter, ret);
3607 if (!iov_iter_count(&s->iter))
3608 break;
3609 rw->bytes_done += ret;
3610 iov_iter_save_state(&s->iter, &s->iter_state);
3611
3612
3613 if (!io_rw_should_retry(req)) {
3614 kiocb->ki_flags &= ~IOCB_WAITQ;
3615 return -EAGAIN;
3616 }
3617
3618
3619
3620
3621
3622
3623
3624 ret = io_iter_do_read(req, &s->iter);
3625 if (ret == -EIOCBQUEUED)
3626 return 0;
3627
3628 kiocb->ki_flags &= ~IOCB_WAITQ;
3629 iov_iter_restore(&s->iter, &s->iter_state);
3630 } while (ret > 0);
3631done:
3632 kiocb_done(kiocb, ret, issue_flags);
3633out_free:
3634
3635 if (iovec)
3636 kfree(iovec);
3637 return 0;
3638}
3639
3640static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3641{
3642 if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3643 return -EBADF;
3644 req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
3645 return io_prep_rw(req, sqe);
3646}
3647
3648static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3649{
3650 struct io_rw_state __s, *s = &__s;
3651 struct iovec *iovec;
3652 struct kiocb *kiocb = &req->rw.kiocb;
3653 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3654 ssize_t ret, ret2;
3655
3656 if (!req_has_async_data(req)) {
3657 ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
3658 if (unlikely(ret < 0))
3659 return ret;
3660 } else {
3661 struct io_async_rw *rw = req->async_data;
3662
3663 s = &rw->s;
3664 iov_iter_restore(&s->iter, &s->iter_state);
3665 iovec = NULL;
3666 }
3667 req->result = iov_iter_count(&s->iter);
3668
3669 if (force_nonblock) {
3670
3671 if (unlikely(!io_file_supports_nowait(req)))
3672 goto copy_iov;
3673
3674
3675 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3676 (req->flags & REQ_F_ISREG))
3677 goto copy_iov;
3678
3679 kiocb->ki_flags |= IOCB_NOWAIT;
3680 } else {
3681
3682 kiocb->ki_flags &= ~IOCB_NOWAIT;
3683 }
3684
3685 ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
3686 if (unlikely(ret))
3687 goto out_free;
3688
3689
3690
3691
3692
3693
3694
3695
3696 if (req->flags & REQ_F_ISREG) {
3697 sb_start_write(file_inode(req->file)->i_sb);
3698 __sb_writers_release(file_inode(req->file)->i_sb,
3699 SB_FREEZE_WRITE);
3700 }
3701 kiocb->ki_flags |= IOCB_WRITE;
3702
3703 if (likely(req->file->f_op->write_iter))
3704 ret2 = call_write_iter(req->file, kiocb, &s->iter);
3705 else if (req->file->f_op->write)
3706 ret2 = loop_rw_iter(WRITE, req, &s->iter);
3707 else
3708 ret2 = -EINVAL;
3709
3710 if (req->flags & REQ_F_REISSUE) {
3711 req->flags &= ~REQ_F_REISSUE;
3712 ret2 = -EAGAIN;
3713 }
3714
3715
3716
3717
3718
3719 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3720 ret2 = -EAGAIN;
3721
3722 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3723 goto done;
3724 if (!force_nonblock || ret2 != -EAGAIN) {
3725
3726 if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
3727 goto copy_iov;
3728done:
3729 kiocb_done(kiocb, ret2, issue_flags);
3730 } else {
3731copy_iov:
3732 iov_iter_restore(&s->iter, &s->iter_state);
3733 ret = io_setup_async_rw(req, iovec, s, false);
3734 return ret ?: -EAGAIN;
3735 }
3736out_free:
3737
3738 if (iovec)
3739 kfree(iovec);
3740 return ret;
3741}
3742
3743static int io_renameat_prep(struct io_kiocb *req,
3744 const struct io_uring_sqe *sqe)
3745{
3746 struct io_rename *ren = &req->rename;
3747 const char __user *oldf, *newf;
3748
3749 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3750 return -EINVAL;
3751 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3752 return -EINVAL;
3753 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3754 return -EBADF;
3755
3756 ren->old_dfd = READ_ONCE(sqe->fd);
3757 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3758 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3759 ren->new_dfd = READ_ONCE(sqe->len);
3760 ren->flags = READ_ONCE(sqe->rename_flags);
3761
3762 ren->oldpath = getname(oldf);
3763 if (IS_ERR(ren->oldpath))
3764 return PTR_ERR(ren->oldpath);
3765
3766 ren->newpath = getname(newf);
3767 if (IS_ERR(ren->newpath)) {
3768 putname(ren->oldpath);
3769 return PTR_ERR(ren->newpath);
3770 }
3771
3772 req->flags |= REQ_F_NEED_CLEANUP;
3773 return 0;
3774}
3775
3776static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3777{
3778 struct io_rename *ren = &req->rename;
3779 int ret;
3780
3781 if (issue_flags & IO_URING_F_NONBLOCK)
3782 return -EAGAIN;
3783
3784 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3785 ren->newpath, ren->flags);
3786
3787 req->flags &= ~REQ_F_NEED_CLEANUP;
3788 if (ret < 0)
3789 req_set_fail(req);
3790 io_req_complete(req, ret);
3791 return 0;
3792}
3793
3794static int io_unlinkat_prep(struct io_kiocb *req,
3795 const struct io_uring_sqe *sqe)
3796{
3797 struct io_unlink *un = &req->unlink;
3798 const char __user *fname;
3799
3800 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3801 return -EINVAL;
3802 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3803 sqe->splice_fd_in)
3804 return -EINVAL;
3805 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3806 return -EBADF;
3807
3808 un->dfd = READ_ONCE(sqe->fd);
3809
3810 un->flags = READ_ONCE(sqe->unlink_flags);
3811 if (un->flags & ~AT_REMOVEDIR)
3812 return -EINVAL;
3813
3814 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3815 un->filename = getname(fname);
3816 if (IS_ERR(un->filename))
3817 return PTR_ERR(un->filename);
3818
3819 req->flags |= REQ_F_NEED_CLEANUP;
3820 return 0;
3821}
3822
3823static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3824{
3825 struct io_unlink *un = &req->unlink;
3826 int ret;
3827
3828 if (issue_flags & IO_URING_F_NONBLOCK)
3829 return -EAGAIN;
3830
3831 if (un->flags & AT_REMOVEDIR)
3832 ret = do_rmdir(un->dfd, un->filename);
3833 else
3834 ret = do_unlinkat(un->dfd, un->filename);
3835
3836 req->flags &= ~REQ_F_NEED_CLEANUP;
3837 if (ret < 0)
3838 req_set_fail(req);
3839 io_req_complete(req, ret);
3840 return 0;
3841}
3842
3843static int io_mkdirat_prep(struct io_kiocb *req,
3844 const struct io_uring_sqe *sqe)
3845{
3846 struct io_mkdir *mkd = &req->mkdir;
3847 const char __user *fname;
3848
3849 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3850 return -EINVAL;
3851 if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
3852 sqe->splice_fd_in)
3853 return -EINVAL;
3854 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3855 return -EBADF;
3856
3857 mkd->dfd = READ_ONCE(sqe->fd);
3858 mkd->mode = READ_ONCE(sqe->len);
3859
3860 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3861 mkd->filename = getname(fname);
3862 if (IS_ERR(mkd->filename))
3863 return PTR_ERR(mkd->filename);
3864
3865 req->flags |= REQ_F_NEED_CLEANUP;
3866 return 0;
3867}
3868
3869static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
3870{
3871 struct io_mkdir *mkd = &req->mkdir;
3872 int ret;
3873
3874 if (issue_flags & IO_URING_F_NONBLOCK)
3875 return -EAGAIN;
3876
3877 ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
3878
3879 req->flags &= ~REQ_F_NEED_CLEANUP;
3880 if (ret < 0)
3881 req_set_fail(req);
3882 io_req_complete(req, ret);
3883 return 0;
3884}
3885
3886static int io_symlinkat_prep(struct io_kiocb *req,
3887 const struct io_uring_sqe *sqe)
3888{
3889 struct io_symlink *sl = &req->symlink;
3890 const char __user *oldpath, *newpath;
3891
3892 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3893 return -EINVAL;
3894 if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
3895 sqe->splice_fd_in)
3896 return -EINVAL;
3897 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3898 return -EBADF;
3899
3900 sl->new_dfd = READ_ONCE(sqe->fd);
3901 oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
3902 newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3903
3904 sl->oldpath = getname(oldpath);
3905 if (IS_ERR(sl->oldpath))
3906 return PTR_ERR(sl->oldpath);
3907
3908 sl->newpath = getname(newpath);
3909 if (IS_ERR(sl->newpath)) {
3910 putname(sl->oldpath);
3911 return PTR_ERR(sl->newpath);
3912 }
3913
3914 req->flags |= REQ_F_NEED_CLEANUP;
3915 return 0;
3916}
3917
3918static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
3919{
3920 struct io_symlink *sl = &req->symlink;
3921 int ret;
3922
3923 if (issue_flags & IO_URING_F_NONBLOCK)
3924 return -EAGAIN;
3925
3926 ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
3927
3928 req->flags &= ~REQ_F_NEED_CLEANUP;
3929 if (ret < 0)
3930 req_set_fail(req);
3931 io_req_complete(req, ret);
3932 return 0;
3933}
3934
3935static int io_linkat_prep(struct io_kiocb *req,
3936 const struct io_uring_sqe *sqe)
3937{
3938 struct io_hardlink *lnk = &req->hardlink;
3939 const char __user *oldf, *newf;
3940
3941 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3942 return -EINVAL;
3943 if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
3944 return -EINVAL;
3945 if (unlikely(req->flags & REQ_F_FIXED_FILE))
3946 return -EBADF;
3947
3948 lnk->old_dfd = READ_ONCE(sqe->fd);
3949 lnk->new_dfd = READ_ONCE(sqe->len);
3950 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3951 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3952 lnk->flags = READ_ONCE(sqe->hardlink_flags);
3953
3954 lnk->oldpath = getname(oldf);
3955 if (IS_ERR(lnk->oldpath))
3956 return PTR_ERR(lnk->oldpath);
3957
3958 lnk->newpath = getname(newf);
3959 if (IS_ERR(lnk->newpath)) {
3960 putname(lnk->oldpath);
3961 return PTR_ERR(lnk->newpath);
3962 }
3963
3964 req->flags |= REQ_F_NEED_CLEANUP;
3965 return 0;
3966}
3967
3968static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
3969{
3970 struct io_hardlink *lnk = &req->hardlink;
3971 int ret;
3972
3973 if (issue_flags & IO_URING_F_NONBLOCK)
3974 return -EAGAIN;
3975
3976 ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
3977 lnk->newpath, lnk->flags);
3978
3979 req->flags &= ~REQ_F_NEED_CLEANUP;
3980 if (ret < 0)
3981 req_set_fail(req);
3982 io_req_complete(req, ret);
3983 return 0;
3984}
3985
3986static int io_shutdown_prep(struct io_kiocb *req,
3987 const struct io_uring_sqe *sqe)
3988{
3989#if defined(CONFIG_NET)
3990 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3991 return -EINVAL;
3992 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3993 sqe->buf_index || sqe->splice_fd_in))
3994 return -EINVAL;
3995
3996 req->shutdown.how = READ_ONCE(sqe->len);
3997 return 0;
3998#else
3999 return -EOPNOTSUPP;
4000#endif
4001}
4002
4003static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
4004{
4005#if defined(CONFIG_NET)
4006 struct socket *sock;
4007 int ret;
4008
4009 if (issue_flags & IO_URING_F_NONBLOCK)
4010 return -EAGAIN;
4011
4012 sock = sock_from_file(req->file);
4013 if (unlikely(!sock))
4014 return -ENOTSOCK;
4015
4016 ret = __sys_shutdown_sock(sock, req->shutdown.how);
4017 if (ret < 0)
4018 req_set_fail(req);
4019 io_req_complete(req, ret);
4020 return 0;
4021#else
4022 return -EOPNOTSUPP;
4023#endif
4024}
4025
4026static int __io_splice_prep(struct io_kiocb *req,
4027 const struct io_uring_sqe *sqe)
4028{
4029 struct io_splice *sp = &req->splice;
4030 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
4031
4032 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4033 return -EINVAL;
4034
4035 sp->file_in = NULL;
4036 sp->len = READ_ONCE(sqe->len);
4037 sp->flags = READ_ONCE(sqe->splice_flags);
4038
4039 if (unlikely(sp->flags & ~valid_flags))
4040 return -EINVAL;
4041
4042 sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
4043 (sp->flags & SPLICE_F_FD_IN_FIXED));
4044 if (!sp->file_in)
4045 return -EBADF;
4046 req->flags |= REQ_F_NEED_CLEANUP;
4047 return 0;
4048}
4049
4050static int io_tee_prep(struct io_kiocb *req,
4051 const struct io_uring_sqe *sqe)
4052{
4053 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
4054 return -EINVAL;
4055 return __io_splice_prep(req, sqe);
4056}
4057
4058static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
4059{
4060 struct io_splice *sp = &req->splice;
4061 struct file *in = sp->file_in;
4062 struct file *out = sp->file_out;
4063 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4064 long ret = 0;
4065
4066 if (issue_flags & IO_URING_F_NONBLOCK)
4067 return -EAGAIN;
4068 if (sp->len)
4069 ret = do_tee(in, out, sp->len, flags);
4070
4071 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4072 io_put_file(in);
4073 req->flags &= ~REQ_F_NEED_CLEANUP;
4074
4075 if (ret != sp->len)
4076 req_set_fail(req);
4077 io_req_complete(req, ret);
4078 return 0;
4079}
4080
4081static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4082{
4083 struct io_splice *sp = &req->splice;
4084
4085 sp->off_in = READ_ONCE(sqe->splice_off_in);
4086 sp->off_out = READ_ONCE(sqe->off);
4087 return __io_splice_prep(req, sqe);
4088}
4089
4090static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
4091{
4092 struct io_splice *sp = &req->splice;
4093 struct file *in = sp->file_in;
4094 struct file *out = sp->file_out;
4095 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4096 loff_t *poff_in, *poff_out;
4097 long ret = 0;
4098
4099 if (issue_flags & IO_URING_F_NONBLOCK)
4100 return -EAGAIN;
4101
4102 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4103 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
4104
4105 if (sp->len)
4106 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
4107
4108 if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4109 io_put_file(in);
4110 req->flags &= ~REQ_F_NEED_CLEANUP;
4111
4112 if (ret != sp->len)
4113 req_set_fail(req);
4114 io_req_complete(req, ret);
4115 return 0;
4116}
4117
4118
4119
4120
4121static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
4122{
4123 struct io_ring_ctx *ctx = req->ctx;
4124
4125 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4126 return -EINVAL;
4127
4128 __io_req_complete(req, issue_flags, 0, 0);
4129 return 0;
4130}
4131
4132static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4133{
4134 struct io_ring_ctx *ctx = req->ctx;
4135
4136 if (!req->file)
4137 return -EBADF;
4138
4139 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4140 return -EINVAL;
4141 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4142 sqe->splice_fd_in))
4143 return -EINVAL;
4144
4145 req->sync.flags = READ_ONCE(sqe->fsync_flags);
4146 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4147 return -EINVAL;
4148
4149 req->sync.off = READ_ONCE(sqe->off);
4150 req->sync.len = READ_ONCE(sqe->len);
4151 return 0;
4152}
4153
4154static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
4155{
4156 loff_t end = req->sync.off + req->sync.len;
4157 int ret;
4158
4159
4160 if (issue_flags & IO_URING_F_NONBLOCK)
4161 return -EAGAIN;
4162
4163 ret = vfs_fsync_range(req->file, req->sync.off,
4164 end > 0 ? end : LLONG_MAX,
4165 req->sync.flags & IORING_FSYNC_DATASYNC);
4166 if (ret < 0)
4167 req_set_fail(req);
4168 io_req_complete(req, ret);
4169 return 0;
4170}
4171
4172static int io_fallocate_prep(struct io_kiocb *req,
4173 const struct io_uring_sqe *sqe)
4174{
4175 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4176 sqe->splice_fd_in)
4177 return -EINVAL;
4178 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4179 return -EINVAL;
4180
4181 req->sync.off = READ_ONCE(sqe->off);
4182 req->sync.len = READ_ONCE(sqe->addr);
4183 req->sync.mode = READ_ONCE(sqe->len);
4184 return 0;
4185}
4186
4187static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
4188{
4189 int ret;
4190
4191
4192 if (issue_flags & IO_URING_F_NONBLOCK)
4193 return -EAGAIN;
4194 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4195 req->sync.len);
4196 if (ret < 0)
4197 req_set_fail(req);
4198 io_req_complete(req, ret);
4199 return 0;
4200}
4201
4202static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4203{
4204 const char __user *fname;
4205 int ret;
4206
4207 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4208 return -EINVAL;
4209 if (unlikely(sqe->ioprio || sqe->buf_index))
4210 return -EINVAL;
4211 if (unlikely(req->flags & REQ_F_FIXED_FILE))
4212 return -EBADF;
4213
4214
4215 if (!(req->open.how.flags & O_PATH) && force_o_largefile())
4216 req->open.how.flags |= O_LARGEFILE;
4217
4218 req->open.dfd = READ_ONCE(sqe->fd);
4219 fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4220 req->open.filename = getname(fname);
4221 if (IS_ERR(req->open.filename)) {
4222 ret = PTR_ERR(req->open.filename);
4223 req->open.filename = NULL;
4224 return ret;
4225 }
4226
4227 req->open.file_slot = READ_ONCE(sqe->file_index);
4228 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4229 return -EINVAL;
4230
4231 req->open.nofile = rlimit(RLIMIT_NOFILE);
4232 req->flags |= REQ_F_NEED_CLEANUP;
4233 return 0;
4234}
4235
4236static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4237{
4238 u64 mode = READ_ONCE(sqe->len);
4239 u64 flags = READ_ONCE(sqe->open_flags);
4240
4241 req->open.how = build_open_how(flags, mode);
4242 return __io_openat_prep(req, sqe);
4243}
4244
4245static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4246{
4247 struct open_how __user *how;
4248 size_t len;
4249 int ret;
4250
4251 how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4252 len = READ_ONCE(sqe->len);
4253 if (len < OPEN_HOW_SIZE_VER0)
4254 return -EINVAL;
4255
4256 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4257 len);
4258 if (ret)
4259 return ret;
4260
4261 return __io_openat_prep(req, sqe);
4262}
4263
4264static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
4265{
4266 struct open_flags op;
4267 struct file *file;
4268 bool resolve_nonblock, nonblock_set;
4269 bool fixed = !!req->open.file_slot;
4270 int ret;
4271
4272 ret = build_open_flags(&req->open.how, &op);
4273 if (ret)
4274 goto err;
4275 nonblock_set = op.open_flag & O_NONBLOCK;
4276 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
4277 if (issue_flags & IO_URING_F_NONBLOCK) {
4278
4279
4280
4281
4282 if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4283 return -EAGAIN;
4284 op.lookup_flags |= LOOKUP_CACHED;
4285 op.open_flag |= O_NONBLOCK;
4286 }
4287
4288 if (!fixed) {
4289 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4290 if (ret < 0)
4291 goto err;
4292 }
4293
4294 file = do_filp_open(req->open.dfd, req->open.filename, &op);
4295 if (IS_ERR(file)) {
4296
4297
4298
4299
4300
4301 if (!fixed)
4302 put_unused_fd(ret);
4303
4304 ret = PTR_ERR(file);
4305
4306 if (ret == -EAGAIN &&
4307 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4308 return -EAGAIN;
4309 goto err;
4310 }
4311
4312 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4313 file->f_flags &= ~O_NONBLOCK;
4314 fsnotify_open(file);
4315
4316 if (!fixed)
4317 fd_install(ret, file);
4318 else
4319 ret = io_install_fixed_file(req, file, issue_flags,
4320 req->open.file_slot - 1);
4321err:
4322 putname(req->open.filename);
4323 req->flags &= ~REQ_F_NEED_CLEANUP;
4324 if (ret < 0)
4325 req_set_fail(req);
4326 __io_req_complete(req, issue_flags, ret, 0);
4327 return 0;
4328}
4329
4330static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4331{
4332 return io_openat2(req, issue_flags);
4333}
4334
4335static int io_remove_buffers_prep(struct io_kiocb *req,
4336 const struct io_uring_sqe *sqe)
4337{
4338 struct io_provide_buf *p = &req->pbuf;
4339 u64 tmp;
4340
4341 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4342 sqe->splice_fd_in)
4343 return -EINVAL;
4344
4345 tmp = READ_ONCE(sqe->fd);
4346 if (!tmp || tmp > USHRT_MAX)
4347 return -EINVAL;
4348
4349 memset(p, 0, sizeof(*p));
4350 p->nbufs = tmp;
4351 p->bgid = READ_ONCE(sqe->buf_group);
4352 return 0;
4353}
4354
4355static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4356 int bgid, unsigned nbufs)
4357{
4358 unsigned i = 0;
4359
4360
4361 if (!nbufs)
4362 return 0;
4363
4364
4365 while (!list_empty(&buf->list)) {
4366 struct io_buffer *nxt;
4367
4368 nxt = list_first_entry(&buf->list, struct io_buffer, list);
4369 list_del(&nxt->list);
4370 kfree(nxt);
4371 if (++i == nbufs)
4372 return i;
4373 cond_resched();
4374 }
4375 i++;
4376 kfree(buf);
4377 xa_erase(&ctx->io_buffers, bgid);
4378
4379 return i;
4380}
4381
4382static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4383{
4384 struct io_provide_buf *p = &req->pbuf;
4385 struct io_ring_ctx *ctx = req->ctx;
4386 struct io_buffer *head;
4387 int ret = 0;
4388 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
4389
4390 io_ring_submit_lock(ctx, needs_lock);
4391
4392 lockdep_assert_held(&ctx->uring_lock);
4393
4394 ret = -ENOENT;
4395 head = xa_load(&ctx->io_buffers, p->bgid);
4396 if (head)
4397 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
4398 if (ret < 0)
4399 req_set_fail(req);
4400
4401
4402 __io_req_complete(req, issue_flags, ret, 0);
4403 io_ring_submit_unlock(ctx, needs_lock);
4404 return 0;
4405}
4406
4407static int io_provide_buffers_prep(struct io_kiocb *req,
4408 const struct io_uring_sqe *sqe)
4409{
4410 unsigned long size, tmp_check;
4411 struct io_provide_buf *p = &req->pbuf;
4412 u64 tmp;
4413
4414 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4415 return -EINVAL;
4416
4417 tmp = READ_ONCE(sqe->fd);
4418 if (!tmp || tmp > USHRT_MAX)
4419 return -E2BIG;
4420 p->nbufs = tmp;
4421 p->addr = READ_ONCE(sqe->addr);
4422 p->len = READ_ONCE(sqe->len);
4423
4424 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4425 &size))
4426 return -EOVERFLOW;
4427 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4428 return -EOVERFLOW;
4429
4430 size = (unsigned long)p->len * p->nbufs;
4431 if (!access_ok(u64_to_user_ptr(p->addr), size))
4432 return -EFAULT;
4433
4434 p->bgid = READ_ONCE(sqe->buf_group);
4435 tmp = READ_ONCE(sqe->off);
4436 if (tmp > USHRT_MAX)
4437 return -E2BIG;
4438 p->bid = tmp;
4439 return 0;
4440}
4441
4442static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4443{
4444 struct io_buffer *buf;
4445 u64 addr = pbuf->addr;
4446 int i, bid = pbuf->bid;
4447
4448 for (i = 0; i < pbuf->nbufs; i++) {
4449 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
4450 if (!buf)
4451 break;
4452
4453 buf->addr = addr;
4454 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4455 buf->bid = bid;
4456 addr += pbuf->len;
4457 bid++;
4458 if (!*head) {
4459 INIT_LIST_HEAD(&buf->list);
4460 *head = buf;
4461 } else {
4462 list_add_tail(&buf->list, &(*head)->list);
4463 }
4464 }
4465
4466 return i ? i : -ENOMEM;
4467}
4468
4469static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4470{
4471 struct io_provide_buf *p = &req->pbuf;
4472 struct io_ring_ctx *ctx = req->ctx;
4473 struct io_buffer *head, *list;
4474 int ret = 0;
4475 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
4476
4477 io_ring_submit_lock(ctx, needs_lock);
4478
4479 lockdep_assert_held(&ctx->uring_lock);
4480
4481 list = head = xa_load(&ctx->io_buffers, p->bgid);
4482
4483 ret = io_add_buffers(p, &head);
4484 if (ret >= 0 && !list) {
4485 ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4486 if (ret < 0)
4487 __io_remove_buffers(ctx, head, p->bgid, -1U);
4488 }
4489 if (ret < 0)
4490 req_set_fail(req);
4491
4492 __io_req_complete(req, issue_flags, ret, 0);
4493 io_ring_submit_unlock(ctx, needs_lock);
4494 return 0;
4495}
4496
4497static int io_epoll_ctl_prep(struct io_kiocb *req,
4498 const struct io_uring_sqe *sqe)
4499{
4500#if defined(CONFIG_EPOLL)
4501 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4502 return -EINVAL;
4503 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4504 return -EINVAL;
4505
4506 req->epoll.epfd = READ_ONCE(sqe->fd);
4507 req->epoll.op = READ_ONCE(sqe->len);
4508 req->epoll.fd = READ_ONCE(sqe->off);
4509
4510 if (ep_op_has_event(req->epoll.op)) {
4511 struct epoll_event __user *ev;
4512
4513 ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4514 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4515 return -EFAULT;
4516 }
4517
4518 return 0;
4519#else
4520 return -EOPNOTSUPP;
4521#endif
4522}
4523
4524static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4525{
4526#if defined(CONFIG_EPOLL)
4527 struct io_epoll *ie = &req->epoll;
4528 int ret;
4529 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4530
4531 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4532 if (force_nonblock && ret == -EAGAIN)
4533 return -EAGAIN;
4534
4535 if (ret < 0)
4536 req_set_fail(req);
4537 __io_req_complete(req, issue_flags, ret, 0);
4538 return 0;
4539#else
4540 return -EOPNOTSUPP;
4541#endif
4542}
4543
4544static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4545{
4546#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4547 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4548 return -EINVAL;
4549 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4550 return -EINVAL;
4551
4552 req->madvise.addr = READ_ONCE(sqe->addr);
4553 req->madvise.len = READ_ONCE(sqe->len);
4554 req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4555 return 0;
4556#else
4557 return -EOPNOTSUPP;
4558#endif
4559}
4560
4561static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4562{
4563#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4564 struct io_madvise *ma = &req->madvise;
4565 int ret;
4566
4567 if (issue_flags & IO_URING_F_NONBLOCK)
4568 return -EAGAIN;
4569
4570 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4571 if (ret < 0)
4572 req_set_fail(req);
4573 io_req_complete(req, ret);
4574 return 0;
4575#else
4576 return -EOPNOTSUPP;
4577#endif
4578}
4579
4580static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4581{
4582 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4583 return -EINVAL;
4584 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4585 return -EINVAL;
4586
4587 req->fadvise.offset = READ_ONCE(sqe->off);
4588 req->fadvise.len = READ_ONCE(sqe->len);
4589 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4590 return 0;
4591}
4592
4593static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4594{
4595 struct io_fadvise *fa = &req->fadvise;
4596 int ret;
4597
4598 if (issue_flags & IO_URING_F_NONBLOCK) {
4599 switch (fa->advice) {
4600 case POSIX_FADV_NORMAL:
4601 case POSIX_FADV_RANDOM:
4602 case POSIX_FADV_SEQUENTIAL:
4603 break;
4604 default:
4605 return -EAGAIN;
4606 }
4607 }
4608
4609 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4610 if (ret < 0)
4611 req_set_fail(req);
4612 __io_req_complete(req, issue_flags, ret, 0);
4613 return 0;
4614}
4615
4616static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4617{
4618 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4619 return -EINVAL;
4620 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4621 return -EINVAL;
4622 if (req->flags & REQ_F_FIXED_FILE)
4623 return -EBADF;
4624
4625 req->statx.dfd = READ_ONCE(sqe->fd);
4626 req->statx.mask = READ_ONCE(sqe->len);
4627 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4628 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4629 req->statx.flags = READ_ONCE(sqe->statx_flags);
4630
4631 return 0;
4632}
4633
4634static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4635{
4636 struct io_statx *ctx = &req->statx;
4637 int ret;
4638
4639 if (issue_flags & IO_URING_F_NONBLOCK)
4640 return -EAGAIN;
4641
4642 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4643 ctx->buffer);
4644
4645 if (ret < 0)
4646 req_set_fail(req);
4647 io_req_complete(req, ret);
4648 return 0;
4649}
4650
4651static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4652{
4653 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4654 return -EINVAL;
4655 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4656 sqe->rw_flags || sqe->buf_index)
4657 return -EINVAL;
4658 if (req->flags & REQ_F_FIXED_FILE)
4659 return -EBADF;
4660
4661 req->close.fd = READ_ONCE(sqe->fd);
4662 req->close.file_slot = READ_ONCE(sqe->file_index);
4663 if (req->close.file_slot && req->close.fd)
4664 return -EINVAL;
4665
4666 return 0;
4667}
4668
4669static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4670{
4671 struct files_struct *files = current->files;
4672 struct io_close *close = &req->close;
4673 struct fdtable *fdt;
4674 struct file *file = NULL;
4675 int ret = -EBADF;
4676
4677 if (req->close.file_slot) {
4678 ret = io_close_fixed(req, issue_flags);
4679 goto err;
4680 }
4681
4682 spin_lock(&files->file_lock);
4683 fdt = files_fdtable(files);
4684 if (close->fd >= fdt->max_fds) {
4685 spin_unlock(&files->file_lock);
4686 goto err;
4687 }
4688 file = fdt->fd[close->fd];
4689 if (!file || file->f_op == &io_uring_fops) {
4690 spin_unlock(&files->file_lock);
4691 file = NULL;
4692 goto err;
4693 }
4694
4695
4696 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4697 spin_unlock(&files->file_lock);
4698 return -EAGAIN;
4699 }
4700
4701 ret = __close_fd_get_file(close->fd, &file);
4702 spin_unlock(&files->file_lock);
4703 if (ret < 0) {
4704 if (ret == -ENOENT)
4705 ret = -EBADF;
4706 goto err;
4707 }
4708
4709
4710 ret = filp_close(file, current->files);
4711err:
4712 if (ret < 0)
4713 req_set_fail(req);
4714 if (file)
4715 fput(file);
4716 __io_req_complete(req, issue_flags, ret, 0);
4717 return 0;
4718}
4719
4720static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4721{
4722 struct io_ring_ctx *ctx = req->ctx;
4723
4724 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4725 return -EINVAL;
4726 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4727 sqe->splice_fd_in))
4728 return -EINVAL;
4729
4730 req->sync.off = READ_ONCE(sqe->off);
4731 req->sync.len = READ_ONCE(sqe->len);
4732 req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4733 return 0;
4734}
4735
4736static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4737{
4738 int ret;
4739
4740
4741 if (issue_flags & IO_URING_F_NONBLOCK)
4742 return -EAGAIN;
4743
4744 ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4745 req->sync.flags);
4746 if (ret < 0)
4747 req_set_fail(req);
4748 io_req_complete(req, ret);
4749 return 0;
4750}
4751
4752#if defined(CONFIG_NET)
4753static int io_setup_async_msg(struct io_kiocb *req,
4754 struct io_async_msghdr *kmsg)
4755{
4756 struct io_async_msghdr *async_msg = req->async_data;
4757
4758 if (async_msg)
4759 return -EAGAIN;
4760 if (io_alloc_async_data(req)) {
4761 kfree(kmsg->free_iov);
4762 return -ENOMEM;
4763 }
4764 async_msg = req->async_data;
4765 req->flags |= REQ_F_NEED_CLEANUP;
4766 memcpy(async_msg, kmsg, sizeof(*kmsg));
4767 async_msg->msg.msg_name = &async_msg->addr;
4768
4769 if (!async_msg->free_iov)
4770 async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4771
4772 return -EAGAIN;
4773}
4774
4775static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4776 struct io_async_msghdr *iomsg)
4777{
4778 iomsg->msg.msg_name = &iomsg->addr;
4779 iomsg->free_iov = iomsg->fast_iov;
4780 return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4781 req->sr_msg.msg_flags, &iomsg->free_iov);
4782}
4783
4784static int io_sendmsg_prep_async(struct io_kiocb *req)
4785{
4786 int ret;
4787
4788 ret = io_sendmsg_copy_hdr(req, req->async_data);
4789 if (!ret)
4790 req->flags |= REQ_F_NEED_CLEANUP;
4791 return ret;
4792}
4793
4794static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4795{
4796 struct io_sr_msg *sr = &req->sr_msg;
4797
4798 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4799 return -EINVAL;
4800
4801 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4802 sr->len = READ_ONCE(sqe->len);
4803 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4804 if (sr->msg_flags & MSG_DONTWAIT)
4805 req->flags |= REQ_F_NOWAIT;
4806
4807#ifdef CONFIG_COMPAT
4808 if (req->ctx->compat)
4809 sr->msg_flags |= MSG_CMSG_COMPAT;
4810#endif
4811 return 0;
4812}
4813
4814static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4815{
4816 struct io_async_msghdr iomsg, *kmsg;
4817 struct socket *sock;
4818 unsigned flags;
4819 int min_ret = 0;
4820 int ret;
4821
4822 sock = sock_from_file(req->file);
4823 if (unlikely(!sock))
4824 return -ENOTSOCK;
4825
4826 if (req_has_async_data(req)) {
4827 kmsg = req->async_data;
4828 } else {
4829 ret = io_sendmsg_copy_hdr(req, &iomsg);
4830 if (ret)
4831 return ret;
4832 kmsg = &iomsg;
4833 }
4834
4835 flags = req->sr_msg.msg_flags;
4836 if (issue_flags & IO_URING_F_NONBLOCK)
4837 flags |= MSG_DONTWAIT;
4838 if (flags & MSG_WAITALL)
4839 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4840
4841 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4842 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4843 return io_setup_async_msg(req, kmsg);
4844 if (ret == -ERESTARTSYS)
4845 ret = -EINTR;
4846
4847
4848 if (kmsg->free_iov)
4849 kfree(kmsg->free_iov);
4850 req->flags &= ~REQ_F_NEED_CLEANUP;
4851 if (ret < min_ret)
4852 req_set_fail(req);
4853 __io_req_complete(req, issue_flags, ret, 0);
4854 return 0;
4855}
4856
4857static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4858{
4859 struct io_sr_msg *sr = &req->sr_msg;
4860 struct msghdr msg;
4861 struct iovec iov;
4862 struct socket *sock;
4863 unsigned flags;
4864 int min_ret = 0;
4865 int ret;
4866
4867 sock = sock_from_file(req->file);
4868 if (unlikely(!sock))
4869 return -ENOTSOCK;
4870
4871 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4872 if (unlikely(ret))
4873 return ret;
4874
4875 msg.msg_name = NULL;
4876 msg.msg_control = NULL;
4877 msg.msg_controllen = 0;
4878 msg.msg_namelen = 0;
4879
4880 flags = req->sr_msg.msg_flags;
4881 if (issue_flags & IO_URING_F_NONBLOCK)
4882 flags |= MSG_DONTWAIT;
4883 if (flags & MSG_WAITALL)
4884 min_ret = iov_iter_count(&msg.msg_iter);
4885
4886 msg.msg_flags = flags;
4887 ret = sock_sendmsg(sock, &msg);
4888 if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4889 return -EAGAIN;
4890 if (ret == -ERESTARTSYS)
4891 ret = -EINTR;
4892
4893 if (ret < min_ret)
4894 req_set_fail(req);
4895 __io_req_complete(req, issue_flags, ret, 0);
4896 return 0;
4897}
4898
4899static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4900 struct io_async_msghdr *iomsg)
4901{
4902 struct io_sr_msg *sr = &req->sr_msg;
4903 struct iovec __user *uiov;
4904 size_t iov_len;
4905 int ret;
4906
4907 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4908 &iomsg->uaddr, &uiov, &iov_len);
4909 if (ret)
4910 return ret;
4911
4912 if (req->flags & REQ_F_BUFFER_SELECT) {
4913 if (iov_len > 1)
4914 return -EINVAL;
4915 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4916 return -EFAULT;
4917 sr->len = iomsg->fast_iov[0].iov_len;
4918 iomsg->free_iov = NULL;
4919 } else {
4920 iomsg->free_iov = iomsg->fast_iov;
4921 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4922 &iomsg->free_iov, &iomsg->msg.msg_iter,
4923 false);
4924 if (ret > 0)
4925 ret = 0;
4926 }
4927
4928 return ret;
4929}
4930
4931#ifdef CONFIG_COMPAT
4932static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4933 struct io_async_msghdr *iomsg)
4934{
4935 struct io_sr_msg *sr = &req->sr_msg;
4936 struct compat_iovec __user *uiov;
4937 compat_uptr_t ptr;
4938 compat_size_t len;
4939 int ret;
4940
4941 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4942 &ptr, &len);
4943 if (ret)
4944 return ret;
4945
4946 uiov = compat_ptr(ptr);
4947 if (req->flags & REQ_F_BUFFER_SELECT) {
4948 compat_ssize_t clen;
4949
4950 if (len > 1)
4951 return -EINVAL;
4952 if (!access_ok(uiov, sizeof(*uiov)))
4953 return -EFAULT;
4954 if (__get_user(clen, &uiov->iov_len))
4955 return -EFAULT;
4956 if (clen < 0)
4957 return -EINVAL;
4958 sr->len = clen;
4959 iomsg->free_iov = NULL;
4960 } else {
4961 iomsg->free_iov = iomsg->fast_iov;
4962 ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4963 UIO_FASTIOV, &iomsg->free_iov,
4964 &iomsg->msg.msg_iter, true);
4965 if (ret < 0)
4966 return ret;
4967 }
4968
4969 return 0;
4970}
4971#endif
4972
4973static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4974 struct io_async_msghdr *iomsg)
4975{
4976 iomsg->msg.msg_name = &iomsg->addr;
4977
4978#ifdef CONFIG_COMPAT
4979 if (req->ctx->compat)
4980 return __io_compat_recvmsg_copy_hdr(req, iomsg);
4981#endif
4982
4983 return __io_recvmsg_copy_hdr(req, iomsg);
4984}
4985
4986static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4987 unsigned int issue_flags)
4988{
4989 struct io_sr_msg *sr = &req->sr_msg;
4990
4991 return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
4992}
4993
4994static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4995{
4996 return io_put_kbuf(req, req->kbuf);
4997}
4998
4999static int io_recvmsg_prep_async(struct io_kiocb *req)
5000{
5001 int ret;
5002
5003 ret = io_recvmsg_copy_hdr(req, req->async_data);
5004 if (!ret)
5005 req->flags |= REQ_F_NEED_CLEANUP;
5006 return ret;
5007}
5008
5009static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5010{
5011 struct io_sr_msg *sr = &req->sr_msg;
5012
5013 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5014 return -EINVAL;
5015
5016 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
5017 sr->len = READ_ONCE(sqe->len);
5018 sr->bgid = READ_ONCE(sqe->buf_group);
5019 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5020 if (sr->msg_flags & MSG_DONTWAIT)
5021 req->flags |= REQ_F_NOWAIT;
5022
5023#ifdef CONFIG_COMPAT
5024 if (req->ctx->compat)
5025 sr->msg_flags |= MSG_CMSG_COMPAT;
5026#endif
5027 return 0;
5028}
5029
5030static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
5031{
5032 struct io_async_msghdr iomsg, *kmsg;
5033 struct socket *sock;
5034 struct io_buffer *kbuf;
5035 unsigned flags;
5036 int min_ret = 0;
5037 int ret, cflags = 0;
5038 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5039
5040 sock = sock_from_file(req->file);
5041 if (unlikely(!sock))
5042 return -ENOTSOCK;
5043
5044 if (req_has_async_data(req)) {
5045 kmsg = req->async_data;
5046 } else {
5047 ret = io_recvmsg_copy_hdr(req, &iomsg);
5048 if (ret)
5049 return ret;
5050 kmsg = &iomsg;
5051 }
5052
5053 if (req->flags & REQ_F_BUFFER_SELECT) {
5054 kbuf = io_recv_buffer_select(req, issue_flags);
5055 if (IS_ERR(kbuf))
5056 return PTR_ERR(kbuf);
5057 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5058 kmsg->fast_iov[0].iov_len = req->sr_msg.len;
5059 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
5060 1, req->sr_msg.len);
5061 }
5062
5063 flags = req->sr_msg.msg_flags;
5064 if (force_nonblock)
5065 flags |= MSG_DONTWAIT;
5066 if (flags & MSG_WAITALL)
5067 min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5068
5069 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5070 kmsg->uaddr, flags);
5071 if (force_nonblock && ret == -EAGAIN)
5072 return io_setup_async_msg(req, kmsg);
5073 if (ret == -ERESTARTSYS)
5074 ret = -EINTR;
5075
5076 if (req->flags & REQ_F_BUFFER_SELECTED)
5077 cflags = io_put_recv_kbuf(req);
5078
5079 if (kmsg->free_iov)
5080 kfree(kmsg->free_iov);
5081 req->flags &= ~REQ_F_NEED_CLEANUP;
5082 if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
5083 req_set_fail(req);
5084 __io_req_complete(req, issue_flags, ret, cflags);
5085 return 0;
5086}
5087
5088static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
5089{
5090 struct io_buffer *kbuf;
5091 struct io_sr_msg *sr = &req->sr_msg;
5092 struct msghdr msg;
5093 void __user *buf = sr->buf;
5094 struct socket *sock;
5095 struct iovec iov;
5096 unsigned flags;
5097 int min_ret = 0;
5098 int ret, cflags = 0;
5099 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5100
5101 sock = sock_from_file(req->file);
5102 if (unlikely(!sock))
5103 return -ENOTSOCK;
5104
5105 if (req->flags & REQ_F_BUFFER_SELECT) {
5106 kbuf = io_recv_buffer_select(req, issue_flags);
5107 if (IS_ERR(kbuf))
5108 return PTR_ERR(kbuf);
5109 buf = u64_to_user_ptr(kbuf->addr);
5110 }
5111
5112 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
5113 if (unlikely(ret))
5114 goto out_free;
5115
5116 msg.msg_name = NULL;
5117 msg.msg_control = NULL;
5118 msg.msg_controllen = 0;
5119 msg.msg_namelen = 0;
5120 msg.msg_iocb = NULL;
5121 msg.msg_flags = 0;
5122
5123 flags = req->sr_msg.msg_flags;
5124 if (force_nonblock)
5125 flags |= MSG_DONTWAIT;
5126 if (flags & MSG_WAITALL)
5127 min_ret = iov_iter_count(&msg.msg_iter);
5128
5129 ret = sock_recvmsg(sock, &msg, flags);
5130 if (force_nonblock && ret == -EAGAIN)
5131 return -EAGAIN;
5132 if (ret == -ERESTARTSYS)
5133 ret = -EINTR;
5134out_free:
5135 if (req->flags & REQ_F_BUFFER_SELECTED)
5136 cflags = io_put_recv_kbuf(req);
5137 if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
5138 req_set_fail(req);
5139 __io_req_complete(req, issue_flags, ret, cflags);
5140 return 0;
5141}
5142
5143static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5144{
5145 struct io_accept *accept = &req->accept;
5146
5147 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5148 return -EINVAL;
5149 if (sqe->ioprio || sqe->len || sqe->buf_index)
5150 return -EINVAL;
5151
5152 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5153 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5154 accept->flags = READ_ONCE(sqe->accept_flags);
5155 accept->nofile = rlimit(RLIMIT_NOFILE);
5156
5157 accept->file_slot = READ_ONCE(sqe->file_index);
5158 if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
5159 (accept->flags & SOCK_CLOEXEC)))
5160 return -EINVAL;
5161 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5162 return -EINVAL;
5163 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5164 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
5165 return 0;
5166}
5167
5168static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5169{
5170 struct io_accept *accept = &req->accept;
5171 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5172 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
5173 bool fixed = !!accept->file_slot;
5174 struct file *file;
5175 int ret, fd;
5176
5177 if (req->file->f_flags & O_NONBLOCK)
5178 req->flags |= REQ_F_NOWAIT;
5179
5180 if (!fixed) {
5181 fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5182 if (unlikely(fd < 0))
5183 return fd;
5184 }
5185 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5186 accept->flags);
5187 if (IS_ERR(file)) {
5188 if (!fixed)
5189 put_unused_fd(fd);
5190 ret = PTR_ERR(file);
5191 if (ret == -EAGAIN && force_nonblock)
5192 return -EAGAIN;
5193 if (ret == -ERESTARTSYS)
5194 ret = -EINTR;
5195 req_set_fail(req);
5196 } else if (!fixed) {
5197 fd_install(fd, file);
5198 ret = fd;
5199 } else {
5200 ret = io_install_fixed_file(req, file, issue_flags,
5201 accept->file_slot - 1);
5202 }
5203 __io_req_complete(req, issue_flags, ret, 0);
5204 return 0;
5205}
5206
5207static int io_connect_prep_async(struct io_kiocb *req)
5208{
5209 struct io_async_connect *io = req->async_data;
5210 struct io_connect *conn = &req->connect;
5211
5212 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5213}
5214
5215static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5216{
5217 struct io_connect *conn = &req->connect;
5218
5219 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5220 return -EINVAL;
5221 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5222 sqe->splice_fd_in)
5223 return -EINVAL;
5224
5225 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5226 conn->addr_len = READ_ONCE(sqe->addr2);
5227 return 0;
5228}
5229
5230static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5231{
5232 struct io_async_connect __io, *io;
5233 unsigned file_flags;
5234 int ret;
5235 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5236
5237 if (req_has_async_data(req)) {
5238 io = req->async_data;
5239 } else {
5240 ret = move_addr_to_kernel(req->connect.addr,
5241 req->connect.addr_len,
5242 &__io.address);
5243 if (ret)
5244 goto out;
5245 io = &__io;
5246 }
5247
5248 file_flags = force_nonblock ? O_NONBLOCK : 0;
5249
5250 ret = __sys_connect_file(req->file, &io->address,
5251 req->connect.addr_len, file_flags);
5252 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
5253 if (req_has_async_data(req))
5254 return -EAGAIN;
5255 if (io_alloc_async_data(req)) {
5256 ret = -ENOMEM;
5257 goto out;
5258 }
5259 memcpy(req->async_data, &__io, sizeof(__io));
5260 return -EAGAIN;
5261 }
5262 if (ret == -ERESTARTSYS)
5263 ret = -EINTR;
5264out:
5265 if (ret < 0)
5266 req_set_fail(req);
5267 __io_req_complete(req, issue_flags, ret, 0);
5268 return 0;
5269}
5270#else
5271#define IO_NETOP_FN(op) \
5272static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \
5273{ \
5274 return -EOPNOTSUPP; \
5275}
5276
5277#define IO_NETOP_PREP(op) \
5278IO_NETOP_FN(op) \
5279static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5280{ \
5281 return -EOPNOTSUPP; \
5282} \
5283
5284#define IO_NETOP_PREP_ASYNC(op) \
5285IO_NETOP_PREP(op) \
5286static int io_##op##_prep_async(struct io_kiocb *req) \
5287{ \
5288 return -EOPNOTSUPP; \
5289}
5290
5291IO_NETOP_PREP_ASYNC(sendmsg);
5292IO_NETOP_PREP_ASYNC(recvmsg);
5293IO_NETOP_PREP_ASYNC(connect);
5294IO_NETOP_PREP(accept);
5295IO_NETOP_FN(send);
5296IO_NETOP_FN(recv);
5297#endif
5298
5299struct io_poll_table {
5300 struct poll_table_struct pt;
5301 struct io_kiocb *req;
5302 int nr_entries;
5303 int error;
5304};
5305
5306static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5307 __poll_t mask, io_req_tw_func_t func)
5308{
5309
5310 if (mask && !(mask & poll->events))
5311 return 0;
5312
5313 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5314
5315 list_del_init(&poll->wait.entry);
5316
5317 req->result = mask;
5318 req->io_task_work.func = func;
5319
5320
5321
5322
5323
5324
5325
5326 io_req_task_work_add(req);
5327 return 1;
5328}
5329
5330static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5331 __acquires(&req->ctx->completion_lock)
5332{
5333 struct io_ring_ctx *ctx = req->ctx;
5334
5335
5336 if (unlikely(req->task->flags & PF_EXITING))
5337 WRITE_ONCE(poll->canceled, true);
5338
5339 if (!req->result && !READ_ONCE(poll->canceled)) {
5340 struct poll_table_struct pt = { ._key = poll->events };
5341
5342 req->result = vfs_poll(req->file, &pt) & poll->events;
5343 }
5344
5345 spin_lock(&ctx->completion_lock);
5346 if (!req->result && !READ_ONCE(poll->canceled)) {
5347 add_wait_queue(poll->head, &poll->wait);
5348 return true;
5349 }
5350
5351 return false;
5352}
5353
5354static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5355{
5356
5357 if (req->opcode == IORING_OP_POLL_ADD)
5358 return req->async_data;
5359 return req->apoll->double_poll;
5360}
5361
5362static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5363{
5364 if (req->opcode == IORING_OP_POLL_ADD)
5365 return &req->poll;
5366 return &req->apoll->poll;
5367}
5368
5369static void io_poll_remove_double(struct io_kiocb *req)
5370 __must_hold(&req->ctx->completion_lock)
5371{
5372 struct io_poll_iocb *poll = io_poll_get_double(req);
5373
5374 lockdep_assert_held(&req->ctx->completion_lock);
5375
5376 if (poll && poll->head) {
5377 struct wait_queue_head *head = poll->head;
5378
5379 spin_lock_irq(&head->lock);
5380 list_del_init(&poll->wait.entry);
5381 if (poll->wait.private)
5382 req_ref_put(req);
5383 poll->head = NULL;
5384 spin_unlock_irq(&head->lock);
5385 }
5386}
5387
5388static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
5389 __must_hold(&req->ctx->completion_lock)
5390{
5391 struct io_ring_ctx *ctx = req->ctx;
5392 unsigned flags = IORING_CQE_F_MORE;
5393 int error;
5394
5395 if (READ_ONCE(req->poll.canceled)) {
5396 error = -ECANCELED;
5397 req->poll.events |= EPOLLONESHOT;
5398 } else {
5399 error = mangle_poll(mask);
5400 }
5401 if (req->poll.events & EPOLLONESHOT)
5402 flags = 0;
5403 if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
5404 req->poll.events |= EPOLLONESHOT;
5405 flags = 0;
5406 }
5407 if (flags & IORING_CQE_F_MORE)
5408 ctx->cq_extra++;
5409
5410 return !(flags & IORING_CQE_F_MORE);
5411}
5412
5413static void io_poll_task_func(struct io_kiocb *req, bool *locked)
5414{
5415 struct io_ring_ctx *ctx = req->ctx;
5416 struct io_kiocb *nxt;
5417
5418 if (io_poll_rewait(req, &req->poll)) {
5419 spin_unlock(&ctx->completion_lock);
5420 } else {
5421 bool done;
5422
5423 if (req->poll.done) {
5424 spin_unlock(&ctx->completion_lock);
5425 return;
5426 }
5427 done = __io_poll_complete(req, req->result);
5428 if (done) {
5429 io_poll_remove_double(req);
5430 hash_del(&req->hash_node);
5431 req->poll.done = true;
5432 } else {
5433 req->result = 0;
5434 add_wait_queue(req->poll.head, &req->poll.wait);
5435 }
5436 io_commit_cqring(ctx);
5437 spin_unlock(&ctx->completion_lock);
5438 io_cqring_ev_posted(ctx);
5439
5440 if (done) {
5441 nxt = io_put_req_find_next(req);
5442 if (nxt)
5443 io_req_task_submit(nxt, locked);
5444 }
5445 }
5446}
5447
5448static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5449 int sync, void *key)
5450{
5451 struct io_kiocb *req = wait->private;
5452 struct io_poll_iocb *poll = io_poll_get_single(req);
5453 __poll_t mask = key_to_poll(key);
5454 unsigned long flags;
5455
5456
5457 if (mask && !(mask & poll->events))
5458 return 0;
5459 if (!(poll->events & EPOLLONESHOT))
5460 return poll->wait.func(&poll->wait, mode, sync, key);
5461
5462 list_del_init(&wait->entry);
5463
5464 if (poll->head) {
5465 bool done;
5466
5467 spin_lock_irqsave(&poll->head->lock, flags);
5468 done = list_empty(&poll->wait.entry);
5469 if (!done)
5470 list_del_init(&poll->wait.entry);
5471
5472 wait->private = NULL;
5473 spin_unlock_irqrestore(&poll->head->lock, flags);
5474 if (!done) {
5475
5476 poll->wait.func(&poll->wait, mode, sync, key);
5477 }
5478 }
5479 req_ref_put(req);
5480 return 1;
5481}
5482
5483static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5484 wait_queue_func_t wake_func)
5485{
5486 poll->head = NULL;
5487 poll->done = false;
5488 poll->canceled = false;
5489#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5490
5491 poll->events = events | IO_POLL_UNMASK;
5492 INIT_LIST_HEAD(&poll->wait.entry);
5493 init_waitqueue_func_entry(&poll->wait, wake_func);
5494}
5495
5496static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5497 struct wait_queue_head *head,
5498 struct io_poll_iocb **poll_ptr)
5499{
5500 struct io_kiocb *req = pt->req;
5501
5502
5503
5504
5505
5506
5507 if (unlikely(pt->nr_entries)) {
5508 struct io_poll_iocb *poll_one = poll;
5509
5510
5511 if (poll_one->head == head)
5512 return;
5513
5514 if (*poll_ptr) {
5515 if ((*poll_ptr)->head == head)
5516 return;
5517 pt->error = -EINVAL;
5518 return;
5519 }
5520
5521
5522
5523
5524 if (!(poll_one->events & EPOLLONESHOT))
5525 poll_one->events |= EPOLLONESHOT;
5526 poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5527 if (!poll) {
5528 pt->error = -ENOMEM;
5529 return;
5530 }
5531 io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5532 req_ref_get(req);
5533 poll->wait.private = req;
5534
5535 *poll_ptr = poll;
5536 if (req->opcode == IORING_OP_POLL_ADD)
5537 req->flags |= REQ_F_ASYNC_DATA;
5538 }
5539
5540 pt->nr_entries++;
5541 poll->head = head;
5542
5543 if (poll->events & EPOLLEXCLUSIVE)
5544 add_wait_queue_exclusive(head, &poll->wait);
5545 else
5546 add_wait_queue(head, &poll->wait);
5547}
5548
5549static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5550 struct poll_table_struct *p)
5551{
5552 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5553 struct async_poll *apoll = pt->req->apoll;
5554
5555 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5556}
5557
5558static void io_async_task_func(struct io_kiocb *req, bool *locked)
5559{
5560 struct async_poll *apoll = req->apoll;
5561 struct io_ring_ctx *ctx = req->ctx;
5562
5563 trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
5564
5565 if (io_poll_rewait(req, &apoll->poll)) {
5566 spin_unlock(&ctx->completion_lock);
5567 return;
5568 }
5569
5570 hash_del(&req->hash_node);
5571 io_poll_remove_double(req);
5572 apoll->poll.done = true;
5573 spin_unlock(&ctx->completion_lock);
5574
5575 if (!READ_ONCE(apoll->poll.canceled))
5576 io_req_task_submit(req, locked);
5577 else
5578 io_req_complete_failed(req, -ECANCELED);
5579}
5580
5581static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5582 void *key)
5583{
5584 struct io_kiocb *req = wait->private;
5585 struct io_poll_iocb *poll = &req->apoll->poll;
5586
5587 trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5588 key_to_poll(key));
5589
5590 return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5591}
5592
5593static void io_poll_req_insert(struct io_kiocb *req)
5594{
5595 struct io_ring_ctx *ctx = req->ctx;
5596 struct hlist_head *list;
5597
5598 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5599 hlist_add_head(&req->hash_node, list);
5600}
5601
5602static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5603 struct io_poll_iocb *poll,
5604 struct io_poll_table *ipt, __poll_t mask,
5605 wait_queue_func_t wake_func)
5606 __acquires(&ctx->completion_lock)
5607{
5608 struct io_ring_ctx *ctx = req->ctx;
5609 bool cancel = false;
5610
5611 INIT_HLIST_NODE(&req->hash_node);
5612 io_init_poll_iocb(poll, mask, wake_func);
5613 poll->file = req->file;
5614 poll->wait.private = req;
5615
5616 ipt->pt._key = mask;
5617 ipt->req = req;
5618 ipt->error = 0;
5619 ipt->nr_entries = 0;
5620
5621 mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5622 if (unlikely(!ipt->nr_entries) && !ipt->error)
5623 ipt->error = -EINVAL;
5624
5625 spin_lock(&ctx->completion_lock);
5626 if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
5627 io_poll_remove_double(req);
5628 if (likely(poll->head)) {
5629 spin_lock_irq(&poll->head->lock);
5630 if (unlikely(list_empty(&poll->wait.entry))) {
5631 if (ipt->error)
5632 cancel = true;
5633 ipt->error = 0;
5634 mask = 0;
5635 }
5636 if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
5637 list_del_init(&poll->wait.entry);
5638 else if (cancel)
5639 WRITE_ONCE(poll->canceled, true);
5640 else if (!poll->done)
5641 io_poll_req_insert(req);
5642 spin_unlock_irq(&poll->head->lock);
5643 }
5644
5645 return mask;
5646}
5647
5648enum {
5649 IO_APOLL_OK,
5650 IO_APOLL_ABORTED,
5651 IO_APOLL_READY
5652};
5653
5654static int io_arm_poll_handler(struct io_kiocb *req)
5655{
5656 const struct io_op_def *def = &io_op_defs[req->opcode];
5657 struct io_ring_ctx *ctx = req->ctx;
5658 struct async_poll *apoll;
5659 struct io_poll_table ipt;
5660 __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
5661
5662 if (!def->pollin && !def->pollout)
5663 return IO_APOLL_ABORTED;
5664 if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
5665 return IO_APOLL_ABORTED;
5666
5667 if (def->pollin) {
5668 mask |= POLLIN | POLLRDNORM;
5669
5670
5671 if ((req->opcode == IORING_OP_RECVMSG) &&
5672 (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5673 mask &= ~POLLIN;
5674 } else {
5675 mask |= POLLOUT | POLLWRNORM;
5676 }
5677
5678 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5679 if (unlikely(!apoll))
5680 return IO_APOLL_ABORTED;
5681 apoll->double_poll = NULL;
5682 req->apoll = apoll;
5683 req->flags |= REQ_F_POLLED;
5684 ipt.pt._qproc = io_async_queue_proc;
5685 io_req_set_refcount(req);
5686
5687 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5688 io_async_wake);
5689 spin_unlock(&ctx->completion_lock);
5690 if (ret || ipt.error)
5691 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5692
5693 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5694 mask, apoll->poll.events);
5695 return IO_APOLL_OK;
5696}
5697
5698static bool __io_poll_remove_one(struct io_kiocb *req,
5699 struct io_poll_iocb *poll, bool do_cancel)
5700 __must_hold(&req->ctx->completion_lock)
5701{
5702 bool do_complete = false;
5703
5704 if (!poll->head)
5705 return false;
5706 spin_lock_irq(&poll->head->lock);
5707 if (do_cancel)
5708 WRITE_ONCE(poll->canceled, true);
5709 if (!list_empty(&poll->wait.entry)) {
5710 list_del_init(&poll->wait.entry);
5711 do_complete = true;
5712 }
5713 spin_unlock_irq(&poll->head->lock);
5714 hash_del(&req->hash_node);
5715 return do_complete;
5716}
5717
5718static bool io_poll_remove_one(struct io_kiocb *req)
5719 __must_hold(&req->ctx->completion_lock)
5720{
5721 bool do_complete;
5722
5723 io_poll_remove_double(req);
5724 do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
5725
5726 if (do_complete) {
5727 io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
5728 io_commit_cqring(req->ctx);
5729 req_set_fail(req);
5730 io_put_req_deferred(req);
5731 }
5732 return do_complete;
5733}
5734
5735
5736
5737
5738static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
5739 struct task_struct *tsk, bool cancel_all)
5740{
5741 struct hlist_node *tmp;
5742 struct io_kiocb *req;
5743 int posted = 0, i;
5744
5745 spin_lock(&ctx->completion_lock);
5746 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5747 struct hlist_head *list;
5748
5749 list = &ctx->cancel_hash[i];
5750 hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5751 if (io_match_task_safe(req, tsk, cancel_all))
5752 posted += io_poll_remove_one(req);
5753 }
5754 }
5755 spin_unlock(&ctx->completion_lock);
5756
5757 if (posted)
5758 io_cqring_ev_posted(ctx);
5759
5760 return posted != 0;
5761}
5762
5763static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5764 bool poll_only)
5765 __must_hold(&ctx->completion_lock)
5766{
5767 struct hlist_head *list;
5768 struct io_kiocb *req;
5769
5770 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5771 hlist_for_each_entry(req, list, hash_node) {
5772 if (sqe_addr != req->user_data)
5773 continue;
5774 if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5775 continue;
5776 return req;
5777 }
5778 return NULL;
5779}
5780
5781static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5782 bool poll_only)
5783 __must_hold(&ctx->completion_lock)
5784{
5785 struct io_kiocb *req;
5786
5787 req = io_poll_find(ctx, sqe_addr, poll_only);
5788 if (!req)
5789 return -ENOENT;
5790 if (io_poll_remove_one(req))
5791 return 0;
5792
5793 return -EALREADY;
5794}
5795
5796static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5797 unsigned int flags)
5798{
5799 u32 events;
5800
5801 events = READ_ONCE(sqe->poll32_events);
5802#ifdef __BIG_ENDIAN
5803 events = swahw32(events);
5804#endif
5805 if (!(flags & IORING_POLL_ADD_MULTI))
5806 events |= EPOLLONESHOT;
5807 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5808}
5809
5810static int io_poll_update_prep(struct io_kiocb *req,
5811 const struct io_uring_sqe *sqe)
5812{
5813 struct io_poll_update *upd = &req->poll_update;
5814 u32 flags;
5815
5816 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5817 return -EINVAL;
5818 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5819 return -EINVAL;
5820 flags = READ_ONCE(sqe->len);
5821 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5822 IORING_POLL_ADD_MULTI))
5823 return -EINVAL;
5824
5825 if (flags == IORING_POLL_ADD_MULTI)
5826 return -EINVAL;
5827
5828 upd->old_user_data = READ_ONCE(sqe->addr);
5829 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5830 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5831
5832 upd->new_user_data = READ_ONCE(sqe->off);
5833 if (!upd->update_user_data && upd->new_user_data)
5834 return -EINVAL;
5835 if (upd->update_events)
5836 upd->events = io_poll_parse_events(sqe, flags);
5837 else if (sqe->poll32_events)
5838 return -EINVAL;
5839
5840 return 0;
5841}
5842
5843static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5844 void *key)
5845{
5846 struct io_kiocb *req = wait->private;
5847 struct io_poll_iocb *poll = &req->poll;
5848
5849 return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5850}
5851
5852static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5853 struct poll_table_struct *p)
5854{
5855 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5856
5857 __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5858}
5859
5860static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5861{
5862 struct io_poll_iocb *poll = &req->poll;
5863 u32 flags;
5864
5865 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5866 return -EINVAL;
5867 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5868 return -EINVAL;
5869 flags = READ_ONCE(sqe->len);
5870 if (flags & ~IORING_POLL_ADD_MULTI)
5871 return -EINVAL;
5872
5873 io_req_set_refcount(req);
5874 poll->events = io_poll_parse_events(sqe, flags);
5875 return 0;
5876}
5877
5878static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5879{
5880 struct io_poll_iocb *poll = &req->poll;
5881 struct io_ring_ctx *ctx = req->ctx;
5882 struct io_poll_table ipt;
5883 __poll_t mask;
5884 bool done;
5885
5886 ipt.pt._qproc = io_poll_queue_proc;
5887
5888 mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5889 io_poll_wake);
5890
5891 if (mask) {
5892 ipt.error = 0;
5893 done = __io_poll_complete(req, mask);
5894 io_commit_cqring(req->ctx);
5895 }
5896 spin_unlock(&ctx->completion_lock);
5897
5898 if (mask) {
5899 io_cqring_ev_posted(ctx);
5900 if (done)
5901 io_put_req(req);
5902 }
5903 return ipt.error;
5904}
5905
5906static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
5907{
5908 struct io_ring_ctx *ctx = req->ctx;
5909 struct io_kiocb *preq;
5910 bool completing;
5911 int ret;
5912
5913 spin_lock(&ctx->completion_lock);
5914 preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
5915 if (!preq) {
5916 ret = -ENOENT;
5917 goto err;
5918 }
5919
5920 if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5921 completing = true;
5922 ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5923 goto err;
5924 }
5925
5926
5927
5928
5929
5930
5931 completing = !__io_poll_remove_one(preq, &preq->poll, false);
5932 if (completing && (preq->poll.events & EPOLLONESHOT)) {
5933 ret = -EALREADY;
5934 goto err;
5935 }
5936
5937 ret = 0;
5938err:
5939 if (ret < 0) {
5940 spin_unlock(&ctx->completion_lock);
5941 req_set_fail(req);
5942 io_req_complete(req, ret);
5943 return 0;
5944 }
5945
5946 if (req->poll_update.update_events) {
5947 preq->poll.events &= ~0xffff;
5948 preq->poll.events |= req->poll_update.events & 0xffff;
5949 preq->poll.events |= IO_POLL_UNMASK;
5950 }
5951 if (req->poll_update.update_user_data)
5952 preq->user_data = req->poll_update.new_user_data;
5953 spin_unlock(&ctx->completion_lock);
5954
5955
5956 io_req_complete(req, ret);
5957
5958 if (!completing) {
5959 ret = io_poll_add(preq, issue_flags);
5960 if (ret < 0) {
5961 req_set_fail(preq);
5962 io_req_complete(preq, ret);
5963 }
5964 }
5965 return 0;
5966}
5967
5968static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
5969{
5970 struct io_timeout_data *data = req->async_data;
5971
5972 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
5973 req_set_fail(req);
5974 io_req_complete_post(req, -ETIME, 0);
5975}
5976
5977static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5978{
5979 struct io_timeout_data *data = container_of(timer,
5980 struct io_timeout_data, timer);
5981 struct io_kiocb *req = data->req;
5982 struct io_ring_ctx *ctx = req->ctx;
5983 unsigned long flags;
5984
5985 spin_lock_irqsave(&ctx->timeout_lock, flags);
5986 list_del_init(&req->timeout.list);
5987 atomic_set(&req->ctx->cq_timeouts,
5988 atomic_read(&req->ctx->cq_timeouts) + 1);
5989 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
5990
5991 req->io_task_work.func = io_req_task_timeout;
5992 io_req_task_work_add(req);
5993 return HRTIMER_NORESTART;
5994}
5995
5996static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5997 __u64 user_data)
5998 __must_hold(&ctx->timeout_lock)
5999{
6000 struct io_timeout_data *io;
6001 struct io_kiocb *req;
6002 bool found = false;
6003
6004 list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
6005 found = user_data == req->user_data;
6006 if (found)
6007 break;
6008 }
6009 if (!found)
6010 return ERR_PTR(-ENOENT);
6011
6012 io = req->async_data;
6013 if (hrtimer_try_to_cancel(&io->timer) == -1)
6014 return ERR_PTR(-EALREADY);
6015 list_del_init(&req->timeout.list);
6016 return req;
6017}
6018
6019static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
6020 __must_hold(&ctx->completion_lock)
6021 __must_hold(&ctx->timeout_lock)
6022{
6023 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6024
6025 if (IS_ERR(req))
6026 return PTR_ERR(req);
6027
6028 req_set_fail(req);
6029 io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
6030 io_put_req_deferred(req);
6031 return 0;
6032}
6033
6034static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
6035{
6036 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
6037 case IORING_TIMEOUT_BOOTTIME:
6038 return CLOCK_BOOTTIME;
6039 case IORING_TIMEOUT_REALTIME:
6040 return CLOCK_REALTIME;
6041 default:
6042
6043 WARN_ON_ONCE(1);
6044 fallthrough;
6045 case 0:
6046 return CLOCK_MONOTONIC;
6047 }
6048}
6049
6050static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6051 struct timespec64 *ts, enum hrtimer_mode mode)
6052 __must_hold(&ctx->timeout_lock)
6053{
6054 struct io_timeout_data *io;
6055 struct io_kiocb *req;
6056 bool found = false;
6057
6058 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
6059 found = user_data == req->user_data;
6060 if (found)
6061 break;
6062 }
6063 if (!found)
6064 return -ENOENT;
6065
6066 io = req->async_data;
6067 if (hrtimer_try_to_cancel(&io->timer) == -1)
6068 return -EALREADY;
6069 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6070 io->timer.function = io_link_timeout_fn;
6071 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6072 return 0;
6073}
6074
6075static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6076 struct timespec64 *ts, enum hrtimer_mode mode)
6077 __must_hold(&ctx->timeout_lock)
6078{
6079 struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6080 struct io_timeout_data *data;
6081
6082 if (IS_ERR(req))
6083 return PTR_ERR(req);
6084
6085 req->timeout.off = 0;
6086 data = req->async_data;
6087 list_add_tail(&req->timeout.list, &ctx->timeout_list);
6088 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
6089 data->timer.function = io_timeout_fn;
6090 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6091 return 0;
6092}
6093
6094static int io_timeout_remove_prep(struct io_kiocb *req,
6095 const struct io_uring_sqe *sqe)
6096{
6097 struct io_timeout_rem *tr = &req->timeout_rem;
6098
6099 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6100 return -EINVAL;
6101 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6102 return -EINVAL;
6103 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
6104 return -EINVAL;
6105
6106 tr->ltimeout = false;
6107 tr->addr = READ_ONCE(sqe->addr);
6108 tr->flags = READ_ONCE(sqe->timeout_flags);
6109 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6110 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6111 return -EINVAL;
6112 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6113 tr->ltimeout = true;
6114 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
6115 return -EINVAL;
6116 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6117 return -EFAULT;
6118 } else if (tr->flags) {
6119
6120 return -EINVAL;
6121 }
6122
6123 return 0;
6124}
6125
6126static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6127{
6128 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6129 : HRTIMER_MODE_REL;
6130}
6131
6132
6133
6134
6135static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
6136{
6137 struct io_timeout_rem *tr = &req->timeout_rem;
6138 struct io_ring_ctx *ctx = req->ctx;
6139 int ret;
6140
6141 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6142 spin_lock(&ctx->completion_lock);
6143 spin_lock_irq(&ctx->timeout_lock);
6144 ret = io_timeout_cancel(ctx, tr->addr);
6145 spin_unlock_irq(&ctx->timeout_lock);
6146 spin_unlock(&ctx->completion_lock);
6147 } else {
6148 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6149
6150 spin_lock_irq(&ctx->timeout_lock);
6151 if (tr->ltimeout)
6152 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6153 else
6154 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
6155 spin_unlock_irq(&ctx->timeout_lock);
6156 }
6157
6158 if (ret < 0)
6159 req_set_fail(req);
6160 io_req_complete_post(req, ret, 0);
6161 return 0;
6162}
6163
6164static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6165 bool is_timeout_link)
6166{
6167 struct io_timeout_data *data;
6168 unsigned flags;
6169 u32 off = READ_ONCE(sqe->off);
6170
6171 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6172 return -EINVAL;
6173 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6174 sqe->splice_fd_in)
6175 return -EINVAL;
6176 if (off && is_timeout_link)
6177 return -EINVAL;
6178 flags = READ_ONCE(sqe->timeout_flags);
6179 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6180 IORING_TIMEOUT_ETIME_SUCCESS))
6181 return -EINVAL;
6182
6183 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6184 return -EINVAL;
6185
6186 INIT_LIST_HEAD(&req->timeout.list);
6187 req->timeout.off = off;
6188 if (unlikely(off && !req->ctx->off_timeout_used))
6189 req->ctx->off_timeout_used = true;
6190
6191 if (WARN_ON_ONCE(req_has_async_data(req)))
6192 return -EFAULT;
6193 if (io_alloc_async_data(req))
6194 return -ENOMEM;
6195
6196 data = req->async_data;
6197 data->req = req;
6198 data->flags = flags;
6199
6200 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
6201 return -EFAULT;
6202
6203 if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
6204 return -EINVAL;
6205
6206 data->mode = io_translate_timeout_mode(flags);
6207 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
6208
6209 if (is_timeout_link) {
6210 struct io_submit_link *link = &req->ctx->submit_state.link;
6211
6212 if (!link->head)
6213 return -EINVAL;
6214 if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6215 return -EINVAL;
6216 req->timeout.head = link->last;
6217 link->last->flags |= REQ_F_ARM_LTIMEOUT;
6218 }
6219 return 0;
6220}
6221
6222static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
6223{
6224 struct io_ring_ctx *ctx = req->ctx;
6225 struct io_timeout_data *data = req->async_data;
6226 struct list_head *entry;
6227 u32 tail, off = req->timeout.off;
6228
6229 spin_lock_irq(&ctx->timeout_lock);
6230
6231
6232
6233
6234
6235
6236 if (io_is_timeout_noseq(req)) {
6237 entry = ctx->timeout_list.prev;
6238 goto add;
6239 }
6240
6241 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6242 req->timeout.target_seq = tail + off;
6243
6244
6245
6246
6247
6248 ctx->cq_last_tm_flush = tail;
6249
6250
6251
6252
6253
6254 list_for_each_prev(entry, &ctx->timeout_list) {
6255 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6256 timeout.list);
6257
6258 if (io_is_timeout_noseq(nxt))
6259 continue;
6260
6261 if (off >= nxt->timeout.target_seq - tail)
6262 break;
6263 }
6264add:
6265 list_add(&req->timeout.list, entry);
6266 data->timer.function = io_timeout_fn;
6267 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
6268 spin_unlock_irq(&ctx->timeout_lock);
6269 return 0;
6270}
6271
6272struct io_cancel_data {
6273 struct io_ring_ctx *ctx;
6274 u64 user_data;
6275};
6276
6277static bool io_cancel_cb(struct io_wq_work *work, void *data)
6278{
6279 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6280 struct io_cancel_data *cd = data;
6281
6282 return req->ctx == cd->ctx && req->user_data == cd->user_data;
6283}
6284
6285static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6286 struct io_ring_ctx *ctx)
6287{
6288 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
6289 enum io_wq_cancel cancel_ret;
6290 int ret = 0;
6291
6292 if (!tctx || !tctx->io_wq)
6293 return -ENOENT;
6294
6295 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
6296 switch (cancel_ret) {
6297 case IO_WQ_CANCEL_OK:
6298 ret = 0;
6299 break;
6300 case IO_WQ_CANCEL_RUNNING:
6301 ret = -EALREADY;
6302 break;
6303 case IO_WQ_CANCEL_NOTFOUND:
6304 ret = -ENOENT;
6305 break;
6306 }
6307
6308 return ret;
6309}
6310
6311static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
6312{
6313 struct io_ring_ctx *ctx = req->ctx;
6314 int ret;
6315
6316 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
6317
6318 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
6319 if (ret != -ENOENT)
6320 return ret;
6321
6322 spin_lock(&ctx->completion_lock);
6323 spin_lock_irq(&ctx->timeout_lock);
6324 ret = io_timeout_cancel(ctx, sqe_addr);
6325 spin_unlock_irq(&ctx->timeout_lock);
6326 if (ret != -ENOENT)
6327 goto out;
6328 ret = io_poll_cancel(ctx, sqe_addr, false);
6329out:
6330 spin_unlock(&ctx->completion_lock);
6331 return ret;
6332}
6333
6334static int io_async_cancel_prep(struct io_kiocb *req,
6335 const struct io_uring_sqe *sqe)
6336{
6337 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6338 return -EINVAL;
6339 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6340 return -EINVAL;
6341 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6342 sqe->splice_fd_in)
6343 return -EINVAL;
6344
6345 req->cancel.addr = READ_ONCE(sqe->addr);
6346 return 0;
6347}
6348
6349static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
6350{
6351 struct io_ring_ctx *ctx = req->ctx;
6352 u64 sqe_addr = req->cancel.addr;
6353 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
6354 struct io_tctx_node *node;
6355 int ret;
6356
6357 ret = io_try_cancel_userdata(req, sqe_addr);
6358 if (ret != -ENOENT)
6359 goto done;
6360
6361
6362 io_ring_submit_lock(ctx, needs_lock);
6363 ret = -ENOENT;
6364 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6365 struct io_uring_task *tctx = node->task->io_uring;
6366
6367 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6368 if (ret != -ENOENT)
6369 break;
6370 }
6371 io_ring_submit_unlock(ctx, needs_lock);
6372done:
6373 if (ret < 0)
6374 req_set_fail(req);
6375 io_req_complete_post(req, ret, 0);
6376 return 0;
6377}
6378
6379static int io_rsrc_update_prep(struct io_kiocb *req,
6380 const struct io_uring_sqe *sqe)
6381{
6382 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6383 return -EINVAL;
6384 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
6385 return -EINVAL;
6386
6387 req->rsrc_update.offset = READ_ONCE(sqe->off);
6388 req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6389 if (!req->rsrc_update.nr_args)
6390 return -EINVAL;
6391 req->rsrc_update.arg = READ_ONCE(sqe->addr);
6392 return 0;
6393}
6394
6395static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6396{
6397 struct io_ring_ctx *ctx = req->ctx;
6398 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
6399 struct io_uring_rsrc_update2 up;
6400 int ret;
6401
6402 up.offset = req->rsrc_update.offset;
6403 up.data = req->rsrc_update.arg;
6404 up.nr = 0;
6405 up.tags = 0;
6406 up.resv = 0;
6407
6408 io_ring_submit_lock(ctx, needs_lock);
6409 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
6410 &up, req->rsrc_update.nr_args);
6411 io_ring_submit_unlock(ctx, needs_lock);
6412
6413 if (ret < 0)
6414 req_set_fail(req);
6415 __io_req_complete(req, issue_flags, ret, 0);
6416 return 0;
6417}
6418
6419static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6420{
6421 switch (req->opcode) {
6422 case IORING_OP_NOP:
6423 return 0;
6424 case IORING_OP_READV:
6425 case IORING_OP_READ_FIXED:
6426 case IORING_OP_READ:
6427 return io_read_prep(req, sqe);
6428 case IORING_OP_WRITEV:
6429 case IORING_OP_WRITE_FIXED:
6430 case IORING_OP_WRITE:
6431 return io_write_prep(req, sqe);
6432 case IORING_OP_POLL_ADD:
6433 return io_poll_add_prep(req, sqe);
6434 case IORING_OP_POLL_REMOVE:
6435 return io_poll_update_prep(req, sqe);
6436 case IORING_OP_FSYNC:
6437 return io_fsync_prep(req, sqe);
6438 case IORING_OP_SYNC_FILE_RANGE:
6439 return io_sfr_prep(req, sqe);
6440 case IORING_OP_SENDMSG:
6441 case IORING_OP_SEND:
6442 return io_sendmsg_prep(req, sqe);
6443 case IORING_OP_RECVMSG:
6444 case IORING_OP_RECV:
6445 return io_recvmsg_prep(req, sqe);
6446 case IORING_OP_CONNECT:
6447 return io_connect_prep(req, sqe);
6448 case IORING_OP_TIMEOUT:
6449 return io_timeout_prep(req, sqe, false);
6450 case IORING_OP_TIMEOUT_REMOVE:
6451 return io_timeout_remove_prep(req, sqe);
6452 case IORING_OP_ASYNC_CANCEL:
6453 return io_async_cancel_prep(req, sqe);
6454 case IORING_OP_LINK_TIMEOUT:
6455 return io_timeout_prep(req, sqe, true);
6456 case IORING_OP_ACCEPT:
6457 return io_accept_prep(req, sqe);
6458 case IORING_OP_FALLOCATE:
6459 return io_fallocate_prep(req, sqe);
6460 case IORING_OP_OPENAT:
6461 return io_openat_prep(req, sqe);
6462 case IORING_OP_CLOSE:
6463 return io_close_prep(req, sqe);
6464 case IORING_OP_FILES_UPDATE:
6465 return io_rsrc_update_prep(req, sqe);
6466 case IORING_OP_STATX:
6467 return io_statx_prep(req, sqe);
6468 case IORING_OP_FADVISE:
6469 return io_fadvise_prep(req, sqe);
6470 case IORING_OP_MADVISE:
6471 return io_madvise_prep(req, sqe);
6472 case IORING_OP_OPENAT2:
6473 return io_openat2_prep(req, sqe);
6474 case IORING_OP_EPOLL_CTL:
6475 return io_epoll_ctl_prep(req, sqe);
6476 case IORING_OP_SPLICE:
6477 return io_splice_prep(req, sqe);
6478 case IORING_OP_PROVIDE_BUFFERS:
6479 return io_provide_buffers_prep(req, sqe);
6480 case IORING_OP_REMOVE_BUFFERS:
6481 return io_remove_buffers_prep(req, sqe);
6482 case IORING_OP_TEE:
6483 return io_tee_prep(req, sqe);
6484 case IORING_OP_SHUTDOWN:
6485 return io_shutdown_prep(req, sqe);
6486 case IORING_OP_RENAMEAT:
6487 return io_renameat_prep(req, sqe);
6488 case IORING_OP_UNLINKAT:
6489 return io_unlinkat_prep(req, sqe);
6490 case IORING_OP_MKDIRAT:
6491 return io_mkdirat_prep(req, sqe);
6492 case IORING_OP_SYMLINKAT:
6493 return io_symlinkat_prep(req, sqe);
6494 case IORING_OP_LINKAT:
6495 return io_linkat_prep(req, sqe);
6496 }
6497
6498 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6499 req->opcode);
6500 return -EINVAL;
6501}
6502
6503static int io_req_prep_async(struct io_kiocb *req)
6504{
6505 if (!io_op_defs[req->opcode].needs_async_setup)
6506 return 0;
6507 if (WARN_ON_ONCE(req_has_async_data(req)))
6508 return -EFAULT;
6509 if (io_alloc_async_data(req))
6510 return -EAGAIN;
6511
6512 switch (req->opcode) {
6513 case IORING_OP_READV:
6514 return io_rw_prep_async(req, READ);
6515 case IORING_OP_WRITEV:
6516 return io_rw_prep_async(req, WRITE);
6517 case IORING_OP_SENDMSG:
6518 return io_sendmsg_prep_async(req);
6519 case IORING_OP_RECVMSG:
6520 return io_recvmsg_prep_async(req);
6521 case IORING_OP_CONNECT:
6522 return io_connect_prep_async(req);
6523 }
6524 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6525 req->opcode);
6526 return -EFAULT;
6527}
6528
6529static u32 io_get_sequence(struct io_kiocb *req)
6530{
6531 u32 seq = req->ctx->cached_sq_head;
6532
6533
6534 io_for_each_link(req, req)
6535 seq--;
6536 return seq;
6537}
6538
6539static __cold void io_drain_req(struct io_kiocb *req)
6540{
6541 struct io_ring_ctx *ctx = req->ctx;
6542 struct io_defer_entry *de;
6543 int ret;
6544 u32 seq = io_get_sequence(req);
6545
6546
6547 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
6548queue:
6549 ctx->drain_active = false;
6550 io_req_task_queue(req);
6551 return;
6552 }
6553
6554 ret = io_req_prep_async(req);
6555 if (ret) {
6556fail:
6557 io_req_complete_failed(req, ret);
6558 return;
6559 }
6560 io_prep_async_link(req);
6561 de = kmalloc(sizeof(*de), GFP_KERNEL);
6562 if (!de) {
6563 ret = -ENOMEM;
6564 goto fail;
6565 }
6566
6567 spin_lock(&ctx->completion_lock);
6568 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
6569 spin_unlock(&ctx->completion_lock);
6570 kfree(de);
6571 goto queue;
6572 }
6573
6574 trace_io_uring_defer(ctx, req, req->user_data);
6575 de->req = req;
6576 de->seq = seq;
6577 list_add_tail(&de->list, &ctx->defer_list);
6578 spin_unlock(&ctx->completion_lock);
6579}
6580
6581static void io_clean_op(struct io_kiocb *req)
6582{
6583 if (req->flags & REQ_F_BUFFER_SELECTED) {
6584 kfree(req->kbuf);
6585 req->kbuf = NULL;
6586 }
6587
6588 if (req->flags & REQ_F_NEED_CLEANUP) {
6589 switch (req->opcode) {
6590 case IORING_OP_READV:
6591 case IORING_OP_READ_FIXED:
6592 case IORING_OP_READ:
6593 case IORING_OP_WRITEV:
6594 case IORING_OP_WRITE_FIXED:
6595 case IORING_OP_WRITE: {
6596 struct io_async_rw *io = req->async_data;
6597
6598 kfree(io->free_iovec);
6599 break;
6600 }
6601 case IORING_OP_RECVMSG:
6602 case IORING_OP_SENDMSG: {
6603 struct io_async_msghdr *io = req->async_data;
6604
6605 kfree(io->free_iov);
6606 break;
6607 }
6608 case IORING_OP_SPLICE:
6609 case IORING_OP_TEE:
6610 if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6611 io_put_file(req->splice.file_in);
6612 break;
6613 case IORING_OP_OPENAT:
6614 case IORING_OP_OPENAT2:
6615 if (req->open.filename)
6616 putname(req->open.filename);
6617 break;
6618 case IORING_OP_RENAMEAT:
6619 putname(req->rename.oldpath);
6620 putname(req->rename.newpath);
6621 break;
6622 case IORING_OP_UNLINKAT:
6623 putname(req->unlink.filename);
6624 break;
6625 case IORING_OP_MKDIRAT:
6626 putname(req->mkdir.filename);
6627 break;
6628 case IORING_OP_SYMLINKAT:
6629 putname(req->symlink.oldpath);
6630 putname(req->symlink.newpath);
6631 break;
6632 case IORING_OP_LINKAT:
6633 putname(req->hardlink.oldpath);
6634 putname(req->hardlink.newpath);
6635 break;
6636 }
6637 }
6638 if ((req->flags & REQ_F_POLLED) && req->apoll) {
6639 kfree(req->apoll->double_poll);
6640 kfree(req->apoll);
6641 req->apoll = NULL;
6642 }
6643 if (req->flags & REQ_F_INFLIGHT) {
6644 struct io_uring_task *tctx = req->task->io_uring;
6645
6646 atomic_dec(&tctx->inflight_tracked);
6647 }
6648 if (req->flags & REQ_F_CREDS)
6649 put_cred(req->creds);
6650 if (req->flags & REQ_F_ASYNC_DATA) {
6651 kfree(req->async_data);
6652 req->async_data = NULL;
6653 }
6654 req->flags &= ~IO_REQ_CLEAN_FLAGS;
6655}
6656
6657static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
6658{
6659 const struct cred *creds = NULL;
6660 int ret;
6661
6662 if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
6663 creds = override_creds(req->creds);
6664
6665 if (!io_op_defs[req->opcode].audit_skip)
6666 audit_uring_entry(req->opcode);
6667
6668 switch (req->opcode) {
6669 case IORING_OP_NOP:
6670 ret = io_nop(req, issue_flags);
6671 break;
6672 case IORING_OP_READV:
6673 case IORING_OP_READ_FIXED:
6674 case IORING_OP_READ:
6675 ret = io_read(req, issue_flags);
6676 break;
6677 case IORING_OP_WRITEV:
6678 case IORING_OP_WRITE_FIXED:
6679 case IORING_OP_WRITE:
6680 ret = io_write(req, issue_flags);
6681 break;
6682 case IORING_OP_FSYNC:
6683 ret = io_fsync(req, issue_flags);
6684 break;
6685 case IORING_OP_POLL_ADD:
6686 ret = io_poll_add(req, issue_flags);
6687 break;
6688 case IORING_OP_POLL_REMOVE:
6689 ret = io_poll_update(req, issue_flags);
6690 break;
6691 case IORING_OP_SYNC_FILE_RANGE:
6692 ret = io_sync_file_range(req, issue_flags);
6693 break;
6694 case IORING_OP_SENDMSG:
6695 ret = io_sendmsg(req, issue_flags);
6696 break;
6697 case IORING_OP_SEND:
6698 ret = io_send(req, issue_flags);
6699 break;
6700 case IORING_OP_RECVMSG:
6701 ret = io_recvmsg(req, issue_flags);
6702 break;
6703 case IORING_OP_RECV:
6704 ret = io_recv(req, issue_flags);
6705 break;
6706 case IORING_OP_TIMEOUT:
6707 ret = io_timeout(req, issue_flags);
6708 break;
6709 case IORING_OP_TIMEOUT_REMOVE:
6710 ret = io_timeout_remove(req, issue_flags);
6711 break;
6712 case IORING_OP_ACCEPT:
6713 ret = io_accept(req, issue_flags);
6714 break;
6715 case IORING_OP_CONNECT:
6716 ret = io_connect(req, issue_flags);
6717 break;
6718 case IORING_OP_ASYNC_CANCEL:
6719 ret = io_async_cancel(req, issue_flags);
6720 break;
6721 case IORING_OP_FALLOCATE:
6722 ret = io_fallocate(req, issue_flags);
6723 break;
6724 case IORING_OP_OPENAT:
6725 ret = io_openat(req, issue_flags);
6726 break;
6727 case IORING_OP_CLOSE:
6728 ret = io_close(req, issue_flags);
6729 break;
6730 case IORING_OP_FILES_UPDATE:
6731 ret = io_files_update(req, issue_flags);
6732 break;
6733 case IORING_OP_STATX:
6734 ret = io_statx(req, issue_flags);
6735 break;
6736 case IORING_OP_FADVISE:
6737 ret = io_fadvise(req, issue_flags);
6738 break;
6739 case IORING_OP_MADVISE:
6740 ret = io_madvise(req, issue_flags);
6741 break;
6742 case IORING_OP_OPENAT2:
6743 ret = io_openat2(req, issue_flags);
6744 break;
6745 case IORING_OP_EPOLL_CTL:
6746 ret = io_epoll_ctl(req, issue_flags);
6747 break;
6748 case IORING_OP_SPLICE:
6749 ret = io_splice(req, issue_flags);
6750 break;
6751 case IORING_OP_PROVIDE_BUFFERS:
6752 ret = io_provide_buffers(req, issue_flags);
6753 break;
6754 case IORING_OP_REMOVE_BUFFERS:
6755 ret = io_remove_buffers(req, issue_flags);
6756 break;
6757 case IORING_OP_TEE:
6758 ret = io_tee(req, issue_flags);
6759 break;
6760 case IORING_OP_SHUTDOWN:
6761 ret = io_shutdown(req, issue_flags);
6762 break;
6763 case IORING_OP_RENAMEAT:
6764 ret = io_renameat(req, issue_flags);
6765 break;
6766 case IORING_OP_UNLINKAT:
6767 ret = io_unlinkat(req, issue_flags);
6768 break;
6769 case IORING_OP_MKDIRAT:
6770 ret = io_mkdirat(req, issue_flags);
6771 break;
6772 case IORING_OP_SYMLINKAT:
6773 ret = io_symlinkat(req, issue_flags);
6774 break;
6775 case IORING_OP_LINKAT:
6776 ret = io_linkat(req, issue_flags);
6777 break;
6778 default:
6779 ret = -EINVAL;
6780 break;
6781 }
6782
6783 if (!io_op_defs[req->opcode].audit_skip)
6784 audit_uring_exit(!ret, ret);
6785
6786 if (creds)
6787 revert_creds(creds);
6788 if (ret)
6789 return ret;
6790
6791 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6792 io_iopoll_req_issued(req, issue_flags);
6793
6794 return 0;
6795}
6796
6797static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6798{
6799 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6800
6801 req = io_put_req_find_next(req);
6802 return req ? &req->work : NULL;
6803}
6804
6805static void io_wq_submit_work(struct io_wq_work *work)
6806{
6807 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6808 unsigned int issue_flags = IO_URING_F_UNLOCKED;
6809 bool needs_poll = false;
6810 struct io_kiocb *timeout;
6811 int ret = 0;
6812
6813
6814 if (!(req->flags & REQ_F_REFCOUNT))
6815 __io_req_set_refcount(req, 2);
6816 else
6817 req_ref_get(req);
6818
6819 timeout = io_prep_linked_timeout(req);
6820 if (timeout)
6821 io_queue_linked_timeout(timeout);
6822
6823
6824 if (work->flags & IO_WQ_WORK_CANCEL) {
6825 io_req_task_queue_fail(req, -ECANCELED);
6826 return;
6827 }
6828
6829 if (req->flags & REQ_F_FORCE_ASYNC) {
6830 const struct io_op_def *def = &io_op_defs[req->opcode];
6831 bool opcode_poll = def->pollin || def->pollout;
6832
6833 if (opcode_poll && file_can_poll(req->file)) {
6834 needs_poll = true;
6835 issue_flags |= IO_URING_F_NONBLOCK;
6836 }
6837 }
6838
6839 do {
6840 ret = io_issue_sqe(req, issue_flags);
6841 if (ret != -EAGAIN)
6842 break;
6843
6844
6845
6846
6847
6848 if (!needs_poll) {
6849 cond_resched();
6850 continue;
6851 }
6852
6853 if (io_arm_poll_handler(req) == IO_APOLL_OK)
6854 return;
6855
6856 needs_poll = false;
6857 issue_flags &= ~IO_URING_F_NONBLOCK;
6858 } while (1);
6859
6860
6861 if (ret)
6862 io_req_task_queue_fail(req, ret);
6863}
6864
6865static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
6866 unsigned i)
6867{
6868 return &table->files[i];
6869}
6870
6871static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6872 int index)
6873{
6874 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
6875
6876 return (struct file *) (slot->file_ptr & FFS_MASK);
6877}
6878
6879static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
6880{
6881 unsigned long file_ptr = (unsigned long) file;
6882
6883 file_ptr |= io_file_get_flags(file);
6884 file_slot->file_ptr = file_ptr;
6885}
6886
6887static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6888 struct io_kiocb *req, int fd)
6889{
6890 struct file *file;
6891 unsigned long file_ptr;
6892
6893 if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6894 return NULL;
6895 fd = array_index_nospec(fd, ctx->nr_user_files);
6896 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6897 file = (struct file *) (file_ptr & FFS_MASK);
6898 file_ptr &= ~FFS_MASK;
6899
6900 req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
6901 io_req_set_rsrc_node(req, ctx);
6902 return file;
6903}
6904
6905static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
6906 struct io_kiocb *req, int fd)
6907{
6908 struct file *file = fget(fd);
6909
6910 trace_io_uring_file_get(ctx, fd);
6911
6912
6913 if (file && unlikely(file->f_op == &io_uring_fops))
6914 io_req_track_inflight(req);
6915 return file;
6916}
6917
6918static inline struct file *io_file_get(struct io_ring_ctx *ctx,
6919 struct io_kiocb *req, int fd, bool fixed)
6920{
6921 if (fixed)
6922 return io_file_get_fixed(ctx, req, fd);
6923 else
6924 return io_file_get_normal(ctx, req, fd);
6925}
6926
6927static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
6928{
6929 struct io_kiocb *prev = req->timeout.prev;
6930 int ret = -ENOENT;
6931
6932 if (prev) {
6933 if (!(req->task->flags & PF_EXITING))
6934 ret = io_try_cancel_userdata(req, prev->user_data);
6935 io_req_complete_post(req, ret ?: -ETIME, 0);
6936 io_put_req(prev);
6937 } else {
6938 io_req_complete_post(req, -ETIME, 0);
6939 }
6940}
6941
6942static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6943{
6944 struct io_timeout_data *data = container_of(timer,
6945 struct io_timeout_data, timer);
6946 struct io_kiocb *prev, *req = data->req;
6947 struct io_ring_ctx *ctx = req->ctx;
6948 unsigned long flags;
6949
6950 spin_lock_irqsave(&ctx->timeout_lock, flags);
6951 prev = req->timeout.head;
6952 req->timeout.head = NULL;
6953
6954
6955
6956
6957
6958 if (prev) {
6959 io_remove_next_linked(prev);
6960 if (!req_ref_inc_not_zero(prev))
6961 prev = NULL;
6962 }
6963 list_del(&req->timeout.list);
6964 req->timeout.prev = prev;
6965 spin_unlock_irqrestore(&ctx->timeout_lock, flags);
6966
6967 req->io_task_work.func = io_req_task_link_timeout;
6968 io_req_task_work_add(req);
6969 return HRTIMER_NORESTART;
6970}
6971
6972static void io_queue_linked_timeout(struct io_kiocb *req)
6973{
6974 struct io_ring_ctx *ctx = req->ctx;
6975
6976 spin_lock_irq(&ctx->timeout_lock);
6977
6978
6979
6980
6981 if (req->timeout.head) {
6982 struct io_timeout_data *data = req->async_data;
6983
6984 data->timer.function = io_link_timeout_fn;
6985 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6986 data->mode);
6987 list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
6988 }
6989 spin_unlock_irq(&ctx->timeout_lock);
6990
6991 io_put_req(req);
6992}
6993
6994static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
6995 __must_hold(&req->ctx->uring_lock)
6996{
6997 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6998
6999 switch (io_arm_poll_handler(req)) {
7000 case IO_APOLL_READY:
7001 io_req_task_queue(req);
7002 break;
7003 case IO_APOLL_ABORTED:
7004
7005
7006
7007
7008 io_queue_async_work(req, NULL);
7009 break;
7010 }
7011
7012 if (linked_timeout)
7013 io_queue_linked_timeout(linked_timeout);
7014}
7015
7016static inline void __io_queue_sqe(struct io_kiocb *req)
7017 __must_hold(&req->ctx->uring_lock)
7018{
7019 struct io_kiocb *linked_timeout;
7020 int ret;
7021
7022 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
7023
7024 if (req->flags & REQ_F_COMPLETE_INLINE) {
7025 io_req_add_compl_list(req);
7026 return;
7027 }
7028
7029
7030
7031
7032 if (likely(!ret)) {
7033 linked_timeout = io_prep_linked_timeout(req);
7034 if (linked_timeout)
7035 io_queue_linked_timeout(linked_timeout);
7036 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
7037 io_queue_sqe_arm_apoll(req);
7038 } else {
7039 io_req_complete_failed(req, ret);
7040 }
7041}
7042
7043static void io_queue_sqe_fallback(struct io_kiocb *req)
7044 __must_hold(&req->ctx->uring_lock)
7045{
7046 if (req->flags & REQ_F_FAIL) {
7047 io_req_complete_fail_submit(req);
7048 } else if (unlikely(req->ctx->drain_active)) {
7049 io_drain_req(req);
7050 } else {
7051 int ret = io_req_prep_async(req);
7052
7053 if (unlikely(ret))
7054 io_req_complete_failed(req, ret);
7055 else
7056 io_queue_async_work(req, NULL);
7057 }
7058}
7059
7060static inline void io_queue_sqe(struct io_kiocb *req)
7061 __must_hold(&req->ctx->uring_lock)
7062{
7063 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
7064 __io_queue_sqe(req);
7065 else
7066 io_queue_sqe_fallback(req);
7067}
7068
7069
7070
7071
7072
7073
7074static inline bool io_check_restriction(struct io_ring_ctx *ctx,
7075 struct io_kiocb *req,
7076 unsigned int sqe_flags)
7077{
7078 if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
7079 return false;
7080
7081 if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
7082 ctx->restrictions.sqe_flags_required)
7083 return false;
7084
7085 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
7086 ctx->restrictions.sqe_flags_required))
7087 return false;
7088
7089 return true;
7090}
7091
7092static void io_init_req_drain(struct io_kiocb *req)
7093{
7094 struct io_ring_ctx *ctx = req->ctx;
7095 struct io_kiocb *head = ctx->submit_state.link.head;
7096
7097 ctx->drain_active = true;
7098 if (head) {
7099
7100
7101
7102
7103
7104
7105
7106 head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
7107 ctx->drain_next = true;
7108 }
7109}
7110
7111static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7112 const struct io_uring_sqe *sqe)
7113 __must_hold(&ctx->uring_lock)
7114{
7115 unsigned int sqe_flags;
7116 int personality;
7117 u8 opcode;
7118
7119
7120 req->opcode = opcode = READ_ONCE(sqe->opcode);
7121
7122 req->flags = sqe_flags = READ_ONCE(sqe->flags);
7123 req->user_data = READ_ONCE(sqe->user_data);
7124 req->file = NULL;
7125 req->fixed_rsrc_refs = NULL;
7126 req->task = current;
7127
7128 if (unlikely(opcode >= IORING_OP_LAST)) {
7129 req->opcode = 0;
7130 return -EINVAL;
7131 }
7132 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7133
7134 if (sqe_flags & ~SQE_VALID_FLAGS)
7135 return -EINVAL;
7136 if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
7137 !io_op_defs[opcode].buffer_select)
7138 return -EOPNOTSUPP;
7139 if (sqe_flags & IOSQE_IO_DRAIN)
7140 io_init_req_drain(req);
7141 }
7142 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7143 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7144 return -EACCES;
7145
7146 if (ctx->drain_active)
7147 req->flags |= REQ_F_FORCE_ASYNC;
7148
7149 if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7150 ctx->drain_next = false;
7151 ctx->drain_active = true;
7152 req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
7153 }
7154 }
7155
7156 if (io_op_defs[opcode].needs_file) {
7157 struct io_submit_state *state = &ctx->submit_state;
7158
7159
7160
7161
7162
7163 if (state->need_plug && io_op_defs[opcode].plug) {
7164 state->plug_started = true;
7165 state->need_plug = false;
7166 blk_start_plug_nr_ios(&state->plug, state->submit_nr);
7167 }
7168
7169 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
7170 (sqe_flags & IOSQE_FIXED_FILE));
7171 if (unlikely(!req->file))
7172 return -EBADF;
7173 }
7174
7175 personality = READ_ONCE(sqe->personality);
7176 if (personality) {
7177 int ret;
7178
7179 req->creds = xa_load(&ctx->personalities, personality);
7180 if (!req->creds)
7181 return -EINVAL;
7182 get_cred(req->creds);
7183 ret = security_uring_override_creds(req->creds);
7184 if (ret) {
7185 put_cred(req->creds);
7186 return ret;
7187 }
7188 req->flags |= REQ_F_CREDS;
7189 }
7190
7191 return io_req_prep(req, sqe);
7192}
7193
7194static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
7195 const struct io_uring_sqe *sqe)
7196 __must_hold(&ctx->uring_lock)
7197{
7198 struct io_submit_link *link = &ctx->submit_state.link;
7199 int ret;
7200
7201 ret = io_init_req(ctx, req, sqe);
7202 if (unlikely(ret)) {
7203 trace_io_uring_req_failed(sqe, ret);
7204
7205
7206 if (link->head) {
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216 if (!(link->head->flags & REQ_F_FAIL))
7217 req_fail_link_node(link->head, -ECANCELED);
7218 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7219
7220
7221
7222
7223 io_req_complete_failed(req, ret);
7224 return ret;
7225 }
7226 req_fail_link_node(req, ret);
7227 }
7228
7229
7230 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
7231 req->flags, true,
7232 ctx->flags & IORING_SETUP_SQPOLL);
7233
7234
7235
7236
7237
7238
7239
7240
7241 if (link->head) {
7242 struct io_kiocb *head = link->head;
7243
7244 if (!(req->flags & REQ_F_FAIL)) {
7245 ret = io_req_prep_async(req);
7246 if (unlikely(ret)) {
7247 req_fail_link_node(req, ret);
7248 if (!(head->flags & REQ_F_FAIL))
7249 req_fail_link_node(head, -ECANCELED);
7250 }
7251 }
7252 trace_io_uring_link(ctx, req, head);
7253 link->last->link = req;
7254 link->last = req;
7255
7256 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
7257 return 0;
7258
7259 link->head = NULL;
7260 req = head;
7261 } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7262 link->head = req;
7263 link->last = req;
7264 return 0;
7265 }
7266
7267 io_queue_sqe(req);
7268 return 0;
7269}
7270
7271
7272
7273
7274static void io_submit_state_end(struct io_ring_ctx *ctx)
7275{
7276 struct io_submit_state *state = &ctx->submit_state;
7277
7278 if (state->link.head)
7279 io_queue_sqe(state->link.head);
7280
7281 io_submit_flush_completions(ctx);
7282 if (state->plug_started)
7283 blk_finish_plug(&state->plug);
7284}
7285
7286
7287
7288
7289static void io_submit_state_start(struct io_submit_state *state,
7290 unsigned int max_ios)
7291{
7292 state->plug_started = false;
7293 state->need_plug = max_ios > 2;
7294 state->submit_nr = max_ios;
7295
7296 state->link.head = NULL;
7297}
7298
7299static void io_commit_sqring(struct io_ring_ctx *ctx)
7300{
7301 struct io_rings *rings = ctx->rings;
7302
7303
7304
7305
7306
7307
7308 smp_store_release(&rings->sq.head, ctx->cached_sq_head);
7309}
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
7320{
7321 unsigned head, mask = ctx->sq_entries - 1;
7322 unsigned sq_idx = ctx->cached_sq_head++ & mask;
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332 head = READ_ONCE(ctx->sq_array[sq_idx]);
7333 if (likely(head < ctx->sq_entries))
7334 return &ctx->sq_sqes[head];
7335
7336
7337 ctx->cq_extra--;
7338 WRITE_ONCE(ctx->rings->sq_dropped,
7339 READ_ONCE(ctx->rings->sq_dropped) + 1);
7340 return NULL;
7341}
7342
7343static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
7344 __must_hold(&ctx->uring_lock)
7345{
7346 unsigned int entries = io_sqring_entries(ctx);
7347 int submitted = 0;
7348
7349 if (unlikely(!entries))
7350 return 0;
7351
7352 nr = min3(nr, ctx->sq_entries, entries);
7353 io_get_task_refs(nr);
7354
7355 io_submit_state_start(&ctx->submit_state, nr);
7356 do {
7357 const struct io_uring_sqe *sqe;
7358 struct io_kiocb *req;
7359
7360 if (unlikely(!io_alloc_req_refill(ctx))) {
7361 if (!submitted)
7362 submitted = -EAGAIN;
7363 break;
7364 }
7365 req = io_alloc_req(ctx);
7366 sqe = io_get_sqe(ctx);
7367 if (unlikely(!sqe)) {
7368 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
7369 break;
7370 }
7371
7372 submitted++;
7373 if (io_submit_sqe(ctx, req, sqe))
7374 break;
7375 } while (submitted < nr);
7376
7377 if (unlikely(submitted != nr)) {
7378 int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
7379 int unused = nr - ref_used;
7380
7381 current->io_uring->cached_refs += unused;
7382 }
7383
7384 io_submit_state_end(ctx);
7385
7386 io_commit_sqring(ctx);
7387
7388 return submitted;
7389}
7390
7391static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7392{
7393 return READ_ONCE(sqd->state);
7394}
7395
7396static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7397{
7398
7399 spin_lock(&ctx->completion_lock);
7400 WRITE_ONCE(ctx->rings->sq_flags,
7401 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
7402 spin_unlock(&ctx->completion_lock);
7403}
7404
7405static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7406{
7407 spin_lock(&ctx->completion_lock);
7408 WRITE_ONCE(ctx->rings->sq_flags,
7409 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
7410 spin_unlock(&ctx->completion_lock);
7411}
7412
7413static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
7414{
7415 unsigned int to_submit;
7416 int ret = 0;
7417
7418 to_submit = io_sqring_entries(ctx);
7419
7420 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7421 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
7422
7423 if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
7424 const struct cred *creds = NULL;
7425
7426 if (ctx->sq_creds != current_cred())
7427 creds = override_creds(ctx->sq_creds);
7428
7429 mutex_lock(&ctx->uring_lock);
7430 if (!wq_list_empty(&ctx->iopoll_list))
7431 io_do_iopoll(ctx, true);
7432
7433
7434
7435
7436
7437 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7438 !(ctx->flags & IORING_SETUP_R_DISABLED))
7439 ret = io_submit_sqes(ctx, to_submit);
7440 mutex_unlock(&ctx->uring_lock);
7441
7442 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7443 wake_up(&ctx->sqo_sq_wait);
7444 if (creds)
7445 revert_creds(creds);
7446 }
7447
7448 return ret;
7449}
7450
7451static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7452{
7453 struct io_ring_ctx *ctx;
7454 unsigned sq_thread_idle = 0;
7455
7456 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7457 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
7458 sqd->sq_thread_idle = sq_thread_idle;
7459}
7460
7461static bool io_sqd_handle_event(struct io_sq_data *sqd)
7462{
7463 bool did_sig = false;
7464 struct ksignal ksig;
7465
7466 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7467 signal_pending(current)) {
7468 mutex_unlock(&sqd->lock);
7469 if (signal_pending(current))
7470 did_sig = get_signal(&ksig);
7471 cond_resched();
7472 mutex_lock(&sqd->lock);
7473 }
7474 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7475}
7476
7477static int io_sq_thread(void *data)
7478{
7479 struct io_sq_data *sqd = data;
7480 struct io_ring_ctx *ctx;
7481 unsigned long timeout = 0;
7482 char buf[TASK_COMM_LEN];
7483 DEFINE_WAIT(wait);
7484
7485 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
7486 set_task_comm(current, buf);
7487
7488 if (sqd->sq_cpu != -1)
7489 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
7490 else
7491 set_cpus_allowed_ptr(current, cpu_online_mask);
7492 current->flags |= PF_NO_SETAFFINITY;
7493
7494 audit_alloc_kernel(current);
7495
7496 mutex_lock(&sqd->lock);
7497 while (1) {
7498 bool cap_entries, sqt_spin = false;
7499
7500 if (io_sqd_events_pending(sqd) || signal_pending(current)) {
7501 if (io_sqd_handle_event(sqd))
7502 break;
7503 timeout = jiffies + sqd->sq_thread_idle;
7504 }
7505
7506 cap_entries = !list_is_singular(&sqd->ctx_list);
7507 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7508 int ret = __io_sq_thread(ctx, cap_entries);
7509
7510 if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
7511 sqt_spin = true;
7512 }
7513 if (io_run_task_work())
7514 sqt_spin = true;
7515
7516 if (sqt_spin || !time_after(jiffies, timeout)) {
7517 cond_resched();
7518 if (sqt_spin)
7519 timeout = jiffies + sqd->sq_thread_idle;
7520 continue;
7521 }
7522
7523 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7524 if (!io_sqd_events_pending(sqd) && !current->task_works) {
7525 bool needs_sched = true;
7526
7527 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7528 io_ring_set_wakeup_flag(ctx);
7529
7530 if ((ctx->flags & IORING_SETUP_IOPOLL) &&
7531 !wq_list_empty(&ctx->iopoll_list)) {
7532 needs_sched = false;
7533 break;
7534 }
7535 if (io_sqring_entries(ctx)) {
7536 needs_sched = false;
7537 break;
7538 }
7539 }
7540
7541 if (needs_sched) {
7542 mutex_unlock(&sqd->lock);
7543 schedule();
7544 mutex_lock(&sqd->lock);
7545 }
7546 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7547 io_ring_clear_wakeup_flag(ctx);
7548 }
7549
7550 finish_wait(&sqd->wait, &wait);
7551 timeout = jiffies + sqd->sq_thread_idle;
7552 }
7553
7554 io_uring_cancel_generic(true, sqd);
7555 sqd->thread = NULL;
7556 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7557 io_ring_set_wakeup_flag(ctx);
7558 io_run_task_work();
7559 mutex_unlock(&sqd->lock);
7560
7561 audit_free(current);
7562
7563 complete(&sqd->exited);
7564 do_exit(0);
7565}
7566
7567struct io_wait_queue {
7568 struct wait_queue_entry wq;
7569 struct io_ring_ctx *ctx;
7570 unsigned cq_tail;
7571 unsigned nr_timeouts;
7572};
7573
7574static inline bool io_should_wake(struct io_wait_queue *iowq)
7575{
7576 struct io_ring_ctx *ctx = iowq->ctx;
7577 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
7578
7579
7580
7581
7582
7583
7584 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
7585}
7586
7587static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7588 int wake_flags, void *key)
7589{
7590 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7591 wq);
7592
7593
7594
7595
7596
7597 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
7598 return autoremove_wake_function(curr, mode, wake_flags, key);
7599 return -1;
7600}
7601
7602static int io_run_task_work_sig(void)
7603{
7604 if (io_run_task_work())
7605 return 1;
7606 if (!signal_pending(current))
7607 return 0;
7608 if (test_thread_flag(TIF_NOTIFY_SIGNAL))
7609 return -ERESTARTSYS;
7610 return -EINTR;
7611}
7612
7613
7614static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7615 struct io_wait_queue *iowq,
7616 signed long *timeout)
7617{
7618 int ret;
7619
7620
7621 ret = io_run_task_work_sig();
7622 if (ret || io_should_wake(iowq))
7623 return ret;
7624
7625 if (test_bit(0, &ctx->check_cq_overflow))
7626 return 1;
7627
7628 *timeout = schedule_timeout(*timeout);
7629 return !*timeout ? -ETIME : 1;
7630}
7631
7632
7633
7634
7635
7636static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
7637 const sigset_t __user *sig, size_t sigsz,
7638 struct __kernel_timespec __user *uts)
7639{
7640 struct io_wait_queue iowq;
7641 struct io_rings *rings = ctx->rings;
7642 signed long timeout = MAX_SCHEDULE_TIMEOUT;
7643 int ret;
7644
7645 do {
7646 io_cqring_overflow_flush(ctx);
7647 if (io_cqring_events(ctx) >= min_events)
7648 return 0;
7649 if (!io_run_task_work())
7650 break;
7651 } while (1);
7652
7653 if (uts) {
7654 struct timespec64 ts;
7655
7656 if (get_timespec64(&ts, uts))
7657 return -EFAULT;
7658 timeout = timespec64_to_jiffies(&ts);
7659 }
7660
7661 if (sig) {
7662#ifdef CONFIG_COMPAT
7663 if (in_compat_syscall())
7664 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
7665 sigsz);
7666 else
7667#endif
7668 ret = set_user_sigmask(sig, sigsz);
7669
7670 if (ret)
7671 return ret;
7672 }
7673
7674 init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7675 iowq.wq.private = current;
7676 INIT_LIST_HEAD(&iowq.wq.entry);
7677 iowq.ctx = ctx;
7678 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
7679 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
7680
7681 trace_io_uring_cqring_wait(ctx, min_events);
7682 do {
7683
7684 if (!io_cqring_overflow_flush(ctx)) {
7685 ret = -EBUSY;
7686 break;
7687 }
7688 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
7689 TASK_INTERRUPTIBLE);
7690 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7691 finish_wait(&ctx->cq_wait, &iowq.wq);
7692 cond_resched();
7693 } while (ret > 0);
7694
7695 restore_saved_sigmask_unless(ret == -EINTR);
7696
7697 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
7698}
7699
7700static void io_free_page_table(void **table, size_t size)
7701{
7702 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7703
7704 for (i = 0; i < nr_tables; i++)
7705 kfree(table[i]);
7706 kfree(table);
7707}
7708
7709static __cold void **io_alloc_page_table(size_t size)
7710{
7711 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7712 size_t init_size = size;
7713 void **table;
7714
7715 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
7716 if (!table)
7717 return NULL;
7718
7719 for (i = 0; i < nr_tables; i++) {
7720 unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
7721
7722 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
7723 if (!table[i]) {
7724 io_free_page_table(table, init_size);
7725 return NULL;
7726 }
7727 size -= this_size;
7728 }
7729 return table;
7730}
7731
7732static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
7733{
7734 percpu_ref_exit(&ref_node->refs);
7735 kfree(ref_node);
7736}
7737
7738static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7739{
7740 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7741 struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7742 unsigned long flags;
7743 bool first_add = false;
7744
7745 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7746 node->done = true;
7747
7748 while (!list_empty(&ctx->rsrc_ref_list)) {
7749 node = list_first_entry(&ctx->rsrc_ref_list,
7750 struct io_rsrc_node, node);
7751
7752 if (!node->done)
7753 break;
7754 list_del(&node->node);
7755 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7756 }
7757 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7758
7759 if (first_add)
7760 mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7761}
7762
7763static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7764{
7765 struct io_rsrc_node *ref_node;
7766
7767 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7768 if (!ref_node)
7769 return NULL;
7770
7771 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7772 0, GFP_KERNEL)) {
7773 kfree(ref_node);
7774 return NULL;
7775 }
7776 INIT_LIST_HEAD(&ref_node->node);
7777 INIT_LIST_HEAD(&ref_node->rsrc_list);
7778 ref_node->done = false;
7779 return ref_node;
7780}
7781
7782static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7783 struct io_rsrc_data *data_to_kill)
7784 __must_hold(&ctx->uring_lock)
7785{
7786 WARN_ON_ONCE(!ctx->rsrc_backup_node);
7787 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
7788
7789 io_rsrc_refs_drop(ctx);
7790
7791 if (data_to_kill) {
7792 struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
7793
7794 rsrc_node->rsrc_data = data_to_kill;
7795 spin_lock_irq(&ctx->rsrc_ref_lock);
7796 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
7797 spin_unlock_irq(&ctx->rsrc_ref_lock);
7798
7799 atomic_inc(&data_to_kill->refs);
7800 percpu_ref_kill(&rsrc_node->refs);
7801 ctx->rsrc_node = NULL;
7802 }
7803
7804 if (!ctx->rsrc_node) {
7805 ctx->rsrc_node = ctx->rsrc_backup_node;
7806 ctx->rsrc_backup_node = NULL;
7807 }
7808}
7809
7810static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
7811{
7812 if (ctx->rsrc_backup_node)
7813 return 0;
7814 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
7815 return ctx->rsrc_backup_node ? 0 : -ENOMEM;
7816}
7817
7818static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
7819 struct io_ring_ctx *ctx)
7820{
7821 int ret;
7822
7823
7824 if (data->quiesce)
7825 return -ENXIO;
7826
7827 data->quiesce = true;
7828 do {
7829 ret = io_rsrc_node_switch_start(ctx);
7830 if (ret)
7831 break;
7832 io_rsrc_node_switch(ctx, data);
7833
7834
7835 if (atomic_dec_and_test(&data->refs))
7836 break;
7837 mutex_unlock(&ctx->uring_lock);
7838 flush_delayed_work(&ctx->rsrc_put_work);
7839 ret = wait_for_completion_interruptible(&data->done);
7840 if (!ret) {
7841 mutex_lock(&ctx->uring_lock);
7842 break;
7843 }
7844
7845 atomic_inc(&data->refs);
7846
7847 flush_delayed_work(&ctx->rsrc_put_work);
7848 reinit_completion(&data->done);
7849
7850 ret = io_run_task_work_sig();
7851 mutex_lock(&ctx->uring_lock);
7852 } while (ret >= 0);
7853 data->quiesce = false;
7854
7855 return ret;
7856}
7857
7858static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7859{
7860 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7861 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7862
7863 return &data->tags[table_idx][off];
7864}
7865
7866static void io_rsrc_data_free(struct io_rsrc_data *data)
7867{
7868 size_t size = data->nr * sizeof(data->tags[0][0]);
7869
7870 if (data->tags)
7871 io_free_page_table((void **)data->tags, size);
7872 kfree(data);
7873}
7874
7875static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7876 u64 __user *utags, unsigned nr,
7877 struct io_rsrc_data **pdata)
7878{
7879 struct io_rsrc_data *data;
7880 int ret = -ENOMEM;
7881 unsigned i;
7882
7883 data = kzalloc(sizeof(*data), GFP_KERNEL);
7884 if (!data)
7885 return -ENOMEM;
7886 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
7887 if (!data->tags) {
7888 kfree(data);
7889 return -ENOMEM;
7890 }
7891
7892 data->nr = nr;
7893 data->ctx = ctx;
7894 data->do_put = do_put;
7895 if (utags) {
7896 ret = -EFAULT;
7897 for (i = 0; i < nr; i++) {
7898 u64 *tag_slot = io_get_tag_slot(data, i);
7899
7900 if (copy_from_user(tag_slot, &utags[i],
7901 sizeof(*tag_slot)))
7902 goto fail;
7903 }
7904 }
7905
7906 atomic_set(&data->refs, 1);
7907 init_completion(&data->done);
7908 *pdata = data;
7909 return 0;
7910fail:
7911 io_rsrc_data_free(data);
7912 return ret;
7913}
7914
7915static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7916{
7917 table->files = kvcalloc(nr_files, sizeof(table->files[0]),
7918 GFP_KERNEL_ACCOUNT);
7919 return !!table->files;
7920}
7921
7922static void io_free_file_tables(struct io_file_table *table)
7923{
7924 kvfree(table->files);
7925 table->files = NULL;
7926}
7927
7928static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7929{
7930#if defined(CONFIG_UNIX)
7931 if (ctx->ring_sock) {
7932 struct sock *sock = ctx->ring_sock->sk;
7933 struct sk_buff *skb;
7934
7935 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7936 kfree_skb(skb);
7937 }
7938#else
7939 int i;
7940
7941 for (i = 0; i < ctx->nr_user_files; i++) {
7942 struct file *file;
7943
7944 file = io_file_from_index(ctx, i);
7945 if (file)
7946 fput(file);
7947 }
7948#endif
7949 io_free_file_tables(&ctx->file_table);
7950 io_rsrc_data_free(ctx->file_data);
7951 ctx->file_data = NULL;
7952 ctx->nr_user_files = 0;
7953}
7954
7955static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7956{
7957 int ret;
7958
7959 if (!ctx->file_data)
7960 return -ENXIO;
7961 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7962 if (!ret)
7963 __io_sqe_files_unregister(ctx);
7964 return ret;
7965}
7966
7967static void io_sq_thread_unpark(struct io_sq_data *sqd)
7968 __releases(&sqd->lock)
7969{
7970 WARN_ON_ONCE(sqd->thread == current);
7971
7972
7973
7974
7975
7976 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7977 if (atomic_dec_return(&sqd->park_pending))
7978 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7979 mutex_unlock(&sqd->lock);
7980}
7981
7982static void io_sq_thread_park(struct io_sq_data *sqd)
7983 __acquires(&sqd->lock)
7984{
7985 WARN_ON_ONCE(sqd->thread == current);
7986
7987 atomic_inc(&sqd->park_pending);
7988 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7989 mutex_lock(&sqd->lock);
7990 if (sqd->thread)
7991 wake_up_process(sqd->thread);
7992}
7993
7994static void io_sq_thread_stop(struct io_sq_data *sqd)
7995{
7996 WARN_ON_ONCE(sqd->thread == current);
7997 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
7998
7999 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8000 mutex_lock(&sqd->lock);
8001 if (sqd->thread)
8002 wake_up_process(sqd->thread);
8003 mutex_unlock(&sqd->lock);
8004 wait_for_completion(&sqd->exited);
8005}
8006
8007static void io_put_sq_data(struct io_sq_data *sqd)
8008{
8009 if (refcount_dec_and_test(&sqd->refs)) {
8010 WARN_ON_ONCE(atomic_read(&sqd->park_pending));
8011
8012 io_sq_thread_stop(sqd);
8013 kfree(sqd);
8014 }
8015}
8016
8017static void io_sq_thread_finish(struct io_ring_ctx *ctx)
8018{
8019 struct io_sq_data *sqd = ctx->sq_data;
8020
8021 if (sqd) {
8022 io_sq_thread_park(sqd);
8023 list_del_init(&ctx->sqd_list);
8024 io_sqd_update_thread_idle(sqd);
8025 io_sq_thread_unpark(sqd);
8026
8027 io_put_sq_data(sqd);
8028 ctx->sq_data = NULL;
8029 }
8030}
8031
8032static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
8033{
8034 struct io_ring_ctx *ctx_attach;
8035 struct io_sq_data *sqd;
8036 struct fd f;
8037
8038 f = fdget(p->wq_fd);
8039 if (!f.file)
8040 return ERR_PTR(-ENXIO);
8041 if (f.file->f_op != &io_uring_fops) {
8042 fdput(f);
8043 return ERR_PTR(-EINVAL);
8044 }
8045
8046 ctx_attach = f.file->private_data;
8047 sqd = ctx_attach->sq_data;
8048 if (!sqd) {
8049 fdput(f);
8050 return ERR_PTR(-EINVAL);
8051 }
8052 if (sqd->task_tgid != current->tgid) {
8053 fdput(f);
8054 return ERR_PTR(-EPERM);
8055 }
8056
8057 refcount_inc(&sqd->refs);
8058 fdput(f);
8059 return sqd;
8060}
8061
8062static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
8063 bool *attached)
8064{
8065 struct io_sq_data *sqd;
8066
8067 *attached = false;
8068 if (p->flags & IORING_SETUP_ATTACH_WQ) {
8069 sqd = io_attach_sq_data(p);
8070 if (!IS_ERR(sqd)) {
8071 *attached = true;
8072 return sqd;
8073 }
8074
8075 if (PTR_ERR(sqd) != -EPERM)
8076 return sqd;
8077 }
8078
8079 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
8080 if (!sqd)
8081 return ERR_PTR(-ENOMEM);
8082
8083 atomic_set(&sqd->park_pending, 0);
8084 refcount_set(&sqd->refs, 1);
8085 INIT_LIST_HEAD(&sqd->ctx_list);
8086 mutex_init(&sqd->lock);
8087 init_waitqueue_head(&sqd->wait);
8088 init_completion(&sqd->exited);
8089 return sqd;
8090}
8091
8092#if defined(CONFIG_UNIX)
8093
8094
8095
8096
8097
8098static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
8099{
8100 struct sock *sk = ctx->ring_sock->sk;
8101 struct scm_fp_list *fpl;
8102 struct sk_buff *skb;
8103 int i, nr_files;
8104
8105 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8106 if (!fpl)
8107 return -ENOMEM;
8108
8109 skb = alloc_skb(0, GFP_KERNEL);
8110 if (!skb) {
8111 kfree(fpl);
8112 return -ENOMEM;
8113 }
8114
8115 skb->sk = sk;
8116
8117 nr_files = 0;
8118 fpl->user = get_uid(current_user());
8119 for (i = 0; i < nr; i++) {
8120 struct file *file = io_file_from_index(ctx, i + offset);
8121
8122 if (!file)
8123 continue;
8124 fpl->fp[nr_files] = get_file(file);
8125 unix_inflight(fpl->user, fpl->fp[nr_files]);
8126 nr_files++;
8127 }
8128
8129 if (nr_files) {
8130 fpl->max = SCM_MAX_FD;
8131 fpl->count = nr_files;
8132 UNIXCB(skb).fp = fpl;
8133 skb->destructor = unix_destruct_scm;
8134 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8135 skb_queue_head(&sk->sk_receive_queue, skb);
8136
8137 for (i = 0; i < nr_files; i++)
8138 fput(fpl->fp[i]);
8139 } else {
8140 kfree_skb(skb);
8141 kfree(fpl);
8142 }
8143
8144 return 0;
8145}
8146
8147
8148
8149
8150
8151
8152static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8153{
8154 unsigned left, total;
8155 int ret = 0;
8156
8157 total = 0;
8158 left = ctx->nr_user_files;
8159 while (left) {
8160 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
8161
8162 ret = __io_sqe_files_scm(ctx, this_files, total);
8163 if (ret)
8164 break;
8165 left -= this_files;
8166 total += this_files;
8167 }
8168
8169 if (!ret)
8170 return 0;
8171
8172 while (total < ctx->nr_user_files) {
8173 struct file *file = io_file_from_index(ctx, total);
8174
8175 if (file)
8176 fput(file);
8177 total++;
8178 }
8179
8180 return ret;
8181}
8182#else
8183static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8184{
8185 return 0;
8186}
8187#endif
8188
8189static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8190{
8191 struct file *file = prsrc->file;
8192#if defined(CONFIG_UNIX)
8193 struct sock *sock = ctx->ring_sock->sk;
8194 struct sk_buff_head list, *head = &sock->sk_receive_queue;
8195 struct sk_buff *skb;
8196 int i;
8197
8198 __skb_queue_head_init(&list);
8199
8200
8201
8202
8203
8204 skb = skb_dequeue(head);
8205 while (skb) {
8206 struct scm_fp_list *fp;
8207
8208 fp = UNIXCB(skb).fp;
8209 for (i = 0; i < fp->count; i++) {
8210 int left;
8211
8212 if (fp->fp[i] != file)
8213 continue;
8214
8215 unix_notinflight(fp->user, fp->fp[i]);
8216 left = fp->count - 1 - i;
8217 if (left) {
8218 memmove(&fp->fp[i], &fp->fp[i + 1],
8219 left * sizeof(struct file *));
8220 }
8221 fp->count--;
8222 if (!fp->count) {
8223 kfree_skb(skb);
8224 skb = NULL;
8225 } else {
8226 __skb_queue_tail(&list, skb);
8227 }
8228 fput(file);
8229 file = NULL;
8230 break;
8231 }
8232
8233 if (!file)
8234 break;
8235
8236 __skb_queue_tail(&list, skb);
8237
8238 skb = skb_dequeue(head);
8239 }
8240
8241 if (skb_peek(&list)) {
8242 spin_lock_irq(&head->lock);
8243 while ((skb = __skb_dequeue(&list)) != NULL)
8244 __skb_queue_tail(head, skb);
8245 spin_unlock_irq(&head->lock);
8246 }
8247#else
8248 fput(file);
8249#endif
8250}
8251
8252static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
8253{
8254 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
8255 struct io_ring_ctx *ctx = rsrc_data->ctx;
8256 struct io_rsrc_put *prsrc, *tmp;
8257
8258 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8259 list_del(&prsrc->list);
8260
8261 if (prsrc->tag) {
8262 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
8263
8264 io_ring_submit_lock(ctx, lock_ring);
8265 spin_lock(&ctx->completion_lock);
8266 io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
8267 ctx->cq_extra++;
8268 io_commit_cqring(ctx);
8269 spin_unlock(&ctx->completion_lock);
8270 io_cqring_ev_posted(ctx);
8271 io_ring_submit_unlock(ctx, lock_ring);
8272 }
8273
8274 rsrc_data->do_put(ctx, prsrc);
8275 kfree(prsrc);
8276 }
8277
8278 io_rsrc_node_destroy(ref_node);
8279 if (atomic_dec_and_test(&rsrc_data->refs))
8280 complete(&rsrc_data->done);
8281}
8282
8283static void io_rsrc_put_work(struct work_struct *work)
8284{
8285 struct io_ring_ctx *ctx;
8286 struct llist_node *node;
8287
8288 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8289 node = llist_del_all(&ctx->rsrc_put_llist);
8290
8291 while (node) {
8292 struct io_rsrc_node *ref_node;
8293 struct llist_node *next = node->next;
8294
8295 ref_node = llist_entry(node, struct io_rsrc_node, llist);
8296 __io_rsrc_put_work(ref_node);
8297 node = next;
8298 }
8299}
8300
8301static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
8302 unsigned nr_args, u64 __user *tags)
8303{
8304 __s32 __user *fds = (__s32 __user *) arg;
8305 struct file *file;
8306 int fd, ret;
8307 unsigned i;
8308
8309 if (ctx->file_data)
8310 return -EBUSY;
8311 if (!nr_args)
8312 return -EINVAL;
8313 if (nr_args > IORING_MAX_FIXED_FILES)
8314 return -EMFILE;
8315 if (nr_args > rlimit(RLIMIT_NOFILE))
8316 return -EMFILE;
8317 ret = io_rsrc_node_switch_start(ctx);
8318 if (ret)
8319 return ret;
8320 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8321 &ctx->file_data);
8322 if (ret)
8323 return ret;
8324
8325 ret = -ENOMEM;
8326 if (!io_alloc_file_tables(&ctx->file_table, nr_args))
8327 goto out_free;
8328
8329 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
8330 if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
8331 ret = -EFAULT;
8332 goto out_fput;
8333 }
8334
8335 if (fd == -1) {
8336 ret = -EINVAL;
8337 if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
8338 goto out_fput;
8339 continue;
8340 }
8341
8342 file = fget(fd);
8343 ret = -EBADF;
8344 if (unlikely(!file))
8345 goto out_fput;
8346
8347
8348
8349
8350
8351
8352
8353
8354 if (file->f_op == &io_uring_fops) {
8355 fput(file);
8356 goto out_fput;
8357 }
8358 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
8359 }
8360
8361 ret = io_sqe_files_scm(ctx);
8362 if (ret) {
8363 __io_sqe_files_unregister(ctx);
8364 return ret;
8365 }
8366
8367 io_rsrc_node_switch(ctx, NULL);
8368 return ret;
8369out_fput:
8370 for (i = 0; i < ctx->nr_user_files; i++) {
8371 file = io_file_from_index(ctx, i);
8372 if (file)
8373 fput(file);
8374 }
8375 io_free_file_tables(&ctx->file_table);
8376 ctx->nr_user_files = 0;
8377out_free:
8378 io_rsrc_data_free(ctx->file_data);
8379 ctx->file_data = NULL;
8380 return ret;
8381}
8382
8383static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
8384 int index)
8385{
8386#if defined(CONFIG_UNIX)
8387 struct sock *sock = ctx->ring_sock->sk;
8388 struct sk_buff_head *head = &sock->sk_receive_queue;
8389 struct sk_buff *skb;
8390
8391
8392
8393
8394
8395
8396 spin_lock_irq(&head->lock);
8397 skb = skb_peek(head);
8398 if (skb) {
8399 struct scm_fp_list *fpl = UNIXCB(skb).fp;
8400
8401 if (fpl->count < SCM_MAX_FD) {
8402 __skb_unlink(skb, head);
8403 spin_unlock_irq(&head->lock);
8404 fpl->fp[fpl->count] = get_file(file);
8405 unix_inflight(fpl->user, fpl->fp[fpl->count]);
8406 fpl->count++;
8407 spin_lock_irq(&head->lock);
8408 __skb_queue_head(head, skb);
8409 } else {
8410 skb = NULL;
8411 }
8412 }
8413 spin_unlock_irq(&head->lock);
8414
8415 if (skb) {
8416 fput(file);
8417 return 0;
8418 }
8419
8420 return __io_sqe_files_scm(ctx, 1, index);
8421#else
8422 return 0;
8423#endif
8424}
8425
8426static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8427 struct io_rsrc_node *node, void *rsrc)
8428{
8429 struct io_rsrc_put *prsrc;
8430
8431 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8432 if (!prsrc)
8433 return -ENOMEM;
8434
8435 prsrc->tag = *io_get_tag_slot(data, idx);
8436 prsrc->rsrc = rsrc;
8437 list_add(&prsrc->list, &node->rsrc_list);
8438 return 0;
8439}
8440
8441static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8442 unsigned int issue_flags, u32 slot_index)
8443{
8444 struct io_ring_ctx *ctx = req->ctx;
8445 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
8446 bool needs_switch = false;
8447 struct io_fixed_file *file_slot;
8448 int ret = -EBADF;
8449
8450 io_ring_submit_lock(ctx, needs_lock);
8451 if (file->f_op == &io_uring_fops)
8452 goto err;
8453 ret = -ENXIO;
8454 if (!ctx->file_data)
8455 goto err;
8456 ret = -EINVAL;
8457 if (slot_index >= ctx->nr_user_files)
8458 goto err;
8459
8460 slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8461 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
8462
8463 if (file_slot->file_ptr) {
8464 struct file *old_file;
8465
8466 ret = io_rsrc_node_switch_start(ctx);
8467 if (ret)
8468 goto err;
8469
8470 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8471 ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8472 ctx->rsrc_node, old_file);
8473 if (ret)
8474 goto err;
8475 file_slot->file_ptr = 0;
8476 needs_switch = true;
8477 }
8478
8479 *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8480 io_fixed_file_set(file_slot, file);
8481 ret = io_sqe_file_register(ctx, file, slot_index);
8482 if (ret) {
8483 file_slot->file_ptr = 0;
8484 goto err;
8485 }
8486
8487 ret = 0;
8488err:
8489 if (needs_switch)
8490 io_rsrc_node_switch(ctx, ctx->file_data);
8491 io_ring_submit_unlock(ctx, needs_lock);
8492 if (ret)
8493 fput(file);
8494 return ret;
8495}
8496
8497static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8498{
8499 unsigned int offset = req->close.file_slot - 1;
8500 struct io_ring_ctx *ctx = req->ctx;
8501 bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
8502 struct io_fixed_file *file_slot;
8503 struct file *file;
8504 int ret, i;
8505
8506 io_ring_submit_lock(ctx, needs_lock);
8507 ret = -ENXIO;
8508 if (unlikely(!ctx->file_data))
8509 goto out;
8510 ret = -EINVAL;
8511 if (offset >= ctx->nr_user_files)
8512 goto out;
8513 ret = io_rsrc_node_switch_start(ctx);
8514 if (ret)
8515 goto out;
8516
8517 i = array_index_nospec(offset, ctx->nr_user_files);
8518 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8519 ret = -EBADF;
8520 if (!file_slot->file_ptr)
8521 goto out;
8522
8523 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8524 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8525 if (ret)
8526 goto out;
8527
8528 file_slot->file_ptr = 0;
8529 io_rsrc_node_switch(ctx, ctx->file_data);
8530 ret = 0;
8531out:
8532 io_ring_submit_unlock(ctx, needs_lock);
8533 return ret;
8534}
8535
8536static int __io_sqe_files_update(struct io_ring_ctx *ctx,
8537 struct io_uring_rsrc_update2 *up,
8538 unsigned nr_args)
8539{
8540 u64 __user *tags = u64_to_user_ptr(up->tags);
8541 __s32 __user *fds = u64_to_user_ptr(up->data);
8542 struct io_rsrc_data *data = ctx->file_data;
8543 struct io_fixed_file *file_slot;
8544 struct file *file;
8545 int fd, i, err = 0;
8546 unsigned int done;
8547 bool needs_switch = false;
8548
8549 if (!ctx->file_data)
8550 return -ENXIO;
8551 if (up->offset + nr_args > ctx->nr_user_files)
8552 return -EINVAL;
8553
8554 for (done = 0; done < nr_args; done++) {
8555 u64 tag = 0;
8556
8557 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
8558 copy_from_user(&fd, &fds[done], sizeof(fd))) {
8559 err = -EFAULT;
8560 break;
8561 }
8562 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
8563 err = -EINVAL;
8564 break;
8565 }
8566 if (fd == IORING_REGISTER_FILES_SKIP)
8567 continue;
8568
8569 i = array_index_nospec(up->offset + done, ctx->nr_user_files);
8570 file_slot = io_fixed_file_slot(&ctx->file_table, i);
8571
8572 if (file_slot->file_ptr) {
8573 file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8574 err = io_queue_rsrc_removal(data, up->offset + done,
8575 ctx->rsrc_node, file);
8576 if (err)
8577 break;
8578 file_slot->file_ptr = 0;
8579 needs_switch = true;
8580 }
8581 if (fd != -1) {
8582 file = fget(fd);
8583 if (!file) {
8584 err = -EBADF;
8585 break;
8586 }
8587
8588
8589
8590
8591
8592
8593
8594
8595 if (file->f_op == &io_uring_fops) {
8596 fput(file);
8597 err = -EBADF;
8598 break;
8599 }
8600 *io_get_tag_slot(data, up->offset + done) = tag;
8601 io_fixed_file_set(file_slot, file);
8602 err = io_sqe_file_register(ctx, file, i);
8603 if (err) {
8604 file_slot->file_ptr = 0;
8605 fput(file);
8606 break;
8607 }
8608 }
8609 }
8610
8611 if (needs_switch)
8612 io_rsrc_node_switch(ctx, data);
8613 return done ? done : err;
8614}
8615
8616static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
8617 struct task_struct *task)
8618{
8619 struct io_wq_hash *hash;
8620 struct io_wq_data data;
8621 unsigned int concurrency;
8622
8623 mutex_lock(&ctx->uring_lock);
8624 hash = ctx->hash_map;
8625 if (!hash) {
8626 hash = kzalloc(sizeof(*hash), GFP_KERNEL);
8627 if (!hash) {
8628 mutex_unlock(&ctx->uring_lock);
8629 return ERR_PTR(-ENOMEM);
8630 }
8631 refcount_set(&hash->refs, 1);
8632 init_waitqueue_head(&hash->wait);
8633 ctx->hash_map = hash;
8634 }
8635 mutex_unlock(&ctx->uring_lock);
8636
8637 data.hash = hash;
8638 data.task = task;
8639 data.free_work = io_wq_free_work;
8640 data.do_work = io_wq_submit_work;
8641
8642
8643 concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
8644
8645 return io_wq_create(concurrency, &data);
8646}
8647
8648static __cold int io_uring_alloc_task_context(struct task_struct *task,
8649 struct io_ring_ctx *ctx)
8650{
8651 struct io_uring_task *tctx;
8652 int ret;
8653
8654 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
8655 if (unlikely(!tctx))
8656 return -ENOMEM;
8657
8658 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8659 if (unlikely(ret)) {
8660 kfree(tctx);
8661 return ret;
8662 }
8663
8664 tctx->io_wq = io_init_wq_offload(ctx, task);
8665 if (IS_ERR(tctx->io_wq)) {
8666 ret = PTR_ERR(tctx->io_wq);
8667 percpu_counter_destroy(&tctx->inflight);
8668 kfree(tctx);
8669 return ret;
8670 }
8671
8672 xa_init(&tctx->xa);
8673 init_waitqueue_head(&tctx->wait);
8674 atomic_set(&tctx->in_idle, 0);
8675 atomic_set(&tctx->inflight_tracked, 0);
8676 task->io_uring = tctx;
8677 spin_lock_init(&tctx->task_lock);
8678 INIT_WQ_LIST(&tctx->task_list);
8679 init_task_work(&tctx->task_work, tctx_task_work);
8680 return 0;
8681}
8682
8683void __io_uring_free(struct task_struct *tsk)
8684{
8685 struct io_uring_task *tctx = tsk->io_uring;
8686
8687 WARN_ON_ONCE(!xa_empty(&tctx->xa));
8688 WARN_ON_ONCE(tctx->io_wq);
8689 WARN_ON_ONCE(tctx->cached_refs);
8690
8691 percpu_counter_destroy(&tctx->inflight);
8692 kfree(tctx);
8693 tsk->io_uring = NULL;
8694}
8695
8696static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
8697 struct io_uring_params *p)
8698{
8699 int ret;
8700
8701
8702 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8703 IORING_SETUP_ATTACH_WQ) {
8704 struct fd f;
8705
8706 f = fdget(p->wq_fd);
8707 if (!f.file)
8708 return -ENXIO;
8709 if (f.file->f_op != &io_uring_fops) {
8710 fdput(f);
8711 return -EINVAL;
8712 }
8713 fdput(f);
8714 }
8715 if (ctx->flags & IORING_SETUP_SQPOLL) {
8716 struct task_struct *tsk;
8717 struct io_sq_data *sqd;
8718 bool attached;
8719
8720 ret = security_uring_sqpoll();
8721 if (ret)
8722 return ret;
8723
8724 sqd = io_get_sq_data(p, &attached);
8725 if (IS_ERR(sqd)) {
8726 ret = PTR_ERR(sqd);
8727 goto err;
8728 }
8729
8730 ctx->sq_creds = get_current_cred();
8731 ctx->sq_data = sqd;
8732 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8733 if (!ctx->sq_thread_idle)
8734 ctx->sq_thread_idle = HZ;
8735
8736 io_sq_thread_park(sqd);
8737 list_add(&ctx->sqd_list, &sqd->ctx_list);
8738 io_sqd_update_thread_idle(sqd);
8739
8740 ret = (attached && !sqd->thread) ? -ENXIO : 0;
8741 io_sq_thread_unpark(sqd);
8742
8743 if (ret < 0)
8744 goto err;
8745 if (attached)
8746 return 0;
8747
8748 if (p->flags & IORING_SETUP_SQ_AFF) {
8749 int cpu = p->sq_thread_cpu;
8750
8751 ret = -EINVAL;
8752 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
8753 goto err_sqpoll;
8754 sqd->sq_cpu = cpu;
8755 } else {
8756 sqd->sq_cpu = -1;
8757 }
8758
8759 sqd->task_pid = current->pid;
8760 sqd->task_tgid = current->tgid;
8761 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8762 if (IS_ERR(tsk)) {
8763 ret = PTR_ERR(tsk);
8764 goto err_sqpoll;
8765 }
8766
8767 sqd->thread = tsk;
8768 ret = io_uring_alloc_task_context(tsk, ctx);
8769 wake_up_new_task(tsk);
8770 if (ret)
8771 goto err;
8772 } else if (p->flags & IORING_SETUP_SQ_AFF) {
8773
8774 ret = -EINVAL;
8775 goto err;
8776 }
8777
8778 return 0;
8779err_sqpoll:
8780 complete(&ctx->sq_data->exited);
8781err:
8782 io_sq_thread_finish(ctx);
8783 return ret;
8784}
8785
8786static inline void __io_unaccount_mem(struct user_struct *user,
8787 unsigned long nr_pages)
8788{
8789 atomic_long_sub(nr_pages, &user->locked_vm);
8790}
8791
8792static inline int __io_account_mem(struct user_struct *user,
8793 unsigned long nr_pages)
8794{
8795 unsigned long page_limit, cur_pages, new_pages;
8796
8797
8798 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8799
8800 do {
8801 cur_pages = atomic_long_read(&user->locked_vm);
8802 new_pages = cur_pages + nr_pages;
8803 if (new_pages > page_limit)
8804 return -ENOMEM;
8805 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8806 new_pages) != cur_pages);
8807
8808 return 0;
8809}
8810
8811static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8812{
8813 if (ctx->user)
8814 __io_unaccount_mem(ctx->user, nr_pages);
8815
8816 if (ctx->mm_account)
8817 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
8818}
8819
8820static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8821{
8822 int ret;
8823
8824 if (ctx->user) {
8825 ret = __io_account_mem(ctx->user, nr_pages);
8826 if (ret)
8827 return ret;
8828 }
8829
8830 if (ctx->mm_account)
8831 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
8832
8833 return 0;
8834}
8835
8836static void io_mem_free(void *ptr)
8837{
8838 struct page *page;
8839
8840 if (!ptr)
8841 return;
8842
8843 page = virt_to_head_page(ptr);
8844 if (put_page_testzero(page))
8845 free_compound_page(page);
8846}
8847
8848static void *io_mem_alloc(size_t size)
8849{
8850 gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
8851 __GFP_NORETRY | __GFP_ACCOUNT;
8852
8853 return (void *) __get_free_pages(gfp_flags, get_order(size));
8854}
8855
8856static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8857 size_t *sq_offset)
8858{
8859 struct io_rings *rings;
8860 size_t off, sq_array_size;
8861
8862 off = struct_size(rings, cqes, cq_entries);
8863 if (off == SIZE_MAX)
8864 return SIZE_MAX;
8865
8866#ifdef CONFIG_SMP
8867 off = ALIGN(off, SMP_CACHE_BYTES);
8868 if (off == 0)
8869 return SIZE_MAX;
8870#endif
8871
8872 if (sq_offset)
8873 *sq_offset = off;
8874
8875 sq_array_size = array_size(sizeof(u32), sq_entries);
8876 if (sq_array_size == SIZE_MAX)
8877 return SIZE_MAX;
8878
8879 if (check_add_overflow(off, sq_array_size, &off))
8880 return SIZE_MAX;
8881
8882 return off;
8883}
8884
8885static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
8886{
8887 struct io_mapped_ubuf *imu = *slot;
8888 unsigned int i;
8889
8890 if (imu != ctx->dummy_ubuf) {
8891 for (i = 0; i < imu->nr_bvecs; i++)
8892 unpin_user_page(imu->bvec[i].bv_page);
8893 if (imu->acct_pages)
8894 io_unaccount_mem(ctx, imu->acct_pages);
8895 kvfree(imu);
8896 }
8897 *slot = NULL;
8898}
8899
8900static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8901{
8902 io_buffer_unmap(ctx, &prsrc->buf);
8903 prsrc->buf = NULL;
8904}
8905
8906static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8907{
8908 unsigned int i;
8909
8910 for (i = 0; i < ctx->nr_user_bufs; i++)
8911 io_buffer_unmap(ctx, &ctx->user_bufs[i]);
8912 kfree(ctx->user_bufs);
8913 io_rsrc_data_free(ctx->buf_data);
8914 ctx->user_bufs = NULL;
8915 ctx->buf_data = NULL;
8916 ctx->nr_user_bufs = 0;
8917}
8918
8919static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8920{
8921 int ret;
8922
8923 if (!ctx->buf_data)
8924 return -ENXIO;
8925
8926 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8927 if (!ret)
8928 __io_sqe_buffers_unregister(ctx);
8929 return ret;
8930}
8931
8932static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8933 void __user *arg, unsigned index)
8934{
8935 struct iovec __user *src;
8936
8937#ifdef CONFIG_COMPAT
8938 if (ctx->compat) {
8939 struct compat_iovec __user *ciovs;
8940 struct compat_iovec ciov;
8941
8942 ciovs = (struct compat_iovec __user *) arg;
8943 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8944 return -EFAULT;
8945
8946 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8947 dst->iov_len = ciov.iov_len;
8948 return 0;
8949 }
8950#endif
8951 src = (struct iovec __user *) arg;
8952 if (copy_from_user(dst, &src[index], sizeof(*dst)))
8953 return -EFAULT;
8954 return 0;
8955}
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8967 int nr_pages, struct page *hpage)
8968{
8969 int i, j;
8970
8971
8972 for (i = 0; i < nr_pages; i++) {
8973 if (!PageCompound(pages[i]))
8974 continue;
8975 if (compound_head(pages[i]) == hpage)
8976 return true;
8977 }
8978
8979
8980 for (i = 0; i < ctx->nr_user_bufs; i++) {
8981 struct io_mapped_ubuf *imu = ctx->user_bufs[i];
8982
8983 for (j = 0; j < imu->nr_bvecs; j++) {
8984 if (!PageCompound(imu->bvec[j].bv_page))
8985 continue;
8986 if (compound_head(imu->bvec[j].bv_page) == hpage)
8987 return true;
8988 }
8989 }
8990
8991 return false;
8992}
8993
8994static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8995 int nr_pages, struct io_mapped_ubuf *imu,
8996 struct page **last_hpage)
8997{
8998 int i, ret;
8999
9000 imu->acct_pages = 0;
9001 for (i = 0; i < nr_pages; i++) {
9002 if (!PageCompound(pages[i])) {
9003 imu->acct_pages++;
9004 } else {
9005 struct page *hpage;
9006
9007 hpage = compound_head(pages[i]);
9008 if (hpage == *last_hpage)
9009 continue;
9010 *last_hpage = hpage;
9011 if (headpage_already_acct(ctx, pages, i, hpage))
9012 continue;
9013 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
9014 }
9015 }
9016
9017 if (!imu->acct_pages)
9018 return 0;
9019
9020 ret = io_account_mem(ctx, imu->acct_pages);
9021 if (ret)
9022 imu->acct_pages = 0;
9023 return ret;
9024}
9025
9026static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
9027 struct io_mapped_ubuf **pimu,
9028 struct page **last_hpage)
9029{
9030 struct io_mapped_ubuf *imu = NULL;
9031 struct vm_area_struct **vmas = NULL;
9032 struct page **pages = NULL;
9033 unsigned long off, start, end, ubuf;
9034 size_t size;
9035 int ret, pret, nr_pages, i;
9036
9037 if (!iov->iov_base) {
9038 *pimu = ctx->dummy_ubuf;
9039 return 0;
9040 }
9041
9042 ubuf = (unsigned long) iov->iov_base;
9043 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
9044 start = ubuf >> PAGE_SHIFT;
9045 nr_pages = end - start;
9046
9047 *pimu = NULL;
9048 ret = -ENOMEM;
9049
9050 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
9051 if (!pages)
9052 goto done;
9053
9054 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
9055 GFP_KERNEL);
9056 if (!vmas)
9057 goto done;
9058
9059 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
9060 if (!imu)
9061 goto done;
9062
9063 ret = 0;
9064 mmap_read_lock(current->mm);
9065 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
9066 pages, vmas);
9067 if (pret == nr_pages) {
9068
9069 for (i = 0; i < nr_pages; i++) {
9070 struct vm_area_struct *vma = vmas[i];
9071
9072 if (vma_is_shmem(vma))
9073 continue;
9074 if (vma->vm_file &&
9075 !is_file_hugepages(vma->vm_file)) {
9076 ret = -EOPNOTSUPP;
9077 break;
9078 }
9079 }
9080 } else {
9081 ret = pret < 0 ? pret : -EFAULT;
9082 }
9083 mmap_read_unlock(current->mm);
9084 if (ret) {
9085
9086
9087
9088
9089 if (pret > 0)
9090 unpin_user_pages(pages, pret);
9091 goto done;
9092 }
9093
9094 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
9095 if (ret) {
9096 unpin_user_pages(pages, pret);
9097 goto done;
9098 }
9099
9100 off = ubuf & ~PAGE_MASK;
9101 size = iov->iov_len;
9102 for (i = 0; i < nr_pages; i++) {
9103 size_t vec_len;
9104
9105 vec_len = min_t(size_t, size, PAGE_SIZE - off);
9106 imu->bvec[i].bv_page = pages[i];
9107 imu->bvec[i].bv_len = vec_len;
9108 imu->bvec[i].bv_offset = off;
9109 off = 0;
9110 size -= vec_len;
9111 }
9112
9113 imu->ubuf = ubuf;
9114 imu->ubuf_end = ubuf + iov->iov_len;
9115 imu->nr_bvecs = nr_pages;
9116 *pimu = imu;
9117 ret = 0;
9118done:
9119 if (ret)
9120 kvfree(imu);
9121 kvfree(pages);
9122 kvfree(vmas);
9123 return ret;
9124}
9125
9126static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
9127{
9128 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9129 return ctx->user_bufs ? 0 : -ENOMEM;
9130}
9131
9132static int io_buffer_validate(struct iovec *iov)
9133{
9134 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9135
9136
9137
9138
9139
9140
9141 if (!iov->iov_base)
9142 return iov->iov_len ? -EFAULT : 0;
9143 if (!iov->iov_len)
9144 return -EFAULT;
9145
9146
9147 if (iov->iov_len > SZ_1G)
9148 return -EFAULT;
9149
9150 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9151 return -EOVERFLOW;
9152
9153 return 0;
9154}
9155
9156static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
9157 unsigned int nr_args, u64 __user *tags)
9158{
9159 struct page *last_hpage = NULL;
9160 struct io_rsrc_data *data;
9161 int i, ret;
9162 struct iovec iov;
9163
9164 if (ctx->user_bufs)
9165 return -EBUSY;
9166 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
9167 return -EINVAL;
9168 ret = io_rsrc_node_switch_start(ctx);
9169 if (ret)
9170 return ret;
9171 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9172 if (ret)
9173 return ret;
9174 ret = io_buffers_map_alloc(ctx, nr_args);
9175 if (ret) {
9176 io_rsrc_data_free(data);
9177 return ret;
9178 }
9179
9180 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
9181 ret = io_copy_iov(ctx, &iov, arg, i);
9182 if (ret)
9183 break;
9184 ret = io_buffer_validate(&iov);
9185 if (ret)
9186 break;
9187 if (!iov.iov_base && *io_get_tag_slot(data, i)) {
9188 ret = -EINVAL;
9189 break;
9190 }
9191
9192 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9193 &last_hpage);
9194 if (ret)
9195 break;
9196 }
9197
9198 WARN_ON_ONCE(ctx->buf_data);
9199
9200 ctx->buf_data = data;
9201 if (ret)
9202 __io_sqe_buffers_unregister(ctx);
9203 else
9204 io_rsrc_node_switch(ctx, NULL);
9205 return ret;
9206}
9207
9208static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9209 struct io_uring_rsrc_update2 *up,
9210 unsigned int nr_args)
9211{
9212 u64 __user *tags = u64_to_user_ptr(up->tags);
9213 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
9214 struct page *last_hpage = NULL;
9215 bool needs_switch = false;
9216 __u32 done;
9217 int i, err;
9218
9219 if (!ctx->buf_data)
9220 return -ENXIO;
9221 if (up->offset + nr_args > ctx->nr_user_bufs)
9222 return -EINVAL;
9223
9224 for (done = 0; done < nr_args; done++) {
9225 struct io_mapped_ubuf *imu;
9226 int offset = up->offset + done;
9227 u64 tag = 0;
9228
9229 err = io_copy_iov(ctx, &iov, iovs, done);
9230 if (err)
9231 break;
9232 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9233 err = -EFAULT;
9234 break;
9235 }
9236 err = io_buffer_validate(&iov);
9237 if (err)
9238 break;
9239 if (!iov.iov_base && tag) {
9240 err = -EINVAL;
9241 break;
9242 }
9243 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9244 if (err)
9245 break;
9246
9247 i = array_index_nospec(offset, ctx->nr_user_bufs);
9248 if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
9249 err = io_queue_rsrc_removal(ctx->buf_data, offset,
9250 ctx->rsrc_node, ctx->user_bufs[i]);
9251 if (unlikely(err)) {
9252 io_buffer_unmap(ctx, &imu);
9253 break;
9254 }
9255 ctx->user_bufs[i] = NULL;
9256 needs_switch = true;
9257 }
9258
9259 ctx->user_bufs[i] = imu;
9260 *io_get_tag_slot(ctx->buf_data, offset) = tag;
9261 }
9262
9263 if (needs_switch)
9264 io_rsrc_node_switch(ctx, ctx->buf_data);
9265 return done ? done : err;
9266}
9267
9268static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
9269{
9270 __s32 __user *fds = arg;
9271 int fd;
9272
9273 if (ctx->cq_ev_fd)
9274 return -EBUSY;
9275
9276 if (copy_from_user(&fd, fds, sizeof(*fds)))
9277 return -EFAULT;
9278
9279 ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
9280 if (IS_ERR(ctx->cq_ev_fd)) {
9281 int ret = PTR_ERR(ctx->cq_ev_fd);
9282
9283 ctx->cq_ev_fd = NULL;
9284 return ret;
9285 }
9286
9287 return 0;
9288}
9289
9290static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9291{
9292 if (ctx->cq_ev_fd) {
9293 eventfd_ctx_put(ctx->cq_ev_fd);
9294 ctx->cq_ev_fd = NULL;
9295 return 0;
9296 }
9297
9298 return -ENXIO;
9299}
9300
9301static void io_destroy_buffers(struct io_ring_ctx *ctx)
9302{
9303 struct io_buffer *buf;
9304 unsigned long index;
9305
9306 xa_for_each(&ctx->io_buffers, index, buf)
9307 __io_remove_buffers(ctx, buf, index, -1U);
9308}
9309
9310static void io_req_caches_free(struct io_ring_ctx *ctx)
9311{
9312 struct io_submit_state *state = &ctx->submit_state;
9313 int nr = 0;
9314
9315 mutex_lock(&ctx->uring_lock);
9316 io_flush_cached_locked_reqs(ctx, state);
9317
9318 while (state->free_list.next) {
9319 struct io_wq_work_node *node;
9320 struct io_kiocb *req;
9321
9322 node = wq_stack_extract(&state->free_list);
9323 req = container_of(node, struct io_kiocb, comp_list);
9324 kmem_cache_free(req_cachep, req);
9325 nr++;
9326 }
9327 if (nr)
9328 percpu_ref_put_many(&ctx->refs, nr);
9329 mutex_unlock(&ctx->uring_lock);
9330}
9331
9332static void io_wait_rsrc_data(struct io_rsrc_data *data)
9333{
9334 if (data && !atomic_dec_and_test(&data->refs))
9335 wait_for_completion(&data->done);
9336}
9337
9338static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
9339{
9340 io_sq_thread_finish(ctx);
9341
9342 if (ctx->mm_account) {
9343 mmdrop(ctx->mm_account);
9344 ctx->mm_account = NULL;
9345 }
9346
9347 io_rsrc_refs_drop(ctx);
9348
9349 io_wait_rsrc_data(ctx->buf_data);
9350 io_wait_rsrc_data(ctx->file_data);
9351
9352 mutex_lock(&ctx->uring_lock);
9353 if (ctx->buf_data)
9354 __io_sqe_buffers_unregister(ctx);
9355 if (ctx->file_data)
9356 __io_sqe_files_unregister(ctx);
9357 if (ctx->rings)
9358 __io_cqring_overflow_flush(ctx, true);
9359 mutex_unlock(&ctx->uring_lock);
9360 io_eventfd_unregister(ctx);
9361 io_destroy_buffers(ctx);
9362 if (ctx->sq_creds)
9363 put_cred(ctx->sq_creds);
9364
9365
9366 if (ctx->rsrc_node)
9367 io_rsrc_node_destroy(ctx->rsrc_node);
9368 if (ctx->rsrc_backup_node)
9369 io_rsrc_node_destroy(ctx->rsrc_backup_node);
9370 flush_delayed_work(&ctx->rsrc_put_work);
9371 flush_delayed_work(&ctx->fallback_work);
9372
9373 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9374 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
9375
9376#if defined(CONFIG_UNIX)
9377 if (ctx->ring_sock) {
9378 ctx->ring_sock->file = NULL;
9379 sock_release(ctx->ring_sock);
9380 }
9381#endif
9382 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
9383
9384 io_mem_free(ctx->rings);
9385 io_mem_free(ctx->sq_sqes);
9386
9387 percpu_ref_exit(&ctx->refs);
9388 free_uid(ctx->user);
9389 io_req_caches_free(ctx);
9390 if (ctx->hash_map)
9391 io_wq_put_hash(ctx->hash_map);
9392 kfree(ctx->cancel_hash);
9393 kfree(ctx->dummy_ubuf);
9394 kfree(ctx);
9395}
9396
9397static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9398{
9399 struct io_ring_ctx *ctx = file->private_data;
9400 __poll_t mask = 0;
9401
9402 poll_wait(file, &ctx->cq_wait, wait);
9403
9404
9405
9406
9407 smp_rmb();
9408 if (!io_sqring_full(ctx))
9409 mask |= EPOLLOUT | EPOLLWRNORM;
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
9425 mask |= EPOLLIN | EPOLLRDNORM;
9426
9427 return mask;
9428}
9429
9430static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9431{
9432 const struct cred *creds;
9433
9434 creds = xa_erase(&ctx->personalities, id);
9435 if (creds) {
9436 put_cred(creds);
9437 return 0;
9438 }
9439
9440 return -EINVAL;
9441}
9442
9443struct io_tctx_exit {
9444 struct callback_head task_work;
9445 struct completion completion;
9446 struct io_ring_ctx *ctx;
9447};
9448
9449static __cold void io_tctx_exit_cb(struct callback_head *cb)
9450{
9451 struct io_uring_task *tctx = current->io_uring;
9452 struct io_tctx_exit *work;
9453
9454 work = container_of(cb, struct io_tctx_exit, task_work);
9455
9456
9457
9458
9459 if (!atomic_read(&tctx->in_idle))
9460 io_uring_del_tctx_node((unsigned long)work->ctx);
9461 complete(&work->completion);
9462}
9463
9464static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
9465{
9466 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9467
9468 return req->ctx == data;
9469}
9470
9471static __cold void io_ring_exit_work(struct work_struct *work)
9472{
9473 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
9474 unsigned long timeout = jiffies + HZ * 60 * 5;
9475 unsigned long interval = HZ / 20;
9476 struct io_tctx_exit exit;
9477 struct io_tctx_node *node;
9478 int ret;
9479
9480
9481
9482
9483
9484
9485
9486 do {
9487 io_uring_try_cancel_requests(ctx, NULL, true);
9488 if (ctx->sq_data) {
9489 struct io_sq_data *sqd = ctx->sq_data;
9490 struct task_struct *tsk;
9491
9492 io_sq_thread_park(sqd);
9493 tsk = sqd->thread;
9494 if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
9495 io_wq_cancel_cb(tsk->io_uring->io_wq,
9496 io_cancel_ctx_cb, ctx, true);
9497 io_sq_thread_unpark(sqd);
9498 }
9499
9500 io_req_caches_free(ctx);
9501
9502 if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
9503
9504 interval = HZ * 60;
9505 }
9506 } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
9507
9508 init_completion(&exit.completion);
9509 init_task_work(&exit.task_work, io_tctx_exit_cb);
9510 exit.ctx = ctx;
9511
9512
9513
9514
9515
9516
9517 mutex_lock(&ctx->uring_lock);
9518 while (!list_empty(&ctx->tctx_list)) {
9519 WARN_ON_ONCE(time_after(jiffies, timeout));
9520
9521 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
9522 ctx_node);
9523
9524 list_rotate_left(&ctx->tctx_list);
9525 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
9526 if (WARN_ON_ONCE(ret))
9527 continue;
9528
9529 mutex_unlock(&ctx->uring_lock);
9530 wait_for_completion(&exit.completion);
9531 mutex_lock(&ctx->uring_lock);
9532 }
9533 mutex_unlock(&ctx->uring_lock);
9534 spin_lock(&ctx->completion_lock);
9535 spin_unlock(&ctx->completion_lock);
9536
9537 io_ring_ctx_free(ctx);
9538}
9539
9540
9541static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
9542 struct task_struct *tsk, bool cancel_all)
9543{
9544 struct io_kiocb *req, *tmp;
9545 int canceled = 0;
9546
9547 spin_lock(&ctx->completion_lock);
9548 spin_lock_irq(&ctx->timeout_lock);
9549 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
9550 if (io_match_task(req, tsk, cancel_all)) {
9551 io_kill_timeout(req, -ECANCELED);
9552 canceled++;
9553 }
9554 }
9555 spin_unlock_irq(&ctx->timeout_lock);
9556 if (canceled != 0)
9557 io_commit_cqring(ctx);
9558 spin_unlock(&ctx->completion_lock);
9559 if (canceled != 0)
9560 io_cqring_ev_posted(ctx);
9561 return canceled != 0;
9562}
9563
9564static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
9565{
9566 unsigned long index;
9567 struct creds *creds;
9568
9569 mutex_lock(&ctx->uring_lock);
9570 percpu_ref_kill(&ctx->refs);
9571 if (ctx->rings)
9572 __io_cqring_overflow_flush(ctx, true);
9573 xa_for_each(&ctx->personalities, index, creds)
9574 io_unregister_personality(ctx, index);
9575 mutex_unlock(&ctx->uring_lock);
9576
9577 io_kill_timeouts(ctx, NULL, true);
9578 io_poll_remove_all(ctx, NULL, true);
9579
9580
9581 io_iopoll_try_reap_events(ctx);
9582
9583 INIT_WORK(&ctx->exit_work, io_ring_exit_work);
9584
9585
9586
9587
9588
9589
9590 queue_work(system_unbound_wq, &ctx->exit_work);
9591}
9592
9593static int io_uring_release(struct inode *inode, struct file *file)
9594{
9595 struct io_ring_ctx *ctx = file->private_data;
9596
9597 file->private_data = NULL;
9598 io_ring_ctx_wait_and_kill(ctx);
9599 return 0;
9600}
9601
9602struct io_task_cancel {
9603 struct task_struct *task;
9604 bool all;
9605};
9606
9607static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
9608{
9609 struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9610 struct io_task_cancel *cancel = data;
9611
9612 return io_match_task_safe(req, cancel->task, cancel->all);
9613}
9614
9615static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
9616 struct task_struct *task,
9617 bool cancel_all)
9618{
9619 struct io_defer_entry *de;
9620 LIST_HEAD(list);
9621
9622 spin_lock(&ctx->completion_lock);
9623 list_for_each_entry_reverse(de, &ctx->defer_list, list) {
9624 if (io_match_task_safe(de->req, task, cancel_all)) {
9625 list_cut_position(&list, &ctx->defer_list, &de->list);
9626 break;
9627 }
9628 }
9629 spin_unlock(&ctx->completion_lock);
9630 if (list_empty(&list))
9631 return false;
9632
9633 while (!list_empty(&list)) {
9634 de = list_first_entry(&list, struct io_defer_entry, list);
9635 list_del_init(&de->list);
9636 io_req_complete_failed(de->req, -ECANCELED);
9637 kfree(de);
9638 }
9639 return true;
9640}
9641
9642static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
9643{
9644 struct io_tctx_node *node;
9645 enum io_wq_cancel cret;
9646 bool ret = false;
9647
9648 mutex_lock(&ctx->uring_lock);
9649 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
9650 struct io_uring_task *tctx = node->task->io_uring;
9651
9652
9653
9654
9655
9656 if (!tctx || !tctx->io_wq)
9657 continue;
9658 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
9659 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9660 }
9661 mutex_unlock(&ctx->uring_lock);
9662
9663 return ret;
9664}
9665
9666static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9667 struct task_struct *task,
9668 bool cancel_all)
9669{
9670 struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
9671 struct io_uring_task *tctx = task ? task->io_uring : NULL;
9672
9673 while (1) {
9674 enum io_wq_cancel cret;
9675 bool ret = false;
9676
9677 if (!task) {
9678 ret |= io_uring_try_cancel_iowq(ctx);
9679 } else if (tctx && tctx->io_wq) {
9680
9681
9682
9683
9684 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9685 &cancel, true);
9686 ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9687 }
9688
9689
9690 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
9691 (ctx->sq_data && ctx->sq_data->thread == current)) {
9692 while (!wq_list_empty(&ctx->iopoll_list)) {
9693 io_iopoll_try_reap_events(ctx);
9694 ret = true;
9695 }
9696 }
9697
9698 ret |= io_cancel_defer_files(ctx, task, cancel_all);
9699 ret |= io_poll_remove_all(ctx, task, cancel_all);
9700 ret |= io_kill_timeouts(ctx, task, cancel_all);
9701 if (task)
9702 ret |= io_run_task_work();
9703 if (!ret)
9704 break;
9705 cond_resched();
9706 }
9707}
9708
9709static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9710{
9711 struct io_uring_task *tctx = current->io_uring;
9712 struct io_tctx_node *node;
9713 int ret;
9714
9715 if (unlikely(!tctx)) {
9716 ret = io_uring_alloc_task_context(current, ctx);
9717 if (unlikely(ret))
9718 return ret;
9719
9720 tctx = current->io_uring;
9721 if (ctx->iowq_limits_set) {
9722 unsigned int limits[2] = { ctx->iowq_limits[0],
9723 ctx->iowq_limits[1], };
9724
9725 ret = io_wq_max_workers(tctx->io_wq, limits);
9726 if (ret)
9727 return ret;
9728 }
9729 }
9730 if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9731 node = kmalloc(sizeof(*node), GFP_KERNEL);
9732 if (!node)
9733 return -ENOMEM;
9734 node->ctx = ctx;
9735 node->task = current;
9736
9737 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9738 node, GFP_KERNEL));
9739 if (ret) {
9740 kfree(node);
9741 return ret;
9742 }
9743
9744 mutex_lock(&ctx->uring_lock);
9745 list_add(&node->ctx_node, &ctx->tctx_list);
9746 mutex_unlock(&ctx->uring_lock);
9747 }
9748 tctx->last = ctx;
9749 return 0;
9750}
9751
9752
9753
9754
9755static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9756{
9757 struct io_uring_task *tctx = current->io_uring;
9758
9759 if (likely(tctx && tctx->last == ctx))
9760 return 0;
9761 return __io_uring_add_tctx_node(ctx);
9762}
9763
9764
9765
9766
9767static __cold void io_uring_del_tctx_node(unsigned long index)
9768{
9769 struct io_uring_task *tctx = current->io_uring;
9770 struct io_tctx_node *node;
9771
9772 if (!tctx)
9773 return;
9774 node = xa_erase(&tctx->xa, index);
9775 if (!node)
9776 return;
9777
9778 WARN_ON_ONCE(current != node->task);
9779 WARN_ON_ONCE(list_empty(&node->ctx_node));
9780
9781 mutex_lock(&node->ctx->uring_lock);
9782 list_del(&node->ctx_node);
9783 mutex_unlock(&node->ctx->uring_lock);
9784
9785 if (tctx->last == node->ctx)
9786 tctx->last = NULL;
9787 kfree(node);
9788}
9789
9790static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
9791{
9792 struct io_wq *wq = tctx->io_wq;
9793 struct io_tctx_node *node;
9794 unsigned long index;
9795
9796 xa_for_each(&tctx->xa, index, node) {
9797 io_uring_del_tctx_node(index);
9798 cond_resched();
9799 }
9800 if (wq) {
9801
9802
9803
9804
9805 io_wq_put_and_exit(wq);
9806 tctx->io_wq = NULL;
9807 }
9808}
9809
9810static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
9811{
9812 if (tracked)
9813 return atomic_read(&tctx->inflight_tracked);
9814 return percpu_counter_sum(&tctx->inflight);
9815}
9816
9817static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
9818{
9819 struct io_uring_task *tctx = task->io_uring;
9820 unsigned int refs = tctx->cached_refs;
9821
9822 if (refs) {
9823 tctx->cached_refs = 0;
9824 percpu_counter_sub(&tctx->inflight, refs);
9825 put_task_struct_many(task, refs);
9826 }
9827}
9828
9829
9830
9831
9832
9833static __cold void io_uring_cancel_generic(bool cancel_all,
9834 struct io_sq_data *sqd)
9835{
9836 struct io_uring_task *tctx = current->io_uring;
9837 struct io_ring_ctx *ctx;
9838 s64 inflight;
9839 DEFINE_WAIT(wait);
9840
9841 WARN_ON_ONCE(sqd && sqd->thread != current);
9842
9843 if (!current->io_uring)
9844 return;
9845 if (tctx->io_wq)
9846 io_wq_exit_start(tctx->io_wq);
9847
9848 atomic_inc(&tctx->in_idle);
9849 do {
9850 io_uring_drop_tctx_refs(current);
9851
9852 inflight = tctx_inflight(tctx, !cancel_all);
9853 if (!inflight)
9854 break;
9855
9856 if (!sqd) {
9857 struct io_tctx_node *node;
9858 unsigned long index;
9859
9860 xa_for_each(&tctx->xa, index, node) {
9861
9862 if (node->ctx->sq_data)
9863 continue;
9864 io_uring_try_cancel_requests(node->ctx, current,
9865 cancel_all);
9866 }
9867 } else {
9868 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9869 io_uring_try_cancel_requests(ctx, current,
9870 cancel_all);
9871 }
9872
9873 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
9874 io_run_task_work();
9875 io_uring_drop_tctx_refs(current);
9876
9877
9878
9879
9880
9881
9882 if (inflight == tctx_inflight(tctx, !cancel_all))
9883 schedule();
9884 finish_wait(&tctx->wait, &wait);
9885 } while (1);
9886 atomic_dec(&tctx->in_idle);
9887
9888 io_uring_clean_tctx(tctx);
9889 if (cancel_all) {
9890
9891 __io_uring_free(current);
9892 }
9893}
9894
9895void __io_uring_cancel(bool cancel_all)
9896{
9897 io_uring_cancel_generic(cancel_all, NULL);
9898}
9899
9900static void *io_uring_validate_mmap_request(struct file *file,
9901 loff_t pgoff, size_t sz)
9902{
9903 struct io_ring_ctx *ctx = file->private_data;
9904 loff_t offset = pgoff << PAGE_SHIFT;
9905 struct page *page;
9906 void *ptr;
9907
9908 switch (offset) {
9909 case IORING_OFF_SQ_RING:
9910 case IORING_OFF_CQ_RING:
9911 ptr = ctx->rings;
9912 break;
9913 case IORING_OFF_SQES:
9914 ptr = ctx->sq_sqes;
9915 break;
9916 default:
9917 return ERR_PTR(-EINVAL);
9918 }
9919
9920 page = virt_to_head_page(ptr);
9921 if (sz > page_size(page))
9922 return ERR_PTR(-EINVAL);
9923
9924 return ptr;
9925}
9926
9927#ifdef CONFIG_MMU
9928
9929static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9930{
9931 size_t sz = vma->vm_end - vma->vm_start;
9932 unsigned long pfn;
9933 void *ptr;
9934
9935 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9936 if (IS_ERR(ptr))
9937 return PTR_ERR(ptr);
9938
9939 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9940 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9941}
9942
9943#else
9944
9945static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9946{
9947 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9948}
9949
9950static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9951{
9952 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9953}
9954
9955static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9956 unsigned long addr, unsigned long len,
9957 unsigned long pgoff, unsigned long flags)
9958{
9959 void *ptr;
9960
9961 ptr = io_uring_validate_mmap_request(file, pgoff, len);
9962 if (IS_ERR(ptr))
9963 return PTR_ERR(ptr);
9964
9965 return (unsigned long) ptr;
9966}
9967
9968#endif
9969
9970static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9971{
9972 DEFINE_WAIT(wait);
9973
9974 do {
9975 if (!io_sqring_full(ctx))
9976 break;
9977 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9978
9979 if (!io_sqring_full(ctx))
9980 break;
9981 schedule();
9982 } while (!signal_pending(current));
9983
9984 finish_wait(&ctx->sqo_sq_wait, &wait);
9985 return 0;
9986}
9987
9988static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9989 struct __kernel_timespec __user **ts,
9990 const sigset_t __user **sig)
9991{
9992 struct io_uring_getevents_arg arg;
9993
9994
9995
9996
9997
9998 if (!(flags & IORING_ENTER_EXT_ARG)) {
9999 *sig = (const sigset_t __user *) argp;
10000 *ts = NULL;
10001 return 0;
10002 }
10003
10004
10005
10006
10007
10008 if (*argsz != sizeof(arg))
10009 return -EINVAL;
10010 if (copy_from_user(&arg, argp, sizeof(arg)))
10011 return -EFAULT;
10012 *sig = u64_to_user_ptr(arg.sigmask);
10013 *argsz = arg.sigmask_sz;
10014 *ts = u64_to_user_ptr(arg.ts);
10015 return 0;
10016}
10017
10018SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
10019 u32, min_complete, u32, flags, const void __user *, argp,
10020 size_t, argsz)
10021{
10022 struct io_ring_ctx *ctx;
10023 int submitted = 0;
10024 struct fd f;
10025 long ret;
10026
10027 io_run_task_work();
10028
10029 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
10030 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
10031 return -EINVAL;
10032
10033 f = fdget(fd);
10034 if (unlikely(!f.file))
10035 return -EBADF;
10036
10037 ret = -EOPNOTSUPP;
10038 if (unlikely(f.file->f_op != &io_uring_fops))
10039 goto out_fput;
10040
10041 ret = -ENXIO;
10042 ctx = f.file->private_data;
10043 if (unlikely(!percpu_ref_tryget(&ctx->refs)))
10044 goto out_fput;
10045
10046 ret = -EBADFD;
10047 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
10048 goto out;
10049
10050
10051
10052
10053
10054
10055 ret = 0;
10056 if (ctx->flags & IORING_SETUP_SQPOLL) {
10057 io_cqring_overflow_flush(ctx);
10058
10059 if (unlikely(ctx->sq_data->thread == NULL)) {
10060 ret = -EOWNERDEAD;
10061 goto out;
10062 }
10063 if (flags & IORING_ENTER_SQ_WAKEUP)
10064 wake_up(&ctx->sq_data->wait);
10065 if (flags & IORING_ENTER_SQ_WAIT) {
10066 ret = io_sqpoll_wait_sq(ctx);
10067 if (ret)
10068 goto out;
10069 }
10070 submitted = to_submit;
10071 } else if (to_submit) {
10072 ret = io_uring_add_tctx_node(ctx);
10073 if (unlikely(ret))
10074 goto out;
10075 mutex_lock(&ctx->uring_lock);
10076 submitted = io_submit_sqes(ctx, to_submit);
10077 mutex_unlock(&ctx->uring_lock);
10078
10079 if (submitted != to_submit)
10080 goto out;
10081 }
10082 if (flags & IORING_ENTER_GETEVENTS) {
10083 const sigset_t __user *sig;
10084 struct __kernel_timespec __user *ts;
10085
10086 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10087 if (unlikely(ret))
10088 goto out;
10089
10090 min_complete = min(min_complete, ctx->cq_entries);
10091
10092
10093
10094
10095
10096
10097
10098 if (ctx->flags & IORING_SETUP_IOPOLL &&
10099 !(ctx->flags & IORING_SETUP_SQPOLL)) {
10100 ret = io_iopoll_check(ctx, min_complete);
10101 } else {
10102 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
10103 }
10104 }
10105
10106out:
10107 percpu_ref_put(&ctx->refs);
10108out_fput:
10109 fdput(f);
10110 return submitted ? submitted : ret;
10111}
10112
10113#ifdef CONFIG_PROC_FS
10114static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
10115 const struct cred *cred)
10116{
10117 struct user_namespace *uns = seq_user_ns(m);
10118 struct group_info *gi;
10119 kernel_cap_t cap;
10120 unsigned __capi;
10121 int g;
10122
10123 seq_printf(m, "%5d\n", id);
10124 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10125 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10126 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10127 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10128 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10129 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10130 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10131 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10132 seq_puts(m, "\n\tGroups:\t");
10133 gi = cred->group_info;
10134 for (g = 0; g < gi->ngroups; g++) {
10135 seq_put_decimal_ull(m, g ? " " : "",
10136 from_kgid_munged(uns, gi->gid[g]));
10137 }
10138 seq_puts(m, "\n\tCapEff:\t");
10139 cap = cred->cap_effective;
10140 CAP_FOR_EACH_U32(__capi)
10141 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10142 seq_putc(m, '\n');
10143 return 0;
10144}
10145
10146static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10147 struct seq_file *m)
10148{
10149 struct io_sq_data *sq = NULL;
10150 struct io_overflow_cqe *ocqe;
10151 struct io_rings *r = ctx->rings;
10152 unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
10153 unsigned int sq_head = READ_ONCE(r->sq.head);
10154 unsigned int sq_tail = READ_ONCE(r->sq.tail);
10155 unsigned int cq_head = READ_ONCE(r->cq.head);
10156 unsigned int cq_tail = READ_ONCE(r->cq.tail);
10157 unsigned int sq_entries, cq_entries;
10158 bool has_lock;
10159 unsigned int i;
10160
10161
10162
10163
10164
10165
10166
10167 seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
10168 seq_printf(m, "SqHead:\t%u\n", sq_head);
10169 seq_printf(m, "SqTail:\t%u\n", sq_tail);
10170 seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
10171 seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
10172 seq_printf(m, "CqHead:\t%u\n", cq_head);
10173 seq_printf(m, "CqTail:\t%u\n", cq_tail);
10174 seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
10175 seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
10176 sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
10177 for (i = 0; i < sq_entries; i++) {
10178 unsigned int entry = i + sq_head;
10179 unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
10180 struct io_uring_sqe *sqe;
10181
10182 if (sq_idx > sq_mask)
10183 continue;
10184 sqe = &ctx->sq_sqes[sq_idx];
10185 seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10186 sq_idx, sqe->opcode, sqe->fd, sqe->flags,
10187 sqe->user_data);
10188 }
10189 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
10190 cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
10191 for (i = 0; i < cq_entries; i++) {
10192 unsigned int entry = i + cq_head;
10193 struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
10194
10195 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10196 entry & cq_mask, cqe->user_data, cqe->res,
10197 cqe->flags);
10198 }
10199
10200
10201
10202
10203
10204
10205
10206 has_lock = mutex_trylock(&ctx->uring_lock);
10207
10208 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
10209 sq = ctx->sq_data;
10210 if (!sq->thread)
10211 sq = NULL;
10212 }
10213
10214 seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10215 seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
10216 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
10217 for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
10218 struct file *f = io_file_from_index(ctx, i);
10219
10220 if (f)
10221 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10222 else
10223 seq_printf(m, "%5u: <none>\n", i);
10224 }
10225 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
10226 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
10227 struct io_mapped_ubuf *buf = ctx->user_bufs[i];
10228 unsigned int len = buf->ubuf_end - buf->ubuf;
10229
10230 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
10231 }
10232 if (has_lock && !xa_empty(&ctx->personalities)) {
10233 unsigned long index;
10234 const struct cred *cred;
10235
10236 seq_printf(m, "Personalities:\n");
10237 xa_for_each(&ctx->personalities, index, cred)
10238 io_uring_show_cred(m, index, cred);
10239 }
10240 if (has_lock)
10241 mutex_unlock(&ctx->uring_lock);
10242
10243 seq_puts(m, "PollList:\n");
10244 spin_lock(&ctx->completion_lock);
10245 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10246 struct hlist_head *list = &ctx->cancel_hash[i];
10247 struct io_kiocb *req;
10248
10249 hlist_for_each_entry(req, list, hash_node)
10250 seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
10251 req->task->task_works != NULL);
10252 }
10253
10254 seq_puts(m, "CqOverflowList:\n");
10255 list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10256 struct io_uring_cqe *cqe = &ocqe->cqe;
10257
10258 seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
10259 cqe->user_data, cqe->res, cqe->flags);
10260
10261 }
10262
10263 spin_unlock(&ctx->completion_lock);
10264}
10265
10266static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
10267{
10268 struct io_ring_ctx *ctx = f->private_data;
10269
10270 if (percpu_ref_tryget(&ctx->refs)) {
10271 __io_uring_show_fdinfo(ctx, m);
10272 percpu_ref_put(&ctx->refs);
10273 }
10274}
10275#endif
10276
10277static const struct file_operations io_uring_fops = {
10278 .release = io_uring_release,
10279 .mmap = io_uring_mmap,
10280#ifndef CONFIG_MMU
10281 .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10282 .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10283#endif
10284 .poll = io_uring_poll,
10285#ifdef CONFIG_PROC_FS
10286 .show_fdinfo = io_uring_show_fdinfo,
10287#endif
10288};
10289
10290static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10291 struct io_uring_params *p)
10292{
10293 struct io_rings *rings;
10294 size_t size, sq_array_offset;
10295
10296
10297 ctx->sq_entries = p->sq_entries;
10298 ctx->cq_entries = p->cq_entries;
10299
10300 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10301 if (size == SIZE_MAX)
10302 return -EOVERFLOW;
10303
10304 rings = io_mem_alloc(size);
10305 if (!rings)
10306 return -ENOMEM;
10307
10308 ctx->rings = rings;
10309 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10310 rings->sq_ring_mask = p->sq_entries - 1;
10311 rings->cq_ring_mask = p->cq_entries - 1;
10312 rings->sq_ring_entries = p->sq_entries;
10313 rings->cq_ring_entries = p->cq_entries;
10314
10315 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
10316 if (size == SIZE_MAX) {
10317 io_mem_free(ctx->rings);
10318 ctx->rings = NULL;
10319 return -EOVERFLOW;
10320 }
10321
10322 ctx->sq_sqes = io_mem_alloc(size);
10323 if (!ctx->sq_sqes) {
10324 io_mem_free(ctx->rings);
10325 ctx->rings = NULL;
10326 return -ENOMEM;
10327 }
10328
10329 return 0;
10330}
10331
10332static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10333{
10334 int ret, fd;
10335
10336 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10337 if (fd < 0)
10338 return fd;
10339
10340 ret = io_uring_add_tctx_node(ctx);
10341 if (ret) {
10342 put_unused_fd(fd);
10343 return ret;
10344 }
10345 fd_install(fd, file);
10346 return fd;
10347}
10348
10349
10350
10351
10352
10353
10354
10355static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
10356{
10357 struct file *file;
10358#if defined(CONFIG_UNIX)
10359 int ret;
10360
10361 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10362 &ctx->ring_sock);
10363 if (ret)
10364 return ERR_PTR(ret);
10365#endif
10366
10367 file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
10368 O_RDWR | O_CLOEXEC, NULL);
10369#if defined(CONFIG_UNIX)
10370 if (IS_ERR(file)) {
10371 sock_release(ctx->ring_sock);
10372 ctx->ring_sock = NULL;
10373 } else {
10374 ctx->ring_sock->file = file;
10375 }
10376#endif
10377 return file;
10378}
10379
10380static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
10381 struct io_uring_params __user *params)
10382{
10383 struct io_ring_ctx *ctx;
10384 struct file *file;
10385 int ret;
10386
10387 if (!entries)
10388 return -EINVAL;
10389 if (entries > IORING_MAX_ENTRIES) {
10390 if (!(p->flags & IORING_SETUP_CLAMP))
10391 return -EINVAL;
10392 entries = IORING_MAX_ENTRIES;
10393 }
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403 p->sq_entries = roundup_pow_of_two(entries);
10404 if (p->flags & IORING_SETUP_CQSIZE) {
10405
10406
10407
10408
10409
10410 if (!p->cq_entries)
10411 return -EINVAL;
10412 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10413 if (!(p->flags & IORING_SETUP_CLAMP))
10414 return -EINVAL;
10415 p->cq_entries = IORING_MAX_CQ_ENTRIES;
10416 }
10417 p->cq_entries = roundup_pow_of_two(p->cq_entries);
10418 if (p->cq_entries < p->sq_entries)
10419 return -EINVAL;
10420 } else {
10421 p->cq_entries = 2 * p->sq_entries;
10422 }
10423
10424 ctx = io_ring_ctx_alloc(p);
10425 if (!ctx)
10426 return -ENOMEM;
10427 ctx->compat = in_compat_syscall();
10428 if (!capable(CAP_IPC_LOCK))
10429 ctx->user = get_uid(current_user());
10430
10431
10432
10433
10434
10435
10436
10437 mmgrab(current->mm);
10438 ctx->mm_account = current->mm;
10439
10440 ret = io_allocate_scq_urings(ctx, p);
10441 if (ret)
10442 goto err;
10443
10444 ret = io_sq_offload_create(ctx, p);
10445 if (ret)
10446 goto err;
10447
10448 ret = io_rsrc_node_switch_start(ctx);
10449 if (ret)
10450 goto err;
10451 io_rsrc_node_switch(ctx, NULL);
10452
10453 memset(&p->sq_off, 0, sizeof(p->sq_off));
10454 p->sq_off.head = offsetof(struct io_rings, sq.head);
10455 p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10456 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10457 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10458 p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10459 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10460 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
10461
10462 memset(&p->cq_off, 0, sizeof(p->cq_off));
10463 p->cq_off.head = offsetof(struct io_rings, cq.head);
10464 p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10465 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10466 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10467 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10468 p->cq_off.cqes = offsetof(struct io_rings, cqes);
10469 p->cq_off.flags = offsetof(struct io_rings, cq_flags);
10470
10471 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10472 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
10473 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
10474 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
10475 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10476 IORING_FEAT_RSRC_TAGS;
10477
10478 if (copy_to_user(params, p, sizeof(*p))) {
10479 ret = -EFAULT;
10480 goto err;
10481 }
10482
10483 file = io_uring_get_file(ctx);
10484 if (IS_ERR(file)) {
10485 ret = PTR_ERR(file);
10486 goto err;
10487 }
10488
10489
10490
10491
10492
10493 ret = io_uring_install_fd(ctx, file);
10494 if (ret < 0) {
10495
10496 fput(file);
10497 return ret;
10498 }
10499
10500 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
10501 return ret;
10502err:
10503 io_ring_ctx_wait_and_kill(ctx);
10504 return ret;
10505}
10506
10507
10508
10509
10510
10511
10512static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10513{
10514 struct io_uring_params p;
10515 int i;
10516
10517 if (copy_from_user(&p, params, sizeof(p)))
10518 return -EFAULT;
10519 for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10520 if (p.resv[i])
10521 return -EINVAL;
10522 }
10523
10524 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
10525 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
10526 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10527 IORING_SETUP_R_DISABLED))
10528 return -EINVAL;
10529
10530 return io_uring_create(entries, &p, params);
10531}
10532
10533SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10534 struct io_uring_params __user *, params)
10535{
10536 return io_uring_setup(entries, params);
10537}
10538
10539static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
10540 unsigned nr_args)
10541{
10542 struct io_uring_probe *p;
10543 size_t size;
10544 int i, ret;
10545
10546 size = struct_size(p, ops, nr_args);
10547 if (size == SIZE_MAX)
10548 return -EOVERFLOW;
10549 p = kzalloc(size, GFP_KERNEL);
10550 if (!p)
10551 return -ENOMEM;
10552
10553 ret = -EFAULT;
10554 if (copy_from_user(p, arg, size))
10555 goto out;
10556 ret = -EINVAL;
10557 if (memchr_inv(p, 0, size))
10558 goto out;
10559
10560 p->last_op = IORING_OP_LAST - 1;
10561 if (nr_args > IORING_OP_LAST)
10562 nr_args = IORING_OP_LAST;
10563
10564 for (i = 0; i < nr_args; i++) {
10565 p->ops[i].op = i;
10566 if (!io_op_defs[i].not_supported)
10567 p->ops[i].flags = IO_URING_OP_SUPPORTED;
10568 }
10569 p->ops_len = i;
10570
10571 ret = 0;
10572 if (copy_to_user(arg, p, size))
10573 ret = -EFAULT;
10574out:
10575 kfree(p);
10576 return ret;
10577}
10578
10579static int io_register_personality(struct io_ring_ctx *ctx)
10580{
10581 const struct cred *creds;
10582 u32 id;
10583 int ret;
10584
10585 creds = get_current_cred();
10586
10587 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10588 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
10589 if (ret < 0) {
10590 put_cred(creds);
10591 return ret;
10592 }
10593 return id;
10594}
10595
10596static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
10597 void __user *arg, unsigned int nr_args)
10598{
10599 struct io_uring_restriction *res;
10600 size_t size;
10601 int i, ret;
10602
10603
10604 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10605 return -EBADFD;
10606
10607
10608 if (ctx->restrictions.registered)
10609 return -EBUSY;
10610
10611 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10612 return -EINVAL;
10613
10614 size = array_size(nr_args, sizeof(*res));
10615 if (size == SIZE_MAX)
10616 return -EOVERFLOW;
10617
10618 res = memdup_user(arg, size);
10619 if (IS_ERR(res))
10620 return PTR_ERR(res);
10621
10622 ret = 0;
10623
10624 for (i = 0; i < nr_args; i++) {
10625 switch (res[i].opcode) {
10626 case IORING_RESTRICTION_REGISTER_OP:
10627 if (res[i].register_op >= IORING_REGISTER_LAST) {
10628 ret = -EINVAL;
10629 goto out;
10630 }
10631
10632 __set_bit(res[i].register_op,
10633 ctx->restrictions.register_op);
10634 break;
10635 case IORING_RESTRICTION_SQE_OP:
10636 if (res[i].sqe_op >= IORING_OP_LAST) {
10637 ret = -EINVAL;
10638 goto out;
10639 }
10640
10641 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10642 break;
10643 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10644 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10645 break;
10646 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10647 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10648 break;
10649 default:
10650 ret = -EINVAL;
10651 goto out;
10652 }
10653 }
10654
10655out:
10656
10657 if (ret != 0)
10658 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10659 else
10660 ctx->restrictions.registered = true;
10661
10662 kfree(res);
10663 return ret;
10664}
10665
10666static int io_register_enable_rings(struct io_ring_ctx *ctx)
10667{
10668 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10669 return -EBADFD;
10670
10671 if (ctx->restrictions.registered)
10672 ctx->restricted = 1;
10673
10674 ctx->flags &= ~IORING_SETUP_R_DISABLED;
10675 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10676 wake_up(&ctx->sq_data->wait);
10677 return 0;
10678}
10679
10680static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
10681 struct io_uring_rsrc_update2 *up,
10682 unsigned nr_args)
10683{
10684 __u32 tmp;
10685 int err;
10686
10687 if (up->resv)
10688 return -EINVAL;
10689 if (check_add_overflow(up->offset, nr_args, &tmp))
10690 return -EOVERFLOW;
10691 err = io_rsrc_node_switch_start(ctx);
10692 if (err)
10693 return err;
10694
10695 switch (type) {
10696 case IORING_RSRC_FILE:
10697 return __io_sqe_files_update(ctx, up, nr_args);
10698 case IORING_RSRC_BUFFER:
10699 return __io_sqe_buffers_update(ctx, up, nr_args);
10700 }
10701 return -EINVAL;
10702}
10703
10704static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10705 unsigned nr_args)
10706{
10707 struct io_uring_rsrc_update2 up;
10708
10709 if (!nr_args)
10710 return -EINVAL;
10711 memset(&up, 0, sizeof(up));
10712 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10713 return -EFAULT;
10714 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10715}
10716
10717static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
10718 unsigned size, unsigned type)
10719{
10720 struct io_uring_rsrc_update2 up;
10721
10722 if (size != sizeof(up))
10723 return -EINVAL;
10724 if (copy_from_user(&up, arg, sizeof(up)))
10725 return -EFAULT;
10726 if (!up.nr || up.resv)
10727 return -EINVAL;
10728 return __io_register_rsrc_update(ctx, type, &up, up.nr);
10729}
10730
10731static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
10732 unsigned int size, unsigned int type)
10733{
10734 struct io_uring_rsrc_register rr;
10735
10736
10737 if (size != sizeof(rr))
10738 return -EINVAL;
10739
10740 memset(&rr, 0, sizeof(rr));
10741 if (copy_from_user(&rr, arg, size))
10742 return -EFAULT;
10743 if (!rr.nr || rr.resv || rr.resv2)
10744 return -EINVAL;
10745
10746 switch (type) {
10747 case IORING_RSRC_FILE:
10748 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10749 rr.nr, u64_to_user_ptr(rr.tags));
10750 case IORING_RSRC_BUFFER:
10751 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10752 rr.nr, u64_to_user_ptr(rr.tags));
10753 }
10754 return -EINVAL;
10755}
10756
10757static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
10758 void __user *arg, unsigned len)
10759{
10760 struct io_uring_task *tctx = current->io_uring;
10761 cpumask_var_t new_mask;
10762 int ret;
10763
10764 if (!tctx || !tctx->io_wq)
10765 return -EINVAL;
10766
10767 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10768 return -ENOMEM;
10769
10770 cpumask_clear(new_mask);
10771 if (len > cpumask_size())
10772 len = cpumask_size();
10773
10774 if (copy_from_user(new_mask, arg, len)) {
10775 free_cpumask_var(new_mask);
10776 return -EFAULT;
10777 }
10778
10779 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10780 free_cpumask_var(new_mask);
10781 return ret;
10782}
10783
10784static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10785{
10786 struct io_uring_task *tctx = current->io_uring;
10787
10788 if (!tctx || !tctx->io_wq)
10789 return -EINVAL;
10790
10791 return io_wq_cpu_affinity(tctx->io_wq, NULL);
10792}
10793
10794static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10795 void __user *arg)
10796 __must_hold(&ctx->uring_lock)
10797{
10798 struct io_tctx_node *node;
10799 struct io_uring_task *tctx = NULL;
10800 struct io_sq_data *sqd = NULL;
10801 __u32 new_count[2];
10802 int i, ret;
10803
10804 if (copy_from_user(new_count, arg, sizeof(new_count)))
10805 return -EFAULT;
10806 for (i = 0; i < ARRAY_SIZE(new_count); i++)
10807 if (new_count[i] > INT_MAX)
10808 return -EINVAL;
10809
10810 if (ctx->flags & IORING_SETUP_SQPOLL) {
10811 sqd = ctx->sq_data;
10812 if (sqd) {
10813
10814
10815
10816
10817
10818 refcount_inc(&sqd->refs);
10819 mutex_unlock(&ctx->uring_lock);
10820 mutex_lock(&sqd->lock);
10821 mutex_lock(&ctx->uring_lock);
10822 if (sqd->thread)
10823 tctx = sqd->thread->io_uring;
10824 }
10825 } else {
10826 tctx = current->io_uring;
10827 }
10828
10829 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
10830
10831 for (i = 0; i < ARRAY_SIZE(new_count); i++)
10832 if (new_count[i])
10833 ctx->iowq_limits[i] = new_count[i];
10834 ctx->iowq_limits_set = true;
10835
10836 if (tctx && tctx->io_wq) {
10837 ret = io_wq_max_workers(tctx->io_wq, new_count);
10838 if (ret)
10839 goto err;
10840 } else {
10841 memset(new_count, 0, sizeof(new_count));
10842 }
10843
10844 if (sqd) {
10845 mutex_unlock(&sqd->lock);
10846 io_put_sq_data(sqd);
10847 }
10848
10849 if (copy_to_user(arg, new_count, sizeof(new_count)))
10850 return -EFAULT;
10851
10852
10853 if (sqd)
10854 return 0;
10855
10856
10857 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10858 struct io_uring_task *tctx = node->task->io_uring;
10859
10860 if (WARN_ON_ONCE(!tctx->io_wq))
10861 continue;
10862
10863 for (i = 0; i < ARRAY_SIZE(new_count); i++)
10864 new_count[i] = ctx->iowq_limits[i];
10865
10866 (void)io_wq_max_workers(tctx->io_wq, new_count);
10867 }
10868 return 0;
10869err:
10870 if (sqd) {
10871 mutex_unlock(&sqd->lock);
10872 io_put_sq_data(sqd);
10873 }
10874 return ret;
10875}
10876
10877static bool io_register_op_must_quiesce(int op)
10878{
10879 switch (op) {
10880 case IORING_REGISTER_BUFFERS:
10881 case IORING_UNREGISTER_BUFFERS:
10882 case IORING_REGISTER_FILES:
10883 case IORING_UNREGISTER_FILES:
10884 case IORING_REGISTER_FILES_UPDATE:
10885 case IORING_REGISTER_PROBE:
10886 case IORING_REGISTER_PERSONALITY:
10887 case IORING_UNREGISTER_PERSONALITY:
10888 case IORING_REGISTER_FILES2:
10889 case IORING_REGISTER_FILES_UPDATE2:
10890 case IORING_REGISTER_BUFFERS2:
10891 case IORING_REGISTER_BUFFERS_UPDATE:
10892 case IORING_REGISTER_IOWQ_AFF:
10893 case IORING_UNREGISTER_IOWQ_AFF:
10894 case IORING_REGISTER_IOWQ_MAX_WORKERS:
10895 return false;
10896 default:
10897 return true;
10898 }
10899}
10900
10901static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
10902{
10903 long ret;
10904
10905 percpu_ref_kill(&ctx->refs);
10906
10907
10908
10909
10910
10911
10912
10913
10914 mutex_unlock(&ctx->uring_lock);
10915 do {
10916 ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
10917 if (ret) {
10918 ret = min(0L, ret);
10919 break;
10920 }
10921
10922 ret = io_run_task_work_sig();
10923 io_req_caches_free(ctx);
10924 } while (ret >= 0);
10925 mutex_lock(&ctx->uring_lock);
10926
10927 if (ret)
10928 io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10929 return ret;
10930}
10931
10932static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10933 void __user *arg, unsigned nr_args)
10934 __releases(ctx->uring_lock)
10935 __acquires(ctx->uring_lock)
10936{
10937 int ret;
10938
10939
10940
10941
10942
10943
10944 if (percpu_ref_is_dying(&ctx->refs))
10945 return -ENXIO;
10946
10947 if (ctx->restricted) {
10948 if (opcode >= IORING_REGISTER_LAST)
10949 return -EINVAL;
10950 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10951 if (!test_bit(opcode, ctx->restrictions.register_op))
10952 return -EACCES;
10953 }
10954
10955 if (io_register_op_must_quiesce(opcode)) {
10956 ret = io_ctx_quiesce(ctx);
10957 if (ret)
10958 return ret;
10959 }
10960
10961 switch (opcode) {
10962 case IORING_REGISTER_BUFFERS:
10963 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10964 break;
10965 case IORING_UNREGISTER_BUFFERS:
10966 ret = -EINVAL;
10967 if (arg || nr_args)
10968 break;
10969 ret = io_sqe_buffers_unregister(ctx);
10970 break;
10971 case IORING_REGISTER_FILES:
10972 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10973 break;
10974 case IORING_UNREGISTER_FILES:
10975 ret = -EINVAL;
10976 if (arg || nr_args)
10977 break;
10978 ret = io_sqe_files_unregister(ctx);
10979 break;
10980 case IORING_REGISTER_FILES_UPDATE:
10981 ret = io_register_files_update(ctx, arg, nr_args);
10982 break;
10983 case IORING_REGISTER_EVENTFD:
10984 case IORING_REGISTER_EVENTFD_ASYNC:
10985 ret = -EINVAL;
10986 if (nr_args != 1)
10987 break;
10988 ret = io_eventfd_register(ctx, arg);
10989 if (ret)
10990 break;
10991 if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10992 ctx->eventfd_async = 1;
10993 else
10994 ctx->eventfd_async = 0;
10995 break;
10996 case IORING_UNREGISTER_EVENTFD:
10997 ret = -EINVAL;
10998 if (arg || nr_args)
10999 break;
11000 ret = io_eventfd_unregister(ctx);
11001 break;
11002 case IORING_REGISTER_PROBE:
11003 ret = -EINVAL;
11004 if (!arg || nr_args > 256)
11005 break;
11006 ret = io_probe(ctx, arg, nr_args);
11007 break;
11008 case IORING_REGISTER_PERSONALITY:
11009 ret = -EINVAL;
11010 if (arg || nr_args)
11011 break;
11012 ret = io_register_personality(ctx);
11013 break;
11014 case IORING_UNREGISTER_PERSONALITY:
11015 ret = -EINVAL;
11016 if (arg)
11017 break;
11018 ret = io_unregister_personality(ctx, nr_args);
11019 break;
11020 case IORING_REGISTER_ENABLE_RINGS:
11021 ret = -EINVAL;
11022 if (arg || nr_args)
11023 break;
11024 ret = io_register_enable_rings(ctx);
11025 break;
11026 case IORING_REGISTER_RESTRICTIONS:
11027 ret = io_register_restrictions(ctx, arg, nr_args);
11028 break;
11029 case IORING_REGISTER_FILES2:
11030 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11031 break;
11032 case IORING_REGISTER_FILES_UPDATE2:
11033 ret = io_register_rsrc_update(ctx, arg, nr_args,
11034 IORING_RSRC_FILE);
11035 break;
11036 case IORING_REGISTER_BUFFERS2:
11037 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
11038 break;
11039 case IORING_REGISTER_BUFFERS_UPDATE:
11040 ret = io_register_rsrc_update(ctx, arg, nr_args,
11041 IORING_RSRC_BUFFER);
11042 break;
11043 case IORING_REGISTER_IOWQ_AFF:
11044 ret = -EINVAL;
11045 if (!arg || !nr_args)
11046 break;
11047 ret = io_register_iowq_aff(ctx, arg, nr_args);
11048 break;
11049 case IORING_UNREGISTER_IOWQ_AFF:
11050 ret = -EINVAL;
11051 if (arg || nr_args)
11052 break;
11053 ret = io_unregister_iowq_aff(ctx);
11054 break;
11055 case IORING_REGISTER_IOWQ_MAX_WORKERS:
11056 ret = -EINVAL;
11057 if (!arg || nr_args != 2)
11058 break;
11059 ret = io_register_iowq_max_workers(ctx, arg);
11060 break;
11061 default:
11062 ret = -EINVAL;
11063 break;
11064 }
11065
11066 if (io_register_op_must_quiesce(opcode)) {
11067
11068 percpu_ref_reinit(&ctx->refs);
11069 reinit_completion(&ctx->ref_comp);
11070 }
11071 return ret;
11072}
11073
11074SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11075 void __user *, arg, unsigned int, nr_args)
11076{
11077 struct io_ring_ctx *ctx;
11078 long ret = -EBADF;
11079 struct fd f;
11080
11081 f = fdget(fd);
11082 if (!f.file)
11083 return -EBADF;
11084
11085 ret = -EOPNOTSUPP;
11086 if (f.file->f_op != &io_uring_fops)
11087 goto out_fput;
11088
11089 ctx = f.file->private_data;
11090
11091 io_run_task_work();
11092
11093 mutex_lock(&ctx->uring_lock);
11094 ret = __io_uring_register(ctx, opcode, arg, nr_args);
11095 mutex_unlock(&ctx->uring_lock);
11096 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
11097 ctx->cq_ev_fd != NULL, ret);
11098out_fput:
11099 fdput(f);
11100 return ret;
11101}
11102
11103static int __init io_uring_init(void)
11104{
11105#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11106 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11107 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11108} while (0)
11109
11110#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11111 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11112 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11113 BUILD_BUG_SQE_ELEM(0, __u8, opcode);
11114 BUILD_BUG_SQE_ELEM(1, __u8, flags);
11115 BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
11116 BUILD_BUG_SQE_ELEM(4, __s32, fd);
11117 BUILD_BUG_SQE_ELEM(8, __u64, off);
11118 BUILD_BUG_SQE_ELEM(8, __u64, addr2);
11119 BUILD_BUG_SQE_ELEM(16, __u64, addr);
11120 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
11121 BUILD_BUG_SQE_ELEM(24, __u32, len);
11122 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
11123 BUILD_BUG_SQE_ELEM(28, int, rw_flags);
11124 BUILD_BUG_SQE_ELEM(28, __u32, rw_flags);
11125 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
11126 BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
11127 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
11128 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
11129 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
11130 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
11131 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
11132 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
11133 BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
11134 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
11135 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
11136 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
11137 BUILD_BUG_SQE_ELEM(32, __u64, user_data);
11138 BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
11139 BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
11140 BUILD_BUG_SQE_ELEM(42, __u16, personality);
11141 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
11142 BUILD_BUG_SQE_ELEM(44, __u32, file_index);
11143
11144 BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11145 sizeof(struct io_uring_rsrc_update));
11146 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11147 sizeof(struct io_uring_rsrc_update2));
11148
11149
11150 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11151
11152
11153 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
11154 BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11155 BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
11156
11157 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
11158 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
11159
11160 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11161 SLAB_ACCOUNT);
11162 return 0;
11163};
11164__initcall(io_uring_init);
11165