linux/fs/io_uring.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Shared application/kernel submission and completion ring pairs, for
   4 * supporting fast/efficient IO.
   5 *
   6 * A note on the read/write ordering memory barriers that are matched between
   7 * the application and kernel side.
   8 *
   9 * After the application reads the CQ ring tail, it must use an
  10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11 * before writing the tail (using smp_load_acquire to read the tail will
  12 * do). It also needs a smp_mb() before updating CQ head (ordering the
  13 * entry load(s) with the head store), pairing with an implicit barrier
  14 * through a control-dependency in io_get_cqe (smp_store_release to
  15 * store head will do). Failure to do so could lead to reading invalid
  16 * CQ entries.
  17 *
  18 * Likewise, the application must use an appropriate smp_wmb() before
  19 * writing the SQ tail (ordering SQ entry stores with the tail store),
  20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21 * to store the tail will do). And it needs a barrier ordering the SQ
  22 * head load before writing new SQ entries (smp_load_acquire to read
  23 * head will do).
  24 *
  25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27 * updating the SQ tail; a full memory barrier smp_mb() is needed
  28 * between.
  29 *
  30 * Also see the examples in the liburing library:
  31 *
  32 *      git://git.kernel.dk/liburing
  33 *
  34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35 * from data shared between the kernel and application. This is done both
  36 * for ordering purposes, but also to ensure that once a value is loaded from
  37 * data that the application could potentially modify, it remains stable.
  38 *
  39 * Copyright (C) 2018-2019 Jens Axboe
  40 * Copyright (c) 2018-2019 Christoph Hellwig
  41 */
  42#include <linux/kernel.h>
  43#include <linux/init.h>
  44#include <linux/errno.h>
  45#include <linux/syscalls.h>
  46#include <linux/compat.h>
  47#include <net/compat.h>
  48#include <linux/refcount.h>
  49#include <linux/uio.h>
  50#include <linux/bits.h>
  51
  52#include <linux/sched/signal.h>
  53#include <linux/fs.h>
  54#include <linux/file.h>
  55#include <linux/fdtable.h>
  56#include <linux/mm.h>
  57#include <linux/mman.h>
  58#include <linux/percpu.h>
  59#include <linux/slab.h>
  60#include <linux/blkdev.h>
  61#include <linux/bvec.h>
  62#include <linux/net.h>
  63#include <net/sock.h>
  64#include <net/af_unix.h>
  65#include <net/scm.h>
  66#include <linux/anon_inodes.h>
  67#include <linux/sched/mm.h>
  68#include <linux/uaccess.h>
  69#include <linux/nospec.h>
  70#include <linux/sizes.h>
  71#include <linux/hugetlb.h>
  72#include <linux/highmem.h>
  73#include <linux/namei.h>
  74#include <linux/fsnotify.h>
  75#include <linux/fadvise.h>
  76#include <linux/eventpoll.h>
  77#include <linux/splice.h>
  78#include <linux/task_work.h>
  79#include <linux/pagemap.h>
  80#include <linux/io_uring.h>
  81#include <linux/tracehook.h>
  82#include <linux/audit.h>
  83#include <linux/security.h>
  84
  85#define CREATE_TRACE_POINTS
  86#include <trace/events/io_uring.h>
  87
  88#include <uapi/linux/io_uring.h>
  89
  90#include "internal.h"
  91#include "io-wq.h"
  92
  93#define IORING_MAX_ENTRIES      32768
  94#define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  95#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
  96
  97/* only define max */
  98#define IORING_MAX_FIXED_FILES  (1U << 15)
  99#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 100                                 IORING_REGISTER_LAST + IORING_OP_LAST)
 101
 102#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
 103#define IO_RSRC_TAG_TABLE_MAX   (1U << IO_RSRC_TAG_TABLE_SHIFT)
 104#define IO_RSRC_TAG_TABLE_MASK  (IO_RSRC_TAG_TABLE_MAX - 1)
 105
 106#define IORING_MAX_REG_BUFFERS  (1U << 14)
 107
 108#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 109                          IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 110
 111#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN)
 112
 113#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
 114                                REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
 115                                REQ_F_ASYNC_DATA)
 116
 117#define IO_TCTX_REFS_CACHE_NR   (1U << 10)
 118
 119struct io_uring {
 120        u32 head ____cacheline_aligned_in_smp;
 121        u32 tail ____cacheline_aligned_in_smp;
 122};
 123
 124/*
 125 * This data is shared with the application through the mmap at offsets
 126 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 127 *
 128 * The offsets to the member fields are published through struct
 129 * io_sqring_offsets when calling io_uring_setup.
 130 */
 131struct io_rings {
 132        /*
 133         * Head and tail offsets into the ring; the offsets need to be
 134         * masked to get valid indices.
 135         *
 136         * The kernel controls head of the sq ring and the tail of the cq ring,
 137         * and the application controls tail of the sq ring and the head of the
 138         * cq ring.
 139         */
 140        struct io_uring         sq, cq;
 141        /*
 142         * Bitmasks to apply to head and tail offsets (constant, equals
 143         * ring_entries - 1)
 144         */
 145        u32                     sq_ring_mask, cq_ring_mask;
 146        /* Ring sizes (constant, power of 2) */
 147        u32                     sq_ring_entries, cq_ring_entries;
 148        /*
 149         * Number of invalid entries dropped by the kernel due to
 150         * invalid index stored in array
 151         *
 152         * Written by the kernel, shouldn't be modified by the
 153         * application (i.e. get number of "new events" by comparing to
 154         * cached value).
 155         *
 156         * After a new SQ head value was read by the application this
 157         * counter includes all submissions that were dropped reaching
 158         * the new SQ head (and possibly more).
 159         */
 160        u32                     sq_dropped;
 161        /*
 162         * Runtime SQ flags
 163         *
 164         * Written by the kernel, shouldn't be modified by the
 165         * application.
 166         *
 167         * The application needs a full memory barrier before checking
 168         * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 169         */
 170        u32                     sq_flags;
 171        /*
 172         * Runtime CQ flags
 173         *
 174         * Written by the application, shouldn't be modified by the
 175         * kernel.
 176         */
 177        u32                     cq_flags;
 178        /*
 179         * Number of completion events lost because the queue was full;
 180         * this should be avoided by the application by making sure
 181         * there are not more requests pending than there is space in
 182         * the completion queue.
 183         *
 184         * Written by the kernel, shouldn't be modified by the
 185         * application (i.e. get number of "new events" by comparing to
 186         * cached value).
 187         *
 188         * As completion events come in out of order this counter is not
 189         * ordered with any other data.
 190         */
 191        u32                     cq_overflow;
 192        /*
 193         * Ring buffer of completion events.
 194         *
 195         * The kernel writes completion events fresh every time they are
 196         * produced, so the application is allowed to modify pending
 197         * entries.
 198         */
 199        struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 200};
 201
 202enum io_uring_cmd_flags {
 203        IO_URING_F_COMPLETE_DEFER       = 1,
 204        IO_URING_F_UNLOCKED             = 2,
 205        /* int's last bit, sign checks are usually faster than a bit test */
 206        IO_URING_F_NONBLOCK             = INT_MIN,
 207};
 208
 209struct io_mapped_ubuf {
 210        u64             ubuf;
 211        u64             ubuf_end;
 212        unsigned int    nr_bvecs;
 213        unsigned long   acct_pages;
 214        struct bio_vec  bvec[];
 215};
 216
 217struct io_ring_ctx;
 218
 219struct io_overflow_cqe {
 220        struct io_uring_cqe cqe;
 221        struct list_head list;
 222};
 223
 224struct io_fixed_file {
 225        /* file * with additional FFS_* flags */
 226        unsigned long file_ptr;
 227};
 228
 229struct io_rsrc_put {
 230        struct list_head list;
 231        u64 tag;
 232        union {
 233                void *rsrc;
 234                struct file *file;
 235                struct io_mapped_ubuf *buf;
 236        };
 237};
 238
 239struct io_file_table {
 240        struct io_fixed_file *files;
 241};
 242
 243struct io_rsrc_node {
 244        struct percpu_ref               refs;
 245        struct list_head                node;
 246        struct list_head                rsrc_list;
 247        struct io_rsrc_data             *rsrc_data;
 248        struct llist_node               llist;
 249        bool                            done;
 250};
 251
 252typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
 253
 254struct io_rsrc_data {
 255        struct io_ring_ctx              *ctx;
 256
 257        u64                             **tags;
 258        unsigned int                    nr;
 259        rsrc_put_fn                     *do_put;
 260        atomic_t                        refs;
 261        struct completion               done;
 262        bool                            quiesce;
 263};
 264
 265struct io_buffer {
 266        struct list_head list;
 267        __u64 addr;
 268        __u32 len;
 269        __u16 bid;
 270};
 271
 272struct io_restriction {
 273        DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 274        DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 275        u8 sqe_flags_allowed;
 276        u8 sqe_flags_required;
 277        bool registered;
 278};
 279
 280enum {
 281        IO_SQ_THREAD_SHOULD_STOP = 0,
 282        IO_SQ_THREAD_SHOULD_PARK,
 283};
 284
 285struct io_sq_data {
 286        refcount_t              refs;
 287        atomic_t                park_pending;
 288        struct mutex            lock;
 289
 290        /* ctx's that are using this sqd */
 291        struct list_head        ctx_list;
 292
 293        struct task_struct      *thread;
 294        struct wait_queue_head  wait;
 295
 296        unsigned                sq_thread_idle;
 297        int                     sq_cpu;
 298        pid_t                   task_pid;
 299        pid_t                   task_tgid;
 300
 301        unsigned long           state;
 302        struct completion       exited;
 303};
 304
 305#define IO_COMPL_BATCH                  32
 306#define IO_REQ_CACHE_SIZE               32
 307#define IO_REQ_ALLOC_BATCH              8
 308
 309struct io_submit_link {
 310        struct io_kiocb         *head;
 311        struct io_kiocb         *last;
 312};
 313
 314struct io_submit_state {
 315        /* inline/task_work completion list, under ->uring_lock */
 316        struct io_wq_work_node  free_list;
 317        /* batch completion logic */
 318        struct io_wq_work_list  compl_reqs;
 319        struct io_submit_link   link;
 320
 321        bool                    plug_started;
 322        bool                    need_plug;
 323        unsigned short          submit_nr;
 324        struct blk_plug         plug;
 325};
 326
 327struct io_ring_ctx {
 328        /* const or read-mostly hot data */
 329        struct {
 330                struct percpu_ref       refs;
 331
 332                struct io_rings         *rings;
 333                unsigned int            flags;
 334                unsigned int            compat: 1;
 335                unsigned int            drain_next: 1;
 336                unsigned int            eventfd_async: 1;
 337                unsigned int            restricted: 1;
 338                unsigned int            off_timeout_used: 1;
 339                unsigned int            drain_active: 1;
 340        } ____cacheline_aligned_in_smp;
 341
 342        /* submission data */
 343        struct {
 344                struct mutex            uring_lock;
 345
 346                /*
 347                 * Ring buffer of indices into array of io_uring_sqe, which is
 348                 * mmapped by the application using the IORING_OFF_SQES offset.
 349                 *
 350                 * This indirection could e.g. be used to assign fixed
 351                 * io_uring_sqe entries to operations and only submit them to
 352                 * the queue when needed.
 353                 *
 354                 * The kernel modifies neither the indices array nor the entries
 355                 * array.
 356                 */
 357                u32                     *sq_array;
 358                struct io_uring_sqe     *sq_sqes;
 359                unsigned                cached_sq_head;
 360                unsigned                sq_entries;
 361                struct list_head        defer_list;
 362
 363                /*
 364                 * Fixed resources fast path, should be accessed only under
 365                 * uring_lock, and updated through io_uring_register(2)
 366                 */
 367                struct io_rsrc_node     *rsrc_node;
 368                int                     rsrc_cached_refs;
 369                struct io_file_table    file_table;
 370                unsigned                nr_user_files;
 371                unsigned                nr_user_bufs;
 372                struct io_mapped_ubuf   **user_bufs;
 373
 374                struct io_submit_state  submit_state;
 375                struct list_head        timeout_list;
 376                struct list_head        ltimeout_list;
 377                struct list_head        cq_overflow_list;
 378                struct xarray           io_buffers;
 379                struct xarray           personalities;
 380                u32                     pers_next;
 381                unsigned                sq_thread_idle;
 382        } ____cacheline_aligned_in_smp;
 383
 384        /* IRQ completion list, under ->completion_lock */
 385        struct io_wq_work_list  locked_free_list;
 386        unsigned int            locked_free_nr;
 387
 388        const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
 389        struct io_sq_data       *sq_data;       /* if using sq thread polling */
 390
 391        struct wait_queue_head  sqo_sq_wait;
 392        struct list_head        sqd_list;
 393
 394        unsigned long           check_cq_overflow;
 395
 396        struct {
 397                unsigned                cached_cq_tail;
 398                unsigned                cq_entries;
 399                struct eventfd_ctx      *cq_ev_fd;
 400                struct wait_queue_head  cq_wait;
 401                unsigned                cq_extra;
 402                atomic_t                cq_timeouts;
 403                unsigned                cq_last_tm_flush;
 404        } ____cacheline_aligned_in_smp;
 405
 406        struct {
 407                spinlock_t              completion_lock;
 408
 409                spinlock_t              timeout_lock;
 410
 411                /*
 412                 * ->iopoll_list is protected by the ctx->uring_lock for
 413                 * io_uring instances that don't use IORING_SETUP_SQPOLL.
 414                 * For SQPOLL, only the single threaded io_sq_thread() will
 415                 * manipulate the list, hence no extra locking is needed there.
 416                 */
 417                struct io_wq_work_list  iopoll_list;
 418                struct hlist_head       *cancel_hash;
 419                unsigned                cancel_hash_bits;
 420                bool                    poll_multi_queue;
 421        } ____cacheline_aligned_in_smp;
 422
 423        struct io_restriction           restrictions;
 424
 425        /* slow path rsrc auxilary data, used by update/register */
 426        struct {
 427                struct io_rsrc_node             *rsrc_backup_node;
 428                struct io_mapped_ubuf           *dummy_ubuf;
 429                struct io_rsrc_data             *file_data;
 430                struct io_rsrc_data             *buf_data;
 431
 432                struct delayed_work             rsrc_put_work;
 433                struct llist_head               rsrc_put_llist;
 434                struct list_head                rsrc_ref_list;
 435                spinlock_t                      rsrc_ref_lock;
 436        };
 437
 438        /* Keep this last, we don't need it for the fast path */
 439        struct {
 440                #if defined(CONFIG_UNIX)
 441                        struct socket           *ring_sock;
 442                #endif
 443                /* hashed buffered write serialization */
 444                struct io_wq_hash               *hash_map;
 445
 446                /* Only used for accounting purposes */
 447                struct user_struct              *user;
 448                struct mm_struct                *mm_account;
 449
 450                /* ctx exit and cancelation */
 451                struct llist_head               fallback_llist;
 452                struct delayed_work             fallback_work;
 453                struct work_struct              exit_work;
 454                struct list_head                tctx_list;
 455                struct completion               ref_comp;
 456                u32                             iowq_limits[2];
 457                bool                            iowq_limits_set;
 458        };
 459};
 460
 461struct io_uring_task {
 462        /* submission side */
 463        int                     cached_refs;
 464        struct xarray           xa;
 465        struct wait_queue_head  wait;
 466        const struct io_ring_ctx *last;
 467        struct io_wq            *io_wq;
 468        struct percpu_counter   inflight;
 469        atomic_t                inflight_tracked;
 470        atomic_t                in_idle;
 471
 472        spinlock_t              task_lock;
 473        struct io_wq_work_list  task_list;
 474        struct callback_head    task_work;
 475        bool                    task_running;
 476};
 477
 478/*
 479 * First field must be the file pointer in all the
 480 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 481 */
 482struct io_poll_iocb {
 483        struct file                     *file;
 484        struct wait_queue_head          *head;
 485        __poll_t                        events;
 486        bool                            done;
 487        bool                            canceled;
 488        struct wait_queue_entry         wait;
 489};
 490
 491struct io_poll_update {
 492        struct file                     *file;
 493        u64                             old_user_data;
 494        u64                             new_user_data;
 495        __poll_t                        events;
 496        bool                            update_events;
 497        bool                            update_user_data;
 498};
 499
 500struct io_close {
 501        struct file                     *file;
 502        int                             fd;
 503        u32                             file_slot;
 504};
 505
 506struct io_timeout_data {
 507        struct io_kiocb                 *req;
 508        struct hrtimer                  timer;
 509        struct timespec64               ts;
 510        enum hrtimer_mode               mode;
 511        u32                             flags;
 512};
 513
 514struct io_accept {
 515        struct file                     *file;
 516        struct sockaddr __user          *addr;
 517        int __user                      *addr_len;
 518        int                             flags;
 519        u32                             file_slot;
 520        unsigned long                   nofile;
 521};
 522
 523struct io_sync {
 524        struct file                     *file;
 525        loff_t                          len;
 526        loff_t                          off;
 527        int                             flags;
 528        int                             mode;
 529};
 530
 531struct io_cancel {
 532        struct file                     *file;
 533        u64                             addr;
 534};
 535
 536struct io_timeout {
 537        struct file                     *file;
 538        u32                             off;
 539        u32                             target_seq;
 540        struct list_head                list;
 541        /* head of the link, used by linked timeouts only */
 542        struct io_kiocb                 *head;
 543        /* for linked completions */
 544        struct io_kiocb                 *prev;
 545};
 546
 547struct io_timeout_rem {
 548        struct file                     *file;
 549        u64                             addr;
 550
 551        /* timeout update */
 552        struct timespec64               ts;
 553        u32                             flags;
 554        bool                            ltimeout;
 555};
 556
 557struct io_rw {
 558        /* NOTE: kiocb has the file as the first member, so don't do it here */
 559        struct kiocb                    kiocb;
 560        u64                             addr;
 561        u64                             len;
 562};
 563
 564struct io_connect {
 565        struct file                     *file;
 566        struct sockaddr __user          *addr;
 567        int                             addr_len;
 568};
 569
 570struct io_sr_msg {
 571        struct file                     *file;
 572        union {
 573                struct compat_msghdr __user     *umsg_compat;
 574                struct user_msghdr __user       *umsg;
 575                void __user                     *buf;
 576        };
 577        int                             msg_flags;
 578        int                             bgid;
 579        size_t                          len;
 580};
 581
 582struct io_open {
 583        struct file                     *file;
 584        int                             dfd;
 585        u32                             file_slot;
 586        struct filename                 *filename;
 587        struct open_how                 how;
 588        unsigned long                   nofile;
 589};
 590
 591struct io_rsrc_update {
 592        struct file                     *file;
 593        u64                             arg;
 594        u32                             nr_args;
 595        u32                             offset;
 596};
 597
 598struct io_fadvise {
 599        struct file                     *file;
 600        u64                             offset;
 601        u32                             len;
 602        u32                             advice;
 603};
 604
 605struct io_madvise {
 606        struct file                     *file;
 607        u64                             addr;
 608        u32                             len;
 609        u32                             advice;
 610};
 611
 612struct io_epoll {
 613        struct file                     *file;
 614        int                             epfd;
 615        int                             op;
 616        int                             fd;
 617        struct epoll_event              event;
 618};
 619
 620struct io_splice {
 621        struct file                     *file_out;
 622        struct file                     *file_in;
 623        loff_t                          off_out;
 624        loff_t                          off_in;
 625        u64                             len;
 626        unsigned int                    flags;
 627};
 628
 629struct io_provide_buf {
 630        struct file                     *file;
 631        __u64                           addr;
 632        __u32                           len;
 633        __u32                           bgid;
 634        __u16                           nbufs;
 635        __u16                           bid;
 636};
 637
 638struct io_statx {
 639        struct file                     *file;
 640        int                             dfd;
 641        unsigned int                    mask;
 642        unsigned int                    flags;
 643        const char __user               *filename;
 644        struct statx __user             *buffer;
 645};
 646
 647struct io_shutdown {
 648        struct file                     *file;
 649        int                             how;
 650};
 651
 652struct io_rename {
 653        struct file                     *file;
 654        int                             old_dfd;
 655        int                             new_dfd;
 656        struct filename                 *oldpath;
 657        struct filename                 *newpath;
 658        int                             flags;
 659};
 660
 661struct io_unlink {
 662        struct file                     *file;
 663        int                             dfd;
 664        int                             flags;
 665        struct filename                 *filename;
 666};
 667
 668struct io_mkdir {
 669        struct file                     *file;
 670        int                             dfd;
 671        umode_t                         mode;
 672        struct filename                 *filename;
 673};
 674
 675struct io_symlink {
 676        struct file                     *file;
 677        int                             new_dfd;
 678        struct filename                 *oldpath;
 679        struct filename                 *newpath;
 680};
 681
 682struct io_hardlink {
 683        struct file                     *file;
 684        int                             old_dfd;
 685        int                             new_dfd;
 686        struct filename                 *oldpath;
 687        struct filename                 *newpath;
 688        int                             flags;
 689};
 690
 691struct io_async_connect {
 692        struct sockaddr_storage         address;
 693};
 694
 695struct io_async_msghdr {
 696        struct iovec                    fast_iov[UIO_FASTIOV];
 697        /* points to an allocated iov, if NULL we use fast_iov instead */
 698        struct iovec                    *free_iov;
 699        struct sockaddr __user          *uaddr;
 700        struct msghdr                   msg;
 701        struct sockaddr_storage         addr;
 702};
 703
 704struct io_rw_state {
 705        struct iov_iter                 iter;
 706        struct iov_iter_state           iter_state;
 707        struct iovec                    fast_iov[UIO_FASTIOV];
 708};
 709
 710struct io_async_rw {
 711        struct io_rw_state              s;
 712        const struct iovec              *free_iovec;
 713        size_t                          bytes_done;
 714        struct wait_page_queue          wpq;
 715};
 716
 717enum {
 718        REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 719        REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 720        REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 721        REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 722        REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 723        REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 724
 725        /* first byte is taken by user flags, shift it to not overlap */
 726        REQ_F_FAIL_BIT          = 8,
 727        REQ_F_INFLIGHT_BIT,
 728        REQ_F_CUR_POS_BIT,
 729        REQ_F_NOWAIT_BIT,
 730        REQ_F_LINK_TIMEOUT_BIT,
 731        REQ_F_NEED_CLEANUP_BIT,
 732        REQ_F_POLLED_BIT,
 733        REQ_F_BUFFER_SELECTED_BIT,
 734        REQ_F_COMPLETE_INLINE_BIT,
 735        REQ_F_REISSUE_BIT,
 736        REQ_F_CREDS_BIT,
 737        REQ_F_REFCOUNT_BIT,
 738        REQ_F_ARM_LTIMEOUT_BIT,
 739        REQ_F_ASYNC_DATA_BIT,
 740        /* keep async read/write and isreg together and in order */
 741        REQ_F_SUPPORT_NOWAIT_BIT,
 742        REQ_F_ISREG_BIT,
 743
 744        /* not a real bit, just to check we're not overflowing the space */
 745        __REQ_F_LAST_BIT,
 746};
 747
 748enum {
 749        /* ctx owns file */
 750        REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 751        /* drain existing IO first */
 752        REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 753        /* linked sqes */
 754        REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 755        /* doesn't sever on completion < 0 */
 756        REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 757        /* IOSQE_ASYNC */
 758        REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 759        /* IOSQE_BUFFER_SELECT */
 760        REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 761
 762        /* fail rest of links */
 763        REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
 764        /* on inflight list, should be cancelled and waited on exit reliably */
 765        REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 766        /* read/write uses file position */
 767        REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 768        /* must not punt to workers */
 769        REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 770        /* has or had linked timeout */
 771        REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 772        /* needs cleanup */
 773        REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 774        /* already went through poll handler */
 775        REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 776        /* buffer already selected */
 777        REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 778        /* completion is deferred through io_comp_state */
 779        REQ_F_COMPLETE_INLINE   = BIT(REQ_F_COMPLETE_INLINE_BIT),
 780        /* caller should reissue async */
 781        REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
 782        /* supports async reads/writes */
 783        REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
 784        /* regular file */
 785        REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 786        /* has creds assigned */
 787        REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
 788        /* skip refcounting if not set */
 789        REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
 790        /* there is a linked timeout that has to be armed */
 791        REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
 792        /* ->async_data allocated */
 793        REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
 794};
 795
 796struct async_poll {
 797        struct io_poll_iocb     poll;
 798        struct io_poll_iocb     *double_poll;
 799};
 800
 801typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
 802
 803struct io_task_work {
 804        union {
 805                struct io_wq_work_node  node;
 806                struct llist_node       fallback_node;
 807        };
 808        io_req_tw_func_t                func;
 809};
 810
 811enum {
 812        IORING_RSRC_FILE                = 0,
 813        IORING_RSRC_BUFFER              = 1,
 814};
 815
 816/*
 817 * NOTE! Each of the iocb union members has the file pointer
 818 * as the first entry in their struct definition. So you can
 819 * access the file pointer through any of the sub-structs,
 820 * or directly as just 'ki_filp' in this struct.
 821 */
 822struct io_kiocb {
 823        union {
 824                struct file             *file;
 825                struct io_rw            rw;
 826                struct io_poll_iocb     poll;
 827                struct io_poll_update   poll_update;
 828                struct io_accept        accept;
 829                struct io_sync          sync;
 830                struct io_cancel        cancel;
 831                struct io_timeout       timeout;
 832                struct io_timeout_rem   timeout_rem;
 833                struct io_connect       connect;
 834                struct io_sr_msg        sr_msg;
 835                struct io_open          open;
 836                struct io_close         close;
 837                struct io_rsrc_update   rsrc_update;
 838                struct io_fadvise       fadvise;
 839                struct io_madvise       madvise;
 840                struct io_epoll         epoll;
 841                struct io_splice        splice;
 842                struct io_provide_buf   pbuf;
 843                struct io_statx         statx;
 844                struct io_shutdown      shutdown;
 845                struct io_rename        rename;
 846                struct io_unlink        unlink;
 847                struct io_mkdir         mkdir;
 848                struct io_symlink       symlink;
 849                struct io_hardlink      hardlink;
 850        };
 851
 852        u8                              opcode;
 853        /* polled IO has completed */
 854        u8                              iopoll_completed;
 855        u16                             buf_index;
 856        unsigned int                    flags;
 857
 858        u64                             user_data;
 859        u32                             result;
 860        u32                             cflags;
 861
 862        struct io_ring_ctx              *ctx;
 863        struct task_struct              *task;
 864
 865        struct percpu_ref               *fixed_rsrc_refs;
 866        /* store used ubuf, so we can prevent reloading */
 867        struct io_mapped_ubuf           *imu;
 868
 869        /* used by request caches, completion batching and iopoll */
 870        struct io_wq_work_node          comp_list;
 871        atomic_t                        refs;
 872        struct io_kiocb                 *link;
 873        struct io_task_work             io_task_work;
 874        /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 875        struct hlist_node               hash_node;
 876        /* internal polling, see IORING_FEAT_FAST_POLL */
 877        struct async_poll               *apoll;
 878        /* opcode allocated if it needs to store data for async defer */
 879        void                            *async_data;
 880        struct io_wq_work               work;
 881        /* custom credentials, valid IFF REQ_F_CREDS is set */
 882        const struct cred               *creds;
 883        /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 884        struct io_buffer                *kbuf;
 885};
 886
 887struct io_tctx_node {
 888        struct list_head        ctx_node;
 889        struct task_struct      *task;
 890        struct io_ring_ctx      *ctx;
 891};
 892
 893struct io_defer_entry {
 894        struct list_head        list;
 895        struct io_kiocb         *req;
 896        u32                     seq;
 897};
 898
 899struct io_op_def {
 900        /* needs req->file assigned */
 901        unsigned                needs_file : 1;
 902        /* should block plug */
 903        unsigned                plug : 1;
 904        /* hash wq insertion if file is a regular file */
 905        unsigned                hash_reg_file : 1;
 906        /* unbound wq insertion if file is a non-regular file */
 907        unsigned                unbound_nonreg_file : 1;
 908        /* set if opcode supports polled "wait" */
 909        unsigned                pollin : 1;
 910        unsigned                pollout : 1;
 911        /* op supports buffer selection */
 912        unsigned                buffer_select : 1;
 913        /* do prep async if is going to be punted */
 914        unsigned                needs_async_setup : 1;
 915        /* opcode is not supported by this kernel */
 916        unsigned                not_supported : 1;
 917        /* skip auditing */
 918        unsigned                audit_skip : 1;
 919        /* size of async data needed, if any */
 920        unsigned short          async_size;
 921};
 922
 923static const struct io_op_def io_op_defs[] = {
 924        [IORING_OP_NOP] = {},
 925        [IORING_OP_READV] = {
 926                .needs_file             = 1,
 927                .unbound_nonreg_file    = 1,
 928                .pollin                 = 1,
 929                .buffer_select          = 1,
 930                .needs_async_setup      = 1,
 931                .plug                   = 1,
 932                .audit_skip             = 1,
 933                .async_size             = sizeof(struct io_async_rw),
 934        },
 935        [IORING_OP_WRITEV] = {
 936                .needs_file             = 1,
 937                .hash_reg_file          = 1,
 938                .unbound_nonreg_file    = 1,
 939                .pollout                = 1,
 940                .needs_async_setup      = 1,
 941                .plug                   = 1,
 942                .audit_skip             = 1,
 943                .async_size             = sizeof(struct io_async_rw),
 944        },
 945        [IORING_OP_FSYNC] = {
 946                .needs_file             = 1,
 947                .audit_skip             = 1,
 948        },
 949        [IORING_OP_READ_FIXED] = {
 950                .needs_file             = 1,
 951                .unbound_nonreg_file    = 1,
 952                .pollin                 = 1,
 953                .plug                   = 1,
 954                .audit_skip             = 1,
 955                .async_size             = sizeof(struct io_async_rw),
 956        },
 957        [IORING_OP_WRITE_FIXED] = {
 958                .needs_file             = 1,
 959                .hash_reg_file          = 1,
 960                .unbound_nonreg_file    = 1,
 961                .pollout                = 1,
 962                .plug                   = 1,
 963                .audit_skip             = 1,
 964                .async_size             = sizeof(struct io_async_rw),
 965        },
 966        [IORING_OP_POLL_ADD] = {
 967                .needs_file             = 1,
 968                .unbound_nonreg_file    = 1,
 969                .audit_skip             = 1,
 970        },
 971        [IORING_OP_POLL_REMOVE] = {
 972                .audit_skip             = 1,
 973        },
 974        [IORING_OP_SYNC_FILE_RANGE] = {
 975                .needs_file             = 1,
 976                .audit_skip             = 1,
 977        },
 978        [IORING_OP_SENDMSG] = {
 979                .needs_file             = 1,
 980                .unbound_nonreg_file    = 1,
 981                .pollout                = 1,
 982                .needs_async_setup      = 1,
 983                .async_size             = sizeof(struct io_async_msghdr),
 984        },
 985        [IORING_OP_RECVMSG] = {
 986                .needs_file             = 1,
 987                .unbound_nonreg_file    = 1,
 988                .pollin                 = 1,
 989                .buffer_select          = 1,
 990                .needs_async_setup      = 1,
 991                .async_size             = sizeof(struct io_async_msghdr),
 992        },
 993        [IORING_OP_TIMEOUT] = {
 994                .audit_skip             = 1,
 995                .async_size             = sizeof(struct io_timeout_data),
 996        },
 997        [IORING_OP_TIMEOUT_REMOVE] = {
 998                /* used by timeout updates' prep() */
 999                .audit_skip             = 1,
1000        },
1001        [IORING_OP_ACCEPT] = {
1002                .needs_file             = 1,
1003                .unbound_nonreg_file    = 1,
1004                .pollin                 = 1,
1005        },
1006        [IORING_OP_ASYNC_CANCEL] = {
1007                .audit_skip             = 1,
1008        },
1009        [IORING_OP_LINK_TIMEOUT] = {
1010                .audit_skip             = 1,
1011                .async_size             = sizeof(struct io_timeout_data),
1012        },
1013        [IORING_OP_CONNECT] = {
1014                .needs_file             = 1,
1015                .unbound_nonreg_file    = 1,
1016                .pollout                = 1,
1017                .needs_async_setup      = 1,
1018                .async_size             = sizeof(struct io_async_connect),
1019        },
1020        [IORING_OP_FALLOCATE] = {
1021                .needs_file             = 1,
1022        },
1023        [IORING_OP_OPENAT] = {},
1024        [IORING_OP_CLOSE] = {},
1025        [IORING_OP_FILES_UPDATE] = {
1026                .audit_skip             = 1,
1027        },
1028        [IORING_OP_STATX] = {
1029                .audit_skip             = 1,
1030        },
1031        [IORING_OP_READ] = {
1032                .needs_file             = 1,
1033                .unbound_nonreg_file    = 1,
1034                .pollin                 = 1,
1035                .buffer_select          = 1,
1036                .plug                   = 1,
1037                .audit_skip             = 1,
1038                .async_size             = sizeof(struct io_async_rw),
1039        },
1040        [IORING_OP_WRITE] = {
1041                .needs_file             = 1,
1042                .hash_reg_file          = 1,
1043                .unbound_nonreg_file    = 1,
1044                .pollout                = 1,
1045                .plug                   = 1,
1046                .audit_skip             = 1,
1047                .async_size             = sizeof(struct io_async_rw),
1048        },
1049        [IORING_OP_FADVISE] = {
1050                .needs_file             = 1,
1051                .audit_skip             = 1,
1052        },
1053        [IORING_OP_MADVISE] = {},
1054        [IORING_OP_SEND] = {
1055                .needs_file             = 1,
1056                .unbound_nonreg_file    = 1,
1057                .pollout                = 1,
1058                .audit_skip             = 1,
1059        },
1060        [IORING_OP_RECV] = {
1061                .needs_file             = 1,
1062                .unbound_nonreg_file    = 1,
1063                .pollin                 = 1,
1064                .buffer_select          = 1,
1065                .audit_skip             = 1,
1066        },
1067        [IORING_OP_OPENAT2] = {
1068        },
1069        [IORING_OP_EPOLL_CTL] = {
1070                .unbound_nonreg_file    = 1,
1071                .audit_skip             = 1,
1072        },
1073        [IORING_OP_SPLICE] = {
1074                .needs_file             = 1,
1075                .hash_reg_file          = 1,
1076                .unbound_nonreg_file    = 1,
1077                .audit_skip             = 1,
1078        },
1079        [IORING_OP_PROVIDE_BUFFERS] = {
1080                .audit_skip             = 1,
1081        },
1082        [IORING_OP_REMOVE_BUFFERS] = {
1083                .audit_skip             = 1,
1084        },
1085        [IORING_OP_TEE] = {
1086                .needs_file             = 1,
1087                .hash_reg_file          = 1,
1088                .unbound_nonreg_file    = 1,
1089                .audit_skip             = 1,
1090        },
1091        [IORING_OP_SHUTDOWN] = {
1092                .needs_file             = 1,
1093        },
1094        [IORING_OP_RENAMEAT] = {},
1095        [IORING_OP_UNLINKAT] = {},
1096        [IORING_OP_MKDIRAT] = {},
1097        [IORING_OP_SYMLINKAT] = {},
1098        [IORING_OP_LINKAT] = {},
1099};
1100
1101/* requests with any of those set should undergo io_disarm_next() */
1102#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1103
1104static bool io_disarm_next(struct io_kiocb *req);
1105static void io_uring_del_tctx_node(unsigned long index);
1106static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1107                                         struct task_struct *task,
1108                                         bool cancel_all);
1109static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1110
1111static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1112                                 s32 res, u32 cflags);
1113static void io_put_req(struct io_kiocb *req);
1114static void io_put_req_deferred(struct io_kiocb *req);
1115static void io_dismantle_req(struct io_kiocb *req);
1116static void io_queue_linked_timeout(struct io_kiocb *req);
1117static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1118                                     struct io_uring_rsrc_update2 *up,
1119                                     unsigned nr_args);
1120static void io_clean_op(struct io_kiocb *req);
1121static struct file *io_file_get(struct io_ring_ctx *ctx,
1122                                struct io_kiocb *req, int fd, bool fixed);
1123static void __io_queue_sqe(struct io_kiocb *req);
1124static void io_rsrc_put_work(struct work_struct *work);
1125
1126static void io_req_task_queue(struct io_kiocb *req);
1127static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
1128static int io_req_prep_async(struct io_kiocb *req);
1129
1130static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1131                                 unsigned int issue_flags, u32 slot_index);
1132static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1133
1134static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
1135
1136static struct kmem_cache *req_cachep;
1137
1138static const struct file_operations io_uring_fops;
1139
1140struct sock *io_uring_get_socket(struct file *file)
1141{
1142#if defined(CONFIG_UNIX)
1143        if (file->f_op == &io_uring_fops) {
1144                struct io_ring_ctx *ctx = file->private_data;
1145
1146                return ctx->ring_sock->sk;
1147        }
1148#endif
1149        return NULL;
1150}
1151EXPORT_SYMBOL(io_uring_get_socket);
1152
1153static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1154{
1155        if (!*locked) {
1156                mutex_lock(&ctx->uring_lock);
1157                *locked = true;
1158        }
1159}
1160
1161#define io_for_each_link(pos, head) \
1162        for (pos = (head); pos; pos = pos->link)
1163
1164/*
1165 * Shamelessly stolen from the mm implementation of page reference checking,
1166 * see commit f958d7b528b1 for details.
1167 */
1168#define req_ref_zero_or_close_to_overflow(req)  \
1169        ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1170
1171static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1172{
1173        WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1174        return atomic_inc_not_zero(&req->refs);
1175}
1176
1177static inline bool req_ref_put_and_test(struct io_kiocb *req)
1178{
1179        if (likely(!(req->flags & REQ_F_REFCOUNT)))
1180                return true;
1181
1182        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1183        return atomic_dec_and_test(&req->refs);
1184}
1185
1186static inline void req_ref_put(struct io_kiocb *req)
1187{
1188        WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1189        WARN_ON_ONCE(req_ref_put_and_test(req));
1190}
1191
1192static inline void req_ref_get(struct io_kiocb *req)
1193{
1194        WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1195        WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1196        atomic_inc(&req->refs);
1197}
1198
1199static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
1200{
1201        if (!wq_list_empty(&ctx->submit_state.compl_reqs))
1202                __io_submit_flush_completions(ctx);
1203}
1204
1205static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1206{
1207        if (!(req->flags & REQ_F_REFCOUNT)) {
1208                req->flags |= REQ_F_REFCOUNT;
1209                atomic_set(&req->refs, nr);
1210        }
1211}
1212
1213static inline void io_req_set_refcount(struct io_kiocb *req)
1214{
1215        __io_req_set_refcount(req, 1);
1216}
1217
1218#define IO_RSRC_REF_BATCH       100
1219
1220static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
1221                                          struct io_ring_ctx *ctx)
1222        __must_hold(&ctx->uring_lock)
1223{
1224        struct percpu_ref *ref = req->fixed_rsrc_refs;
1225
1226        if (ref) {
1227                if (ref == &ctx->rsrc_node->refs)
1228                        ctx->rsrc_cached_refs++;
1229                else
1230                        percpu_ref_put(ref);
1231        }
1232}
1233
1234static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx)
1235{
1236        if (req->fixed_rsrc_refs)
1237                percpu_ref_put(req->fixed_rsrc_refs);
1238}
1239
1240static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
1241        __must_hold(&ctx->uring_lock)
1242{
1243        if (ctx->rsrc_cached_refs) {
1244                percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs);
1245                ctx->rsrc_cached_refs = 0;
1246        }
1247}
1248
1249static void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
1250        __must_hold(&ctx->uring_lock)
1251{
1252        ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
1253        percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
1254}
1255
1256static inline void io_req_set_rsrc_node(struct io_kiocb *req,
1257                                        struct io_ring_ctx *ctx)
1258{
1259        if (!req->fixed_rsrc_refs) {
1260                req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1261                ctx->rsrc_cached_refs--;
1262                if (unlikely(ctx->rsrc_cached_refs < 0))
1263                        io_rsrc_refs_refill(ctx);
1264        }
1265}
1266
1267static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1268{
1269        bool got = percpu_ref_tryget(ref);
1270
1271        /* already at zero, wait for ->release() */
1272        if (!got)
1273                wait_for_completion(compl);
1274        percpu_ref_resurrect(ref);
1275        if (got)
1276                percpu_ref_put(ref);
1277}
1278
1279static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1280                          bool cancel_all)
1281        __must_hold(&req->ctx->timeout_lock)
1282{
1283        struct io_kiocb *req;
1284
1285        if (task && head->task != task)
1286                return false;
1287        if (cancel_all)
1288                return true;
1289
1290        io_for_each_link(req, head) {
1291                if (req->flags & REQ_F_INFLIGHT)
1292                        return true;
1293        }
1294        return false;
1295}
1296
1297static bool io_match_linked(struct io_kiocb *head)
1298{
1299        struct io_kiocb *req;
1300
1301        io_for_each_link(req, head) {
1302                if (req->flags & REQ_F_INFLIGHT)
1303                        return true;
1304        }
1305        return false;
1306}
1307
1308/*
1309 * As io_match_task() but protected against racing with linked timeouts.
1310 * User must not hold timeout_lock.
1311 */
1312static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1313                               bool cancel_all)
1314{
1315        bool matched;
1316
1317        if (task && head->task != task)
1318                return false;
1319        if (cancel_all)
1320                return true;
1321
1322        if (head->flags & REQ_F_LINK_TIMEOUT) {
1323                struct io_ring_ctx *ctx = head->ctx;
1324
1325                /* protect against races with linked timeouts */
1326                spin_lock_irq(&ctx->timeout_lock);
1327                matched = io_match_linked(head);
1328                spin_unlock_irq(&ctx->timeout_lock);
1329        } else {
1330                matched = io_match_linked(head);
1331        }
1332        return matched;
1333}
1334
1335static inline bool req_has_async_data(struct io_kiocb *req)
1336{
1337        return req->flags & REQ_F_ASYNC_DATA;
1338}
1339
1340static inline void req_set_fail(struct io_kiocb *req)
1341{
1342        req->flags |= REQ_F_FAIL;
1343}
1344
1345static inline void req_fail_link_node(struct io_kiocb *req, int res)
1346{
1347        req_set_fail(req);
1348        req->result = res;
1349}
1350
1351static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
1352{
1353        struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1354
1355        complete(&ctx->ref_comp);
1356}
1357
1358static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1359{
1360        return !req->timeout.off;
1361}
1362
1363static __cold void io_fallback_req_func(struct work_struct *work)
1364{
1365        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1366                                                fallback_work.work);
1367        struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1368        struct io_kiocb *req, *tmp;
1369        bool locked = false;
1370
1371        percpu_ref_get(&ctx->refs);
1372        llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1373                req->io_task_work.func(req, &locked);
1374
1375        if (locked) {
1376                io_submit_flush_completions(ctx);
1377                mutex_unlock(&ctx->uring_lock);
1378        }
1379        percpu_ref_put(&ctx->refs);
1380}
1381
1382static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1383{
1384        struct io_ring_ctx *ctx;
1385        int hash_bits;
1386
1387        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1388        if (!ctx)
1389                return NULL;
1390
1391        /*
1392         * Use 5 bits less than the max cq entries, that should give us around
1393         * 32 entries per hash list if totally full and uniformly spread.
1394         */
1395        hash_bits = ilog2(p->cq_entries);
1396        hash_bits -= 5;
1397        if (hash_bits <= 0)
1398                hash_bits = 1;
1399        ctx->cancel_hash_bits = hash_bits;
1400        ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1401                                        GFP_KERNEL);
1402        if (!ctx->cancel_hash)
1403                goto err;
1404        __hash_init(ctx->cancel_hash, 1U << hash_bits);
1405
1406        ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1407        if (!ctx->dummy_ubuf)
1408                goto err;
1409        /* set invalid range, so io_import_fixed() fails meeting it */
1410        ctx->dummy_ubuf->ubuf = -1UL;
1411
1412        if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1413                            PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1414                goto err;
1415
1416        ctx->flags = p->flags;
1417        init_waitqueue_head(&ctx->sqo_sq_wait);
1418        INIT_LIST_HEAD(&ctx->sqd_list);
1419        INIT_LIST_HEAD(&ctx->cq_overflow_list);
1420        init_completion(&ctx->ref_comp);
1421        xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1422        xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1423        mutex_init(&ctx->uring_lock);
1424        init_waitqueue_head(&ctx->cq_wait);
1425        spin_lock_init(&ctx->completion_lock);
1426        spin_lock_init(&ctx->timeout_lock);
1427        INIT_WQ_LIST(&ctx->iopoll_list);
1428        INIT_LIST_HEAD(&ctx->defer_list);
1429        INIT_LIST_HEAD(&ctx->timeout_list);
1430        INIT_LIST_HEAD(&ctx->ltimeout_list);
1431        spin_lock_init(&ctx->rsrc_ref_lock);
1432        INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1433        INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1434        init_llist_head(&ctx->rsrc_put_llist);
1435        INIT_LIST_HEAD(&ctx->tctx_list);
1436        ctx->submit_state.free_list.next = NULL;
1437        INIT_WQ_LIST(&ctx->locked_free_list);
1438        INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1439        INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
1440        return ctx;
1441err:
1442        kfree(ctx->dummy_ubuf);
1443        kfree(ctx->cancel_hash);
1444        kfree(ctx);
1445        return NULL;
1446}
1447
1448static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1449{
1450        struct io_rings *r = ctx->rings;
1451
1452        WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1453        ctx->cq_extra--;
1454}
1455
1456static bool req_need_defer(struct io_kiocb *req, u32 seq)
1457{
1458        if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1459                struct io_ring_ctx *ctx = req->ctx;
1460
1461                return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1462        }
1463
1464        return false;
1465}
1466
1467#define FFS_NOWAIT              0x1UL
1468#define FFS_ISREG               0x2UL
1469#define FFS_MASK                ~(FFS_NOWAIT|FFS_ISREG)
1470
1471static inline bool io_req_ffs_set(struct io_kiocb *req)
1472{
1473        return req->flags & REQ_F_FIXED_FILE;
1474}
1475
1476static inline void io_req_track_inflight(struct io_kiocb *req)
1477{
1478        if (!(req->flags & REQ_F_INFLIGHT)) {
1479                req->flags |= REQ_F_INFLIGHT;
1480                atomic_inc(&current->io_uring->inflight_tracked);
1481        }
1482}
1483
1484static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1485{
1486        if (WARN_ON_ONCE(!req->link))
1487                return NULL;
1488
1489        req->flags &= ~REQ_F_ARM_LTIMEOUT;
1490        req->flags |= REQ_F_LINK_TIMEOUT;
1491
1492        /* linked timeouts should have two refs once prep'ed */
1493        io_req_set_refcount(req);
1494        __io_req_set_refcount(req->link, 2);
1495        return req->link;
1496}
1497
1498static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1499{
1500        if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1501                return NULL;
1502        return __io_prep_linked_timeout(req);
1503}
1504
1505static void io_prep_async_work(struct io_kiocb *req)
1506{
1507        const struct io_op_def *def = &io_op_defs[req->opcode];
1508        struct io_ring_ctx *ctx = req->ctx;
1509
1510        if (!(req->flags & REQ_F_CREDS)) {
1511                req->flags |= REQ_F_CREDS;
1512                req->creds = get_current_cred();
1513        }
1514
1515        req->work.list.next = NULL;
1516        req->work.flags = 0;
1517        if (req->flags & REQ_F_FORCE_ASYNC)
1518                req->work.flags |= IO_WQ_WORK_CONCURRENT;
1519
1520        if (req->flags & REQ_F_ISREG) {
1521                if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1522                        io_wq_hash_work(&req->work, file_inode(req->file));
1523        } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1524                if (def->unbound_nonreg_file)
1525                        req->work.flags |= IO_WQ_WORK_UNBOUND;
1526        }
1527
1528        switch (req->opcode) {
1529        case IORING_OP_SPLICE:
1530        case IORING_OP_TEE:
1531                if (!S_ISREG(file_inode(req->splice.file_in)->i_mode))
1532                        req->work.flags |= IO_WQ_WORK_UNBOUND;
1533                break;
1534        }
1535}
1536
1537static void io_prep_async_link(struct io_kiocb *req)
1538{
1539        struct io_kiocb *cur;
1540
1541        if (req->flags & REQ_F_LINK_TIMEOUT) {
1542                struct io_ring_ctx *ctx = req->ctx;
1543
1544                spin_lock_irq(&ctx->timeout_lock);
1545                io_for_each_link(cur, req)
1546                        io_prep_async_work(cur);
1547                spin_unlock_irq(&ctx->timeout_lock);
1548        } else {
1549                io_for_each_link(cur, req)
1550                        io_prep_async_work(cur);
1551        }
1552}
1553
1554static inline void io_req_add_compl_list(struct io_kiocb *req)
1555{
1556        struct io_submit_state *state = &req->ctx->submit_state;
1557
1558        wq_list_add_tail(&req->comp_list, &state->compl_reqs);
1559}
1560
1561static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
1562{
1563        struct io_ring_ctx *ctx = req->ctx;
1564        struct io_kiocb *link = io_prep_linked_timeout(req);
1565        struct io_uring_task *tctx = req->task->io_uring;
1566
1567        BUG_ON(!tctx);
1568        BUG_ON(!tctx->io_wq);
1569
1570        /* init ->work of the whole link before punting */
1571        io_prep_async_link(req);
1572
1573        /*
1574         * Not expected to happen, but if we do have a bug where this _can_
1575         * happen, catch it here and ensure the request is marked as
1576         * canceled. That will make io-wq go through the usual work cancel
1577         * procedure rather than attempt to run this request (or create a new
1578         * worker for it).
1579         */
1580        if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1581                req->work.flags |= IO_WQ_WORK_CANCEL;
1582
1583        trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1584                                        &req->work, req->flags);
1585        io_wq_enqueue(tctx->io_wq, &req->work);
1586        if (link)
1587                io_queue_linked_timeout(link);
1588}
1589
1590static void io_kill_timeout(struct io_kiocb *req, int status)
1591        __must_hold(&req->ctx->completion_lock)
1592        __must_hold(&req->ctx->timeout_lock)
1593{
1594        struct io_timeout_data *io = req->async_data;
1595
1596        if (hrtimer_try_to_cancel(&io->timer) != -1) {
1597                if (status)
1598                        req_set_fail(req);
1599                atomic_set(&req->ctx->cq_timeouts,
1600                        atomic_read(&req->ctx->cq_timeouts) + 1);
1601                list_del_init(&req->timeout.list);
1602                io_cqring_fill_event(req->ctx, req->user_data, status, 0);
1603                io_put_req_deferred(req);
1604        }
1605}
1606
1607static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
1608{
1609        while (!list_empty(&ctx->defer_list)) {
1610                struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1611                                                struct io_defer_entry, list);
1612
1613                if (req_need_defer(de->req, de->seq))
1614                        break;
1615                list_del_init(&de->list);
1616                io_req_task_queue(de->req);
1617                kfree(de);
1618        }
1619}
1620
1621static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
1622        __must_hold(&ctx->completion_lock)
1623{
1624        u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1625
1626        spin_lock_irq(&ctx->timeout_lock);
1627        while (!list_empty(&ctx->timeout_list)) {
1628                u32 events_needed, events_got;
1629                struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1630                                                struct io_kiocb, timeout.list);
1631
1632                if (io_is_timeout_noseq(req))
1633                        break;
1634
1635                /*
1636                 * Since seq can easily wrap around over time, subtract
1637                 * the last seq at which timeouts were flushed before comparing.
1638                 * Assuming not more than 2^31-1 events have happened since,
1639                 * these subtractions won't have wrapped, so we can check if
1640                 * target is in [last_seq, current_seq] by comparing the two.
1641                 */
1642                events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1643                events_got = seq - ctx->cq_last_tm_flush;
1644                if (events_got < events_needed)
1645                        break;
1646
1647                list_del_init(&req->timeout.list);
1648                io_kill_timeout(req, 0);
1649        }
1650        ctx->cq_last_tm_flush = seq;
1651        spin_unlock_irq(&ctx->timeout_lock);
1652}
1653
1654static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1655{
1656        if (ctx->off_timeout_used)
1657                io_flush_timeouts(ctx);
1658        if (ctx->drain_active)
1659                io_queue_deferred(ctx);
1660}
1661
1662static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1663{
1664        if (unlikely(ctx->off_timeout_used || ctx->drain_active))
1665                __io_commit_cqring_flush(ctx);
1666        /* order cqe stores with ring update */
1667        smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1668}
1669
1670static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1671{
1672        struct io_rings *r = ctx->rings;
1673
1674        return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1675}
1676
1677static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1678{
1679        return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1680}
1681
1682static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1683{
1684        struct io_rings *rings = ctx->rings;
1685        unsigned tail, mask = ctx->cq_entries - 1;
1686
1687        /*
1688         * writes to the cq entry need to come after reading head; the
1689         * control dependency is enough as we're using WRITE_ONCE to
1690         * fill the cq entry
1691         */
1692        if (__io_cqring_events(ctx) == ctx->cq_entries)
1693                return NULL;
1694
1695        tail = ctx->cached_cq_tail++;
1696        return &rings->cqes[tail & mask];
1697}
1698
1699static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1700{
1701        if (likely(!ctx->cq_ev_fd))
1702                return false;
1703        if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1704                return false;
1705        return !ctx->eventfd_async || io_wq_current_is_worker();
1706}
1707
1708/*
1709 * This should only get called when at least one event has been posted.
1710 * Some applications rely on the eventfd notification count only changing
1711 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1712 * 1:1 relationship between how many times this function is called (and
1713 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1714 */
1715static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1716{
1717        /*
1718         * wake_up_all() may seem excessive, but io_wake_function() and
1719         * io_should_wake() handle the termination of the loop and only
1720         * wake as many waiters as we need to.
1721         */
1722        if (wq_has_sleeper(&ctx->cq_wait))
1723                wake_up_all(&ctx->cq_wait);
1724        if (io_should_trigger_evfd(ctx))
1725                eventfd_signal(ctx->cq_ev_fd, 1);
1726}
1727
1728static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1729{
1730        /* see waitqueue_active() comment */
1731        smp_mb();
1732
1733        if (ctx->flags & IORING_SETUP_SQPOLL) {
1734                if (waitqueue_active(&ctx->cq_wait))
1735                        wake_up_all(&ctx->cq_wait);
1736        }
1737        if (io_should_trigger_evfd(ctx))
1738                eventfd_signal(ctx->cq_ev_fd, 1);
1739}
1740
1741/* Returns true if there are no backlogged entries after the flush */
1742static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1743{
1744        bool all_flushed, posted;
1745
1746        if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1747                return false;
1748
1749        posted = false;
1750        spin_lock(&ctx->completion_lock);
1751        while (!list_empty(&ctx->cq_overflow_list)) {
1752                struct io_uring_cqe *cqe = io_get_cqe(ctx);
1753                struct io_overflow_cqe *ocqe;
1754
1755                if (!cqe && !force)
1756                        break;
1757                ocqe = list_first_entry(&ctx->cq_overflow_list,
1758                                        struct io_overflow_cqe, list);
1759                if (cqe)
1760                        memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1761                else
1762                        io_account_cq_overflow(ctx);
1763
1764                posted = true;
1765                list_del(&ocqe->list);
1766                kfree(ocqe);
1767        }
1768
1769        all_flushed = list_empty(&ctx->cq_overflow_list);
1770        if (all_flushed) {
1771                clear_bit(0, &ctx->check_cq_overflow);
1772                WRITE_ONCE(ctx->rings->sq_flags,
1773                           ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1774        }
1775
1776        if (posted)
1777                io_commit_cqring(ctx);
1778        spin_unlock(&ctx->completion_lock);
1779        if (posted)
1780                io_cqring_ev_posted(ctx);
1781        return all_flushed;
1782}
1783
1784static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1785{
1786        bool ret = true;
1787
1788        if (test_bit(0, &ctx->check_cq_overflow)) {
1789                /* iopoll syncs against uring_lock, not completion_lock */
1790                if (ctx->flags & IORING_SETUP_IOPOLL)
1791                        mutex_lock(&ctx->uring_lock);
1792                ret = __io_cqring_overflow_flush(ctx, false);
1793                if (ctx->flags & IORING_SETUP_IOPOLL)
1794                        mutex_unlock(&ctx->uring_lock);
1795        }
1796
1797        return ret;
1798}
1799
1800/* must to be called somewhat shortly after putting a request */
1801static inline void io_put_task(struct task_struct *task, int nr)
1802{
1803        struct io_uring_task *tctx = task->io_uring;
1804
1805        if (likely(task == current)) {
1806                tctx->cached_refs += nr;
1807        } else {
1808                percpu_counter_sub(&tctx->inflight, nr);
1809                if (unlikely(atomic_read(&tctx->in_idle)))
1810                        wake_up(&tctx->wait);
1811                put_task_struct_many(task, nr);
1812        }
1813}
1814
1815static void io_task_refs_refill(struct io_uring_task *tctx)
1816{
1817        unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
1818
1819        percpu_counter_add(&tctx->inflight, refill);
1820        refcount_add(refill, &current->usage);
1821        tctx->cached_refs += refill;
1822}
1823
1824static inline void io_get_task_refs(int nr)
1825{
1826        struct io_uring_task *tctx = current->io_uring;
1827
1828        tctx->cached_refs -= nr;
1829        if (unlikely(tctx->cached_refs < 0))
1830                io_task_refs_refill(tctx);
1831}
1832
1833static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1834                                     s32 res, u32 cflags)
1835{
1836        struct io_overflow_cqe *ocqe;
1837
1838        ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1839        if (!ocqe) {
1840                /*
1841                 * If we're in ring overflow flush mode, or in task cancel mode,
1842                 * or cannot allocate an overflow entry, then we need to drop it
1843                 * on the floor.
1844                 */
1845                io_account_cq_overflow(ctx);
1846                return false;
1847        }
1848        if (list_empty(&ctx->cq_overflow_list)) {
1849                set_bit(0, &ctx->check_cq_overflow);
1850                WRITE_ONCE(ctx->rings->sq_flags,
1851                           ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1852
1853        }
1854        ocqe->cqe.user_data = user_data;
1855        ocqe->cqe.res = res;
1856        ocqe->cqe.flags = cflags;
1857        list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1858        return true;
1859}
1860
1861static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1862                                          s32 res, u32 cflags)
1863{
1864        struct io_uring_cqe *cqe;
1865
1866        trace_io_uring_complete(ctx, user_data, res, cflags);
1867
1868        /*
1869         * If we can't get a cq entry, userspace overflowed the
1870         * submission (by quite a lot). Increment the overflow count in
1871         * the ring.
1872         */
1873        cqe = io_get_cqe(ctx);
1874        if (likely(cqe)) {
1875                WRITE_ONCE(cqe->user_data, user_data);
1876                WRITE_ONCE(cqe->res, res);
1877                WRITE_ONCE(cqe->flags, cflags);
1878                return true;
1879        }
1880        return io_cqring_event_overflow(ctx, user_data, res, cflags);
1881}
1882
1883/* not as hot to bloat with inlining */
1884static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data,
1885                                          s32 res, u32 cflags)
1886{
1887        return __io_cqring_fill_event(ctx, user_data, res, cflags);
1888}
1889
1890static void io_req_complete_post(struct io_kiocb *req, s32 res,
1891                                 u32 cflags)
1892{
1893        struct io_ring_ctx *ctx = req->ctx;
1894
1895        spin_lock(&ctx->completion_lock);
1896        __io_cqring_fill_event(ctx, req->user_data, res, cflags);
1897        /*
1898         * If we're the last reference to this request, add to our locked
1899         * free_list cache.
1900         */
1901        if (req_ref_put_and_test(req)) {
1902                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1903                        if (req->flags & IO_DISARM_MASK)
1904                                io_disarm_next(req);
1905                        if (req->link) {
1906                                io_req_task_queue(req->link);
1907                                req->link = NULL;
1908                        }
1909                }
1910                io_req_put_rsrc(req, ctx);
1911                io_dismantle_req(req);
1912                io_put_task(req->task, 1);
1913                wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
1914                ctx->locked_free_nr++;
1915        }
1916        io_commit_cqring(ctx);
1917        spin_unlock(&ctx->completion_lock);
1918        io_cqring_ev_posted(ctx);
1919}
1920
1921static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
1922                                         u32 cflags)
1923{
1924        req->result = res;
1925        req->cflags = cflags;
1926        req->flags |= REQ_F_COMPLETE_INLINE;
1927}
1928
1929static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1930                                     s32 res, u32 cflags)
1931{
1932        if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1933                io_req_complete_state(req, res, cflags);
1934        else
1935                io_req_complete_post(req, res, cflags);
1936}
1937
1938static inline void io_req_complete(struct io_kiocb *req, s32 res)
1939{
1940        __io_req_complete(req, 0, res, 0);
1941}
1942
1943static void io_req_complete_failed(struct io_kiocb *req, s32 res)
1944{
1945        req_set_fail(req);
1946        io_req_complete_post(req, res, 0);
1947}
1948
1949static void io_req_complete_fail_submit(struct io_kiocb *req)
1950{
1951        /*
1952         * We don't submit, fail them all, for that replace hardlinks with
1953         * normal links. Extra REQ_F_LINK is tolerated.
1954         */
1955        req->flags &= ~REQ_F_HARDLINK;
1956        req->flags |= REQ_F_LINK;
1957        io_req_complete_failed(req, req->result);
1958}
1959
1960/*
1961 * Don't initialise the fields below on every allocation, but do that in
1962 * advance and keep them valid across allocations.
1963 */
1964static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1965{
1966        req->ctx = ctx;
1967        req->link = NULL;
1968        req->async_data = NULL;
1969        /* not necessary, but safer to zero */
1970        req->result = 0;
1971}
1972
1973static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1974                                        struct io_submit_state *state)
1975{
1976        spin_lock(&ctx->completion_lock);
1977        wq_list_splice(&ctx->locked_free_list, &state->free_list);
1978        ctx->locked_free_nr = 0;
1979        spin_unlock(&ctx->completion_lock);
1980}
1981
1982/* Returns true IFF there are requests in the cache */
1983static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1984{
1985        struct io_submit_state *state = &ctx->submit_state;
1986
1987        /*
1988         * If we have more than a batch's worth of requests in our IRQ side
1989         * locked cache, grab the lock and move them over to our submission
1990         * side cache.
1991         */
1992        if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1993                io_flush_cached_locked_reqs(ctx, state);
1994        return !!state->free_list.next;
1995}
1996
1997/*
1998 * A request might get retired back into the request caches even before opcode
1999 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
2000 * Because of that, io_alloc_req() should be called only under ->uring_lock
2001 * and with extra caution to not get a request that is still worked on.
2002 */
2003static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
2004        __must_hold(&ctx->uring_lock)
2005{
2006        struct io_submit_state *state = &ctx->submit_state;
2007        gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
2008        void *reqs[IO_REQ_ALLOC_BATCH];
2009        struct io_kiocb *req;
2010        int ret, i;
2011
2012        if (likely(state->free_list.next || io_flush_cached_reqs(ctx)))
2013                return true;
2014
2015        ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
2016
2017        /*
2018         * Bulk alloc is all-or-nothing. If we fail to get a batch,
2019         * retry single alloc to be on the safe side.
2020         */
2021        if (unlikely(ret <= 0)) {
2022                reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2023                if (!reqs[0])
2024                        return false;
2025                ret = 1;
2026        }
2027
2028        percpu_ref_get_many(&ctx->refs, ret);
2029        for (i = 0; i < ret; i++) {
2030                req = reqs[i];
2031
2032                io_preinit_req(req, ctx);
2033                wq_stack_add_head(&req->comp_list, &state->free_list);
2034        }
2035        return true;
2036}
2037
2038static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
2039{
2040        if (unlikely(!ctx->submit_state.free_list.next))
2041                return __io_alloc_req_refill(ctx);
2042        return true;
2043}
2044
2045static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
2046{
2047        struct io_wq_work_node *node;
2048
2049        node = wq_stack_extract(&ctx->submit_state.free_list);
2050        return container_of(node, struct io_kiocb, comp_list);
2051}
2052
2053static inline void io_put_file(struct file *file)
2054{
2055        if (file)
2056                fput(file);
2057}
2058
2059static inline void io_dismantle_req(struct io_kiocb *req)
2060{
2061        unsigned int flags = req->flags;
2062
2063        if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
2064                io_clean_op(req);
2065        if (!(flags & REQ_F_FIXED_FILE))
2066                io_put_file(req->file);
2067}
2068
2069static __cold void __io_free_req(struct io_kiocb *req)
2070{
2071        struct io_ring_ctx *ctx = req->ctx;
2072
2073        io_req_put_rsrc(req, ctx);
2074        io_dismantle_req(req);
2075        io_put_task(req->task, 1);
2076
2077        spin_lock(&ctx->completion_lock);
2078        wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
2079        ctx->locked_free_nr++;
2080        spin_unlock(&ctx->completion_lock);
2081}
2082
2083static inline void io_remove_next_linked(struct io_kiocb *req)
2084{
2085        struct io_kiocb *nxt = req->link;
2086
2087        req->link = nxt->link;
2088        nxt->link = NULL;
2089}
2090
2091static bool io_kill_linked_timeout(struct io_kiocb *req)
2092        __must_hold(&req->ctx->completion_lock)
2093        __must_hold(&req->ctx->timeout_lock)
2094{
2095        struct io_kiocb *link = req->link;
2096
2097        if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2098                struct io_timeout_data *io = link->async_data;
2099
2100                io_remove_next_linked(req);
2101                link->timeout.head = NULL;
2102                if (hrtimer_try_to_cancel(&io->timer) != -1) {
2103                        list_del(&link->timeout.list);
2104                        io_cqring_fill_event(link->ctx, link->user_data,
2105                                             -ECANCELED, 0);
2106                        io_put_req_deferred(link);
2107                        return true;
2108                }
2109        }
2110        return false;
2111}
2112
2113static void io_fail_links(struct io_kiocb *req)
2114        __must_hold(&req->ctx->completion_lock)
2115{
2116        struct io_kiocb *nxt, *link = req->link;
2117
2118        req->link = NULL;
2119        while (link) {
2120                long res = -ECANCELED;
2121
2122                if (link->flags & REQ_F_FAIL)
2123                        res = link->result;
2124
2125                nxt = link->link;
2126                link->link = NULL;
2127
2128                trace_io_uring_fail_link(req, link);
2129                io_cqring_fill_event(link->ctx, link->user_data, res, 0);
2130                io_put_req_deferred(link);
2131                link = nxt;
2132        }
2133}
2134
2135static bool io_disarm_next(struct io_kiocb *req)
2136        __must_hold(&req->ctx->completion_lock)
2137{
2138        bool posted = false;
2139
2140        if (req->flags & REQ_F_ARM_LTIMEOUT) {
2141                struct io_kiocb *link = req->link;
2142
2143                req->flags &= ~REQ_F_ARM_LTIMEOUT;
2144                if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2145                        io_remove_next_linked(req);
2146                        io_cqring_fill_event(link->ctx, link->user_data,
2147                                             -ECANCELED, 0);
2148                        io_put_req_deferred(link);
2149                        posted = true;
2150                }
2151        } else if (req->flags & REQ_F_LINK_TIMEOUT) {
2152                struct io_ring_ctx *ctx = req->ctx;
2153
2154                spin_lock_irq(&ctx->timeout_lock);
2155                posted = io_kill_linked_timeout(req);
2156                spin_unlock_irq(&ctx->timeout_lock);
2157        }
2158        if (unlikely((req->flags & REQ_F_FAIL) &&
2159                     !(req->flags & REQ_F_HARDLINK))) {
2160                posted |= (req->link != NULL);
2161                io_fail_links(req);
2162        }
2163        return posted;
2164}
2165
2166static void __io_req_find_next_prep(struct io_kiocb *req)
2167{
2168        struct io_ring_ctx *ctx = req->ctx;
2169        bool posted;
2170
2171        spin_lock(&ctx->completion_lock);
2172        posted = io_disarm_next(req);
2173        if (posted)
2174                io_commit_cqring(req->ctx);
2175        spin_unlock(&ctx->completion_lock);
2176        if (posted)
2177                io_cqring_ev_posted(ctx);
2178}
2179
2180static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2181{
2182        struct io_kiocb *nxt;
2183
2184        if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2185                return NULL;
2186        /*
2187         * If LINK is set, we have dependent requests in this chain. If we
2188         * didn't fail this request, queue the first one up, moving any other
2189         * dependencies to the next request. In case of failure, fail the rest
2190         * of the chain.
2191         */
2192        if (unlikely(req->flags & IO_DISARM_MASK))
2193                __io_req_find_next_prep(req);
2194        nxt = req->link;
2195        req->link = NULL;
2196        return nxt;
2197}
2198
2199static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2200{
2201        if (!ctx)
2202                return;
2203        if (*locked) {
2204                io_submit_flush_completions(ctx);
2205                mutex_unlock(&ctx->uring_lock);
2206                *locked = false;
2207        }
2208        percpu_ref_put(&ctx->refs);
2209}
2210
2211static void tctx_task_work(struct callback_head *cb)
2212{
2213        bool locked = false;
2214        struct io_ring_ctx *ctx = NULL;
2215        struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2216                                                  task_work);
2217
2218        while (1) {
2219                struct io_wq_work_node *node;
2220
2221                if (!tctx->task_list.first && locked)
2222                        io_submit_flush_completions(ctx);
2223
2224                spin_lock_irq(&tctx->task_lock);
2225                node = tctx->task_list.first;
2226                INIT_WQ_LIST(&tctx->task_list);
2227                if (!node)
2228                        tctx->task_running = false;
2229                spin_unlock_irq(&tctx->task_lock);
2230                if (!node)
2231                        break;
2232
2233                do {
2234                        struct io_wq_work_node *next = node->next;
2235                        struct io_kiocb *req = container_of(node, struct io_kiocb,
2236                                                            io_task_work.node);
2237
2238                        if (req->ctx != ctx) {
2239                                ctx_flush_and_put(ctx, &locked);
2240                                ctx = req->ctx;
2241                                /* if not contended, grab and improve batching */
2242                                locked = mutex_trylock(&ctx->uring_lock);
2243                                percpu_ref_get(&ctx->refs);
2244                        }
2245                        req->io_task_work.func(req, &locked);
2246                        node = next;
2247                } while (node);
2248
2249                cond_resched();
2250        }
2251
2252        ctx_flush_and_put(ctx, &locked);
2253}
2254
2255static void io_req_task_work_add(struct io_kiocb *req)
2256{
2257        struct task_struct *tsk = req->task;
2258        struct io_uring_task *tctx = tsk->io_uring;
2259        enum task_work_notify_mode notify;
2260        struct io_wq_work_node *node;
2261        unsigned long flags;
2262        bool running;
2263
2264        WARN_ON_ONCE(!tctx);
2265
2266        spin_lock_irqsave(&tctx->task_lock, flags);
2267        wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2268        running = tctx->task_running;
2269        if (!running)
2270                tctx->task_running = true;
2271        spin_unlock_irqrestore(&tctx->task_lock, flags);
2272
2273        /* task_work already pending, we're done */
2274        if (running)
2275                return;
2276
2277        /*
2278         * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2279         * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2280         * processing task_work. There's no reliable way to tell if TWA_RESUME
2281         * will do the job.
2282         */
2283        notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2284        if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
2285                if (notify == TWA_NONE)
2286                        wake_up_process(tsk);
2287                return;
2288        }
2289
2290        spin_lock_irqsave(&tctx->task_lock, flags);
2291        tctx->task_running = false;
2292        node = tctx->task_list.first;
2293        INIT_WQ_LIST(&tctx->task_list);
2294        spin_unlock_irqrestore(&tctx->task_lock, flags);
2295
2296        while (node) {
2297                req = container_of(node, struct io_kiocb, io_task_work.node);
2298                node = node->next;
2299                if (llist_add(&req->io_task_work.fallback_node,
2300                              &req->ctx->fallback_llist))
2301                        schedule_delayed_work(&req->ctx->fallback_work, 1);
2302        }
2303}
2304
2305static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2306{
2307        struct io_ring_ctx *ctx = req->ctx;
2308
2309        /* not needed for normal modes, but SQPOLL depends on it */
2310        io_tw_lock(ctx, locked);
2311        io_req_complete_failed(req, req->result);
2312}
2313
2314static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2315{
2316        struct io_ring_ctx *ctx = req->ctx;
2317
2318        io_tw_lock(ctx, locked);
2319        /* req->task == current here, checking PF_EXITING is safe */
2320        if (likely(!(req->task->flags & PF_EXITING)))
2321                __io_queue_sqe(req);
2322        else
2323                io_req_complete_failed(req, -EFAULT);
2324}
2325
2326static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2327{
2328        req->result = ret;
2329        req->io_task_work.func = io_req_task_cancel;
2330        io_req_task_work_add(req);
2331}
2332
2333static void io_req_task_queue(struct io_kiocb *req)
2334{
2335        req->io_task_work.func = io_req_task_submit;
2336        io_req_task_work_add(req);
2337}
2338
2339static void io_req_task_queue_reissue(struct io_kiocb *req)
2340{
2341        req->io_task_work.func = io_queue_async_work;
2342        io_req_task_work_add(req);
2343}
2344
2345static inline void io_queue_next(struct io_kiocb *req)
2346{
2347        struct io_kiocb *nxt = io_req_find_next(req);
2348
2349        if (nxt)
2350                io_req_task_queue(nxt);
2351}
2352
2353static void io_free_req(struct io_kiocb *req)
2354{
2355        io_queue_next(req);
2356        __io_free_req(req);
2357}
2358
2359static void io_free_req_work(struct io_kiocb *req, bool *locked)
2360{
2361        io_free_req(req);
2362}
2363
2364static void io_free_batch_list(struct io_ring_ctx *ctx,
2365                                struct io_wq_work_node *node)
2366        __must_hold(&ctx->uring_lock)
2367{
2368        struct task_struct *task = NULL;
2369        int task_refs = 0;
2370
2371        do {
2372                struct io_kiocb *req = container_of(node, struct io_kiocb,
2373                                                    comp_list);
2374
2375                if (unlikely(req->flags & REQ_F_REFCOUNT)) {
2376                        node = req->comp_list.next;
2377                        if (!req_ref_put_and_test(req))
2378                                continue;
2379                }
2380
2381                io_req_put_rsrc_locked(req, ctx);
2382                io_queue_next(req);
2383                io_dismantle_req(req);
2384
2385                if (req->task != task) {
2386                        if (task)
2387                                io_put_task(task, task_refs);
2388                        task = req->task;
2389                        task_refs = 0;
2390                }
2391                task_refs++;
2392                node = req->comp_list.next;
2393                wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
2394        } while (node);
2395
2396        if (task)
2397                io_put_task(task, task_refs);
2398}
2399
2400static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
2401        __must_hold(&ctx->uring_lock)
2402{
2403        struct io_wq_work_node *node, *prev;
2404        struct io_submit_state *state = &ctx->submit_state;
2405
2406        spin_lock(&ctx->completion_lock);
2407        wq_list_for_each(node, prev, &state->compl_reqs) {
2408                struct io_kiocb *req = container_of(node, struct io_kiocb,
2409                                                    comp_list);
2410
2411                __io_cqring_fill_event(ctx, req->user_data, req->result,
2412                                        req->cflags);
2413        }
2414        io_commit_cqring(ctx);
2415        spin_unlock(&ctx->completion_lock);
2416        io_cqring_ev_posted(ctx);
2417
2418        io_free_batch_list(ctx, state->compl_reqs.first);
2419        INIT_WQ_LIST(&state->compl_reqs);
2420}
2421
2422/*
2423 * Drop reference to request, return next in chain (if there is one) if this
2424 * was the last reference to this request.
2425 */
2426static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2427{
2428        struct io_kiocb *nxt = NULL;
2429
2430        if (req_ref_put_and_test(req)) {
2431                nxt = io_req_find_next(req);
2432                __io_free_req(req);
2433        }
2434        return nxt;
2435}
2436
2437static inline void io_put_req(struct io_kiocb *req)
2438{
2439        if (req_ref_put_and_test(req))
2440                io_free_req(req);
2441}
2442
2443static inline void io_put_req_deferred(struct io_kiocb *req)
2444{
2445        if (req_ref_put_and_test(req)) {
2446                req->io_task_work.func = io_free_req_work;
2447                io_req_task_work_add(req);
2448        }
2449}
2450
2451static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2452{
2453        /* See comment at the top of this file */
2454        smp_rmb();
2455        return __io_cqring_events(ctx);
2456}
2457
2458static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2459{
2460        struct io_rings *rings = ctx->rings;
2461
2462        /* make sure SQ entry isn't read before tail */
2463        return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2464}
2465
2466static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2467{
2468        unsigned int cflags;
2469
2470        cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2471        cflags |= IORING_CQE_F_BUFFER;
2472        req->flags &= ~REQ_F_BUFFER_SELECTED;
2473        kfree(kbuf);
2474        return cflags;
2475}
2476
2477static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2478{
2479        if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2480                return 0;
2481        return io_put_kbuf(req, req->kbuf);
2482}
2483
2484static inline bool io_run_task_work(void)
2485{
2486        if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
2487                __set_current_state(TASK_RUNNING);
2488                tracehook_notify_signal();
2489                return true;
2490        }
2491
2492        return false;
2493}
2494
2495static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
2496{
2497        struct io_wq_work_node *pos, *start, *prev;
2498        unsigned int poll_flags = BLK_POLL_NOSLEEP;
2499        DEFINE_IO_COMP_BATCH(iob);
2500        int nr_events = 0;
2501
2502        /*
2503         * Only spin for completions if we don't have multiple devices hanging
2504         * off our complete list.
2505         */
2506        if (ctx->poll_multi_queue || force_nonspin)
2507                poll_flags |= BLK_POLL_ONESHOT;
2508
2509        wq_list_for_each(pos, start, &ctx->iopoll_list) {
2510                struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2511                struct kiocb *kiocb = &req->rw.kiocb;
2512                int ret;
2513
2514                /*
2515                 * Move completed and retryable entries to our local lists.
2516                 * If we find a request that requires polling, break out
2517                 * and complete those lists first, if we have entries there.
2518                 */
2519                if (READ_ONCE(req->iopoll_completed))
2520                        break;
2521
2522                ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags);
2523                if (unlikely(ret < 0))
2524                        return ret;
2525                else if (ret)
2526                        poll_flags |= BLK_POLL_ONESHOT;
2527
2528                /* iopoll may have completed current req */
2529                if (!rq_list_empty(iob.req_list) ||
2530                    READ_ONCE(req->iopoll_completed))
2531                        break;
2532        }
2533
2534        if (!rq_list_empty(iob.req_list))
2535                iob.complete(&iob);
2536        else if (!pos)
2537                return 0;
2538
2539        prev = start;
2540        wq_list_for_each_resume(pos, prev) {
2541                struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
2542
2543                /* order with io_complete_rw_iopoll(), e.g. ->result updates */
2544                if (!smp_load_acquire(&req->iopoll_completed))
2545                        break;
2546                __io_cqring_fill_event(ctx, req->user_data, req->result,
2547                                        io_put_rw_kbuf(req));
2548                nr_events++;
2549        }
2550
2551        if (unlikely(!nr_events))
2552                return 0;
2553
2554        io_commit_cqring(ctx);
2555        io_cqring_ev_posted_iopoll(ctx);
2556        pos = start ? start->next : ctx->iopoll_list.first;
2557        wq_list_cut(&ctx->iopoll_list, prev, start);
2558        io_free_batch_list(ctx, pos);
2559        return nr_events;
2560}
2561
2562/*
2563 * We can't just wait for polled events to come to us, we have to actively
2564 * find and complete them.
2565 */
2566static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2567{
2568        if (!(ctx->flags & IORING_SETUP_IOPOLL))
2569                return;
2570
2571        mutex_lock(&ctx->uring_lock);
2572        while (!wq_list_empty(&ctx->iopoll_list)) {
2573                /* let it sleep and repeat later if can't complete a request */
2574                if (io_do_iopoll(ctx, true) == 0)
2575                        break;
2576                /*
2577                 * Ensure we allow local-to-the-cpu processing to take place,
2578                 * in this case we need to ensure that we reap all events.
2579                 * Also let task_work, etc. to progress by releasing the mutex
2580                 */
2581                if (need_resched()) {
2582                        mutex_unlock(&ctx->uring_lock);
2583                        cond_resched();
2584                        mutex_lock(&ctx->uring_lock);
2585                }
2586        }
2587        mutex_unlock(&ctx->uring_lock);
2588}
2589
2590static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2591{
2592        unsigned int nr_events = 0;
2593        int ret = 0;
2594
2595        /*
2596         * We disallow the app entering submit/complete with polling, but we
2597         * still need to lock the ring to prevent racing with polled issue
2598         * that got punted to a workqueue.
2599         */
2600        mutex_lock(&ctx->uring_lock);
2601        /*
2602         * Don't enter poll loop if we already have events pending.
2603         * If we do, we can potentially be spinning for commands that
2604         * already triggered a CQE (eg in error).
2605         */
2606        if (test_bit(0, &ctx->check_cq_overflow))
2607                __io_cqring_overflow_flush(ctx, false);
2608        if (io_cqring_events(ctx))
2609                goto out;
2610        do {
2611                /*
2612                 * If a submit got punted to a workqueue, we can have the
2613                 * application entering polling for a command before it gets
2614                 * issued. That app will hold the uring_lock for the duration
2615                 * of the poll right here, so we need to take a breather every
2616                 * now and then to ensure that the issue has a chance to add
2617                 * the poll to the issued list. Otherwise we can spin here
2618                 * forever, while the workqueue is stuck trying to acquire the
2619                 * very same mutex.
2620                 */
2621                if (wq_list_empty(&ctx->iopoll_list)) {
2622                        u32 tail = ctx->cached_cq_tail;
2623
2624                        mutex_unlock(&ctx->uring_lock);
2625                        io_run_task_work();
2626                        mutex_lock(&ctx->uring_lock);
2627
2628                        /* some requests don't go through iopoll_list */
2629                        if (tail != ctx->cached_cq_tail ||
2630                            wq_list_empty(&ctx->iopoll_list))
2631                                break;
2632                }
2633                ret = io_do_iopoll(ctx, !min);
2634                if (ret < 0)
2635                        break;
2636                nr_events += ret;
2637                ret = 0;
2638        } while (nr_events < min && !need_resched());
2639out:
2640        mutex_unlock(&ctx->uring_lock);
2641        return ret;
2642}
2643
2644static void kiocb_end_write(struct io_kiocb *req)
2645{
2646        /*
2647         * Tell lockdep we inherited freeze protection from submission
2648         * thread.
2649         */
2650        if (req->flags & REQ_F_ISREG) {
2651                struct super_block *sb = file_inode(req->file)->i_sb;
2652
2653                __sb_writers_acquired(sb, SB_FREEZE_WRITE);
2654                sb_end_write(sb);
2655        }
2656}
2657
2658#ifdef CONFIG_BLOCK
2659static bool io_resubmit_prep(struct io_kiocb *req)
2660{
2661        struct io_async_rw *rw = req->async_data;
2662
2663        if (!req_has_async_data(req))
2664                return !io_req_prep_async(req);
2665        iov_iter_restore(&rw->s.iter, &rw->s.iter_state);
2666        return true;
2667}
2668
2669static bool io_rw_should_reissue(struct io_kiocb *req)
2670{
2671        umode_t mode = file_inode(req->file)->i_mode;
2672        struct io_ring_ctx *ctx = req->ctx;
2673
2674        if (!S_ISBLK(mode) && !S_ISREG(mode))
2675                return false;
2676        if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2677            !(ctx->flags & IORING_SETUP_IOPOLL)))
2678                return false;
2679        /*
2680         * If ref is dying, we might be running poll reap from the exit work.
2681         * Don't attempt to reissue from that path, just let it fail with
2682         * -EAGAIN.
2683         */
2684        if (percpu_ref_is_dying(&ctx->refs))
2685                return false;
2686        /*
2687         * Play it safe and assume not safe to re-import and reissue if we're
2688         * not in the original thread group (or in task context).
2689         */
2690        if (!same_thread_group(req->task, current) || !in_task())
2691                return false;
2692        return true;
2693}
2694#else
2695static bool io_resubmit_prep(struct io_kiocb *req)
2696{
2697        return false;
2698}
2699static bool io_rw_should_reissue(struct io_kiocb *req)
2700{
2701        return false;
2702}
2703#endif
2704
2705static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2706{
2707        if (req->rw.kiocb.ki_flags & IOCB_WRITE)
2708                kiocb_end_write(req);
2709        if (unlikely(res != req->result)) {
2710                if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2711                    io_rw_should_reissue(req)) {
2712                        req->flags |= REQ_F_REISSUE;
2713                        return true;
2714                }
2715                req_set_fail(req);
2716                req->result = res;
2717        }
2718        return false;
2719}
2720
2721static void io_req_task_complete(struct io_kiocb *req, bool *locked)
2722{
2723        unsigned int cflags = io_put_rw_kbuf(req);
2724        int res = req->result;
2725
2726        if (*locked) {
2727                io_req_complete_state(req, res, cflags);
2728                io_req_add_compl_list(req);
2729        } else {
2730                io_req_complete_post(req, res, cflags);
2731        }
2732}
2733
2734static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2735                             unsigned int issue_flags)
2736{
2737        if (__io_complete_rw_common(req, res))
2738                return;
2739        __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
2740}
2741
2742static void io_complete_rw(struct kiocb *kiocb, long res)
2743{
2744        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2745
2746        if (__io_complete_rw_common(req, res))
2747                return;
2748        req->result = res;
2749        req->io_task_work.func = io_req_task_complete;
2750        io_req_task_work_add(req);
2751}
2752
2753static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
2754{
2755        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2756
2757        if (kiocb->ki_flags & IOCB_WRITE)
2758                kiocb_end_write(req);
2759        if (unlikely(res != req->result)) {
2760                if (res == -EAGAIN && io_rw_should_reissue(req)) {
2761                        req->flags |= REQ_F_REISSUE;
2762                        return;
2763                }
2764                req->result = res;
2765        }
2766
2767        /* order with io_iopoll_complete() checking ->iopoll_completed */
2768        smp_store_release(&req->iopoll_completed, 1);
2769}
2770
2771/*
2772 * After the iocb has been issued, it's safe to be found on the poll list.
2773 * Adding the kiocb to the list AFTER submission ensures that we don't
2774 * find it from a io_do_iopoll() thread before the issuer is done
2775 * accessing the kiocb cookie.
2776 */
2777static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
2778{
2779        struct io_ring_ctx *ctx = req->ctx;
2780        const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
2781
2782        /* workqueue context doesn't hold uring_lock, grab it now */
2783        if (unlikely(needs_lock))
2784                mutex_lock(&ctx->uring_lock);
2785
2786        /*
2787         * Track whether we have multiple files in our lists. This will impact
2788         * how we do polling eventually, not spinning if we're on potentially
2789         * different devices.
2790         */
2791        if (wq_list_empty(&ctx->iopoll_list)) {
2792                ctx->poll_multi_queue = false;
2793        } else if (!ctx->poll_multi_queue) {
2794                struct io_kiocb *list_req;
2795
2796                list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
2797                                        comp_list);
2798                if (list_req->file != req->file)
2799                        ctx->poll_multi_queue = true;
2800        }
2801
2802        /*
2803         * For fast devices, IO may have already completed. If it has, add
2804         * it to the front so we find it first.
2805         */
2806        if (READ_ONCE(req->iopoll_completed))
2807                wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
2808        else
2809                wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
2810
2811        if (unlikely(needs_lock)) {
2812                /*
2813                 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2814                 * in sq thread task context or in io worker task context. If
2815                 * current task context is sq thread, we don't need to check
2816                 * whether should wake up sq thread.
2817                 */
2818                if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2819                    wq_has_sleeper(&ctx->sq_data->wait))
2820                        wake_up(&ctx->sq_data->wait);
2821
2822                mutex_unlock(&ctx->uring_lock);
2823        }
2824}
2825
2826static bool io_bdev_nowait(struct block_device *bdev)
2827{
2828        return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2829}
2830
2831/*
2832 * If we tracked the file through the SCM inflight mechanism, we could support
2833 * any file. For now, just ensure that anything potentially problematic is done
2834 * inline.
2835 */
2836static bool __io_file_supports_nowait(struct file *file, umode_t mode)
2837{
2838        if (S_ISBLK(mode)) {
2839                if (IS_ENABLED(CONFIG_BLOCK) &&
2840                    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2841                        return true;
2842                return false;
2843        }
2844        if (S_ISSOCK(mode))
2845                return true;
2846        if (S_ISREG(mode)) {
2847                if (IS_ENABLED(CONFIG_BLOCK) &&
2848                    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2849                    file->f_op != &io_uring_fops)
2850                        return true;
2851                return false;
2852        }
2853
2854        /* any ->read/write should understand O_NONBLOCK */
2855        if (file->f_flags & O_NONBLOCK)
2856                return true;
2857        return file->f_mode & FMODE_NOWAIT;
2858}
2859
2860/*
2861 * If we tracked the file through the SCM inflight mechanism, we could support
2862 * any file. For now, just ensure that anything potentially problematic is done
2863 * inline.
2864 */
2865static unsigned int io_file_get_flags(struct file *file)
2866{
2867        umode_t mode = file_inode(file)->i_mode;
2868        unsigned int res = 0;
2869
2870        if (S_ISREG(mode))
2871                res |= FFS_ISREG;
2872        if (__io_file_supports_nowait(file, mode))
2873                res |= FFS_NOWAIT;
2874        return res;
2875}
2876
2877static inline bool io_file_supports_nowait(struct io_kiocb *req)
2878{
2879        return req->flags & REQ_F_SUPPORT_NOWAIT;
2880}
2881
2882static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2883{
2884        struct io_ring_ctx *ctx = req->ctx;
2885        struct kiocb *kiocb = &req->rw.kiocb;
2886        struct file *file = req->file;
2887        unsigned ioprio;
2888        int ret;
2889
2890        if (!io_req_ffs_set(req))
2891                req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
2892
2893        kiocb->ki_pos = READ_ONCE(sqe->off);
2894        if (kiocb->ki_pos == -1) {
2895                if (!(file->f_mode & FMODE_STREAM)) {
2896                        req->flags |= REQ_F_CUR_POS;
2897                        kiocb->ki_pos = file->f_pos;
2898                } else {
2899                        kiocb->ki_pos = 0;
2900                }
2901        }
2902        kiocb->ki_flags = iocb_flags(file);
2903        ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2904        if (unlikely(ret))
2905                return ret;
2906
2907        /*
2908         * If the file is marked O_NONBLOCK, still allow retry for it if it
2909         * supports async. Otherwise it's impossible to use O_NONBLOCK files
2910         * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
2911         */
2912        if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2913            ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req)))
2914                req->flags |= REQ_F_NOWAIT;
2915
2916        if (ctx->flags & IORING_SETUP_IOPOLL) {
2917                if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
2918                        return -EOPNOTSUPP;
2919
2920                kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
2921                kiocb->ki_complete = io_complete_rw_iopoll;
2922                req->iopoll_completed = 0;
2923        } else {
2924                if (kiocb->ki_flags & IOCB_HIPRI)
2925                        return -EINVAL;
2926                kiocb->ki_complete = io_complete_rw;
2927        }
2928
2929        ioprio = READ_ONCE(sqe->ioprio);
2930        if (ioprio) {
2931                ret = ioprio_check_cap(ioprio);
2932                if (ret)
2933                        return ret;
2934
2935                kiocb->ki_ioprio = ioprio;
2936        } else {
2937                kiocb->ki_ioprio = get_current_ioprio();
2938        }
2939
2940        req->imu = NULL;
2941        req->rw.addr = READ_ONCE(sqe->addr);
2942        req->rw.len = READ_ONCE(sqe->len);
2943        req->buf_index = READ_ONCE(sqe->buf_index);
2944        return 0;
2945}
2946
2947static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2948{
2949        switch (ret) {
2950        case -EIOCBQUEUED:
2951                break;
2952        case -ERESTARTSYS:
2953        case -ERESTARTNOINTR:
2954        case -ERESTARTNOHAND:
2955        case -ERESTART_RESTARTBLOCK:
2956                /*
2957                 * We can't just restart the syscall, since previously
2958                 * submitted sqes may already be in progress. Just fail this
2959                 * IO with EINTR.
2960                 */
2961                ret = -EINTR;
2962                fallthrough;
2963        default:
2964                kiocb->ki_complete(kiocb, ret);
2965        }
2966}
2967
2968static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2969                       unsigned int issue_flags)
2970{
2971        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2972        struct io_async_rw *io = req->async_data;
2973
2974        /* add previously done IO, if any */
2975        if (req_has_async_data(req) && io->bytes_done > 0) {
2976                if (ret < 0)
2977                        ret = io->bytes_done;
2978                else
2979                        ret += io->bytes_done;
2980        }
2981
2982        if (req->flags & REQ_F_CUR_POS)
2983                req->file->f_pos = kiocb->ki_pos;
2984        if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
2985                __io_complete_rw(req, ret, 0, issue_flags);
2986        else
2987                io_rw_done(kiocb, ret);
2988
2989        if (req->flags & REQ_F_REISSUE) {
2990                req->flags &= ~REQ_F_REISSUE;
2991                if (io_resubmit_prep(req)) {
2992                        io_req_task_queue_reissue(req);
2993                } else {
2994                        unsigned int cflags = io_put_rw_kbuf(req);
2995                        struct io_ring_ctx *ctx = req->ctx;
2996
2997                        req_set_fail(req);
2998                        if (issue_flags & IO_URING_F_UNLOCKED) {
2999                                mutex_lock(&ctx->uring_lock);
3000                                __io_req_complete(req, issue_flags, ret, cflags);
3001                                mutex_unlock(&ctx->uring_lock);
3002                        } else {
3003                                __io_req_complete(req, issue_flags, ret, cflags);
3004                        }
3005                }
3006        }
3007}
3008
3009static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3010                             struct io_mapped_ubuf *imu)
3011{
3012        size_t len = req->rw.len;
3013        u64 buf_end, buf_addr = req->rw.addr;
3014        size_t offset;
3015
3016        if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
3017                return -EFAULT;
3018        /* not inside the mapped region */
3019        if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
3020                return -EFAULT;
3021
3022        /*
3023         * May not be a start of buffer, set size appropriately
3024         * and advance us to the beginning.
3025         */
3026        offset = buf_addr - imu->ubuf;
3027        iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
3028
3029        if (offset) {
3030                /*
3031                 * Don't use iov_iter_advance() here, as it's really slow for
3032                 * using the latter parts of a big fixed buffer - it iterates
3033                 * over each segment manually. We can cheat a bit here, because
3034                 * we know that:
3035                 *
3036                 * 1) it's a BVEC iter, we set it up
3037                 * 2) all bvecs are PAGE_SIZE in size, except potentially the
3038                 *    first and last bvec
3039                 *
3040                 * So just find our index, and adjust the iterator afterwards.
3041                 * If the offset is within the first bvec (or the whole first
3042                 * bvec, just use iov_iter_advance(). This makes it easier
3043                 * since we can just skip the first segment, which may not
3044                 * be PAGE_SIZE aligned.
3045                 */
3046                const struct bio_vec *bvec = imu->bvec;
3047
3048                if (offset <= bvec->bv_len) {
3049                        iov_iter_advance(iter, offset);
3050                } else {
3051                        unsigned long seg_skip;
3052
3053                        /* skip first vec */
3054                        offset -= bvec->bv_len;
3055                        seg_skip = 1 + (offset >> PAGE_SHIFT);
3056
3057                        iter->bvec = bvec + seg_skip;
3058                        iter->nr_segs -= seg_skip;
3059                        iter->count -= bvec->bv_len + offset;
3060                        iter->iov_offset = offset & ~PAGE_MASK;
3061                }
3062        }
3063
3064        return 0;
3065}
3066
3067static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
3068{
3069        struct io_mapped_ubuf *imu = req->imu;
3070        u16 index, buf_index = req->buf_index;
3071
3072        if (likely(!imu)) {
3073                struct io_ring_ctx *ctx = req->ctx;
3074
3075                if (unlikely(buf_index >= ctx->nr_user_bufs))
3076                        return -EFAULT;
3077                io_req_set_rsrc_node(req, ctx);
3078                index = array_index_nospec(buf_index, ctx->nr_user_bufs);
3079                imu = READ_ONCE(ctx->user_bufs[index]);
3080                req->imu = imu;
3081        }
3082        return __io_import_fixed(req, rw, iter, imu);
3083}
3084
3085static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3086{
3087        if (needs_lock)
3088                mutex_unlock(&ctx->uring_lock);
3089}
3090
3091static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3092{
3093        /*
3094         * "Normal" inline submissions always hold the uring_lock, since we
3095         * grab it from the system call. Same is true for the SQPOLL offload.
3096         * The only exception is when we've detached the request and issue it
3097         * from an async worker thread, grab the lock for that case.
3098         */
3099        if (needs_lock)
3100                mutex_lock(&ctx->uring_lock);
3101}
3102
3103static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3104                                          int bgid, unsigned int issue_flags)
3105{
3106        struct io_buffer *kbuf = req->kbuf;
3107        struct io_buffer *head;
3108        bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
3109
3110        if (req->flags & REQ_F_BUFFER_SELECTED)
3111                return kbuf;
3112
3113        io_ring_submit_lock(req->ctx, needs_lock);
3114
3115        lockdep_assert_held(&req->ctx->uring_lock);
3116
3117        head = xa_load(&req->ctx->io_buffers, bgid);
3118        if (head) {
3119                if (!list_empty(&head->list)) {
3120                        kbuf = list_last_entry(&head->list, struct io_buffer,
3121                                                        list);
3122                        list_del(&kbuf->list);
3123                } else {
3124                        kbuf = head;
3125                        xa_erase(&req->ctx->io_buffers, bgid);
3126                }
3127                if (*len > kbuf->len)
3128                        *len = kbuf->len;
3129                req->flags |= REQ_F_BUFFER_SELECTED;
3130                req->kbuf = kbuf;
3131        } else {
3132                kbuf = ERR_PTR(-ENOBUFS);
3133        }
3134
3135        io_ring_submit_unlock(req->ctx, needs_lock);
3136        return kbuf;
3137}
3138
3139static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3140                                        unsigned int issue_flags)
3141{
3142        struct io_buffer *kbuf;
3143        u16 bgid;
3144
3145        bgid = req->buf_index;
3146        kbuf = io_buffer_select(req, len, bgid, issue_flags);
3147        if (IS_ERR(kbuf))
3148                return kbuf;
3149        return u64_to_user_ptr(kbuf->addr);
3150}
3151
3152#ifdef CONFIG_COMPAT
3153static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3154                                unsigned int issue_flags)
3155{
3156        struct compat_iovec __user *uiov;
3157        compat_ssize_t clen;
3158        void __user *buf;
3159        ssize_t len;
3160
3161        uiov = u64_to_user_ptr(req->rw.addr);
3162        if (!access_ok(uiov, sizeof(*uiov)))
3163                return -EFAULT;
3164        if (__get_user(clen, &uiov->iov_len))
3165                return -EFAULT;
3166        if (clen < 0)
3167                return -EINVAL;
3168
3169        len = clen;
3170        buf = io_rw_buffer_select(req, &len, issue_flags);
3171        if (IS_ERR(buf))
3172                return PTR_ERR(buf);
3173        iov[0].iov_base = buf;
3174        iov[0].iov_len = (compat_size_t) len;
3175        return 0;
3176}
3177#endif
3178
3179static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3180                                      unsigned int issue_flags)
3181{
3182        struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3183        void __user *buf;
3184        ssize_t len;
3185
3186        if (copy_from_user(iov, uiov, sizeof(*uiov)))
3187                return -EFAULT;
3188
3189        len = iov[0].iov_len;
3190        if (len < 0)
3191                return -EINVAL;
3192        buf = io_rw_buffer_select(req, &len, issue_flags);
3193        if (IS_ERR(buf))
3194                return PTR_ERR(buf);
3195        iov[0].iov_base = buf;
3196        iov[0].iov_len = len;
3197        return 0;
3198}
3199
3200static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3201                                    unsigned int issue_flags)
3202{
3203        if (req->flags & REQ_F_BUFFER_SELECTED) {
3204                struct io_buffer *kbuf = req->kbuf;
3205
3206                iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3207                iov[0].iov_len = kbuf->len;
3208                return 0;
3209        }
3210        if (req->rw.len != 1)
3211                return -EINVAL;
3212
3213#ifdef CONFIG_COMPAT
3214        if (req->ctx->compat)
3215                return io_compat_import(req, iov, issue_flags);
3216#endif
3217
3218        return __io_iov_buffer_select(req, iov, issue_flags);
3219}
3220
3221static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
3222                                       struct io_rw_state *s,
3223                                       unsigned int issue_flags)
3224{
3225        struct iov_iter *iter = &s->iter;
3226        u8 opcode = req->opcode;
3227        struct iovec *iovec;
3228        void __user *buf;
3229        size_t sqe_len;
3230        ssize_t ret;
3231
3232        BUILD_BUG_ON(ERR_PTR(0) != NULL);
3233
3234        if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED)
3235                return ERR_PTR(io_import_fixed(req, rw, iter));
3236
3237        /* buffer index only valid with fixed read/write, or buffer select  */
3238        if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
3239                return ERR_PTR(-EINVAL);
3240
3241        buf = u64_to_user_ptr(req->rw.addr);
3242        sqe_len = req->rw.len;
3243
3244        if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3245                if (req->flags & REQ_F_BUFFER_SELECT) {
3246                        buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
3247                        if (IS_ERR(buf))
3248                                return ERR_CAST(buf);
3249                        req->rw.len = sqe_len;
3250                }
3251
3252                ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter);
3253                return ERR_PTR(ret);
3254        }
3255
3256        iovec = s->fast_iov;
3257        if (req->flags & REQ_F_BUFFER_SELECT) {
3258                ret = io_iov_buffer_select(req, iovec, issue_flags);
3259                if (!ret)
3260                        iov_iter_init(iter, rw, iovec, 1, iovec->iov_len);
3261                return ERR_PTR(ret);
3262        }
3263
3264        ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
3265                              req->ctx->compat);
3266        if (unlikely(ret < 0))
3267                return ERR_PTR(ret);
3268        return iovec;
3269}
3270
3271static inline int io_import_iovec(int rw, struct io_kiocb *req,
3272                                  struct iovec **iovec, struct io_rw_state *s,
3273                                  unsigned int issue_flags)
3274{
3275        *iovec = __io_import_iovec(rw, req, s, issue_flags);
3276        if (unlikely(IS_ERR(*iovec)))
3277                return PTR_ERR(*iovec);
3278
3279        iov_iter_save_state(&s->iter, &s->iter_state);
3280        return 0;
3281}
3282
3283static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3284{
3285        return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3286}
3287
3288/*
3289 * For files that don't have ->read_iter() and ->write_iter(), handle them
3290 * by looping over ->read() or ->write() manually.
3291 */
3292static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3293{
3294        struct kiocb *kiocb = &req->rw.kiocb;
3295        struct file *file = req->file;
3296        ssize_t ret = 0;
3297
3298        /*
3299         * Don't support polled IO through this interface, and we can't
3300         * support non-blocking either. For the latter, this just causes
3301         * the kiocb to be handled from an async context.
3302         */
3303        if (kiocb->ki_flags & IOCB_HIPRI)
3304                return -EOPNOTSUPP;
3305        if ((kiocb->ki_flags & IOCB_NOWAIT) &&
3306            !(kiocb->ki_filp->f_flags & O_NONBLOCK))
3307                return -EAGAIN;
3308
3309        while (iov_iter_count(iter)) {
3310                struct iovec iovec;
3311                ssize_t nr;
3312
3313                if (!iov_iter_is_bvec(iter)) {
3314                        iovec = iov_iter_iovec(iter);
3315                } else {
3316                        iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3317                        iovec.iov_len = req->rw.len;
3318                }
3319
3320                if (rw == READ) {
3321                        nr = file->f_op->read(file, iovec.iov_base,
3322                                              iovec.iov_len, io_kiocb_ppos(kiocb));
3323                } else {
3324                        nr = file->f_op->write(file, iovec.iov_base,
3325                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3326                }
3327
3328                if (nr < 0) {
3329                        if (!ret)
3330                                ret = nr;
3331                        break;
3332                }
3333                if (!iov_iter_is_bvec(iter)) {
3334                        iov_iter_advance(iter, nr);
3335                } else {
3336                        req->rw.len -= nr;
3337                        req->rw.addr += nr;
3338                }
3339                ret += nr;
3340                if (nr != iovec.iov_len)
3341                        break;
3342        }
3343
3344        return ret;
3345}
3346
3347static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3348                          const struct iovec *fast_iov, struct iov_iter *iter)
3349{
3350        struct io_async_rw *rw = req->async_data;
3351
3352        memcpy(&rw->s.iter, iter, sizeof(*iter));
3353        rw->free_iovec = iovec;
3354        rw->bytes_done = 0;
3355        /* can only be fixed buffers, no need to do anything */
3356        if (iov_iter_is_bvec(iter))
3357                return;
3358        if (!iovec) {
3359                unsigned iov_off = 0;
3360
3361                rw->s.iter.iov = rw->s.fast_iov;
3362                if (iter->iov != fast_iov) {
3363                        iov_off = iter->iov - fast_iov;
3364                        rw->s.iter.iov += iov_off;
3365                }
3366                if (rw->s.fast_iov != fast_iov)
3367                        memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off,
3368                               sizeof(struct iovec) * iter->nr_segs);
3369        } else {
3370                req->flags |= REQ_F_NEED_CLEANUP;
3371        }
3372}
3373
3374static inline bool io_alloc_async_data(struct io_kiocb *req)
3375{
3376        WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3377        req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3378        if (req->async_data) {
3379                req->flags |= REQ_F_ASYNC_DATA;
3380                return false;
3381        }
3382        return true;
3383}
3384
3385static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3386                             struct io_rw_state *s, bool force)
3387{
3388        if (!force && !io_op_defs[req->opcode].needs_async_setup)
3389                return 0;
3390        if (!req_has_async_data(req)) {
3391                struct io_async_rw *iorw;
3392
3393                if (io_alloc_async_data(req)) {
3394                        kfree(iovec);
3395                        return -ENOMEM;
3396                }
3397
3398                io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
3399                iorw = req->async_data;
3400                /* we've copied and mapped the iter, ensure state is saved */
3401                iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
3402        }
3403        return 0;
3404}
3405
3406static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3407{
3408        struct io_async_rw *iorw = req->async_data;
3409        struct iovec *iov;
3410        int ret;
3411
3412        /* submission path, ->uring_lock should already be taken */
3413        ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
3414        if (unlikely(ret < 0))
3415                return ret;
3416
3417        iorw->bytes_done = 0;
3418        iorw->free_iovec = iov;
3419        if (iov)
3420                req->flags |= REQ_F_NEED_CLEANUP;
3421        return 0;
3422}
3423
3424static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3425{
3426        if (unlikely(!(req->file->f_mode & FMODE_READ)))
3427                return -EBADF;
3428        return io_prep_rw(req, sqe);
3429}
3430
3431/*
3432 * This is our waitqueue callback handler, registered through __folio_lock_async()
3433 * when we initially tried to do the IO with the iocb armed our waitqueue.
3434 * This gets called when the page is unlocked, and we generally expect that to
3435 * happen when the page IO is completed and the page is now uptodate. This will
3436 * queue a task_work based retry of the operation, attempting to copy the data
3437 * again. If the latter fails because the page was NOT uptodate, then we will
3438 * do a thread based blocking retry of the operation. That's the unexpected
3439 * slow path.
3440 */
3441static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3442                             int sync, void *arg)
3443{
3444        struct wait_page_queue *wpq;
3445        struct io_kiocb *req = wait->private;
3446        struct wait_page_key *key = arg;
3447
3448        wpq = container_of(wait, struct wait_page_queue, wait);
3449
3450        if (!wake_page_match(wpq, key))
3451                return 0;
3452
3453        req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3454        list_del_init(&wait->entry);
3455        io_req_task_queue(req);
3456        return 1;
3457}
3458
3459/*
3460 * This controls whether a given IO request should be armed for async page
3461 * based retry. If we return false here, the request is handed to the async
3462 * worker threads for retry. If we're doing buffered reads on a regular file,
3463 * we prepare a private wait_page_queue entry and retry the operation. This
3464 * will either succeed because the page is now uptodate and unlocked, or it
3465 * will register a callback when the page is unlocked at IO completion. Through
3466 * that callback, io_uring uses task_work to setup a retry of the operation.
3467 * That retry will attempt the buffered read again. The retry will generally
3468 * succeed, or in rare cases where it fails, we then fall back to using the
3469 * async worker threads for a blocking retry.
3470 */
3471static bool io_rw_should_retry(struct io_kiocb *req)
3472{
3473        struct io_async_rw *rw = req->async_data;
3474        struct wait_page_queue *wait = &rw->wpq;
3475        struct kiocb *kiocb = &req->rw.kiocb;
3476
3477        /* never retry for NOWAIT, we just complete with -EAGAIN */
3478        if (req->flags & REQ_F_NOWAIT)
3479                return false;
3480
3481        /* Only for buffered IO */
3482        if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3483                return false;
3484
3485        /*
3486         * just use poll if we can, and don't attempt if the fs doesn't
3487         * support callback based unlocks
3488         */
3489        if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3490                return false;
3491
3492        wait->wait.func = io_async_buf_func;
3493        wait->wait.private = req;
3494        wait->wait.flags = 0;
3495        INIT_LIST_HEAD(&wait->wait.entry);
3496        kiocb->ki_flags |= IOCB_WAITQ;
3497        kiocb->ki_flags &= ~IOCB_NOWAIT;
3498        kiocb->ki_waitq = wait;
3499        return true;
3500}
3501
3502static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3503{
3504        if (likely(req->file->f_op->read_iter))
3505                return call_read_iter(req->file, &req->rw.kiocb, iter);
3506        else if (req->file->f_op->read)
3507                return loop_rw_iter(READ, req, iter);
3508        else
3509                return -EINVAL;
3510}
3511
3512static bool need_read_all(struct io_kiocb *req)
3513{
3514        return req->flags & REQ_F_ISREG ||
3515                S_ISBLK(file_inode(req->file)->i_mode);
3516}
3517
3518static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3519{
3520        struct io_rw_state __s, *s = &__s;
3521        struct iovec *iovec;
3522        struct kiocb *kiocb = &req->rw.kiocb;
3523        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3524        struct io_async_rw *rw;
3525        ssize_t ret, ret2;
3526
3527        if (!req_has_async_data(req)) {
3528                ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
3529                if (unlikely(ret < 0))
3530                        return ret;
3531        } else {
3532                rw = req->async_data;
3533                s = &rw->s;
3534                /*
3535                 * We come here from an earlier attempt, restore our state to
3536                 * match in case it doesn't. It's cheap enough that we don't
3537                 * need to make this conditional.
3538                 */
3539                iov_iter_restore(&s->iter, &s->iter_state);
3540                iovec = NULL;
3541        }
3542        req->result = iov_iter_count(&s->iter);
3543
3544        if (force_nonblock) {
3545                /* If the file doesn't support async, just async punt */
3546                if (unlikely(!io_file_supports_nowait(req))) {
3547                        ret = io_setup_async_rw(req, iovec, s, true);
3548                        return ret ?: -EAGAIN;
3549                }
3550                kiocb->ki_flags |= IOCB_NOWAIT;
3551        } else {
3552                /* Ensure we clear previously set non-block flag */
3553                kiocb->ki_flags &= ~IOCB_NOWAIT;
3554        }
3555
3556        ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
3557        if (unlikely(ret)) {
3558                kfree(iovec);
3559                return ret;
3560        }
3561
3562        ret = io_iter_do_read(req, &s->iter);
3563
3564        if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3565                req->flags &= ~REQ_F_REISSUE;
3566                /* IOPOLL retry should happen for io-wq threads */
3567                if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3568                        goto done;
3569                /* no retry on NONBLOCK nor RWF_NOWAIT */
3570                if (req->flags & REQ_F_NOWAIT)
3571                        goto done;
3572                ret = 0;
3573        } else if (ret == -EIOCBQUEUED) {
3574                goto out_free;
3575        } else if (ret == req->result || ret <= 0 || !force_nonblock ||
3576                   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
3577                /* read all, failed, already did sync or don't want to retry */
3578                goto done;
3579        }
3580
3581        /*
3582         * Don't depend on the iter state matching what was consumed, or being
3583         * untouched in case of error. Restore it and we'll advance it
3584         * manually if we need to.
3585         */
3586        iov_iter_restore(&s->iter, &s->iter_state);
3587
3588        ret2 = io_setup_async_rw(req, iovec, s, true);
3589        if (ret2)
3590                return ret2;
3591
3592        iovec = NULL;
3593        rw = req->async_data;
3594        s = &rw->s;
3595        /*
3596         * Now use our persistent iterator and state, if we aren't already.
3597         * We've restored and mapped the iter to match.
3598         */
3599
3600        do {
3601                /*
3602                 * We end up here because of a partial read, either from
3603                 * above or inside this loop. Advance the iter by the bytes
3604                 * that were consumed.
3605                 */
3606                iov_iter_advance(&s->iter, ret);
3607                if (!iov_iter_count(&s->iter))
3608                        break;
3609                rw->bytes_done += ret;
3610                iov_iter_save_state(&s->iter, &s->iter_state);
3611
3612                /* if we can retry, do so with the callbacks armed */
3613                if (!io_rw_should_retry(req)) {
3614                        kiocb->ki_flags &= ~IOCB_WAITQ;
3615                        return -EAGAIN;
3616                }
3617
3618                /*
3619                 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3620                 * we get -EIOCBQUEUED, then we'll get a notification when the
3621                 * desired page gets unlocked. We can also get a partial read
3622                 * here, and if we do, then just retry at the new offset.
3623                 */
3624                ret = io_iter_do_read(req, &s->iter);
3625                if (ret == -EIOCBQUEUED)
3626                        return 0;
3627                /* we got some bytes, but not all. retry. */
3628                kiocb->ki_flags &= ~IOCB_WAITQ;
3629                iov_iter_restore(&s->iter, &s->iter_state);
3630        } while (ret > 0);
3631done:
3632        kiocb_done(kiocb, ret, issue_flags);
3633out_free:
3634        /* it's faster to check here then delegate to kfree */
3635        if (iovec)
3636                kfree(iovec);
3637        return 0;
3638}
3639
3640static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3641{
3642        if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3643                return -EBADF;
3644        req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
3645        return io_prep_rw(req, sqe);
3646}
3647
3648static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3649{
3650        struct io_rw_state __s, *s = &__s;
3651        struct iovec *iovec;
3652        struct kiocb *kiocb = &req->rw.kiocb;
3653        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3654        ssize_t ret, ret2;
3655
3656        if (!req_has_async_data(req)) {
3657                ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
3658                if (unlikely(ret < 0))
3659                        return ret;
3660        } else {
3661                struct io_async_rw *rw = req->async_data;
3662
3663                s = &rw->s;
3664                iov_iter_restore(&s->iter, &s->iter_state);
3665                iovec = NULL;
3666        }
3667        req->result = iov_iter_count(&s->iter);
3668
3669        if (force_nonblock) {
3670                /* If the file doesn't support async, just async punt */
3671                if (unlikely(!io_file_supports_nowait(req)))
3672                        goto copy_iov;
3673
3674                /* file path doesn't support NOWAIT for non-direct_IO */
3675                if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3676                    (req->flags & REQ_F_ISREG))
3677                        goto copy_iov;
3678
3679                kiocb->ki_flags |= IOCB_NOWAIT;
3680        } else {
3681                /* Ensure we clear previously set non-block flag */
3682                kiocb->ki_flags &= ~IOCB_NOWAIT;
3683        }
3684
3685        ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
3686        if (unlikely(ret))
3687                goto out_free;
3688
3689        /*
3690         * Open-code file_start_write here to grab freeze protection,
3691         * which will be released by another thread in
3692         * io_complete_rw().  Fool lockdep by telling it the lock got
3693         * released so that it doesn't complain about the held lock when
3694         * we return to userspace.
3695         */
3696        if (req->flags & REQ_F_ISREG) {
3697                sb_start_write(file_inode(req->file)->i_sb);
3698                __sb_writers_release(file_inode(req->file)->i_sb,
3699                                        SB_FREEZE_WRITE);
3700        }
3701        kiocb->ki_flags |= IOCB_WRITE;
3702
3703        if (likely(req->file->f_op->write_iter))
3704                ret2 = call_write_iter(req->file, kiocb, &s->iter);
3705        else if (req->file->f_op->write)
3706                ret2 = loop_rw_iter(WRITE, req, &s->iter);
3707        else
3708                ret2 = -EINVAL;
3709
3710        if (req->flags & REQ_F_REISSUE) {
3711                req->flags &= ~REQ_F_REISSUE;
3712                ret2 = -EAGAIN;
3713        }
3714
3715        /*
3716         * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3717         * retry them without IOCB_NOWAIT.
3718         */
3719        if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3720                ret2 = -EAGAIN;
3721        /* no retry on NONBLOCK nor RWF_NOWAIT */
3722        if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3723                goto done;
3724        if (!force_nonblock || ret2 != -EAGAIN) {
3725                /* IOPOLL retry should happen for io-wq threads */
3726                if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
3727                        goto copy_iov;
3728done:
3729                kiocb_done(kiocb, ret2, issue_flags);
3730        } else {
3731copy_iov:
3732                iov_iter_restore(&s->iter, &s->iter_state);
3733                ret = io_setup_async_rw(req, iovec, s, false);
3734                return ret ?: -EAGAIN;
3735        }
3736out_free:
3737        /* it's reportedly faster than delegating the null check to kfree() */
3738        if (iovec)
3739                kfree(iovec);
3740        return ret;
3741}
3742
3743static int io_renameat_prep(struct io_kiocb *req,
3744                            const struct io_uring_sqe *sqe)
3745{
3746        struct io_rename *ren = &req->rename;
3747        const char __user *oldf, *newf;
3748
3749        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3750                return -EINVAL;
3751        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3752                return -EINVAL;
3753        if (unlikely(req->flags & REQ_F_FIXED_FILE))
3754                return -EBADF;
3755
3756        ren->old_dfd = READ_ONCE(sqe->fd);
3757        oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3758        newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3759        ren->new_dfd = READ_ONCE(sqe->len);
3760        ren->flags = READ_ONCE(sqe->rename_flags);
3761
3762        ren->oldpath = getname(oldf);
3763        if (IS_ERR(ren->oldpath))
3764                return PTR_ERR(ren->oldpath);
3765
3766        ren->newpath = getname(newf);
3767        if (IS_ERR(ren->newpath)) {
3768                putname(ren->oldpath);
3769                return PTR_ERR(ren->newpath);
3770        }
3771
3772        req->flags |= REQ_F_NEED_CLEANUP;
3773        return 0;
3774}
3775
3776static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3777{
3778        struct io_rename *ren = &req->rename;
3779        int ret;
3780
3781        if (issue_flags & IO_URING_F_NONBLOCK)
3782                return -EAGAIN;
3783
3784        ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3785                                ren->newpath, ren->flags);
3786
3787        req->flags &= ~REQ_F_NEED_CLEANUP;
3788        if (ret < 0)
3789                req_set_fail(req);
3790        io_req_complete(req, ret);
3791        return 0;
3792}
3793
3794static int io_unlinkat_prep(struct io_kiocb *req,
3795                            const struct io_uring_sqe *sqe)
3796{
3797        struct io_unlink *un = &req->unlink;
3798        const char __user *fname;
3799
3800        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3801                return -EINVAL;
3802        if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3803            sqe->splice_fd_in)
3804                return -EINVAL;
3805        if (unlikely(req->flags & REQ_F_FIXED_FILE))
3806                return -EBADF;
3807
3808        un->dfd = READ_ONCE(sqe->fd);
3809
3810        un->flags = READ_ONCE(sqe->unlink_flags);
3811        if (un->flags & ~AT_REMOVEDIR)
3812                return -EINVAL;
3813
3814        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3815        un->filename = getname(fname);
3816        if (IS_ERR(un->filename))
3817                return PTR_ERR(un->filename);
3818
3819        req->flags |= REQ_F_NEED_CLEANUP;
3820        return 0;
3821}
3822
3823static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3824{
3825        struct io_unlink *un = &req->unlink;
3826        int ret;
3827
3828        if (issue_flags & IO_URING_F_NONBLOCK)
3829                return -EAGAIN;
3830
3831        if (un->flags & AT_REMOVEDIR)
3832                ret = do_rmdir(un->dfd, un->filename);
3833        else
3834                ret = do_unlinkat(un->dfd, un->filename);
3835
3836        req->flags &= ~REQ_F_NEED_CLEANUP;
3837        if (ret < 0)
3838                req_set_fail(req);
3839        io_req_complete(req, ret);
3840        return 0;
3841}
3842
3843static int io_mkdirat_prep(struct io_kiocb *req,
3844                            const struct io_uring_sqe *sqe)
3845{
3846        struct io_mkdir *mkd = &req->mkdir;
3847        const char __user *fname;
3848
3849        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3850                return -EINVAL;
3851        if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
3852            sqe->splice_fd_in)
3853                return -EINVAL;
3854        if (unlikely(req->flags & REQ_F_FIXED_FILE))
3855                return -EBADF;
3856
3857        mkd->dfd = READ_ONCE(sqe->fd);
3858        mkd->mode = READ_ONCE(sqe->len);
3859
3860        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3861        mkd->filename = getname(fname);
3862        if (IS_ERR(mkd->filename))
3863                return PTR_ERR(mkd->filename);
3864
3865        req->flags |= REQ_F_NEED_CLEANUP;
3866        return 0;
3867}
3868
3869static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
3870{
3871        struct io_mkdir *mkd = &req->mkdir;
3872        int ret;
3873
3874        if (issue_flags & IO_URING_F_NONBLOCK)
3875                return -EAGAIN;
3876
3877        ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
3878
3879        req->flags &= ~REQ_F_NEED_CLEANUP;
3880        if (ret < 0)
3881                req_set_fail(req);
3882        io_req_complete(req, ret);
3883        return 0;
3884}
3885
3886static int io_symlinkat_prep(struct io_kiocb *req,
3887                            const struct io_uring_sqe *sqe)
3888{
3889        struct io_symlink *sl = &req->symlink;
3890        const char __user *oldpath, *newpath;
3891
3892        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3893                return -EINVAL;
3894        if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
3895            sqe->splice_fd_in)
3896                return -EINVAL;
3897        if (unlikely(req->flags & REQ_F_FIXED_FILE))
3898                return -EBADF;
3899
3900        sl->new_dfd = READ_ONCE(sqe->fd);
3901        oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr));
3902        newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3903
3904        sl->oldpath = getname(oldpath);
3905        if (IS_ERR(sl->oldpath))
3906                return PTR_ERR(sl->oldpath);
3907
3908        sl->newpath = getname(newpath);
3909        if (IS_ERR(sl->newpath)) {
3910                putname(sl->oldpath);
3911                return PTR_ERR(sl->newpath);
3912        }
3913
3914        req->flags |= REQ_F_NEED_CLEANUP;
3915        return 0;
3916}
3917
3918static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
3919{
3920        struct io_symlink *sl = &req->symlink;
3921        int ret;
3922
3923        if (issue_flags & IO_URING_F_NONBLOCK)
3924                return -EAGAIN;
3925
3926        ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
3927
3928        req->flags &= ~REQ_F_NEED_CLEANUP;
3929        if (ret < 0)
3930                req_set_fail(req);
3931        io_req_complete(req, ret);
3932        return 0;
3933}
3934
3935static int io_linkat_prep(struct io_kiocb *req,
3936                            const struct io_uring_sqe *sqe)
3937{
3938        struct io_hardlink *lnk = &req->hardlink;
3939        const char __user *oldf, *newf;
3940
3941        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3942                return -EINVAL;
3943        if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
3944                return -EINVAL;
3945        if (unlikely(req->flags & REQ_F_FIXED_FILE))
3946                return -EBADF;
3947
3948        lnk->old_dfd = READ_ONCE(sqe->fd);
3949        lnk->new_dfd = READ_ONCE(sqe->len);
3950        oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3951        newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3952        lnk->flags = READ_ONCE(sqe->hardlink_flags);
3953
3954        lnk->oldpath = getname(oldf);
3955        if (IS_ERR(lnk->oldpath))
3956                return PTR_ERR(lnk->oldpath);
3957
3958        lnk->newpath = getname(newf);
3959        if (IS_ERR(lnk->newpath)) {
3960                putname(lnk->oldpath);
3961                return PTR_ERR(lnk->newpath);
3962        }
3963
3964        req->flags |= REQ_F_NEED_CLEANUP;
3965        return 0;
3966}
3967
3968static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
3969{
3970        struct io_hardlink *lnk = &req->hardlink;
3971        int ret;
3972
3973        if (issue_flags & IO_URING_F_NONBLOCK)
3974                return -EAGAIN;
3975
3976        ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
3977                                lnk->newpath, lnk->flags);
3978
3979        req->flags &= ~REQ_F_NEED_CLEANUP;
3980        if (ret < 0)
3981                req_set_fail(req);
3982        io_req_complete(req, ret);
3983        return 0;
3984}
3985
3986static int io_shutdown_prep(struct io_kiocb *req,
3987                            const struct io_uring_sqe *sqe)
3988{
3989#if defined(CONFIG_NET)
3990        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3991                return -EINVAL;
3992        if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3993                     sqe->buf_index || sqe->splice_fd_in))
3994                return -EINVAL;
3995
3996        req->shutdown.how = READ_ONCE(sqe->len);
3997        return 0;
3998#else
3999        return -EOPNOTSUPP;
4000#endif
4001}
4002
4003static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
4004{
4005#if defined(CONFIG_NET)
4006        struct socket *sock;
4007        int ret;
4008
4009        if (issue_flags & IO_URING_F_NONBLOCK)
4010                return -EAGAIN;
4011
4012        sock = sock_from_file(req->file);
4013        if (unlikely(!sock))
4014                return -ENOTSOCK;
4015
4016        ret = __sys_shutdown_sock(sock, req->shutdown.how);
4017        if (ret < 0)
4018                req_set_fail(req);
4019        io_req_complete(req, ret);
4020        return 0;
4021#else
4022        return -EOPNOTSUPP;
4023#endif
4024}
4025
4026static int __io_splice_prep(struct io_kiocb *req,
4027                            const struct io_uring_sqe *sqe)
4028{
4029        struct io_splice *sp = &req->splice;
4030        unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
4031
4032        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4033                return -EINVAL;
4034
4035        sp->file_in = NULL;
4036        sp->len = READ_ONCE(sqe->len);
4037        sp->flags = READ_ONCE(sqe->splice_flags);
4038
4039        if (unlikely(sp->flags & ~valid_flags))
4040                return -EINVAL;
4041
4042        sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in),
4043                                  (sp->flags & SPLICE_F_FD_IN_FIXED));
4044        if (!sp->file_in)
4045                return -EBADF;
4046        req->flags |= REQ_F_NEED_CLEANUP;
4047        return 0;
4048}
4049
4050static int io_tee_prep(struct io_kiocb *req,
4051                       const struct io_uring_sqe *sqe)
4052{
4053        if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
4054                return -EINVAL;
4055        return __io_splice_prep(req, sqe);
4056}
4057
4058static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
4059{
4060        struct io_splice *sp = &req->splice;
4061        struct file *in = sp->file_in;
4062        struct file *out = sp->file_out;
4063        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4064        long ret = 0;
4065
4066        if (issue_flags & IO_URING_F_NONBLOCK)
4067                return -EAGAIN;
4068        if (sp->len)
4069                ret = do_tee(in, out, sp->len, flags);
4070
4071        if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4072                io_put_file(in);
4073        req->flags &= ~REQ_F_NEED_CLEANUP;
4074
4075        if (ret != sp->len)
4076                req_set_fail(req);
4077        io_req_complete(req, ret);
4078        return 0;
4079}
4080
4081static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4082{
4083        struct io_splice *sp = &req->splice;
4084
4085        sp->off_in = READ_ONCE(sqe->splice_off_in);
4086        sp->off_out = READ_ONCE(sqe->off);
4087        return __io_splice_prep(req, sqe);
4088}
4089
4090static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
4091{
4092        struct io_splice *sp = &req->splice;
4093        struct file *in = sp->file_in;
4094        struct file *out = sp->file_out;
4095        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4096        loff_t *poff_in, *poff_out;
4097        long ret = 0;
4098
4099        if (issue_flags & IO_URING_F_NONBLOCK)
4100                return -EAGAIN;
4101
4102        poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4103        poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
4104
4105        if (sp->len)
4106                ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
4107
4108        if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4109                io_put_file(in);
4110        req->flags &= ~REQ_F_NEED_CLEANUP;
4111
4112        if (ret != sp->len)
4113                req_set_fail(req);
4114        io_req_complete(req, ret);
4115        return 0;
4116}
4117
4118/*
4119 * IORING_OP_NOP just posts a completion event, nothing else.
4120 */
4121static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
4122{
4123        struct io_ring_ctx *ctx = req->ctx;
4124
4125        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4126                return -EINVAL;
4127
4128        __io_req_complete(req, issue_flags, 0, 0);
4129        return 0;
4130}
4131
4132static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4133{
4134        struct io_ring_ctx *ctx = req->ctx;
4135
4136        if (!req->file)
4137                return -EBADF;
4138
4139        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4140                return -EINVAL;
4141        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4142                     sqe->splice_fd_in))
4143                return -EINVAL;
4144
4145        req->sync.flags = READ_ONCE(sqe->fsync_flags);
4146        if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4147                return -EINVAL;
4148
4149        req->sync.off = READ_ONCE(sqe->off);
4150        req->sync.len = READ_ONCE(sqe->len);
4151        return 0;
4152}
4153
4154static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
4155{
4156        loff_t end = req->sync.off + req->sync.len;
4157        int ret;
4158
4159        /* fsync always requires a blocking context */
4160        if (issue_flags & IO_URING_F_NONBLOCK)
4161                return -EAGAIN;
4162
4163        ret = vfs_fsync_range(req->file, req->sync.off,
4164                                end > 0 ? end : LLONG_MAX,
4165                                req->sync.flags & IORING_FSYNC_DATASYNC);
4166        if (ret < 0)
4167                req_set_fail(req);
4168        io_req_complete(req, ret);
4169        return 0;
4170}
4171
4172static int io_fallocate_prep(struct io_kiocb *req,
4173                             const struct io_uring_sqe *sqe)
4174{
4175        if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4176            sqe->splice_fd_in)
4177                return -EINVAL;
4178        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4179                return -EINVAL;
4180
4181        req->sync.off = READ_ONCE(sqe->off);
4182        req->sync.len = READ_ONCE(sqe->addr);
4183        req->sync.mode = READ_ONCE(sqe->len);
4184        return 0;
4185}
4186
4187static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
4188{
4189        int ret;
4190
4191        /* fallocate always requiring blocking context */
4192        if (issue_flags & IO_URING_F_NONBLOCK)
4193                return -EAGAIN;
4194        ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4195                                req->sync.len);
4196        if (ret < 0)
4197                req_set_fail(req);
4198        io_req_complete(req, ret);
4199        return 0;
4200}
4201
4202static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4203{
4204        const char __user *fname;
4205        int ret;
4206
4207        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4208                return -EINVAL;
4209        if (unlikely(sqe->ioprio || sqe->buf_index))
4210                return -EINVAL;
4211        if (unlikely(req->flags & REQ_F_FIXED_FILE))
4212                return -EBADF;
4213
4214        /* open.how should be already initialised */
4215        if (!(req->open.how.flags & O_PATH) && force_o_largefile())
4216                req->open.how.flags |= O_LARGEFILE;
4217
4218        req->open.dfd = READ_ONCE(sqe->fd);
4219        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4220        req->open.filename = getname(fname);
4221        if (IS_ERR(req->open.filename)) {
4222                ret = PTR_ERR(req->open.filename);
4223                req->open.filename = NULL;
4224                return ret;
4225        }
4226
4227        req->open.file_slot = READ_ONCE(sqe->file_index);
4228        if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4229                return -EINVAL;
4230
4231        req->open.nofile = rlimit(RLIMIT_NOFILE);
4232        req->flags |= REQ_F_NEED_CLEANUP;
4233        return 0;
4234}
4235
4236static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4237{
4238        u64 mode = READ_ONCE(sqe->len);
4239        u64 flags = READ_ONCE(sqe->open_flags);
4240
4241        req->open.how = build_open_how(flags, mode);
4242        return __io_openat_prep(req, sqe);
4243}
4244
4245static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4246{
4247        struct open_how __user *how;
4248        size_t len;
4249        int ret;
4250
4251        how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4252        len = READ_ONCE(sqe->len);
4253        if (len < OPEN_HOW_SIZE_VER0)
4254                return -EINVAL;
4255
4256        ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4257                                        len);
4258        if (ret)
4259                return ret;
4260
4261        return __io_openat_prep(req, sqe);
4262}
4263
4264static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
4265{
4266        struct open_flags op;
4267        struct file *file;
4268        bool resolve_nonblock, nonblock_set;
4269        bool fixed = !!req->open.file_slot;
4270        int ret;
4271
4272        ret = build_open_flags(&req->open.how, &op);
4273        if (ret)
4274                goto err;
4275        nonblock_set = op.open_flag & O_NONBLOCK;
4276        resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
4277        if (issue_flags & IO_URING_F_NONBLOCK) {
4278                /*
4279                 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4280                 * it'll always -EAGAIN
4281                 */
4282                if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
4283                        return -EAGAIN;
4284                op.lookup_flags |= LOOKUP_CACHED;
4285                op.open_flag |= O_NONBLOCK;
4286        }
4287
4288        if (!fixed) {
4289                ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4290                if (ret < 0)
4291                        goto err;
4292        }
4293
4294        file = do_filp_open(req->open.dfd, req->open.filename, &op);
4295        if (IS_ERR(file)) {
4296                /*
4297                 * We could hang on to this 'fd' on retrying, but seems like
4298                 * marginal gain for something that is now known to be a slower
4299                 * path. So just put it, and we'll get a new one when we retry.
4300                 */
4301                if (!fixed)
4302                        put_unused_fd(ret);
4303
4304                ret = PTR_ERR(file);
4305                /* only retry if RESOLVE_CACHED wasn't already set by application */
4306                if (ret == -EAGAIN &&
4307                    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4308                        return -EAGAIN;
4309                goto err;
4310        }
4311
4312        if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4313                file->f_flags &= ~O_NONBLOCK;
4314        fsnotify_open(file);
4315
4316        if (!fixed)
4317                fd_install(ret, file);
4318        else
4319                ret = io_install_fixed_file(req, file, issue_flags,
4320                                            req->open.file_slot - 1);
4321err:
4322        putname(req->open.filename);
4323        req->flags &= ~REQ_F_NEED_CLEANUP;
4324        if (ret < 0)
4325                req_set_fail(req);
4326        __io_req_complete(req, issue_flags, ret, 0);
4327        return 0;
4328}
4329
4330static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4331{
4332        return io_openat2(req, issue_flags);
4333}
4334
4335static int io_remove_buffers_prep(struct io_kiocb *req,
4336                                  const struct io_uring_sqe *sqe)
4337{
4338        struct io_provide_buf *p = &req->pbuf;
4339        u64 tmp;
4340
4341        if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4342            sqe->splice_fd_in)
4343                return -EINVAL;
4344
4345        tmp = READ_ONCE(sqe->fd);
4346        if (!tmp || tmp > USHRT_MAX)
4347                return -EINVAL;
4348
4349        memset(p, 0, sizeof(*p));
4350        p->nbufs = tmp;
4351        p->bgid = READ_ONCE(sqe->buf_group);
4352        return 0;
4353}
4354
4355static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4356                               int bgid, unsigned nbufs)
4357{
4358        unsigned i = 0;
4359
4360        /* shouldn't happen */
4361        if (!nbufs)
4362                return 0;
4363
4364        /* the head kbuf is the list itself */
4365        while (!list_empty(&buf->list)) {
4366                struct io_buffer *nxt;
4367
4368                nxt = list_first_entry(&buf->list, struct io_buffer, list);
4369                list_del(&nxt->list);
4370                kfree(nxt);
4371                if (++i == nbufs)
4372                        return i;
4373                cond_resched();
4374        }
4375        i++;
4376        kfree(buf);
4377        xa_erase(&ctx->io_buffers, bgid);
4378
4379        return i;
4380}
4381
4382static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4383{
4384        struct io_provide_buf *p = &req->pbuf;
4385        struct io_ring_ctx *ctx = req->ctx;
4386        struct io_buffer *head;
4387        int ret = 0;
4388        bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
4389
4390        io_ring_submit_lock(ctx, needs_lock);
4391
4392        lockdep_assert_held(&ctx->uring_lock);
4393
4394        ret = -ENOENT;
4395        head = xa_load(&ctx->io_buffers, p->bgid);
4396        if (head)
4397                ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
4398        if (ret < 0)
4399                req_set_fail(req);
4400
4401        /* complete before unlock, IOPOLL may need the lock */
4402        __io_req_complete(req, issue_flags, ret, 0);
4403        io_ring_submit_unlock(ctx, needs_lock);
4404        return 0;
4405}
4406
4407static int io_provide_buffers_prep(struct io_kiocb *req,
4408                                   const struct io_uring_sqe *sqe)
4409{
4410        unsigned long size, tmp_check;
4411        struct io_provide_buf *p = &req->pbuf;
4412        u64 tmp;
4413
4414        if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4415                return -EINVAL;
4416
4417        tmp = READ_ONCE(sqe->fd);
4418        if (!tmp || tmp > USHRT_MAX)
4419                return -E2BIG;
4420        p->nbufs = tmp;
4421        p->addr = READ_ONCE(sqe->addr);
4422        p->len = READ_ONCE(sqe->len);
4423
4424        if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4425                                &size))
4426                return -EOVERFLOW;
4427        if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4428                return -EOVERFLOW;
4429
4430        size = (unsigned long)p->len * p->nbufs;
4431        if (!access_ok(u64_to_user_ptr(p->addr), size))
4432                return -EFAULT;
4433
4434        p->bgid = READ_ONCE(sqe->buf_group);
4435        tmp = READ_ONCE(sqe->off);
4436        if (tmp > USHRT_MAX)
4437                return -E2BIG;
4438        p->bid = tmp;
4439        return 0;
4440}
4441
4442static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4443{
4444        struct io_buffer *buf;
4445        u64 addr = pbuf->addr;
4446        int i, bid = pbuf->bid;
4447
4448        for (i = 0; i < pbuf->nbufs; i++) {
4449                buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
4450                if (!buf)
4451                        break;
4452
4453                buf->addr = addr;
4454                buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4455                buf->bid = bid;
4456                addr += pbuf->len;
4457                bid++;
4458                if (!*head) {
4459                        INIT_LIST_HEAD(&buf->list);
4460                        *head = buf;
4461                } else {
4462                        list_add_tail(&buf->list, &(*head)->list);
4463                }
4464        }
4465
4466        return i ? i : -ENOMEM;
4467}
4468
4469static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4470{
4471        struct io_provide_buf *p = &req->pbuf;
4472        struct io_ring_ctx *ctx = req->ctx;
4473        struct io_buffer *head, *list;
4474        int ret = 0;
4475        bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
4476
4477        io_ring_submit_lock(ctx, needs_lock);
4478
4479        lockdep_assert_held(&ctx->uring_lock);
4480
4481        list = head = xa_load(&ctx->io_buffers, p->bgid);
4482
4483        ret = io_add_buffers(p, &head);
4484        if (ret >= 0 && !list) {
4485                ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
4486                if (ret < 0)
4487                        __io_remove_buffers(ctx, head, p->bgid, -1U);
4488        }
4489        if (ret < 0)
4490                req_set_fail(req);
4491        /* complete before unlock, IOPOLL may need the lock */
4492        __io_req_complete(req, issue_flags, ret, 0);
4493        io_ring_submit_unlock(ctx, needs_lock);
4494        return 0;
4495}
4496
4497static int io_epoll_ctl_prep(struct io_kiocb *req,
4498                             const struct io_uring_sqe *sqe)
4499{
4500#if defined(CONFIG_EPOLL)
4501        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4502                return -EINVAL;
4503        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4504                return -EINVAL;
4505
4506        req->epoll.epfd = READ_ONCE(sqe->fd);
4507        req->epoll.op = READ_ONCE(sqe->len);
4508        req->epoll.fd = READ_ONCE(sqe->off);
4509
4510        if (ep_op_has_event(req->epoll.op)) {
4511                struct epoll_event __user *ev;
4512
4513                ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4514                if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4515                        return -EFAULT;
4516        }
4517
4518        return 0;
4519#else
4520        return -EOPNOTSUPP;
4521#endif
4522}
4523
4524static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4525{
4526#if defined(CONFIG_EPOLL)
4527        struct io_epoll *ie = &req->epoll;
4528        int ret;
4529        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4530
4531        ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4532        if (force_nonblock && ret == -EAGAIN)
4533                return -EAGAIN;
4534
4535        if (ret < 0)
4536                req_set_fail(req);
4537        __io_req_complete(req, issue_flags, ret, 0);
4538        return 0;
4539#else
4540        return -EOPNOTSUPP;
4541#endif
4542}
4543
4544static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4545{
4546#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4547        if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4548                return -EINVAL;
4549        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4550                return -EINVAL;
4551
4552        req->madvise.addr = READ_ONCE(sqe->addr);
4553        req->madvise.len = READ_ONCE(sqe->len);
4554        req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4555        return 0;
4556#else
4557        return -EOPNOTSUPP;
4558#endif
4559}
4560
4561static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4562{
4563#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4564        struct io_madvise *ma = &req->madvise;
4565        int ret;
4566
4567        if (issue_flags & IO_URING_F_NONBLOCK)
4568                return -EAGAIN;
4569
4570        ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4571        if (ret < 0)
4572                req_set_fail(req);
4573        io_req_complete(req, ret);
4574        return 0;
4575#else
4576        return -EOPNOTSUPP;
4577#endif
4578}
4579
4580static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4581{
4582        if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4583                return -EINVAL;
4584        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4585                return -EINVAL;
4586
4587        req->fadvise.offset = READ_ONCE(sqe->off);
4588        req->fadvise.len = READ_ONCE(sqe->len);
4589        req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4590        return 0;
4591}
4592
4593static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4594{
4595        struct io_fadvise *fa = &req->fadvise;
4596        int ret;
4597
4598        if (issue_flags & IO_URING_F_NONBLOCK) {
4599                switch (fa->advice) {
4600                case POSIX_FADV_NORMAL:
4601                case POSIX_FADV_RANDOM:
4602                case POSIX_FADV_SEQUENTIAL:
4603                        break;
4604                default:
4605                        return -EAGAIN;
4606                }
4607        }
4608
4609        ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4610        if (ret < 0)
4611                req_set_fail(req);
4612        __io_req_complete(req, issue_flags, ret, 0);
4613        return 0;
4614}
4615
4616static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4617{
4618        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4619                return -EINVAL;
4620        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4621                return -EINVAL;
4622        if (req->flags & REQ_F_FIXED_FILE)
4623                return -EBADF;
4624
4625        req->statx.dfd = READ_ONCE(sqe->fd);
4626        req->statx.mask = READ_ONCE(sqe->len);
4627        req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4628        req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4629        req->statx.flags = READ_ONCE(sqe->statx_flags);
4630
4631        return 0;
4632}
4633
4634static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4635{
4636        struct io_statx *ctx = &req->statx;
4637        int ret;
4638
4639        if (issue_flags & IO_URING_F_NONBLOCK)
4640                return -EAGAIN;
4641
4642        ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4643                       ctx->buffer);
4644
4645        if (ret < 0)
4646                req_set_fail(req);
4647        io_req_complete(req, ret);
4648        return 0;
4649}
4650
4651static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4652{
4653        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4654                return -EINVAL;
4655        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4656            sqe->rw_flags || sqe->buf_index)
4657                return -EINVAL;
4658        if (req->flags & REQ_F_FIXED_FILE)
4659                return -EBADF;
4660
4661        req->close.fd = READ_ONCE(sqe->fd);
4662        req->close.file_slot = READ_ONCE(sqe->file_index);
4663        if (req->close.file_slot && req->close.fd)
4664                return -EINVAL;
4665
4666        return 0;
4667}
4668
4669static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4670{
4671        struct files_struct *files = current->files;
4672        struct io_close *close = &req->close;
4673        struct fdtable *fdt;
4674        struct file *file = NULL;
4675        int ret = -EBADF;
4676
4677        if (req->close.file_slot) {
4678                ret = io_close_fixed(req, issue_flags);
4679                goto err;
4680        }
4681
4682        spin_lock(&files->file_lock);
4683        fdt = files_fdtable(files);
4684        if (close->fd >= fdt->max_fds) {
4685                spin_unlock(&files->file_lock);
4686                goto err;
4687        }
4688        file = fdt->fd[close->fd];
4689        if (!file || file->f_op == &io_uring_fops) {
4690                spin_unlock(&files->file_lock);
4691                file = NULL;
4692                goto err;
4693        }
4694
4695        /* if the file has a flush method, be safe and punt to async */
4696        if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4697                spin_unlock(&files->file_lock);
4698                return -EAGAIN;
4699        }
4700
4701        ret = __close_fd_get_file(close->fd, &file);
4702        spin_unlock(&files->file_lock);
4703        if (ret < 0) {
4704                if (ret == -ENOENT)
4705                        ret = -EBADF;
4706                goto err;
4707        }
4708
4709        /* No ->flush() or already async, safely close from here */
4710        ret = filp_close(file, current->files);
4711err:
4712        if (ret < 0)
4713                req_set_fail(req);
4714        if (file)
4715                fput(file);
4716        __io_req_complete(req, issue_flags, ret, 0);
4717        return 0;
4718}
4719
4720static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4721{
4722        struct io_ring_ctx *ctx = req->ctx;
4723
4724        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4725                return -EINVAL;
4726        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4727                     sqe->splice_fd_in))
4728                return -EINVAL;
4729
4730        req->sync.off = READ_ONCE(sqe->off);
4731        req->sync.len = READ_ONCE(sqe->len);
4732        req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4733        return 0;
4734}
4735
4736static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4737{
4738        int ret;
4739
4740        /* sync_file_range always requires a blocking context */
4741        if (issue_flags & IO_URING_F_NONBLOCK)
4742                return -EAGAIN;
4743
4744        ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4745                                req->sync.flags);
4746        if (ret < 0)
4747                req_set_fail(req);
4748        io_req_complete(req, ret);
4749        return 0;
4750}
4751
4752#if defined(CONFIG_NET)
4753static int io_setup_async_msg(struct io_kiocb *req,
4754                              struct io_async_msghdr *kmsg)
4755{
4756        struct io_async_msghdr *async_msg = req->async_data;
4757
4758        if (async_msg)
4759                return -EAGAIN;
4760        if (io_alloc_async_data(req)) {
4761                kfree(kmsg->free_iov);
4762                return -ENOMEM;
4763        }
4764        async_msg = req->async_data;
4765        req->flags |= REQ_F_NEED_CLEANUP;
4766        memcpy(async_msg, kmsg, sizeof(*kmsg));
4767        async_msg->msg.msg_name = &async_msg->addr;
4768        /* if were using fast_iov, set it to the new one */
4769        if (!async_msg->free_iov)
4770                async_msg->msg.msg_iter.iov = async_msg->fast_iov;
4771
4772        return -EAGAIN;
4773}
4774
4775static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4776                               struct io_async_msghdr *iomsg)
4777{
4778        iomsg->msg.msg_name = &iomsg->addr;
4779        iomsg->free_iov = iomsg->fast_iov;
4780        return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4781                                   req->sr_msg.msg_flags, &iomsg->free_iov);
4782}
4783
4784static int io_sendmsg_prep_async(struct io_kiocb *req)
4785{
4786        int ret;
4787
4788        ret = io_sendmsg_copy_hdr(req, req->async_data);
4789        if (!ret)
4790                req->flags |= REQ_F_NEED_CLEANUP;
4791        return ret;
4792}
4793
4794static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4795{
4796        struct io_sr_msg *sr = &req->sr_msg;
4797
4798        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4799                return -EINVAL;
4800
4801        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4802        sr->len = READ_ONCE(sqe->len);
4803        sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4804        if (sr->msg_flags & MSG_DONTWAIT)
4805                req->flags |= REQ_F_NOWAIT;
4806
4807#ifdef CONFIG_COMPAT
4808        if (req->ctx->compat)
4809                sr->msg_flags |= MSG_CMSG_COMPAT;
4810#endif
4811        return 0;
4812}
4813
4814static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4815{
4816        struct io_async_msghdr iomsg, *kmsg;
4817        struct socket *sock;
4818        unsigned flags;
4819        int min_ret = 0;
4820        int ret;
4821
4822        sock = sock_from_file(req->file);
4823        if (unlikely(!sock))
4824                return -ENOTSOCK;
4825
4826        if (req_has_async_data(req)) {
4827                kmsg = req->async_data;
4828        } else {
4829                ret = io_sendmsg_copy_hdr(req, &iomsg);
4830                if (ret)
4831                        return ret;
4832                kmsg = &iomsg;
4833        }
4834
4835        flags = req->sr_msg.msg_flags;
4836        if (issue_flags & IO_URING_F_NONBLOCK)
4837                flags |= MSG_DONTWAIT;
4838        if (flags & MSG_WAITALL)
4839                min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4840
4841        ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4842        if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4843                return io_setup_async_msg(req, kmsg);
4844        if (ret == -ERESTARTSYS)
4845                ret = -EINTR;
4846
4847        /* fast path, check for non-NULL to avoid function call */
4848        if (kmsg->free_iov)
4849                kfree(kmsg->free_iov);
4850        req->flags &= ~REQ_F_NEED_CLEANUP;
4851        if (ret < min_ret)
4852                req_set_fail(req);
4853        __io_req_complete(req, issue_flags, ret, 0);
4854        return 0;
4855}
4856
4857static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4858{
4859        struct io_sr_msg *sr = &req->sr_msg;
4860        struct msghdr msg;
4861        struct iovec iov;
4862        struct socket *sock;
4863        unsigned flags;
4864        int min_ret = 0;
4865        int ret;
4866
4867        sock = sock_from_file(req->file);
4868        if (unlikely(!sock))
4869                return -ENOTSOCK;
4870
4871        ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4872        if (unlikely(ret))
4873                return ret;
4874
4875        msg.msg_name = NULL;
4876        msg.msg_control = NULL;
4877        msg.msg_controllen = 0;
4878        msg.msg_namelen = 0;
4879
4880        flags = req->sr_msg.msg_flags;
4881        if (issue_flags & IO_URING_F_NONBLOCK)
4882                flags |= MSG_DONTWAIT;
4883        if (flags & MSG_WAITALL)
4884                min_ret = iov_iter_count(&msg.msg_iter);
4885
4886        msg.msg_flags = flags;
4887        ret = sock_sendmsg(sock, &msg);
4888        if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN)
4889                return -EAGAIN;
4890        if (ret == -ERESTARTSYS)
4891                ret = -EINTR;
4892
4893        if (ret < min_ret)
4894                req_set_fail(req);
4895        __io_req_complete(req, issue_flags, ret, 0);
4896        return 0;
4897}
4898
4899static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4900                                 struct io_async_msghdr *iomsg)
4901{
4902        struct io_sr_msg *sr = &req->sr_msg;
4903        struct iovec __user *uiov;
4904        size_t iov_len;
4905        int ret;
4906
4907        ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4908                                        &iomsg->uaddr, &uiov, &iov_len);
4909        if (ret)
4910                return ret;
4911
4912        if (req->flags & REQ_F_BUFFER_SELECT) {
4913                if (iov_len > 1)
4914                        return -EINVAL;
4915                if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4916                        return -EFAULT;
4917                sr->len = iomsg->fast_iov[0].iov_len;
4918                iomsg->free_iov = NULL;
4919        } else {
4920                iomsg->free_iov = iomsg->fast_iov;
4921                ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4922                                     &iomsg->free_iov, &iomsg->msg.msg_iter,
4923                                     false);
4924                if (ret > 0)
4925                        ret = 0;
4926        }
4927
4928        return ret;
4929}
4930
4931#ifdef CONFIG_COMPAT
4932static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4933                                        struct io_async_msghdr *iomsg)
4934{
4935        struct io_sr_msg *sr = &req->sr_msg;
4936        struct compat_iovec __user *uiov;
4937        compat_uptr_t ptr;
4938        compat_size_t len;
4939        int ret;
4940
4941        ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4942                                  &ptr, &len);
4943        if (ret)
4944                return ret;
4945
4946        uiov = compat_ptr(ptr);
4947        if (req->flags & REQ_F_BUFFER_SELECT) {
4948                compat_ssize_t clen;
4949
4950                if (len > 1)
4951                        return -EINVAL;
4952                if (!access_ok(uiov, sizeof(*uiov)))
4953                        return -EFAULT;
4954                if (__get_user(clen, &uiov->iov_len))
4955                        return -EFAULT;
4956                if (clen < 0)
4957                        return -EINVAL;
4958                sr->len = clen;
4959                iomsg->free_iov = NULL;
4960        } else {
4961                iomsg->free_iov = iomsg->fast_iov;
4962                ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4963                                   UIO_FASTIOV, &iomsg->free_iov,
4964                                   &iomsg->msg.msg_iter, true);
4965                if (ret < 0)
4966                        return ret;
4967        }
4968
4969        return 0;
4970}
4971#endif
4972
4973static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4974                               struct io_async_msghdr *iomsg)
4975{
4976        iomsg->msg.msg_name = &iomsg->addr;
4977
4978#ifdef CONFIG_COMPAT
4979        if (req->ctx->compat)
4980                return __io_compat_recvmsg_copy_hdr(req, iomsg);
4981#endif
4982
4983        return __io_recvmsg_copy_hdr(req, iomsg);
4984}
4985
4986static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4987                                               unsigned int issue_flags)
4988{
4989        struct io_sr_msg *sr = &req->sr_msg;
4990
4991        return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
4992}
4993
4994static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4995{
4996        return io_put_kbuf(req, req->kbuf);
4997}
4998
4999static int io_recvmsg_prep_async(struct io_kiocb *req)
5000{
5001        int ret;
5002
5003        ret = io_recvmsg_copy_hdr(req, req->async_data);
5004        if (!ret)
5005                req->flags |= REQ_F_NEED_CLEANUP;
5006        return ret;
5007}
5008
5009static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5010{
5011        struct io_sr_msg *sr = &req->sr_msg;
5012
5013        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5014                return -EINVAL;
5015
5016        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
5017        sr->len = READ_ONCE(sqe->len);
5018        sr->bgid = READ_ONCE(sqe->buf_group);
5019        sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
5020        if (sr->msg_flags & MSG_DONTWAIT)
5021                req->flags |= REQ_F_NOWAIT;
5022
5023#ifdef CONFIG_COMPAT
5024        if (req->ctx->compat)
5025                sr->msg_flags |= MSG_CMSG_COMPAT;
5026#endif
5027        return 0;
5028}
5029
5030static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
5031{
5032        struct io_async_msghdr iomsg, *kmsg;
5033        struct socket *sock;
5034        struct io_buffer *kbuf;
5035        unsigned flags;
5036        int min_ret = 0;
5037        int ret, cflags = 0;
5038        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5039
5040        sock = sock_from_file(req->file);
5041        if (unlikely(!sock))
5042                return -ENOTSOCK;
5043
5044        if (req_has_async_data(req)) {
5045                kmsg = req->async_data;
5046        } else {
5047                ret = io_recvmsg_copy_hdr(req, &iomsg);
5048                if (ret)
5049                        return ret;
5050                kmsg = &iomsg;
5051        }
5052
5053        if (req->flags & REQ_F_BUFFER_SELECT) {
5054                kbuf = io_recv_buffer_select(req, issue_flags);
5055                if (IS_ERR(kbuf))
5056                        return PTR_ERR(kbuf);
5057                kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5058                kmsg->fast_iov[0].iov_len = req->sr_msg.len;
5059                iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
5060                                1, req->sr_msg.len);
5061        }
5062
5063        flags = req->sr_msg.msg_flags;
5064        if (force_nonblock)
5065                flags |= MSG_DONTWAIT;
5066        if (flags & MSG_WAITALL)
5067                min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5068
5069        ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5070                                        kmsg->uaddr, flags);
5071        if (force_nonblock && ret == -EAGAIN)
5072                return io_setup_async_msg(req, kmsg);
5073        if (ret == -ERESTARTSYS)
5074                ret = -EINTR;
5075
5076        if (req->flags & REQ_F_BUFFER_SELECTED)
5077                cflags = io_put_recv_kbuf(req);
5078        /* fast path, check for non-NULL to avoid function call */
5079        if (kmsg->free_iov)
5080                kfree(kmsg->free_iov);
5081        req->flags &= ~REQ_F_NEED_CLEANUP;
5082        if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
5083                req_set_fail(req);
5084        __io_req_complete(req, issue_flags, ret, cflags);
5085        return 0;
5086}
5087
5088static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
5089{
5090        struct io_buffer *kbuf;
5091        struct io_sr_msg *sr = &req->sr_msg;
5092        struct msghdr msg;
5093        void __user *buf = sr->buf;
5094        struct socket *sock;
5095        struct iovec iov;
5096        unsigned flags;
5097        int min_ret = 0;
5098        int ret, cflags = 0;
5099        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5100
5101        sock = sock_from_file(req->file);
5102        if (unlikely(!sock))
5103                return -ENOTSOCK;
5104
5105        if (req->flags & REQ_F_BUFFER_SELECT) {
5106                kbuf = io_recv_buffer_select(req, issue_flags);
5107                if (IS_ERR(kbuf))
5108                        return PTR_ERR(kbuf);
5109                buf = u64_to_user_ptr(kbuf->addr);
5110        }
5111
5112        ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
5113        if (unlikely(ret))
5114                goto out_free;
5115
5116        msg.msg_name = NULL;
5117        msg.msg_control = NULL;
5118        msg.msg_controllen = 0;
5119        msg.msg_namelen = 0;
5120        msg.msg_iocb = NULL;
5121        msg.msg_flags = 0;
5122
5123        flags = req->sr_msg.msg_flags;
5124        if (force_nonblock)
5125                flags |= MSG_DONTWAIT;
5126        if (flags & MSG_WAITALL)
5127                min_ret = iov_iter_count(&msg.msg_iter);
5128
5129        ret = sock_recvmsg(sock, &msg, flags);
5130        if (force_nonblock && ret == -EAGAIN)
5131                return -EAGAIN;
5132        if (ret == -ERESTARTSYS)
5133                ret = -EINTR;
5134out_free:
5135        if (req->flags & REQ_F_BUFFER_SELECTED)
5136                cflags = io_put_recv_kbuf(req);
5137        if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))))
5138                req_set_fail(req);
5139        __io_req_complete(req, issue_flags, ret, cflags);
5140        return 0;
5141}
5142
5143static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5144{
5145        struct io_accept *accept = &req->accept;
5146
5147        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5148                return -EINVAL;
5149        if (sqe->ioprio || sqe->len || sqe->buf_index)
5150                return -EINVAL;
5151
5152        accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5153        accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5154        accept->flags = READ_ONCE(sqe->accept_flags);
5155        accept->nofile = rlimit(RLIMIT_NOFILE);
5156
5157        accept->file_slot = READ_ONCE(sqe->file_index);
5158        if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
5159                                  (accept->flags & SOCK_CLOEXEC)))
5160                return -EINVAL;
5161        if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5162                return -EINVAL;
5163        if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5164                accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
5165        return 0;
5166}
5167
5168static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5169{
5170        struct io_accept *accept = &req->accept;
5171        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5172        unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
5173        bool fixed = !!accept->file_slot;
5174        struct file *file;
5175        int ret, fd;
5176
5177        if (req->file->f_flags & O_NONBLOCK)
5178                req->flags |= REQ_F_NOWAIT;
5179
5180        if (!fixed) {
5181                fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5182                if (unlikely(fd < 0))
5183                        return fd;
5184        }
5185        file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5186                         accept->flags);
5187        if (IS_ERR(file)) {
5188                if (!fixed)
5189                        put_unused_fd(fd);
5190                ret = PTR_ERR(file);
5191                if (ret == -EAGAIN && force_nonblock)
5192                        return -EAGAIN;
5193                if (ret == -ERESTARTSYS)
5194                        ret = -EINTR;
5195                req_set_fail(req);
5196        } else if (!fixed) {
5197                fd_install(fd, file);
5198                ret = fd;
5199        } else {
5200                ret = io_install_fixed_file(req, file, issue_flags,
5201                                            accept->file_slot - 1);
5202        }
5203        __io_req_complete(req, issue_flags, ret, 0);
5204        return 0;
5205}
5206
5207static int io_connect_prep_async(struct io_kiocb *req)
5208{
5209        struct io_async_connect *io = req->async_data;
5210        struct io_connect *conn = &req->connect;
5211
5212        return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5213}
5214
5215static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5216{
5217        struct io_connect *conn = &req->connect;
5218
5219        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5220                return -EINVAL;
5221        if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5222            sqe->splice_fd_in)
5223                return -EINVAL;
5224
5225        conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5226        conn->addr_len =  READ_ONCE(sqe->addr2);
5227        return 0;
5228}
5229
5230static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5231{
5232        struct io_async_connect __io, *io;
5233        unsigned file_flags;
5234        int ret;
5235        bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5236
5237        if (req_has_async_data(req)) {
5238                io = req->async_data;
5239        } else {
5240                ret = move_addr_to_kernel(req->connect.addr,
5241                                                req->connect.addr_len,
5242                                                &__io.address);
5243                if (ret)
5244                        goto out;
5245                io = &__io;
5246        }
5247
5248        file_flags = force_nonblock ? O_NONBLOCK : 0;
5249
5250        ret = __sys_connect_file(req->file, &io->address,
5251                                        req->connect.addr_len, file_flags);
5252        if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
5253                if (req_has_async_data(req))
5254                        return -EAGAIN;
5255                if (io_alloc_async_data(req)) {
5256                        ret = -ENOMEM;
5257                        goto out;
5258                }
5259                memcpy(req->async_data, &__io, sizeof(__io));
5260                return -EAGAIN;
5261        }
5262        if (ret == -ERESTARTSYS)
5263                ret = -EINTR;
5264out:
5265        if (ret < 0)
5266                req_set_fail(req);
5267        __io_req_complete(req, issue_flags, ret, 0);
5268        return 0;
5269}
5270#else /* !CONFIG_NET */
5271#define IO_NETOP_FN(op)                                                 \
5272static int io_##op(struct io_kiocb *req, unsigned int issue_flags)      \
5273{                                                                       \
5274        return -EOPNOTSUPP;                                             \
5275}
5276
5277#define IO_NETOP_PREP(op)                                               \
5278IO_NETOP_FN(op)                                                         \
5279static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5280{                                                                       \
5281        return -EOPNOTSUPP;                                             \
5282}                                                                       \
5283
5284#define IO_NETOP_PREP_ASYNC(op)                                         \
5285IO_NETOP_PREP(op)                                                       \
5286static int io_##op##_prep_async(struct io_kiocb *req)                   \
5287{                                                                       \
5288        return -EOPNOTSUPP;                                             \
5289}
5290
5291IO_NETOP_PREP_ASYNC(sendmsg);
5292IO_NETOP_PREP_ASYNC(recvmsg);
5293IO_NETOP_PREP_ASYNC(connect);
5294IO_NETOP_PREP(accept);
5295IO_NETOP_FN(send);
5296IO_NETOP_FN(recv);
5297#endif /* CONFIG_NET */
5298
5299struct io_poll_table {
5300        struct poll_table_struct pt;
5301        struct io_kiocb *req;
5302        int nr_entries;
5303        int error;
5304};
5305
5306static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
5307                           __poll_t mask, io_req_tw_func_t func)
5308{
5309        /* for instances that support it check for an event match first: */
5310        if (mask && !(mask & poll->events))
5311                return 0;
5312
5313        trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5314
5315        list_del_init(&poll->wait.entry);
5316
5317        req->result = mask;
5318        req->io_task_work.func = func;
5319
5320        /*
5321         * If this fails, then the task is exiting. When a task exits, the
5322         * work gets canceled, so just cancel this request as well instead
5323         * of executing it. We can't safely execute it anyway, as we may not
5324         * have the needed state needed for it anyway.
5325         */
5326        io_req_task_work_add(req);
5327        return 1;
5328}
5329
5330static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
5331        __acquires(&req->ctx->completion_lock)
5332{
5333        struct io_ring_ctx *ctx = req->ctx;
5334
5335        /* req->task == current here, checking PF_EXITING is safe */
5336        if (unlikely(req->task->flags & PF_EXITING))
5337                WRITE_ONCE(poll->canceled, true);
5338
5339        if (!req->result && !READ_ONCE(poll->canceled)) {
5340                struct poll_table_struct pt = { ._key = poll->events };
5341
5342                req->result = vfs_poll(req->file, &pt) & poll->events;
5343        }
5344
5345        spin_lock(&ctx->completion_lock);
5346        if (!req->result && !READ_ONCE(poll->canceled)) {
5347                add_wait_queue(poll->head, &poll->wait);
5348                return true;
5349        }
5350
5351        return false;
5352}
5353
5354static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5355{
5356        /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5357        if (req->opcode == IORING_OP_POLL_ADD)
5358                return req->async_data;
5359        return req->apoll->double_poll;
5360}
5361
5362static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5363{
5364        if (req->opcode == IORING_OP_POLL_ADD)
5365                return &req->poll;
5366        return &req->apoll->poll;
5367}
5368
5369static void io_poll_remove_double(struct io_kiocb *req)
5370        __must_hold(&req->ctx->completion_lock)
5371{
5372        struct io_poll_iocb *poll = io_poll_get_double(req);
5373
5374        lockdep_assert_held(&req->ctx->completion_lock);
5375
5376        if (poll && poll->head) {
5377                struct wait_queue_head *head = poll->head;
5378
5379                spin_lock_irq(&head->lock);
5380                list_del_init(&poll->wait.entry);
5381                if (poll->wait.private)
5382                        req_ref_put(req);
5383                poll->head = NULL;
5384                spin_unlock_irq(&head->lock);
5385        }
5386}
5387
5388static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
5389        __must_hold(&req->ctx->completion_lock)
5390{
5391        struct io_ring_ctx *ctx = req->ctx;
5392        unsigned flags = IORING_CQE_F_MORE;
5393        int error;
5394
5395        if (READ_ONCE(req->poll.canceled)) {
5396                error = -ECANCELED;
5397                req->poll.events |= EPOLLONESHOT;
5398        } else {
5399                error = mangle_poll(mask);
5400        }
5401        if (req->poll.events & EPOLLONESHOT)
5402                flags = 0;
5403        if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) {
5404                req->poll.events |= EPOLLONESHOT;
5405                flags = 0;
5406        }
5407        if (flags & IORING_CQE_F_MORE)
5408                ctx->cq_extra++;
5409
5410        return !(flags & IORING_CQE_F_MORE);
5411}
5412
5413static void io_poll_task_func(struct io_kiocb *req, bool *locked)
5414{
5415        struct io_ring_ctx *ctx = req->ctx;
5416        struct io_kiocb *nxt;
5417
5418        if (io_poll_rewait(req, &req->poll)) {
5419                spin_unlock(&ctx->completion_lock);
5420        } else {
5421                bool done;
5422
5423                if (req->poll.done) {
5424                        spin_unlock(&ctx->completion_lock);
5425                        return;
5426                }
5427                done = __io_poll_complete(req, req->result);
5428                if (done) {
5429                        io_poll_remove_double(req);
5430                        hash_del(&req->hash_node);
5431                        req->poll.done = true;
5432                } else {
5433                        req->result = 0;
5434                        add_wait_queue(req->poll.head, &req->poll.wait);
5435                }
5436                io_commit_cqring(ctx);
5437                spin_unlock(&ctx->completion_lock);
5438                io_cqring_ev_posted(ctx);
5439
5440                if (done) {
5441                        nxt = io_put_req_find_next(req);
5442                        if (nxt)
5443                                io_req_task_submit(nxt, locked);
5444                }
5445        }
5446}
5447
5448static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
5449                               int sync, void *key)
5450{
5451        struct io_kiocb *req = wait->private;
5452        struct io_poll_iocb *poll = io_poll_get_single(req);
5453        __poll_t mask = key_to_poll(key);
5454        unsigned long flags;
5455
5456        /* for instances that support it check for an event match first: */
5457        if (mask && !(mask & poll->events))
5458                return 0;
5459        if (!(poll->events & EPOLLONESHOT))
5460                return poll->wait.func(&poll->wait, mode, sync, key);
5461
5462        list_del_init(&wait->entry);
5463
5464        if (poll->head) {
5465                bool done;
5466
5467                spin_lock_irqsave(&poll->head->lock, flags);
5468                done = list_empty(&poll->wait.entry);
5469                if (!done)
5470                        list_del_init(&poll->wait.entry);
5471                /* make sure double remove sees this as being gone */
5472                wait->private = NULL;
5473                spin_unlock_irqrestore(&poll->head->lock, flags);
5474                if (!done) {
5475                        /* use wait func handler, so it matches the rq type */
5476                        poll->wait.func(&poll->wait, mode, sync, key);
5477                }
5478        }
5479        req_ref_put(req);
5480        return 1;
5481}
5482
5483static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5484                              wait_queue_func_t wake_func)
5485{
5486        poll->head = NULL;
5487        poll->done = false;
5488        poll->canceled = false;
5489#define IO_POLL_UNMASK  (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5490        /* mask in events that we always want/need */
5491        poll->events = events | IO_POLL_UNMASK;
5492        INIT_LIST_HEAD(&poll->wait.entry);
5493        init_waitqueue_func_entry(&poll->wait, wake_func);
5494}
5495
5496static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5497                            struct wait_queue_head *head,
5498                            struct io_poll_iocb **poll_ptr)
5499{
5500        struct io_kiocb *req = pt->req;
5501
5502        /*
5503         * The file being polled uses multiple waitqueues for poll handling
5504         * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5505         * if this happens.
5506         */
5507        if (unlikely(pt->nr_entries)) {
5508                struct io_poll_iocb *poll_one = poll;
5509
5510                /* double add on the same waitqueue head, ignore */
5511                if (poll_one->head == head)
5512                        return;
5513                /* already have a 2nd entry, fail a third attempt */
5514                if (*poll_ptr) {
5515                        if ((*poll_ptr)->head == head)
5516                                return;
5517                        pt->error = -EINVAL;
5518                        return;
5519                }
5520                /*
5521                 * Can't handle multishot for double wait for now, turn it
5522                 * into one-shot mode.
5523                 */
5524                if (!(poll_one->events & EPOLLONESHOT))
5525                        poll_one->events |= EPOLLONESHOT;
5526                poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5527                if (!poll) {
5528                        pt->error = -ENOMEM;
5529                        return;
5530                }
5531                io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5532                req_ref_get(req);
5533                poll->wait.private = req;
5534
5535                *poll_ptr = poll;
5536                if (req->opcode == IORING_OP_POLL_ADD)
5537                        req->flags |= REQ_F_ASYNC_DATA;
5538        }
5539
5540        pt->nr_entries++;
5541        poll->head = head;
5542
5543        if (poll->events & EPOLLEXCLUSIVE)
5544                add_wait_queue_exclusive(head, &poll->wait);
5545        else
5546                add_wait_queue(head, &poll->wait);
5547}
5548
5549static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5550                               struct poll_table_struct *p)
5551{
5552        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5553        struct async_poll *apoll = pt->req->apoll;
5554
5555        __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5556}
5557
5558static void io_async_task_func(struct io_kiocb *req, bool *locked)
5559{
5560        struct async_poll *apoll = req->apoll;
5561        struct io_ring_ctx *ctx = req->ctx;
5562
5563        trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data);
5564
5565        if (io_poll_rewait(req, &apoll->poll)) {
5566                spin_unlock(&ctx->completion_lock);
5567                return;
5568        }
5569
5570        hash_del(&req->hash_node);
5571        io_poll_remove_double(req);
5572        apoll->poll.done = true;
5573        spin_unlock(&ctx->completion_lock);
5574
5575        if (!READ_ONCE(apoll->poll.canceled))
5576                io_req_task_submit(req, locked);
5577        else
5578                io_req_complete_failed(req, -ECANCELED);
5579}
5580
5581static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5582                        void *key)
5583{
5584        struct io_kiocb *req = wait->private;
5585        struct io_poll_iocb *poll = &req->apoll->poll;
5586
5587        trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5588                                        key_to_poll(key));
5589
5590        return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5591}
5592
5593static void io_poll_req_insert(struct io_kiocb *req)
5594{
5595        struct io_ring_ctx *ctx = req->ctx;
5596        struct hlist_head *list;
5597
5598        list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5599        hlist_add_head(&req->hash_node, list);
5600}
5601
5602static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5603                                      struct io_poll_iocb *poll,
5604                                      struct io_poll_table *ipt, __poll_t mask,
5605                                      wait_queue_func_t wake_func)
5606        __acquires(&ctx->completion_lock)
5607{
5608        struct io_ring_ctx *ctx = req->ctx;
5609        bool cancel = false;
5610
5611        INIT_HLIST_NODE(&req->hash_node);
5612        io_init_poll_iocb(poll, mask, wake_func);
5613        poll->file = req->file;
5614        poll->wait.private = req;
5615
5616        ipt->pt._key = mask;
5617        ipt->req = req;
5618        ipt->error = 0;
5619        ipt->nr_entries = 0;
5620
5621        mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5622        if (unlikely(!ipt->nr_entries) && !ipt->error)
5623                ipt->error = -EINVAL;
5624
5625        spin_lock(&ctx->completion_lock);
5626        if (ipt->error || (mask && (poll->events & EPOLLONESHOT)))
5627                io_poll_remove_double(req);
5628        if (likely(poll->head)) {
5629                spin_lock_irq(&poll->head->lock);
5630                if (unlikely(list_empty(&poll->wait.entry))) {
5631                        if (ipt->error)
5632                                cancel = true;
5633                        ipt->error = 0;
5634                        mask = 0;
5635                }
5636                if ((mask && (poll->events & EPOLLONESHOT)) || ipt->error)
5637                        list_del_init(&poll->wait.entry);
5638                else if (cancel)
5639                        WRITE_ONCE(poll->canceled, true);
5640                else if (!poll->done) /* actually waiting for an event */
5641                        io_poll_req_insert(req);
5642                spin_unlock_irq(&poll->head->lock);
5643        }
5644
5645        return mask;
5646}
5647
5648enum {
5649        IO_APOLL_OK,
5650        IO_APOLL_ABORTED,
5651        IO_APOLL_READY
5652};
5653
5654static int io_arm_poll_handler(struct io_kiocb *req)
5655{
5656        const struct io_op_def *def = &io_op_defs[req->opcode];
5657        struct io_ring_ctx *ctx = req->ctx;
5658        struct async_poll *apoll;
5659        struct io_poll_table ipt;
5660        __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI;
5661
5662        if (!def->pollin && !def->pollout)
5663                return IO_APOLL_ABORTED;
5664        if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED))
5665                return IO_APOLL_ABORTED;
5666
5667        if (def->pollin) {
5668                mask |= POLLIN | POLLRDNORM;
5669
5670                /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5671                if ((req->opcode == IORING_OP_RECVMSG) &&
5672                    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5673                        mask &= ~POLLIN;
5674        } else {
5675                mask |= POLLOUT | POLLWRNORM;
5676        }
5677
5678        apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5679        if (unlikely(!apoll))
5680                return IO_APOLL_ABORTED;
5681        apoll->double_poll = NULL;
5682        req->apoll = apoll;
5683        req->flags |= REQ_F_POLLED;
5684        ipt.pt._qproc = io_async_queue_proc;
5685        io_req_set_refcount(req);
5686
5687        ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5688                                        io_async_wake);
5689        spin_unlock(&ctx->completion_lock);
5690        if (ret || ipt.error)
5691                return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5692
5693        trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5694                                mask, apoll->poll.events);
5695        return IO_APOLL_OK;
5696}
5697
5698static bool __io_poll_remove_one(struct io_kiocb *req,
5699                                 struct io_poll_iocb *poll, bool do_cancel)
5700        __must_hold(&req->ctx->completion_lock)
5701{
5702        bool do_complete = false;
5703
5704        if (!poll->head)
5705                return false;
5706        spin_lock_irq(&poll->head->lock);
5707        if (do_cancel)
5708                WRITE_ONCE(poll->canceled, true);
5709        if (!list_empty(&poll->wait.entry)) {
5710                list_del_init(&poll->wait.entry);
5711                do_complete = true;
5712        }
5713        spin_unlock_irq(&poll->head->lock);
5714        hash_del(&req->hash_node);
5715        return do_complete;
5716}
5717
5718static bool io_poll_remove_one(struct io_kiocb *req)
5719        __must_hold(&req->ctx->completion_lock)
5720{
5721        bool do_complete;
5722
5723        io_poll_remove_double(req);
5724        do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true);
5725
5726        if (do_complete) {
5727                io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0);
5728                io_commit_cqring(req->ctx);
5729                req_set_fail(req);
5730                io_put_req_deferred(req);
5731        }
5732        return do_complete;
5733}
5734
5735/*
5736 * Returns true if we found and killed one or more poll requests
5737 */
5738static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx,
5739                                      struct task_struct *tsk, bool cancel_all)
5740{
5741        struct hlist_node *tmp;
5742        struct io_kiocb *req;
5743        int posted = 0, i;
5744
5745        spin_lock(&ctx->completion_lock);
5746        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5747                struct hlist_head *list;
5748
5749                list = &ctx->cancel_hash[i];
5750                hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5751                        if (io_match_task_safe(req, tsk, cancel_all))
5752                                posted += io_poll_remove_one(req);
5753                }
5754        }
5755        spin_unlock(&ctx->completion_lock);
5756
5757        if (posted)
5758                io_cqring_ev_posted(ctx);
5759
5760        return posted != 0;
5761}
5762
5763static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5764                                     bool poll_only)
5765        __must_hold(&ctx->completion_lock)
5766{
5767        struct hlist_head *list;
5768        struct io_kiocb *req;
5769
5770        list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5771        hlist_for_each_entry(req, list, hash_node) {
5772                if (sqe_addr != req->user_data)
5773                        continue;
5774                if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5775                        continue;
5776                return req;
5777        }
5778        return NULL;
5779}
5780
5781static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5782                          bool poll_only)
5783        __must_hold(&ctx->completion_lock)
5784{
5785        struct io_kiocb *req;
5786
5787        req = io_poll_find(ctx, sqe_addr, poll_only);
5788        if (!req)
5789                return -ENOENT;
5790        if (io_poll_remove_one(req))
5791                return 0;
5792
5793        return -EALREADY;
5794}
5795
5796static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5797                                     unsigned int flags)
5798{
5799        u32 events;
5800
5801        events = READ_ONCE(sqe->poll32_events);
5802#ifdef __BIG_ENDIAN
5803        events = swahw32(events);
5804#endif
5805        if (!(flags & IORING_POLL_ADD_MULTI))
5806                events |= EPOLLONESHOT;
5807        return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5808}
5809
5810static int io_poll_update_prep(struct io_kiocb *req,
5811                               const struct io_uring_sqe *sqe)
5812{
5813        struct io_poll_update *upd = &req->poll_update;
5814        u32 flags;
5815
5816        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5817                return -EINVAL;
5818        if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5819                return -EINVAL;
5820        flags = READ_ONCE(sqe->len);
5821        if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5822                      IORING_POLL_ADD_MULTI))
5823                return -EINVAL;
5824        /* meaningless without update */
5825        if (flags == IORING_POLL_ADD_MULTI)
5826                return -EINVAL;
5827
5828        upd->old_user_data = READ_ONCE(sqe->addr);
5829        upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5830        upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5831
5832        upd->new_user_data = READ_ONCE(sqe->off);
5833        if (!upd->update_user_data && upd->new_user_data)
5834                return -EINVAL;
5835        if (upd->update_events)
5836                upd->events = io_poll_parse_events(sqe, flags);
5837        else if (sqe->poll32_events)
5838                return -EINVAL;
5839
5840        return 0;
5841}
5842
5843static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5844                        void *key)
5845{
5846        struct io_kiocb *req = wait->private;
5847        struct io_poll_iocb *poll = &req->poll;
5848
5849        return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5850}
5851
5852static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5853                               struct poll_table_struct *p)
5854{
5855        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5856
5857        __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5858}
5859
5860static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5861{
5862        struct io_poll_iocb *poll = &req->poll;
5863        u32 flags;
5864
5865        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5866                return -EINVAL;
5867        if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5868                return -EINVAL;
5869        flags = READ_ONCE(sqe->len);
5870        if (flags & ~IORING_POLL_ADD_MULTI)
5871                return -EINVAL;
5872
5873        io_req_set_refcount(req);
5874        poll->events = io_poll_parse_events(sqe, flags);
5875        return 0;
5876}
5877
5878static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5879{
5880        struct io_poll_iocb *poll = &req->poll;
5881        struct io_ring_ctx *ctx = req->ctx;
5882        struct io_poll_table ipt;
5883        __poll_t mask;
5884        bool done;
5885
5886        ipt.pt._qproc = io_poll_queue_proc;
5887
5888        mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5889                                        io_poll_wake);
5890
5891        if (mask) { /* no async, we'd stolen it */
5892                ipt.error = 0;
5893                done = __io_poll_complete(req, mask);
5894                io_commit_cqring(req->ctx);
5895        }
5896        spin_unlock(&ctx->completion_lock);
5897
5898        if (mask) {
5899                io_cqring_ev_posted(ctx);
5900                if (done)
5901                        io_put_req(req);
5902        }
5903        return ipt.error;
5904}
5905
5906static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
5907{
5908        struct io_ring_ctx *ctx = req->ctx;
5909        struct io_kiocb *preq;
5910        bool completing;
5911        int ret;
5912
5913        spin_lock(&ctx->completion_lock);
5914        preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
5915        if (!preq) {
5916                ret = -ENOENT;
5917                goto err;
5918        }
5919
5920        if (!req->poll_update.update_events && !req->poll_update.update_user_data) {
5921                completing = true;
5922                ret = io_poll_remove_one(preq) ? 0 : -EALREADY;
5923                goto err;
5924        }
5925
5926        /*
5927         * Don't allow racy completion with singleshot, as we cannot safely
5928         * update those. For multishot, if we're racing with completion, just
5929         * let completion re-add it.
5930         */
5931        completing = !__io_poll_remove_one(preq, &preq->poll, false);
5932        if (completing && (preq->poll.events & EPOLLONESHOT)) {
5933                ret = -EALREADY;
5934                goto err;
5935        }
5936        /* we now have a detached poll request. reissue. */
5937        ret = 0;
5938err:
5939        if (ret < 0) {
5940                spin_unlock(&ctx->completion_lock);
5941                req_set_fail(req);
5942                io_req_complete(req, ret);
5943                return 0;
5944        }
5945        /* only mask one event flags, keep behavior flags */
5946        if (req->poll_update.update_events) {
5947                preq->poll.events &= ~0xffff;
5948                preq->poll.events |= req->poll_update.events & 0xffff;
5949                preq->poll.events |= IO_POLL_UNMASK;
5950        }
5951        if (req->poll_update.update_user_data)
5952                preq->user_data = req->poll_update.new_user_data;
5953        spin_unlock(&ctx->completion_lock);
5954
5955        /* complete update request, we're done with it */
5956        io_req_complete(req, ret);
5957
5958        if (!completing) {
5959                ret = io_poll_add(preq, issue_flags);
5960                if (ret < 0) {
5961                        req_set_fail(preq);
5962                        io_req_complete(preq, ret);
5963                }
5964        }
5965        return 0;
5966}
5967
5968static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
5969{
5970        struct io_timeout_data *data = req->async_data;
5971
5972        if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS))
5973                req_set_fail(req);
5974        io_req_complete_post(req, -ETIME, 0);
5975}
5976
5977static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5978{
5979        struct io_timeout_data *data = container_of(timer,
5980                                                struct io_timeout_data, timer);
5981        struct io_kiocb *req = data->req;
5982        struct io_ring_ctx *ctx = req->ctx;
5983        unsigned long flags;
5984
5985        spin_lock_irqsave(&ctx->timeout_lock, flags);
5986        list_del_init(&req->timeout.list);
5987        atomic_set(&req->ctx->cq_timeouts,
5988                atomic_read(&req->ctx->cq_timeouts) + 1);
5989        spin_unlock_irqrestore(&ctx->timeout_lock, flags);
5990
5991        req->io_task_work.func = io_req_task_timeout;
5992        io_req_task_work_add(req);
5993        return HRTIMER_NORESTART;
5994}
5995
5996static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
5997                                           __u64 user_data)
5998        __must_hold(&ctx->timeout_lock)
5999{
6000        struct io_timeout_data *io;
6001        struct io_kiocb *req;
6002        bool found = false;
6003
6004        list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
6005                found = user_data == req->user_data;
6006                if (found)
6007                        break;
6008        }
6009        if (!found)
6010                return ERR_PTR(-ENOENT);
6011
6012        io = req->async_data;
6013        if (hrtimer_try_to_cancel(&io->timer) == -1)
6014                return ERR_PTR(-EALREADY);
6015        list_del_init(&req->timeout.list);
6016        return req;
6017}
6018
6019static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
6020        __must_hold(&ctx->completion_lock)
6021        __must_hold(&ctx->timeout_lock)
6022{
6023        struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6024
6025        if (IS_ERR(req))
6026                return PTR_ERR(req);
6027
6028        req_set_fail(req);
6029        io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0);
6030        io_put_req_deferred(req);
6031        return 0;
6032}
6033
6034static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
6035{
6036        switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
6037        case IORING_TIMEOUT_BOOTTIME:
6038                return CLOCK_BOOTTIME;
6039        case IORING_TIMEOUT_REALTIME:
6040                return CLOCK_REALTIME;
6041        default:
6042                /* can't happen, vetted at prep time */
6043                WARN_ON_ONCE(1);
6044                fallthrough;
6045        case 0:
6046                return CLOCK_MONOTONIC;
6047        }
6048}
6049
6050static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6051                                    struct timespec64 *ts, enum hrtimer_mode mode)
6052        __must_hold(&ctx->timeout_lock)
6053{
6054        struct io_timeout_data *io;
6055        struct io_kiocb *req;
6056        bool found = false;
6057
6058        list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
6059                found = user_data == req->user_data;
6060                if (found)
6061                        break;
6062        }
6063        if (!found)
6064                return -ENOENT;
6065
6066        io = req->async_data;
6067        if (hrtimer_try_to_cancel(&io->timer) == -1)
6068                return -EALREADY;
6069        hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6070        io->timer.function = io_link_timeout_fn;
6071        hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6072        return 0;
6073}
6074
6075static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6076                             struct timespec64 *ts, enum hrtimer_mode mode)
6077        __must_hold(&ctx->timeout_lock)
6078{
6079        struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6080        struct io_timeout_data *data;
6081
6082        if (IS_ERR(req))
6083                return PTR_ERR(req);
6084
6085        req->timeout.off = 0; /* noseq */
6086        data = req->async_data;
6087        list_add_tail(&req->timeout.list, &ctx->timeout_list);
6088        hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
6089        data->timer.function = io_timeout_fn;
6090        hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6091        return 0;
6092}
6093
6094static int io_timeout_remove_prep(struct io_kiocb *req,
6095                                  const struct io_uring_sqe *sqe)
6096{
6097        struct io_timeout_rem *tr = &req->timeout_rem;
6098
6099        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6100                return -EINVAL;
6101        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6102                return -EINVAL;
6103        if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
6104                return -EINVAL;
6105
6106        tr->ltimeout = false;
6107        tr->addr = READ_ONCE(sqe->addr);
6108        tr->flags = READ_ONCE(sqe->timeout_flags);
6109        if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6110                if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6111                        return -EINVAL;
6112                if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6113                        tr->ltimeout = true;
6114                if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
6115                        return -EINVAL;
6116                if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6117                        return -EFAULT;
6118        } else if (tr->flags) {
6119                /* timeout removal doesn't support flags */
6120                return -EINVAL;
6121        }
6122
6123        return 0;
6124}
6125
6126static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6127{
6128        return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6129                                            : HRTIMER_MODE_REL;
6130}
6131
6132/*
6133 * Remove or update an existing timeout command
6134 */
6135static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
6136{
6137        struct io_timeout_rem *tr = &req->timeout_rem;
6138        struct io_ring_ctx *ctx = req->ctx;
6139        int ret;
6140
6141        if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6142                spin_lock(&ctx->completion_lock);
6143                spin_lock_irq(&ctx->timeout_lock);
6144                ret = io_timeout_cancel(ctx, tr->addr);
6145                spin_unlock_irq(&ctx->timeout_lock);
6146                spin_unlock(&ctx->completion_lock);
6147        } else {
6148                enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6149
6150                spin_lock_irq(&ctx->timeout_lock);
6151                if (tr->ltimeout)
6152                        ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6153                else
6154                        ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
6155                spin_unlock_irq(&ctx->timeout_lock);
6156        }
6157
6158        if (ret < 0)
6159                req_set_fail(req);
6160        io_req_complete_post(req, ret, 0);
6161        return 0;
6162}
6163
6164static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6165                           bool is_timeout_link)
6166{
6167        struct io_timeout_data *data;
6168        unsigned flags;
6169        u32 off = READ_ONCE(sqe->off);
6170
6171        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6172                return -EINVAL;
6173        if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6174            sqe->splice_fd_in)
6175                return -EINVAL;
6176        if (off && is_timeout_link)
6177                return -EINVAL;
6178        flags = READ_ONCE(sqe->timeout_flags);
6179        if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
6180                      IORING_TIMEOUT_ETIME_SUCCESS))
6181                return -EINVAL;
6182        /* more than one clock specified is invalid, obviously */
6183        if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6184                return -EINVAL;
6185
6186        INIT_LIST_HEAD(&req->timeout.list);
6187        req->timeout.off = off;
6188        if (unlikely(off && !req->ctx->off_timeout_used))
6189                req->ctx->off_timeout_used = true;
6190
6191        if (WARN_ON_ONCE(req_has_async_data(req)))
6192                return -EFAULT;
6193        if (io_alloc_async_data(req))
6194                return -ENOMEM;
6195
6196        data = req->async_data;
6197        data->req = req;
6198        data->flags = flags;
6199
6200        if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
6201                return -EFAULT;
6202
6203        if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0)
6204                return -EINVAL;
6205
6206        data->mode = io_translate_timeout_mode(flags);
6207        hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
6208
6209        if (is_timeout_link) {
6210                struct io_submit_link *link = &req->ctx->submit_state.link;
6211
6212                if (!link->head)
6213                        return -EINVAL;
6214                if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6215                        return -EINVAL;
6216                req->timeout.head = link->last;
6217                link->last->flags |= REQ_F_ARM_LTIMEOUT;
6218        }
6219        return 0;
6220}
6221
6222static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
6223{
6224        struct io_ring_ctx *ctx = req->ctx;
6225        struct io_timeout_data *data = req->async_data;
6226        struct list_head *entry;
6227        u32 tail, off = req->timeout.off;
6228
6229        spin_lock_irq(&ctx->timeout_lock);
6230
6231        /*
6232         * sqe->off holds how many events that need to occur for this
6233         * timeout event to be satisfied. If it isn't set, then this is
6234         * a pure timeout request, sequence isn't used.
6235         */
6236        if (io_is_timeout_noseq(req)) {
6237                entry = ctx->timeout_list.prev;
6238                goto add;
6239        }
6240
6241        tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6242        req->timeout.target_seq = tail + off;
6243
6244        /* Update the last seq here in case io_flush_timeouts() hasn't.
6245         * This is safe because ->completion_lock is held, and submissions
6246         * and completions are never mixed in the same ->completion_lock section.
6247         */
6248        ctx->cq_last_tm_flush = tail;
6249
6250        /*
6251         * Insertion sort, ensuring the first entry in the list is always
6252         * the one we need first.
6253         */
6254        list_for_each_prev(entry, &ctx->timeout_list) {
6255                struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6256                                                  timeout.list);
6257
6258                if (io_is_timeout_noseq(nxt))
6259                        continue;
6260                /* nxt.seq is behind @tail, otherwise would've been completed */
6261                if (off >= nxt->timeout.target_seq - tail)
6262                        break;
6263        }
6264add:
6265        list_add(&req->timeout.list, entry);
6266        data->timer.function = io_timeout_fn;
6267        hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
6268        spin_unlock_irq(&ctx->timeout_lock);
6269        return 0;
6270}
6271
6272struct io_cancel_data {
6273        struct io_ring_ctx *ctx;
6274        u64 user_data;
6275};
6276
6277static bool io_cancel_cb(struct io_wq_work *work, void *data)
6278{
6279        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6280        struct io_cancel_data *cd = data;
6281
6282        return req->ctx == cd->ctx && req->user_data == cd->user_data;
6283}
6284
6285static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6286                               struct io_ring_ctx *ctx)
6287{
6288        struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
6289        enum io_wq_cancel cancel_ret;
6290        int ret = 0;
6291
6292        if (!tctx || !tctx->io_wq)
6293                return -ENOENT;
6294
6295        cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
6296        switch (cancel_ret) {
6297        case IO_WQ_CANCEL_OK:
6298                ret = 0;
6299                break;
6300        case IO_WQ_CANCEL_RUNNING:
6301                ret = -EALREADY;
6302                break;
6303        case IO_WQ_CANCEL_NOTFOUND:
6304                ret = -ENOENT;
6305                break;
6306        }
6307
6308        return ret;
6309}
6310
6311static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
6312{
6313        struct io_ring_ctx *ctx = req->ctx;
6314        int ret;
6315
6316        WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
6317
6318        ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
6319        if (ret != -ENOENT)
6320                return ret;
6321
6322        spin_lock(&ctx->completion_lock);
6323        spin_lock_irq(&ctx->timeout_lock);
6324        ret = io_timeout_cancel(ctx, sqe_addr);
6325        spin_unlock_irq(&ctx->timeout_lock);
6326        if (ret != -ENOENT)
6327                goto out;
6328        ret = io_poll_cancel(ctx, sqe_addr, false);
6329out:
6330        spin_unlock(&ctx->completion_lock);
6331        return ret;
6332}
6333
6334static int io_async_cancel_prep(struct io_kiocb *req,
6335                                const struct io_uring_sqe *sqe)
6336{
6337        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6338                return -EINVAL;
6339        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6340                return -EINVAL;
6341        if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6342            sqe->splice_fd_in)
6343                return -EINVAL;
6344
6345        req->cancel.addr = READ_ONCE(sqe->addr);
6346        return 0;
6347}
6348
6349static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
6350{
6351        struct io_ring_ctx *ctx = req->ctx;
6352        u64 sqe_addr = req->cancel.addr;
6353        bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
6354        struct io_tctx_node *node;
6355        int ret;
6356
6357        ret = io_try_cancel_userdata(req, sqe_addr);
6358        if (ret != -ENOENT)
6359                goto done;
6360
6361        /* slow path, try all io-wq's */
6362        io_ring_submit_lock(ctx, needs_lock);
6363        ret = -ENOENT;
6364        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6365                struct io_uring_task *tctx = node->task->io_uring;
6366
6367                ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6368                if (ret != -ENOENT)
6369                        break;
6370        }
6371        io_ring_submit_unlock(ctx, needs_lock);
6372done:
6373        if (ret < 0)
6374                req_set_fail(req);
6375        io_req_complete_post(req, ret, 0);
6376        return 0;
6377}
6378
6379static int io_rsrc_update_prep(struct io_kiocb *req,
6380                                const struct io_uring_sqe *sqe)
6381{
6382        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6383                return -EINVAL;
6384        if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
6385                return -EINVAL;
6386
6387        req->rsrc_update.offset = READ_ONCE(sqe->off);
6388        req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6389        if (!req->rsrc_update.nr_args)
6390                return -EINVAL;
6391        req->rsrc_update.arg = READ_ONCE(sqe->addr);
6392        return 0;
6393}
6394
6395static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6396{
6397        struct io_ring_ctx *ctx = req->ctx;
6398        bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
6399        struct io_uring_rsrc_update2 up;
6400        int ret;
6401
6402        up.offset = req->rsrc_update.offset;
6403        up.data = req->rsrc_update.arg;
6404        up.nr = 0;
6405        up.tags = 0;
6406        up.resv = 0;
6407
6408        io_ring_submit_lock(ctx, needs_lock);
6409        ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
6410                                        &up, req->rsrc_update.nr_args);
6411        io_ring_submit_unlock(ctx, needs_lock);
6412
6413        if (ret < 0)
6414                req_set_fail(req);
6415        __io_req_complete(req, issue_flags, ret, 0);
6416        return 0;
6417}
6418
6419static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6420{
6421        switch (req->opcode) {
6422        case IORING_OP_NOP:
6423                return 0;
6424        case IORING_OP_READV:
6425        case IORING_OP_READ_FIXED:
6426        case IORING_OP_READ:
6427                return io_read_prep(req, sqe);
6428        case IORING_OP_WRITEV:
6429        case IORING_OP_WRITE_FIXED:
6430        case IORING_OP_WRITE:
6431                return io_write_prep(req, sqe);
6432        case IORING_OP_POLL_ADD:
6433                return io_poll_add_prep(req, sqe);
6434        case IORING_OP_POLL_REMOVE:
6435                return io_poll_update_prep(req, sqe);
6436        case IORING_OP_FSYNC:
6437                return io_fsync_prep(req, sqe);
6438        case IORING_OP_SYNC_FILE_RANGE:
6439                return io_sfr_prep(req, sqe);
6440        case IORING_OP_SENDMSG:
6441        case IORING_OP_SEND:
6442                return io_sendmsg_prep(req, sqe);
6443        case IORING_OP_RECVMSG:
6444        case IORING_OP_RECV:
6445                return io_recvmsg_prep(req, sqe);
6446        case IORING_OP_CONNECT:
6447                return io_connect_prep(req, sqe);
6448        case IORING_OP_TIMEOUT:
6449                return io_timeout_prep(req, sqe, false);
6450        case IORING_OP_TIMEOUT_REMOVE:
6451                return io_timeout_remove_prep(req, sqe);
6452        case IORING_OP_ASYNC_CANCEL:
6453                return io_async_cancel_prep(req, sqe);
6454        case IORING_OP_LINK_TIMEOUT:
6455                return io_timeout_prep(req, sqe, true);
6456        case IORING_OP_ACCEPT:
6457                return io_accept_prep(req, sqe);
6458        case IORING_OP_FALLOCATE:
6459                return io_fallocate_prep(req, sqe);
6460        case IORING_OP_OPENAT:
6461                return io_openat_prep(req, sqe);
6462        case IORING_OP_CLOSE:
6463                return io_close_prep(req, sqe);
6464        case IORING_OP_FILES_UPDATE:
6465                return io_rsrc_update_prep(req, sqe);
6466        case IORING_OP_STATX:
6467                return io_statx_prep(req, sqe);
6468        case IORING_OP_FADVISE:
6469                return io_fadvise_prep(req, sqe);
6470        case IORING_OP_MADVISE:
6471                return io_madvise_prep(req, sqe);
6472        case IORING_OP_OPENAT2:
6473                return io_openat2_prep(req, sqe);
6474        case IORING_OP_EPOLL_CTL:
6475                return io_epoll_ctl_prep(req, sqe);
6476        case IORING_OP_SPLICE:
6477                return io_splice_prep(req, sqe);
6478        case IORING_OP_PROVIDE_BUFFERS:
6479                return io_provide_buffers_prep(req, sqe);
6480        case IORING_OP_REMOVE_BUFFERS:
6481                return io_remove_buffers_prep(req, sqe);
6482        case IORING_OP_TEE:
6483                return io_tee_prep(req, sqe);
6484        case IORING_OP_SHUTDOWN:
6485                return io_shutdown_prep(req, sqe);
6486        case IORING_OP_RENAMEAT:
6487                return io_renameat_prep(req, sqe);
6488        case IORING_OP_UNLINKAT:
6489                return io_unlinkat_prep(req, sqe);
6490        case IORING_OP_MKDIRAT:
6491                return io_mkdirat_prep(req, sqe);
6492        case IORING_OP_SYMLINKAT:
6493                return io_symlinkat_prep(req, sqe);
6494        case IORING_OP_LINKAT:
6495                return io_linkat_prep(req, sqe);
6496        }
6497
6498        printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6499                        req->opcode);
6500        return -EINVAL;
6501}
6502
6503static int io_req_prep_async(struct io_kiocb *req)
6504{
6505        if (!io_op_defs[req->opcode].needs_async_setup)
6506                return 0;
6507        if (WARN_ON_ONCE(req_has_async_data(req)))
6508                return -EFAULT;
6509        if (io_alloc_async_data(req))
6510                return -EAGAIN;
6511
6512        switch (req->opcode) {
6513        case IORING_OP_READV:
6514                return io_rw_prep_async(req, READ);
6515        case IORING_OP_WRITEV:
6516                return io_rw_prep_async(req, WRITE);
6517        case IORING_OP_SENDMSG:
6518                return io_sendmsg_prep_async(req);
6519        case IORING_OP_RECVMSG:
6520                return io_recvmsg_prep_async(req);
6521        case IORING_OP_CONNECT:
6522                return io_connect_prep_async(req);
6523        }
6524        printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6525                    req->opcode);
6526        return -EFAULT;
6527}
6528
6529static u32 io_get_sequence(struct io_kiocb *req)
6530{
6531        u32 seq = req->ctx->cached_sq_head;
6532
6533        /* need original cached_sq_head, but it was increased for each req */
6534        io_for_each_link(req, req)
6535                seq--;
6536        return seq;
6537}
6538
6539static __cold void io_drain_req(struct io_kiocb *req)
6540{
6541        struct io_ring_ctx *ctx = req->ctx;
6542        struct io_defer_entry *de;
6543        int ret;
6544        u32 seq = io_get_sequence(req);
6545
6546        /* Still need defer if there is pending req in defer list. */
6547        if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
6548queue:
6549                ctx->drain_active = false;
6550                io_req_task_queue(req);
6551                return;
6552        }
6553
6554        ret = io_req_prep_async(req);
6555        if (ret) {
6556fail:
6557                io_req_complete_failed(req, ret);
6558                return;
6559        }
6560        io_prep_async_link(req);
6561        de = kmalloc(sizeof(*de), GFP_KERNEL);
6562        if (!de) {
6563                ret = -ENOMEM;
6564                goto fail;
6565        }
6566
6567        spin_lock(&ctx->completion_lock);
6568        if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
6569                spin_unlock(&ctx->completion_lock);
6570                kfree(de);
6571                goto queue;
6572        }
6573
6574        trace_io_uring_defer(ctx, req, req->user_data);
6575        de->req = req;
6576        de->seq = seq;
6577        list_add_tail(&de->list, &ctx->defer_list);
6578        spin_unlock(&ctx->completion_lock);
6579}
6580
6581static void io_clean_op(struct io_kiocb *req)
6582{
6583        if (req->flags & REQ_F_BUFFER_SELECTED) {
6584                kfree(req->kbuf);
6585                req->kbuf = NULL;
6586        }
6587
6588        if (req->flags & REQ_F_NEED_CLEANUP) {
6589                switch (req->opcode) {
6590                case IORING_OP_READV:
6591                case IORING_OP_READ_FIXED:
6592                case IORING_OP_READ:
6593                case IORING_OP_WRITEV:
6594                case IORING_OP_WRITE_FIXED:
6595                case IORING_OP_WRITE: {
6596                        struct io_async_rw *io = req->async_data;
6597
6598                        kfree(io->free_iovec);
6599                        break;
6600                        }
6601                case IORING_OP_RECVMSG:
6602                case IORING_OP_SENDMSG: {
6603                        struct io_async_msghdr *io = req->async_data;
6604
6605                        kfree(io->free_iov);
6606                        break;
6607                        }
6608                case IORING_OP_SPLICE:
6609                case IORING_OP_TEE:
6610                        if (!(req->splice.flags & SPLICE_F_FD_IN_FIXED))
6611                                io_put_file(req->splice.file_in);
6612                        break;
6613                case IORING_OP_OPENAT:
6614                case IORING_OP_OPENAT2:
6615                        if (req->open.filename)
6616                                putname(req->open.filename);
6617                        break;
6618                case IORING_OP_RENAMEAT:
6619                        putname(req->rename.oldpath);
6620                        putname(req->rename.newpath);
6621                        break;
6622                case IORING_OP_UNLINKAT:
6623                        putname(req->unlink.filename);
6624                        break;
6625                case IORING_OP_MKDIRAT:
6626                        putname(req->mkdir.filename);
6627                        break;
6628                case IORING_OP_SYMLINKAT:
6629                        putname(req->symlink.oldpath);
6630                        putname(req->symlink.newpath);
6631                        break;
6632                case IORING_OP_LINKAT:
6633                        putname(req->hardlink.oldpath);
6634                        putname(req->hardlink.newpath);
6635                        break;
6636                }
6637        }
6638        if ((req->flags & REQ_F_POLLED) && req->apoll) {
6639                kfree(req->apoll->double_poll);
6640                kfree(req->apoll);
6641                req->apoll = NULL;
6642        }
6643        if (req->flags & REQ_F_INFLIGHT) {
6644                struct io_uring_task *tctx = req->task->io_uring;
6645
6646                atomic_dec(&tctx->inflight_tracked);
6647        }
6648        if (req->flags & REQ_F_CREDS)
6649                put_cred(req->creds);
6650        if (req->flags & REQ_F_ASYNC_DATA) {
6651                kfree(req->async_data);
6652                req->async_data = NULL;
6653        }
6654        req->flags &= ~IO_REQ_CLEAN_FLAGS;
6655}
6656
6657static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
6658{
6659        const struct cred *creds = NULL;
6660        int ret;
6661
6662        if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
6663                creds = override_creds(req->creds);
6664
6665        if (!io_op_defs[req->opcode].audit_skip)
6666                audit_uring_entry(req->opcode);
6667
6668        switch (req->opcode) {
6669        case IORING_OP_NOP:
6670                ret = io_nop(req, issue_flags);
6671                break;
6672        case IORING_OP_READV:
6673        case IORING_OP_READ_FIXED:
6674        case IORING_OP_READ:
6675                ret = io_read(req, issue_flags);
6676                break;
6677        case IORING_OP_WRITEV:
6678        case IORING_OP_WRITE_FIXED:
6679        case IORING_OP_WRITE:
6680                ret = io_write(req, issue_flags);
6681                break;
6682        case IORING_OP_FSYNC:
6683                ret = io_fsync(req, issue_flags);
6684                break;
6685        case IORING_OP_POLL_ADD:
6686                ret = io_poll_add(req, issue_flags);
6687                break;
6688        case IORING_OP_POLL_REMOVE:
6689                ret = io_poll_update(req, issue_flags);
6690                break;
6691        case IORING_OP_SYNC_FILE_RANGE:
6692                ret = io_sync_file_range(req, issue_flags);
6693                break;
6694        case IORING_OP_SENDMSG:
6695                ret = io_sendmsg(req, issue_flags);
6696                break;
6697        case IORING_OP_SEND:
6698                ret = io_send(req, issue_flags);
6699                break;
6700        case IORING_OP_RECVMSG:
6701                ret = io_recvmsg(req, issue_flags);
6702                break;
6703        case IORING_OP_RECV:
6704                ret = io_recv(req, issue_flags);
6705                break;
6706        case IORING_OP_TIMEOUT:
6707                ret = io_timeout(req, issue_flags);
6708                break;
6709        case IORING_OP_TIMEOUT_REMOVE:
6710                ret = io_timeout_remove(req, issue_flags);
6711                break;
6712        case IORING_OP_ACCEPT:
6713                ret = io_accept(req, issue_flags);
6714                break;
6715        case IORING_OP_CONNECT:
6716                ret = io_connect(req, issue_flags);
6717                break;
6718        case IORING_OP_ASYNC_CANCEL:
6719                ret = io_async_cancel(req, issue_flags);
6720                break;
6721        case IORING_OP_FALLOCATE:
6722                ret = io_fallocate(req, issue_flags);
6723                break;
6724        case IORING_OP_OPENAT:
6725                ret = io_openat(req, issue_flags);
6726                break;
6727        case IORING_OP_CLOSE:
6728                ret = io_close(req, issue_flags);
6729                break;
6730        case IORING_OP_FILES_UPDATE:
6731                ret = io_files_update(req, issue_flags);
6732                break;
6733        case IORING_OP_STATX:
6734                ret = io_statx(req, issue_flags);
6735                break;
6736        case IORING_OP_FADVISE:
6737                ret = io_fadvise(req, issue_flags);
6738                break;
6739        case IORING_OP_MADVISE:
6740                ret = io_madvise(req, issue_flags);
6741                break;
6742        case IORING_OP_OPENAT2:
6743                ret = io_openat2(req, issue_flags);
6744                break;
6745        case IORING_OP_EPOLL_CTL:
6746                ret = io_epoll_ctl(req, issue_flags);
6747                break;
6748        case IORING_OP_SPLICE:
6749                ret = io_splice(req, issue_flags);
6750                break;
6751        case IORING_OP_PROVIDE_BUFFERS:
6752                ret = io_provide_buffers(req, issue_flags);
6753                break;
6754        case IORING_OP_REMOVE_BUFFERS:
6755                ret = io_remove_buffers(req, issue_flags);
6756                break;
6757        case IORING_OP_TEE:
6758                ret = io_tee(req, issue_flags);
6759                break;
6760        case IORING_OP_SHUTDOWN:
6761                ret = io_shutdown(req, issue_flags);
6762                break;
6763        case IORING_OP_RENAMEAT:
6764                ret = io_renameat(req, issue_flags);
6765                break;
6766        case IORING_OP_UNLINKAT:
6767                ret = io_unlinkat(req, issue_flags);
6768                break;
6769        case IORING_OP_MKDIRAT:
6770                ret = io_mkdirat(req, issue_flags);
6771                break;
6772        case IORING_OP_SYMLINKAT:
6773                ret = io_symlinkat(req, issue_flags);
6774                break;
6775        case IORING_OP_LINKAT:
6776                ret = io_linkat(req, issue_flags);
6777                break;
6778        default:
6779                ret = -EINVAL;
6780                break;
6781        }
6782
6783        if (!io_op_defs[req->opcode].audit_skip)
6784                audit_uring_exit(!ret, ret);
6785
6786        if (creds)
6787                revert_creds(creds);
6788        if (ret)
6789                return ret;
6790        /* If the op doesn't have a file, we're not polling for it */
6791        if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6792                io_iopoll_req_issued(req, issue_flags);
6793
6794        return 0;
6795}
6796
6797static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6798{
6799        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6800
6801        req = io_put_req_find_next(req);
6802        return req ? &req->work : NULL;
6803}
6804
6805static void io_wq_submit_work(struct io_wq_work *work)
6806{
6807        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6808        unsigned int issue_flags = IO_URING_F_UNLOCKED;
6809        bool needs_poll = false;
6810        struct io_kiocb *timeout;
6811        int ret = 0;
6812
6813        /* one will be dropped by ->io_free_work() after returning to io-wq */
6814        if (!(req->flags & REQ_F_REFCOUNT))
6815                __io_req_set_refcount(req, 2);
6816        else
6817                req_ref_get(req);
6818
6819        timeout = io_prep_linked_timeout(req);
6820        if (timeout)
6821                io_queue_linked_timeout(timeout);
6822
6823        /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
6824        if (work->flags & IO_WQ_WORK_CANCEL) {
6825                io_req_task_queue_fail(req, -ECANCELED);
6826                return;
6827        }
6828
6829        if (req->flags & REQ_F_FORCE_ASYNC) {
6830                const struct io_op_def *def = &io_op_defs[req->opcode];
6831                bool opcode_poll = def->pollin || def->pollout;
6832
6833                if (opcode_poll && file_can_poll(req->file)) {
6834                        needs_poll = true;
6835                        issue_flags |= IO_URING_F_NONBLOCK;
6836                }
6837        }
6838
6839        do {
6840                ret = io_issue_sqe(req, issue_flags);
6841                if (ret != -EAGAIN)
6842                        break;
6843                /*
6844                 * We can get EAGAIN for iopolled IO even though we're
6845                 * forcing a sync submission from here, since we can't
6846                 * wait for request slots on the block side.
6847                 */
6848                if (!needs_poll) {
6849                        cond_resched();
6850                        continue;
6851                }
6852
6853                if (io_arm_poll_handler(req) == IO_APOLL_OK)
6854                        return;
6855                /* aborted or ready, in either case retry blocking */
6856                needs_poll = false;
6857                issue_flags &= ~IO_URING_F_NONBLOCK;
6858        } while (1);
6859
6860        /* avoid locking problems by failing it from a clean context */
6861        if (ret)
6862                io_req_task_queue_fail(req, ret);
6863}
6864
6865static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
6866                                                       unsigned i)
6867{
6868        return &table->files[i];
6869}
6870
6871static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6872                                              int index)
6873{
6874        struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
6875
6876        return (struct file *) (slot->file_ptr & FFS_MASK);
6877}
6878
6879static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
6880{
6881        unsigned long file_ptr = (unsigned long) file;
6882
6883        file_ptr |= io_file_get_flags(file);
6884        file_slot->file_ptr = file_ptr;
6885}
6886
6887static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6888                                             struct io_kiocb *req, int fd)
6889{
6890        struct file *file;
6891        unsigned long file_ptr;
6892
6893        if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6894                return NULL;
6895        fd = array_index_nospec(fd, ctx->nr_user_files);
6896        file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6897        file = (struct file *) (file_ptr & FFS_MASK);
6898        file_ptr &= ~FFS_MASK;
6899        /* mask in overlapping REQ_F and FFS bits */
6900        req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
6901        io_req_set_rsrc_node(req, ctx);
6902        return file;
6903}
6904
6905static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
6906                                       struct io_kiocb *req, int fd)
6907{
6908        struct file *file = fget(fd);
6909
6910        trace_io_uring_file_get(ctx, fd);
6911
6912        /* we don't allow fixed io_uring files */
6913        if (file && unlikely(file->f_op == &io_uring_fops))
6914                io_req_track_inflight(req);
6915        return file;
6916}
6917
6918static inline struct file *io_file_get(struct io_ring_ctx *ctx,
6919                                       struct io_kiocb *req, int fd, bool fixed)
6920{
6921        if (fixed)
6922                return io_file_get_fixed(ctx, req, fd);
6923        else
6924                return io_file_get_normal(ctx, req, fd);
6925}
6926
6927static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
6928{
6929        struct io_kiocb *prev = req->timeout.prev;
6930        int ret = -ENOENT;
6931
6932        if (prev) {
6933                if (!(req->task->flags & PF_EXITING))
6934                        ret = io_try_cancel_userdata(req, prev->user_data);
6935                io_req_complete_post(req, ret ?: -ETIME, 0);
6936                io_put_req(prev);
6937        } else {
6938                io_req_complete_post(req, -ETIME, 0);
6939        }
6940}
6941
6942static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6943{
6944        struct io_timeout_data *data = container_of(timer,
6945                                                struct io_timeout_data, timer);
6946        struct io_kiocb *prev, *req = data->req;
6947        struct io_ring_ctx *ctx = req->ctx;
6948        unsigned long flags;
6949
6950        spin_lock_irqsave(&ctx->timeout_lock, flags);
6951        prev = req->timeout.head;
6952        req->timeout.head = NULL;
6953
6954        /*
6955         * We don't expect the list to be empty, that will only happen if we
6956         * race with the completion of the linked work.
6957         */
6958        if (prev) {
6959                io_remove_next_linked(prev);
6960                if (!req_ref_inc_not_zero(prev))
6961                        prev = NULL;
6962        }
6963        list_del(&req->timeout.list);
6964        req->timeout.prev = prev;
6965        spin_unlock_irqrestore(&ctx->timeout_lock, flags);
6966
6967        req->io_task_work.func = io_req_task_link_timeout;
6968        io_req_task_work_add(req);
6969        return HRTIMER_NORESTART;
6970}
6971
6972static void io_queue_linked_timeout(struct io_kiocb *req)
6973{
6974        struct io_ring_ctx *ctx = req->ctx;
6975
6976        spin_lock_irq(&ctx->timeout_lock);
6977        /*
6978         * If the back reference is NULL, then our linked request finished
6979         * before we got a chance to setup the timer
6980         */
6981        if (req->timeout.head) {
6982                struct io_timeout_data *data = req->async_data;
6983
6984                data->timer.function = io_link_timeout_fn;
6985                hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6986                                data->mode);
6987                list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
6988        }
6989        spin_unlock_irq(&ctx->timeout_lock);
6990        /* drop submission reference */
6991        io_put_req(req);
6992}
6993
6994static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
6995        __must_hold(&req->ctx->uring_lock)
6996{
6997        struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
6998
6999        switch (io_arm_poll_handler(req)) {
7000        case IO_APOLL_READY:
7001                io_req_task_queue(req);
7002                break;
7003        case IO_APOLL_ABORTED:
7004                /*
7005                 * Queued up for async execution, worker will release
7006                 * submit reference when the iocb is actually submitted.
7007                 */
7008                io_queue_async_work(req, NULL);
7009                break;
7010        }
7011
7012        if (linked_timeout)
7013                io_queue_linked_timeout(linked_timeout);
7014}
7015
7016static inline void __io_queue_sqe(struct io_kiocb *req)
7017        __must_hold(&req->ctx->uring_lock)
7018{
7019        struct io_kiocb *linked_timeout;
7020        int ret;
7021
7022        ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
7023
7024        if (req->flags & REQ_F_COMPLETE_INLINE) {
7025                io_req_add_compl_list(req);
7026                return;
7027        }
7028        /*
7029         * We async punt it if the file wasn't marked NOWAIT, or if the file
7030         * doesn't support non-blocking read/write attempts
7031         */
7032        if (likely(!ret)) {
7033                linked_timeout = io_prep_linked_timeout(req);
7034                if (linked_timeout)
7035                        io_queue_linked_timeout(linked_timeout);
7036        } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
7037                io_queue_sqe_arm_apoll(req);
7038        } else {
7039                io_req_complete_failed(req, ret);
7040        }
7041}
7042
7043static void io_queue_sqe_fallback(struct io_kiocb *req)
7044        __must_hold(&req->ctx->uring_lock)
7045{
7046        if (req->flags & REQ_F_FAIL) {
7047                io_req_complete_fail_submit(req);
7048        } else if (unlikely(req->ctx->drain_active)) {
7049                io_drain_req(req);
7050        } else {
7051                int ret = io_req_prep_async(req);
7052
7053                if (unlikely(ret))
7054                        io_req_complete_failed(req, ret);
7055                else
7056                        io_queue_async_work(req, NULL);
7057        }
7058}
7059
7060static inline void io_queue_sqe(struct io_kiocb *req)
7061        __must_hold(&req->ctx->uring_lock)
7062{
7063        if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))))
7064                __io_queue_sqe(req);
7065        else
7066                io_queue_sqe_fallback(req);
7067}
7068
7069/*
7070 * Check SQE restrictions (opcode and flags).
7071 *
7072 * Returns 'true' if SQE is allowed, 'false' otherwise.
7073 */
7074static inline bool io_check_restriction(struct io_ring_ctx *ctx,
7075                                        struct io_kiocb *req,
7076                                        unsigned int sqe_flags)
7077{
7078        if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
7079                return false;
7080
7081        if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
7082            ctx->restrictions.sqe_flags_required)
7083                return false;
7084
7085        if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
7086                          ctx->restrictions.sqe_flags_required))
7087                return false;
7088
7089        return true;
7090}
7091
7092static void io_init_req_drain(struct io_kiocb *req)
7093{
7094        struct io_ring_ctx *ctx = req->ctx;
7095        struct io_kiocb *head = ctx->submit_state.link.head;
7096
7097        ctx->drain_active = true;
7098        if (head) {
7099                /*
7100                 * If we need to drain a request in the middle of a link, drain
7101                 * the head request and the next request/link after the current
7102                 * link. Considering sequential execution of links,
7103                 * IOSQE_IO_DRAIN will be maintained for every request of our
7104                 * link.
7105                 */
7106                head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
7107                ctx->drain_next = true;
7108        }
7109}
7110
7111static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7112                       const struct io_uring_sqe *sqe)
7113        __must_hold(&ctx->uring_lock)
7114{
7115        unsigned int sqe_flags;
7116        int personality;
7117        u8 opcode;
7118
7119        /* req is partially pre-initialised, see io_preinit_req() */
7120        req->opcode = opcode = READ_ONCE(sqe->opcode);
7121        /* same numerical values with corresponding REQ_F_*, safe to copy */
7122        req->flags = sqe_flags = READ_ONCE(sqe->flags);
7123        req->user_data = READ_ONCE(sqe->user_data);
7124        req->file = NULL;
7125        req->fixed_rsrc_refs = NULL;
7126        req->task = current;
7127
7128        if (unlikely(opcode >= IORING_OP_LAST)) {
7129                req->opcode = 0;
7130                return -EINVAL;
7131        }
7132        if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
7133                /* enforce forwards compatibility on users */
7134                if (sqe_flags & ~SQE_VALID_FLAGS)
7135                        return -EINVAL;
7136                if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
7137                    !io_op_defs[opcode].buffer_select)
7138                        return -EOPNOTSUPP;
7139                if (sqe_flags & IOSQE_IO_DRAIN)
7140                        io_init_req_drain(req);
7141        }
7142        if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
7143                if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
7144                        return -EACCES;
7145                /* knock it to the slow queue path, will be drained there */
7146                if (ctx->drain_active)
7147                        req->flags |= REQ_F_FORCE_ASYNC;
7148                /* if there is no link, we're at "next" request and need to drain */
7149                if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
7150                        ctx->drain_next = false;
7151                        ctx->drain_active = true;
7152                        req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC;
7153                }
7154        }
7155
7156        if (io_op_defs[opcode].needs_file) {
7157                struct io_submit_state *state = &ctx->submit_state;
7158
7159                /*
7160                 * Plug now if we have more than 2 IO left after this, and the
7161                 * target is potentially a read/write to block based storage.
7162                 */
7163                if (state->need_plug && io_op_defs[opcode].plug) {
7164                        state->plug_started = true;
7165                        state->need_plug = false;
7166                        blk_start_plug_nr_ios(&state->plug, state->submit_nr);
7167                }
7168
7169                req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
7170                                        (sqe_flags & IOSQE_FIXED_FILE));
7171                if (unlikely(!req->file))
7172                        return -EBADF;
7173        }
7174
7175        personality = READ_ONCE(sqe->personality);
7176        if (personality) {
7177                int ret;
7178
7179                req->creds = xa_load(&ctx->personalities, personality);
7180                if (!req->creds)
7181                        return -EINVAL;
7182                get_cred(req->creds);
7183                ret = security_uring_override_creds(req->creds);
7184                if (ret) {
7185                        put_cred(req->creds);
7186                        return ret;
7187                }
7188                req->flags |= REQ_F_CREDS;
7189        }
7190
7191        return io_req_prep(req, sqe);
7192}
7193
7194static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
7195                         const struct io_uring_sqe *sqe)
7196        __must_hold(&ctx->uring_lock)
7197{
7198        struct io_submit_link *link = &ctx->submit_state.link;
7199        int ret;
7200
7201        ret = io_init_req(ctx, req, sqe);
7202        if (unlikely(ret)) {
7203                trace_io_uring_req_failed(sqe, ret);
7204
7205                /* fail even hard links since we don't submit */
7206                if (link->head) {
7207                        /*
7208                         * we can judge a link req is failed or cancelled by if
7209                         * REQ_F_FAIL is set, but the head is an exception since
7210                         * it may be set REQ_F_FAIL because of other req's failure
7211                         * so let's leverage req->result to distinguish if a head
7212                         * is set REQ_F_FAIL because of its failure or other req's
7213                         * failure so that we can set the correct ret code for it.
7214                         * init result here to avoid affecting the normal path.
7215                         */
7216                        if (!(link->head->flags & REQ_F_FAIL))
7217                                req_fail_link_node(link->head, -ECANCELED);
7218                } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7219                        /*
7220                         * the current req is a normal req, we should return
7221                         * error and thus break the submittion loop.
7222                         */
7223                        io_req_complete_failed(req, ret);
7224                        return ret;
7225                }
7226                req_fail_link_node(req, ret);
7227        }
7228
7229        /* don't need @sqe from now on */
7230        trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
7231                                  req->flags, true,
7232                                  ctx->flags & IORING_SETUP_SQPOLL);
7233
7234        /*
7235         * If we already have a head request, queue this one for async
7236         * submittal once the head completes. If we don't have a head but
7237         * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7238         * submitted sync once the chain is complete. If none of those
7239         * conditions are true (normal request), then just queue it.
7240         */
7241        if (link->head) {
7242                struct io_kiocb *head = link->head;
7243
7244                if (!(req->flags & REQ_F_FAIL)) {
7245                        ret = io_req_prep_async(req);
7246                        if (unlikely(ret)) {
7247                                req_fail_link_node(req, ret);
7248                                if (!(head->flags & REQ_F_FAIL))
7249                                        req_fail_link_node(head, -ECANCELED);
7250                        }
7251                }
7252                trace_io_uring_link(ctx, req, head);
7253                link->last->link = req;
7254                link->last = req;
7255
7256                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
7257                        return 0;
7258                /* last request of a link, enqueue the link */
7259                link->head = NULL;
7260                req = head;
7261        } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7262                link->head = req;
7263                link->last = req;
7264                return 0;
7265        }
7266
7267        io_queue_sqe(req);
7268        return 0;
7269}
7270
7271/*
7272 * Batched submission is done, ensure local IO is flushed out.
7273 */
7274static void io_submit_state_end(struct io_ring_ctx *ctx)
7275{
7276        struct io_submit_state *state = &ctx->submit_state;
7277
7278        if (state->link.head)
7279                io_queue_sqe(state->link.head);
7280        /* flush only after queuing links as they can generate completions */
7281        io_submit_flush_completions(ctx);
7282        if (state->plug_started)
7283                blk_finish_plug(&state->plug);
7284}
7285
7286/*
7287 * Start submission side cache.
7288 */
7289static void io_submit_state_start(struct io_submit_state *state,
7290                                  unsigned int max_ios)
7291{
7292        state->plug_started = false;
7293        state->need_plug = max_ios > 2;
7294        state->submit_nr = max_ios;
7295        /* set only head, no need to init link_last in advance */
7296        state->link.head = NULL;
7297}
7298
7299static void io_commit_sqring(struct io_ring_ctx *ctx)
7300{
7301        struct io_rings *rings = ctx->rings;
7302
7303        /*
7304         * Ensure any loads from the SQEs are done at this point,
7305         * since once we write the new head, the application could
7306         * write new data to them.
7307         */
7308        smp_store_release(&rings->sq.head, ctx->cached_sq_head);
7309}
7310
7311/*
7312 * Fetch an sqe, if one is available. Note this returns a pointer to memory
7313 * that is mapped by userspace. This means that care needs to be taken to
7314 * ensure that reads are stable, as we cannot rely on userspace always
7315 * being a good citizen. If members of the sqe are validated and then later
7316 * used, it's important that those reads are done through READ_ONCE() to
7317 * prevent a re-load down the line.
7318 */
7319static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
7320{
7321        unsigned head, mask = ctx->sq_entries - 1;
7322        unsigned sq_idx = ctx->cached_sq_head++ & mask;
7323
7324        /*
7325         * The cached sq head (or cq tail) serves two purposes:
7326         *
7327         * 1) allows us to batch the cost of updating the user visible
7328         *    head updates.
7329         * 2) allows the kernel side to track the head on its own, even
7330         *    though the application is the one updating it.
7331         */
7332        head = READ_ONCE(ctx->sq_array[sq_idx]);
7333        if (likely(head < ctx->sq_entries))
7334                return &ctx->sq_sqes[head];
7335
7336        /* drop invalid entries */
7337        ctx->cq_extra--;
7338        WRITE_ONCE(ctx->rings->sq_dropped,
7339                   READ_ONCE(ctx->rings->sq_dropped) + 1);
7340        return NULL;
7341}
7342
7343static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
7344        __must_hold(&ctx->uring_lock)
7345{
7346        unsigned int entries = io_sqring_entries(ctx);
7347        int submitted = 0;
7348
7349        if (unlikely(!entries))
7350                return 0;
7351        /* make sure SQ entry isn't read before tail */
7352        nr = min3(nr, ctx->sq_entries, entries);
7353        io_get_task_refs(nr);
7354
7355        io_submit_state_start(&ctx->submit_state, nr);
7356        do {
7357                const struct io_uring_sqe *sqe;
7358                struct io_kiocb *req;
7359
7360                if (unlikely(!io_alloc_req_refill(ctx))) {
7361                        if (!submitted)
7362                                submitted = -EAGAIN;
7363                        break;
7364                }
7365                req = io_alloc_req(ctx);
7366                sqe = io_get_sqe(ctx);
7367                if (unlikely(!sqe)) {
7368                        wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
7369                        break;
7370                }
7371                /* will complete beyond this point, count as submitted */
7372                submitted++;
7373                if (io_submit_sqe(ctx, req, sqe))
7374                        break;
7375        } while (submitted < nr);
7376
7377        if (unlikely(submitted != nr)) {
7378                int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
7379                int unused = nr - ref_used;
7380
7381                current->io_uring->cached_refs += unused;
7382        }
7383
7384        io_submit_state_end(ctx);
7385         /* Commit SQ ring head once we've consumed and submitted all SQEs */
7386        io_commit_sqring(ctx);
7387
7388        return submitted;
7389}
7390
7391static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7392{
7393        return READ_ONCE(sqd->state);
7394}
7395
7396static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7397{
7398        /* Tell userspace we may need a wakeup call */
7399        spin_lock(&ctx->completion_lock);
7400        WRITE_ONCE(ctx->rings->sq_flags,
7401                   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
7402        spin_unlock(&ctx->completion_lock);
7403}
7404
7405static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7406{
7407        spin_lock(&ctx->completion_lock);
7408        WRITE_ONCE(ctx->rings->sq_flags,
7409                   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
7410        spin_unlock(&ctx->completion_lock);
7411}
7412
7413static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
7414{
7415        unsigned int to_submit;
7416        int ret = 0;
7417
7418        to_submit = io_sqring_entries(ctx);
7419        /* if we're handling multiple rings, cap submit size for fairness */
7420        if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7421                to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
7422
7423        if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
7424                const struct cred *creds = NULL;
7425
7426                if (ctx->sq_creds != current_cred())
7427                        creds = override_creds(ctx->sq_creds);
7428
7429                mutex_lock(&ctx->uring_lock);
7430                if (!wq_list_empty(&ctx->iopoll_list))
7431                        io_do_iopoll(ctx, true);
7432
7433                /*
7434                 * Don't submit if refs are dying, good for io_uring_register(),
7435                 * but also it is relied upon by io_ring_exit_work()
7436                 */
7437                if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7438                    !(ctx->flags & IORING_SETUP_R_DISABLED))
7439                        ret = io_submit_sqes(ctx, to_submit);
7440                mutex_unlock(&ctx->uring_lock);
7441
7442                if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7443                        wake_up(&ctx->sqo_sq_wait);
7444                if (creds)
7445                        revert_creds(creds);
7446        }
7447
7448        return ret;
7449}
7450
7451static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7452{
7453        struct io_ring_ctx *ctx;
7454        unsigned sq_thread_idle = 0;
7455
7456        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7457                sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
7458        sqd->sq_thread_idle = sq_thread_idle;
7459}
7460
7461static bool io_sqd_handle_event(struct io_sq_data *sqd)
7462{
7463        bool did_sig = false;
7464        struct ksignal ksig;
7465
7466        if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7467            signal_pending(current)) {
7468                mutex_unlock(&sqd->lock);
7469                if (signal_pending(current))
7470                        did_sig = get_signal(&ksig);
7471                cond_resched();
7472                mutex_lock(&sqd->lock);
7473        }
7474        return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7475}
7476
7477static int io_sq_thread(void *data)
7478{
7479        struct io_sq_data *sqd = data;
7480        struct io_ring_ctx *ctx;
7481        unsigned long timeout = 0;
7482        char buf[TASK_COMM_LEN];
7483        DEFINE_WAIT(wait);
7484
7485        snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
7486        set_task_comm(current, buf);
7487
7488        if (sqd->sq_cpu != -1)
7489                set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
7490        else
7491                set_cpus_allowed_ptr(current, cpu_online_mask);
7492        current->flags |= PF_NO_SETAFFINITY;
7493
7494        audit_alloc_kernel(current);
7495
7496        mutex_lock(&sqd->lock);
7497        while (1) {
7498                bool cap_entries, sqt_spin = false;
7499
7500                if (io_sqd_events_pending(sqd) || signal_pending(current)) {
7501                        if (io_sqd_handle_event(sqd))
7502                                break;
7503                        timeout = jiffies + sqd->sq_thread_idle;
7504                }
7505
7506                cap_entries = !list_is_singular(&sqd->ctx_list);
7507                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7508                        int ret = __io_sq_thread(ctx, cap_entries);
7509
7510                        if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
7511                                sqt_spin = true;
7512                }
7513                if (io_run_task_work())
7514                        sqt_spin = true;
7515
7516                if (sqt_spin || !time_after(jiffies, timeout)) {
7517                        cond_resched();
7518                        if (sqt_spin)
7519                                timeout = jiffies + sqd->sq_thread_idle;
7520                        continue;
7521                }
7522
7523                prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7524                if (!io_sqd_events_pending(sqd) && !current->task_works) {
7525                        bool needs_sched = true;
7526
7527                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7528                                io_ring_set_wakeup_flag(ctx);
7529
7530                                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
7531                                    !wq_list_empty(&ctx->iopoll_list)) {
7532                                        needs_sched = false;
7533                                        break;
7534                                }
7535                                if (io_sqring_entries(ctx)) {
7536                                        needs_sched = false;
7537                                        break;
7538                                }
7539                        }
7540
7541                        if (needs_sched) {
7542                                mutex_unlock(&sqd->lock);
7543                                schedule();
7544                                mutex_lock(&sqd->lock);
7545                        }
7546                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7547                                io_ring_clear_wakeup_flag(ctx);
7548                }
7549
7550                finish_wait(&sqd->wait, &wait);
7551                timeout = jiffies + sqd->sq_thread_idle;
7552        }
7553
7554        io_uring_cancel_generic(true, sqd);
7555        sqd->thread = NULL;
7556        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7557                io_ring_set_wakeup_flag(ctx);
7558        io_run_task_work();
7559        mutex_unlock(&sqd->lock);
7560
7561        audit_free(current);
7562
7563        complete(&sqd->exited);
7564        do_exit(0);
7565}
7566
7567struct io_wait_queue {
7568        struct wait_queue_entry wq;
7569        struct io_ring_ctx *ctx;
7570        unsigned cq_tail;
7571        unsigned nr_timeouts;
7572};
7573
7574static inline bool io_should_wake(struct io_wait_queue *iowq)
7575{
7576        struct io_ring_ctx *ctx = iowq->ctx;
7577        int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
7578
7579        /*
7580         * Wake up if we have enough events, or if a timeout occurred since we
7581         * started waiting. For timeouts, we always want to return to userspace,
7582         * regardless of event count.
7583         */
7584        return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
7585}
7586
7587static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7588                            int wake_flags, void *key)
7589{
7590        struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7591                                                        wq);
7592
7593        /*
7594         * Cannot safely flush overflowed CQEs from here, ensure we wake up
7595         * the task, and the next invocation will do it.
7596         */
7597        if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
7598                return autoremove_wake_function(curr, mode, wake_flags, key);
7599        return -1;
7600}
7601
7602static int io_run_task_work_sig(void)
7603{
7604        if (io_run_task_work())
7605                return 1;
7606        if (!signal_pending(current))
7607                return 0;
7608        if (test_thread_flag(TIF_NOTIFY_SIGNAL))
7609                return -ERESTARTSYS;
7610        return -EINTR;
7611}
7612
7613/* when returns >0, the caller should retry */
7614static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7615                                          struct io_wait_queue *iowq,
7616                                          signed long *timeout)
7617{
7618        int ret;
7619
7620        /* make sure we run task_work before checking for signals */
7621        ret = io_run_task_work_sig();
7622        if (ret || io_should_wake(iowq))
7623                return ret;
7624        /* let the caller flush overflows, retry */
7625        if (test_bit(0, &ctx->check_cq_overflow))
7626                return 1;
7627
7628        *timeout = schedule_timeout(*timeout);
7629        return !*timeout ? -ETIME : 1;
7630}
7631
7632/*
7633 * Wait until events become available, if we don't already have some. The
7634 * application must reap them itself, as they reside on the shared cq ring.
7635 */
7636static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
7637                          const sigset_t __user *sig, size_t sigsz,
7638                          struct __kernel_timespec __user *uts)
7639{
7640        struct io_wait_queue iowq;
7641        struct io_rings *rings = ctx->rings;
7642        signed long timeout = MAX_SCHEDULE_TIMEOUT;
7643        int ret;
7644
7645        do {
7646                io_cqring_overflow_flush(ctx);
7647                if (io_cqring_events(ctx) >= min_events)
7648                        return 0;
7649                if (!io_run_task_work())
7650                        break;
7651        } while (1);
7652
7653        if (uts) {
7654                struct timespec64 ts;
7655
7656                if (get_timespec64(&ts, uts))
7657                        return -EFAULT;
7658                timeout = timespec64_to_jiffies(&ts);
7659        }
7660
7661        if (sig) {
7662#ifdef CONFIG_COMPAT
7663                if (in_compat_syscall())
7664                        ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
7665                                                      sigsz);
7666                else
7667#endif
7668                        ret = set_user_sigmask(sig, sigsz);
7669
7670                if (ret)
7671                        return ret;
7672        }
7673
7674        init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7675        iowq.wq.private = current;
7676        INIT_LIST_HEAD(&iowq.wq.entry);
7677        iowq.ctx = ctx;
7678        iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
7679        iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
7680
7681        trace_io_uring_cqring_wait(ctx, min_events);
7682        do {
7683                /* if we can't even flush overflow, don't wait for more */
7684                if (!io_cqring_overflow_flush(ctx)) {
7685                        ret = -EBUSY;
7686                        break;
7687                }
7688                prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
7689                                                TASK_INTERRUPTIBLE);
7690                ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7691                finish_wait(&ctx->cq_wait, &iowq.wq);
7692                cond_resched();
7693        } while (ret > 0);
7694
7695        restore_saved_sigmask_unless(ret == -EINTR);
7696
7697        return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
7698}
7699
7700static void io_free_page_table(void **table, size_t size)
7701{
7702        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7703
7704        for (i = 0; i < nr_tables; i++)
7705                kfree(table[i]);
7706        kfree(table);
7707}
7708
7709static __cold void **io_alloc_page_table(size_t size)
7710{
7711        unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7712        size_t init_size = size;
7713        void **table;
7714
7715        table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
7716        if (!table)
7717                return NULL;
7718
7719        for (i = 0; i < nr_tables; i++) {
7720                unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
7721
7722                table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
7723                if (!table[i]) {
7724                        io_free_page_table(table, init_size);
7725                        return NULL;
7726                }
7727                size -= this_size;
7728        }
7729        return table;
7730}
7731
7732static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
7733{
7734        percpu_ref_exit(&ref_node->refs);
7735        kfree(ref_node);
7736}
7737
7738static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7739{
7740        struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7741        struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7742        unsigned long flags;
7743        bool first_add = false;
7744
7745        spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7746        node->done = true;
7747
7748        while (!list_empty(&ctx->rsrc_ref_list)) {
7749                node = list_first_entry(&ctx->rsrc_ref_list,
7750                                            struct io_rsrc_node, node);
7751                /* recycle ref nodes in order */
7752                if (!node->done)
7753                        break;
7754                list_del(&node->node);
7755                first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7756        }
7757        spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7758
7759        if (first_add)
7760                mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ);
7761}
7762
7763static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7764{
7765        struct io_rsrc_node *ref_node;
7766
7767        ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7768        if (!ref_node)
7769                return NULL;
7770
7771        if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7772                            0, GFP_KERNEL)) {
7773                kfree(ref_node);
7774                return NULL;
7775        }
7776        INIT_LIST_HEAD(&ref_node->node);
7777        INIT_LIST_HEAD(&ref_node->rsrc_list);
7778        ref_node->done = false;
7779        return ref_node;
7780}
7781
7782static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7783                                struct io_rsrc_data *data_to_kill)
7784        __must_hold(&ctx->uring_lock)
7785{
7786        WARN_ON_ONCE(!ctx->rsrc_backup_node);
7787        WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
7788
7789        io_rsrc_refs_drop(ctx);
7790
7791        if (data_to_kill) {
7792                struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
7793
7794                rsrc_node->rsrc_data = data_to_kill;
7795                spin_lock_irq(&ctx->rsrc_ref_lock);
7796                list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
7797                spin_unlock_irq(&ctx->rsrc_ref_lock);
7798
7799                atomic_inc(&data_to_kill->refs);
7800                percpu_ref_kill(&rsrc_node->refs);
7801                ctx->rsrc_node = NULL;
7802        }
7803
7804        if (!ctx->rsrc_node) {
7805                ctx->rsrc_node = ctx->rsrc_backup_node;
7806                ctx->rsrc_backup_node = NULL;
7807        }
7808}
7809
7810static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
7811{
7812        if (ctx->rsrc_backup_node)
7813                return 0;
7814        ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
7815        return ctx->rsrc_backup_node ? 0 : -ENOMEM;
7816}
7817
7818static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
7819                                      struct io_ring_ctx *ctx)
7820{
7821        int ret;
7822
7823        /* As we may drop ->uring_lock, other task may have started quiesce */
7824        if (data->quiesce)
7825                return -ENXIO;
7826
7827        data->quiesce = true;
7828        do {
7829                ret = io_rsrc_node_switch_start(ctx);
7830                if (ret)
7831                        break;
7832                io_rsrc_node_switch(ctx, data);
7833
7834                /* kill initial ref, already quiesced if zero */
7835                if (atomic_dec_and_test(&data->refs))
7836                        break;
7837                mutex_unlock(&ctx->uring_lock);
7838                flush_delayed_work(&ctx->rsrc_put_work);
7839                ret = wait_for_completion_interruptible(&data->done);
7840                if (!ret) {
7841                        mutex_lock(&ctx->uring_lock);
7842                        break;
7843                }
7844
7845                atomic_inc(&data->refs);
7846                /* wait for all works potentially completing data->done */
7847                flush_delayed_work(&ctx->rsrc_put_work);
7848                reinit_completion(&data->done);
7849
7850                ret = io_run_task_work_sig();
7851                mutex_lock(&ctx->uring_lock);
7852        } while (ret >= 0);
7853        data->quiesce = false;
7854
7855        return ret;
7856}
7857
7858static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7859{
7860        unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7861        unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7862
7863        return &data->tags[table_idx][off];
7864}
7865
7866static void io_rsrc_data_free(struct io_rsrc_data *data)
7867{
7868        size_t size = data->nr * sizeof(data->tags[0][0]);
7869
7870        if (data->tags)
7871                io_free_page_table((void **)data->tags, size);
7872        kfree(data);
7873}
7874
7875static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7876                                     u64 __user *utags, unsigned nr,
7877                                     struct io_rsrc_data **pdata)
7878{
7879        struct io_rsrc_data *data;
7880        int ret = -ENOMEM;
7881        unsigned i;
7882
7883        data = kzalloc(sizeof(*data), GFP_KERNEL);
7884        if (!data)
7885                return -ENOMEM;
7886        data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
7887        if (!data->tags) {
7888                kfree(data);
7889                return -ENOMEM;
7890        }
7891
7892        data->nr = nr;
7893        data->ctx = ctx;
7894        data->do_put = do_put;
7895        if (utags) {
7896                ret = -EFAULT;
7897                for (i = 0; i < nr; i++) {
7898                        u64 *tag_slot = io_get_tag_slot(data, i);
7899
7900                        if (copy_from_user(tag_slot, &utags[i],
7901                                           sizeof(*tag_slot)))
7902                                goto fail;
7903                }
7904        }
7905
7906        atomic_set(&data->refs, 1);
7907        init_completion(&data->done);
7908        *pdata = data;
7909        return 0;
7910fail:
7911        io_rsrc_data_free(data);
7912        return ret;
7913}
7914
7915static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7916{
7917        table->files = kvcalloc(nr_files, sizeof(table->files[0]),
7918                                GFP_KERNEL_ACCOUNT);
7919        return !!table->files;
7920}
7921
7922static void io_free_file_tables(struct io_file_table *table)
7923{
7924        kvfree(table->files);
7925        table->files = NULL;
7926}
7927
7928static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7929{
7930#if defined(CONFIG_UNIX)
7931        if (ctx->ring_sock) {
7932                struct sock *sock = ctx->ring_sock->sk;
7933                struct sk_buff *skb;
7934
7935                while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7936                        kfree_skb(skb);
7937        }
7938#else
7939        int i;
7940
7941        for (i = 0; i < ctx->nr_user_files; i++) {
7942                struct file *file;
7943
7944                file = io_file_from_index(ctx, i);
7945                if (file)
7946                        fput(file);
7947        }
7948#endif
7949        io_free_file_tables(&ctx->file_table);
7950        io_rsrc_data_free(ctx->file_data);
7951        ctx->file_data = NULL;
7952        ctx->nr_user_files = 0;
7953}
7954
7955static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
7956{
7957        int ret;
7958
7959        if (!ctx->file_data)
7960                return -ENXIO;
7961        ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
7962        if (!ret)
7963                __io_sqe_files_unregister(ctx);
7964        return ret;
7965}
7966
7967static void io_sq_thread_unpark(struct io_sq_data *sqd)
7968        __releases(&sqd->lock)
7969{
7970        WARN_ON_ONCE(sqd->thread == current);
7971
7972        /*
7973         * Do the dance but not conditional clear_bit() because it'd race with
7974         * other threads incrementing park_pending and setting the bit.
7975         */
7976        clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7977        if (atomic_dec_return(&sqd->park_pending))
7978                set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7979        mutex_unlock(&sqd->lock);
7980}
7981
7982static void io_sq_thread_park(struct io_sq_data *sqd)
7983        __acquires(&sqd->lock)
7984{
7985        WARN_ON_ONCE(sqd->thread == current);
7986
7987        atomic_inc(&sqd->park_pending);
7988        set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
7989        mutex_lock(&sqd->lock);
7990        if (sqd->thread)
7991                wake_up_process(sqd->thread);
7992}
7993
7994static void io_sq_thread_stop(struct io_sq_data *sqd)
7995{
7996        WARN_ON_ONCE(sqd->thread == current);
7997        WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
7998
7999        set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8000        mutex_lock(&sqd->lock);
8001        if (sqd->thread)
8002                wake_up_process(sqd->thread);
8003        mutex_unlock(&sqd->lock);
8004        wait_for_completion(&sqd->exited);
8005}
8006
8007static void io_put_sq_data(struct io_sq_data *sqd)
8008{
8009        if (refcount_dec_and_test(&sqd->refs)) {
8010                WARN_ON_ONCE(atomic_read(&sqd->park_pending));
8011
8012                io_sq_thread_stop(sqd);
8013                kfree(sqd);
8014        }
8015}
8016
8017static void io_sq_thread_finish(struct io_ring_ctx *ctx)
8018{
8019        struct io_sq_data *sqd = ctx->sq_data;
8020
8021        if (sqd) {
8022                io_sq_thread_park(sqd);
8023                list_del_init(&ctx->sqd_list);
8024                io_sqd_update_thread_idle(sqd);
8025                io_sq_thread_unpark(sqd);
8026
8027                io_put_sq_data(sqd);
8028                ctx->sq_data = NULL;
8029        }
8030}
8031
8032static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
8033{
8034        struct io_ring_ctx *ctx_attach;
8035        struct io_sq_data *sqd;
8036        struct fd f;
8037
8038        f = fdget(p->wq_fd);
8039        if (!f.file)
8040                return ERR_PTR(-ENXIO);
8041        if (f.file->f_op != &io_uring_fops) {
8042                fdput(f);
8043                return ERR_PTR(-EINVAL);
8044        }
8045
8046        ctx_attach = f.file->private_data;
8047        sqd = ctx_attach->sq_data;
8048        if (!sqd) {
8049                fdput(f);
8050                return ERR_PTR(-EINVAL);
8051        }
8052        if (sqd->task_tgid != current->tgid) {
8053                fdput(f);
8054                return ERR_PTR(-EPERM);
8055        }
8056
8057        refcount_inc(&sqd->refs);
8058        fdput(f);
8059        return sqd;
8060}
8061
8062static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
8063                                         bool *attached)
8064{
8065        struct io_sq_data *sqd;
8066
8067        *attached = false;
8068        if (p->flags & IORING_SETUP_ATTACH_WQ) {
8069                sqd = io_attach_sq_data(p);
8070                if (!IS_ERR(sqd)) {
8071                        *attached = true;
8072                        return sqd;
8073                }
8074                /* fall through for EPERM case, setup new sqd/task */
8075                if (PTR_ERR(sqd) != -EPERM)
8076                        return sqd;
8077        }
8078
8079        sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
8080        if (!sqd)
8081                return ERR_PTR(-ENOMEM);
8082
8083        atomic_set(&sqd->park_pending, 0);
8084        refcount_set(&sqd->refs, 1);
8085        INIT_LIST_HEAD(&sqd->ctx_list);
8086        mutex_init(&sqd->lock);
8087        init_waitqueue_head(&sqd->wait);
8088        init_completion(&sqd->exited);
8089        return sqd;
8090}
8091
8092#if defined(CONFIG_UNIX)
8093/*
8094 * Ensure the UNIX gc is aware of our file set, so we are certain that
8095 * the io_uring can be safely unregistered on process exit, even if we have
8096 * loops in the file referencing.
8097 */
8098static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
8099{
8100        struct sock *sk = ctx->ring_sock->sk;
8101        struct scm_fp_list *fpl;
8102        struct sk_buff *skb;
8103        int i, nr_files;
8104
8105        fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8106        if (!fpl)
8107                return -ENOMEM;
8108
8109        skb = alloc_skb(0, GFP_KERNEL);
8110        if (!skb) {
8111                kfree(fpl);
8112                return -ENOMEM;
8113        }
8114
8115        skb->sk = sk;
8116
8117        nr_files = 0;
8118        fpl->user = get_uid(current_user());
8119        for (i = 0; i < nr; i++) {
8120                struct file *file = io_file_from_index(ctx, i + offset);
8121
8122                if (!file)
8123                        continue;
8124                fpl->fp[nr_files] = get_file(file);
8125                unix_inflight(fpl->user, fpl->fp[nr_files]);
8126                nr_files++;
8127        }
8128
8129        if (nr_files) {
8130                fpl->max = SCM_MAX_FD;
8131                fpl->count = nr_files;
8132                UNIXCB(skb).fp = fpl;
8133                skb->destructor = unix_destruct_scm;
8134                refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8135                skb_queue_head(&sk->sk_receive_queue, skb);
8136
8137                for (i = 0; i < nr_files; i++)
8138                        fput(fpl->fp[i]);
8139        } else {
8140                kfree_skb(skb);
8141                kfree(fpl);
8142        }
8143
8144        return 0;
8145}
8146
8147/*
8148 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
8149 * causes regular reference counting to break down. We rely on the UNIX
8150 * garbage collection to take care of this problem for us.
8151 */
8152static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8153{
8154        unsigned left, total;
8155        int ret = 0;
8156
8157        total = 0;
8158        left = ctx->nr_user_files;
8159        while (left) {
8160                unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
8161
8162                ret = __io_sqe_files_scm(ctx, this_files, total);
8163                if (ret)
8164                        break;
8165                left -= this_files;
8166                total += this_files;
8167        }
8168
8169        if (!ret)
8170                return 0;
8171
8172        while (total < ctx->nr_user_files) {
8173                struct file *file = io_file_from_index(ctx, total);
8174
8175                if (file)
8176                        fput(file);
8177                total++;
8178        }
8179
8180        return ret;
8181}
8182#else
8183static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8184{
8185        return 0;
8186}
8187#endif
8188
8189static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8190{
8191        struct file *file = prsrc->file;
8192#if defined(CONFIG_UNIX)
8193        struct sock *sock = ctx->ring_sock->sk;
8194        struct sk_buff_head list, *head = &sock->sk_receive_queue;
8195        struct sk_buff *skb;
8196        int i;
8197
8198        __skb_queue_head_init(&list);
8199
8200        /*
8201         * Find the skb that holds this file in its SCM_RIGHTS. When found,
8202         * remove this entry and rearrange the file array.
8203         */
8204        skb = skb_dequeue(head);
8205        while (skb) {
8206                struct scm_fp_list *fp;
8207
8208                fp = UNIXCB(skb).fp;
8209                for (i = 0; i < fp->count; i++) {
8210                        int left;
8211
8212                        if (fp->fp[i] != file)
8213                                continue;
8214
8215                        unix_notinflight(fp->user, fp->fp[i]);
8216                        left = fp->count - 1 - i;
8217                        if (left) {
8218                                memmove(&fp->fp[i], &fp->fp[i + 1],
8219                                                left * sizeof(struct file *));
8220                        }
8221                        fp->count--;
8222                        if (!fp->count) {
8223                                kfree_skb(skb);
8224                                skb = NULL;
8225                        } else {
8226                                __skb_queue_tail(&list, skb);
8227                        }
8228                        fput(file);
8229                        file = NULL;
8230                        break;
8231                }
8232
8233                if (!file)
8234                        break;
8235
8236                __skb_queue_tail(&list, skb);
8237
8238                skb = skb_dequeue(head);
8239        }
8240
8241        if (skb_peek(&list)) {
8242                spin_lock_irq(&head->lock);
8243                while ((skb = __skb_dequeue(&list)) != NULL)
8244                        __skb_queue_tail(head, skb);
8245                spin_unlock_irq(&head->lock);
8246        }
8247#else
8248        fput(file);
8249#endif
8250}
8251
8252static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
8253{
8254        struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
8255        struct io_ring_ctx *ctx = rsrc_data->ctx;
8256        struct io_rsrc_put *prsrc, *tmp;
8257
8258        list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8259                list_del(&prsrc->list);
8260
8261                if (prsrc->tag) {
8262                        bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
8263
8264                        io_ring_submit_lock(ctx, lock_ring);
8265                        spin_lock(&ctx->completion_lock);
8266                        io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
8267                        ctx->cq_extra++;
8268                        io_commit_cqring(ctx);
8269                        spin_unlock(&ctx->completion_lock);
8270                        io_cqring_ev_posted(ctx);
8271                        io_ring_submit_unlock(ctx, lock_ring);
8272                }
8273
8274                rsrc_data->do_put(ctx, prsrc);
8275                kfree(prsrc);
8276        }
8277
8278        io_rsrc_node_destroy(ref_node);
8279        if (atomic_dec_and_test(&rsrc_data->refs))
8280                complete(&rsrc_data->done);
8281}
8282
8283static void io_rsrc_put_work(struct work_struct *work)
8284{
8285        struct io_ring_ctx *ctx;
8286        struct llist_node *node;
8287
8288        ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8289        node = llist_del_all(&ctx->rsrc_put_llist);
8290
8291        while (node) {
8292                struct io_rsrc_node *ref_node;
8293                struct llist_node *next = node->next;
8294
8295                ref_node = llist_entry(node, struct io_rsrc_node, llist);
8296                __io_rsrc_put_work(ref_node);
8297                node = next;
8298        }
8299}
8300
8301static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
8302                                 unsigned nr_args, u64 __user *tags)
8303{
8304        __s32 __user *fds = (__s32 __user *) arg;
8305        struct file *file;
8306        int fd, ret;
8307        unsigned i;
8308
8309        if (ctx->file_data)
8310                return -EBUSY;
8311        if (!nr_args)
8312                return -EINVAL;
8313        if (nr_args > IORING_MAX_FIXED_FILES)
8314                return -EMFILE;
8315        if (nr_args > rlimit(RLIMIT_NOFILE))
8316                return -EMFILE;
8317        ret = io_rsrc_node_switch_start(ctx);
8318        if (ret)
8319                return ret;
8320        ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8321                                 &ctx->file_data);
8322        if (ret)
8323                return ret;
8324
8325        ret = -ENOMEM;
8326        if (!io_alloc_file_tables(&ctx->file_table, nr_args))
8327                goto out_free;
8328
8329        for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
8330                if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
8331                        ret = -EFAULT;
8332                        goto out_fput;
8333                }
8334                /* allow sparse sets */
8335                if (fd == -1) {
8336                        ret = -EINVAL;
8337                        if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
8338                                goto out_fput;
8339                        continue;
8340                }
8341
8342                file = fget(fd);
8343                ret = -EBADF;
8344                if (unlikely(!file))
8345                        goto out_fput;
8346
8347                /*
8348                 * Don't allow io_uring instances to be registered. If UNIX
8349                 * isn't enabled, then this causes a reference cycle and this
8350                 * instance can never get freed. If UNIX is enabled we'll
8351                 * handle it just fine, but there's still no point in allowing
8352                 * a ring fd as it doesn't support regular read/write anyway.
8353                 */
8354                if (file->f_op == &io_uring_fops) {
8355                        fput(file);
8356                        goto out_fput;
8357                }
8358                io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
8359        }
8360
8361        ret = io_sqe_files_scm(ctx);
8362        if (ret) {
8363                __io_sqe_files_unregister(ctx);
8364                return ret;
8365        }
8366
8367        io_rsrc_node_switch(ctx, NULL);
8368        return ret;
8369out_fput:
8370        for (i = 0; i < ctx->nr_user_files; i++) {
8371                file = io_file_from_index(ctx, i);
8372                if (file)
8373                        fput(file);
8374        }
8375        io_free_file_tables(&ctx->file_table);
8376        ctx->nr_user_files = 0;
8377out_free:
8378        io_rsrc_data_free(ctx->file_data);
8379        ctx->file_data = NULL;
8380        return ret;
8381}
8382
8383static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
8384                                int index)
8385{
8386#if defined(CONFIG_UNIX)
8387        struct sock *sock = ctx->ring_sock->sk;
8388        struct sk_buff_head *head = &sock->sk_receive_queue;
8389        struct sk_buff *skb;
8390
8391        /*
8392         * See if we can merge this file into an existing skb SCM_RIGHTS
8393         * file set. If there's no room, fall back to allocating a new skb
8394         * and filling it in.
8395         */
8396        spin_lock_irq(&head->lock);
8397        skb = skb_peek(head);
8398        if (skb) {
8399                struct scm_fp_list *fpl = UNIXCB(skb).fp;
8400
8401                if (fpl->count < SCM_MAX_FD) {
8402                        __skb_unlink(skb, head);
8403                        spin_unlock_irq(&head->lock);
8404                        fpl->fp[fpl->count] = get_file(file);
8405                        unix_inflight(fpl->user, fpl->fp[fpl->count]);
8406                        fpl->count++;
8407                        spin_lock_irq(&head->lock);
8408                        __skb_queue_head(head, skb);
8409                } else {
8410                        skb = NULL;
8411                }
8412        }
8413        spin_unlock_irq(&head->lock);
8414
8415        if (skb) {
8416                fput(file);
8417                return 0;
8418        }
8419
8420        return __io_sqe_files_scm(ctx, 1, index);
8421#else
8422        return 0;
8423#endif
8424}
8425
8426static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8427                                 struct io_rsrc_node *node, void *rsrc)
8428{
8429        struct io_rsrc_put *prsrc;
8430
8431        prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8432        if (!prsrc)
8433                return -ENOMEM;
8434
8435        prsrc->tag = *io_get_tag_slot(data, idx);
8436        prsrc->rsrc = rsrc;
8437        list_add(&prsrc->list, &node->rsrc_list);
8438        return 0;
8439}
8440
8441static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8442                                 unsigned int issue_flags, u32 slot_index)
8443{
8444        struct io_ring_ctx *ctx = req->ctx;
8445        bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
8446        bool needs_switch = false;
8447        struct io_fixed_file *file_slot;
8448        int ret = -EBADF;
8449
8450        io_ring_submit_lock(ctx, needs_lock);
8451        if (file->f_op == &io_uring_fops)
8452                goto err;
8453        ret = -ENXIO;
8454        if (!ctx->file_data)
8455                goto err;
8456        ret = -EINVAL;
8457        if (slot_index >= ctx->nr_user_files)
8458                goto err;
8459
8460        slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8461        file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
8462
8463        if (file_slot->file_ptr) {
8464                struct file *old_file;
8465
8466                ret = io_rsrc_node_switch_start(ctx);
8467                if (ret)
8468                        goto err;
8469
8470                old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8471                ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8472                                            ctx->rsrc_node, old_file);
8473                if (ret)
8474                        goto err;
8475                file_slot->file_ptr = 0;
8476                needs_switch = true;
8477        }
8478
8479        *io_get_tag_slot(ctx->file_data, slot_index) = 0;
8480        io_fixed_file_set(file_slot, file);
8481        ret = io_sqe_file_register(ctx, file, slot_index);
8482        if (ret) {
8483                file_slot->file_ptr = 0;
8484                goto err;
8485        }
8486
8487        ret = 0;
8488err:
8489        if (needs_switch)
8490                io_rsrc_node_switch(ctx, ctx->file_data);
8491        io_ring_submit_unlock(ctx, needs_lock);
8492        if (ret)
8493                fput(file);
8494        return ret;
8495}
8496
8497static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8498{
8499        unsigned int offset = req->close.file_slot - 1;
8500        struct io_ring_ctx *ctx = req->ctx;
8501        bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
8502        struct io_fixed_file *file_slot;
8503        struct file *file;
8504        int ret, i;
8505
8506        io_ring_submit_lock(ctx, needs_lock);
8507        ret = -ENXIO;
8508        if (unlikely(!ctx->file_data))
8509                goto out;
8510        ret = -EINVAL;
8511        if (offset >= ctx->nr_user_files)
8512                goto out;
8513        ret = io_rsrc_node_switch_start(ctx);
8514        if (ret)
8515                goto out;
8516
8517        i = array_index_nospec(offset, ctx->nr_user_files);
8518        file_slot = io_fixed_file_slot(&ctx->file_table, i);
8519        ret = -EBADF;
8520        if (!file_slot->file_ptr)
8521                goto out;
8522
8523        file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8524        ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8525        if (ret)
8526                goto out;
8527
8528        file_slot->file_ptr = 0;
8529        io_rsrc_node_switch(ctx, ctx->file_data);
8530        ret = 0;
8531out:
8532        io_ring_submit_unlock(ctx, needs_lock);
8533        return ret;
8534}
8535
8536static int __io_sqe_files_update(struct io_ring_ctx *ctx,
8537                                 struct io_uring_rsrc_update2 *up,
8538                                 unsigned nr_args)
8539{
8540        u64 __user *tags = u64_to_user_ptr(up->tags);
8541        __s32 __user *fds = u64_to_user_ptr(up->data);
8542        struct io_rsrc_data *data = ctx->file_data;
8543        struct io_fixed_file *file_slot;
8544        struct file *file;
8545        int fd, i, err = 0;
8546        unsigned int done;
8547        bool needs_switch = false;
8548
8549        if (!ctx->file_data)
8550                return -ENXIO;
8551        if (up->offset + nr_args > ctx->nr_user_files)
8552                return -EINVAL;
8553
8554        for (done = 0; done < nr_args; done++) {
8555                u64 tag = 0;
8556
8557                if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
8558                    copy_from_user(&fd, &fds[done], sizeof(fd))) {
8559                        err = -EFAULT;
8560                        break;
8561                }
8562                if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
8563                        err = -EINVAL;
8564                        break;
8565                }
8566                if (fd == IORING_REGISTER_FILES_SKIP)
8567                        continue;
8568
8569                i = array_index_nospec(up->offset + done, ctx->nr_user_files);
8570                file_slot = io_fixed_file_slot(&ctx->file_table, i);
8571
8572                if (file_slot->file_ptr) {
8573                        file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8574                        err = io_queue_rsrc_removal(data, up->offset + done,
8575                                                    ctx->rsrc_node, file);
8576                        if (err)
8577                                break;
8578                        file_slot->file_ptr = 0;
8579                        needs_switch = true;
8580                }
8581                if (fd != -1) {
8582                        file = fget(fd);
8583                        if (!file) {
8584                                err = -EBADF;
8585                                break;
8586                        }
8587                        /*
8588                         * Don't allow io_uring instances to be registered. If
8589                         * UNIX isn't enabled, then this causes a reference
8590                         * cycle and this instance can never get freed. If UNIX
8591                         * is enabled we'll handle it just fine, but there's
8592                         * still no point in allowing a ring fd as it doesn't
8593                         * support regular read/write anyway.
8594                         */
8595                        if (file->f_op == &io_uring_fops) {
8596                                fput(file);
8597                                err = -EBADF;
8598                                break;
8599                        }
8600                        *io_get_tag_slot(data, up->offset + done) = tag;
8601                        io_fixed_file_set(file_slot, file);
8602                        err = io_sqe_file_register(ctx, file, i);
8603                        if (err) {
8604                                file_slot->file_ptr = 0;
8605                                fput(file);
8606                                break;
8607                        }
8608                }
8609        }
8610
8611        if (needs_switch)
8612                io_rsrc_node_switch(ctx, data);
8613        return done ? done : err;
8614}
8615
8616static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
8617                                        struct task_struct *task)
8618{
8619        struct io_wq_hash *hash;
8620        struct io_wq_data data;
8621        unsigned int concurrency;
8622
8623        mutex_lock(&ctx->uring_lock);
8624        hash = ctx->hash_map;
8625        if (!hash) {
8626                hash = kzalloc(sizeof(*hash), GFP_KERNEL);
8627                if (!hash) {
8628                        mutex_unlock(&ctx->uring_lock);
8629                        return ERR_PTR(-ENOMEM);
8630                }
8631                refcount_set(&hash->refs, 1);
8632                init_waitqueue_head(&hash->wait);
8633                ctx->hash_map = hash;
8634        }
8635        mutex_unlock(&ctx->uring_lock);
8636
8637        data.hash = hash;
8638        data.task = task;
8639        data.free_work = io_wq_free_work;
8640        data.do_work = io_wq_submit_work;
8641
8642        /* Do QD, or 4 * CPUS, whatever is smallest */
8643        concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
8644
8645        return io_wq_create(concurrency, &data);
8646}
8647
8648static __cold int io_uring_alloc_task_context(struct task_struct *task,
8649                                              struct io_ring_ctx *ctx)
8650{
8651        struct io_uring_task *tctx;
8652        int ret;
8653
8654        tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
8655        if (unlikely(!tctx))
8656                return -ENOMEM;
8657
8658        ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8659        if (unlikely(ret)) {
8660                kfree(tctx);
8661                return ret;
8662        }
8663
8664        tctx->io_wq = io_init_wq_offload(ctx, task);
8665        if (IS_ERR(tctx->io_wq)) {
8666                ret = PTR_ERR(tctx->io_wq);
8667                percpu_counter_destroy(&tctx->inflight);
8668                kfree(tctx);
8669                return ret;
8670        }
8671
8672        xa_init(&tctx->xa);
8673        init_waitqueue_head(&tctx->wait);
8674        atomic_set(&tctx->in_idle, 0);
8675        atomic_set(&tctx->inflight_tracked, 0);
8676        task->io_uring = tctx;
8677        spin_lock_init(&tctx->task_lock);
8678        INIT_WQ_LIST(&tctx->task_list);
8679        init_task_work(&tctx->task_work, tctx_task_work);
8680        return 0;
8681}
8682
8683void __io_uring_free(struct task_struct *tsk)
8684{
8685        struct io_uring_task *tctx = tsk->io_uring;
8686
8687        WARN_ON_ONCE(!xa_empty(&tctx->xa));
8688        WARN_ON_ONCE(tctx->io_wq);
8689        WARN_ON_ONCE(tctx->cached_refs);
8690
8691        percpu_counter_destroy(&tctx->inflight);
8692        kfree(tctx);
8693        tsk->io_uring = NULL;
8694}
8695
8696static __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
8697                                       struct io_uring_params *p)
8698{
8699        int ret;
8700
8701        /* Retain compatibility with failing for an invalid attach attempt */
8702        if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8703                                IORING_SETUP_ATTACH_WQ) {
8704                struct fd f;
8705
8706                f = fdget(p->wq_fd);
8707                if (!f.file)
8708                        return -ENXIO;
8709                if (f.file->f_op != &io_uring_fops) {
8710                        fdput(f);
8711                        return -EINVAL;
8712                }
8713                fdput(f);
8714        }
8715        if (ctx->flags & IORING_SETUP_SQPOLL) {
8716                struct task_struct *tsk;
8717                struct io_sq_data *sqd;
8718                bool attached;
8719
8720                ret = security_uring_sqpoll();
8721                if (ret)
8722                        return ret;
8723
8724                sqd = io_get_sq_data(p, &attached);
8725                if (IS_ERR(sqd)) {
8726                        ret = PTR_ERR(sqd);
8727                        goto err;
8728                }
8729
8730                ctx->sq_creds = get_current_cred();
8731                ctx->sq_data = sqd;
8732                ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8733                if (!ctx->sq_thread_idle)
8734                        ctx->sq_thread_idle = HZ;
8735
8736                io_sq_thread_park(sqd);
8737                list_add(&ctx->sqd_list, &sqd->ctx_list);
8738                io_sqd_update_thread_idle(sqd);
8739                /* don't attach to a dying SQPOLL thread, would be racy */
8740                ret = (attached && !sqd->thread) ? -ENXIO : 0;
8741                io_sq_thread_unpark(sqd);
8742
8743                if (ret < 0)
8744                        goto err;
8745                if (attached)
8746                        return 0;
8747
8748                if (p->flags & IORING_SETUP_SQ_AFF) {
8749                        int cpu = p->sq_thread_cpu;
8750
8751                        ret = -EINVAL;
8752                        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
8753                                goto err_sqpoll;
8754                        sqd->sq_cpu = cpu;
8755                } else {
8756                        sqd->sq_cpu = -1;
8757                }
8758
8759                sqd->task_pid = current->pid;
8760                sqd->task_tgid = current->tgid;
8761                tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8762                if (IS_ERR(tsk)) {
8763                        ret = PTR_ERR(tsk);
8764                        goto err_sqpoll;
8765                }
8766
8767                sqd->thread = tsk;
8768                ret = io_uring_alloc_task_context(tsk, ctx);
8769                wake_up_new_task(tsk);
8770                if (ret)
8771                        goto err;
8772        } else if (p->flags & IORING_SETUP_SQ_AFF) {
8773                /* Can't have SQ_AFF without SQPOLL */
8774                ret = -EINVAL;
8775                goto err;
8776        }
8777
8778        return 0;
8779err_sqpoll:
8780        complete(&ctx->sq_data->exited);
8781err:
8782        io_sq_thread_finish(ctx);
8783        return ret;
8784}
8785
8786static inline void __io_unaccount_mem(struct user_struct *user,
8787                                      unsigned long nr_pages)
8788{
8789        atomic_long_sub(nr_pages, &user->locked_vm);
8790}
8791
8792static inline int __io_account_mem(struct user_struct *user,
8793                                   unsigned long nr_pages)
8794{
8795        unsigned long page_limit, cur_pages, new_pages;
8796
8797        /* Don't allow more pages than we can safely lock */
8798        page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8799
8800        do {
8801                cur_pages = atomic_long_read(&user->locked_vm);
8802                new_pages = cur_pages + nr_pages;
8803                if (new_pages > page_limit)
8804                        return -ENOMEM;
8805        } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8806                                        new_pages) != cur_pages);
8807
8808        return 0;
8809}
8810
8811static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8812{
8813        if (ctx->user)
8814                __io_unaccount_mem(ctx->user, nr_pages);
8815
8816        if (ctx->mm_account)
8817                atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
8818}
8819
8820static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8821{
8822        int ret;
8823
8824        if (ctx->user) {
8825                ret = __io_account_mem(ctx->user, nr_pages);
8826                if (ret)
8827                        return ret;
8828        }
8829
8830        if (ctx->mm_account)
8831                atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
8832
8833        return 0;
8834}
8835
8836static void io_mem_free(void *ptr)
8837{
8838        struct page *page;
8839
8840        if (!ptr)
8841                return;
8842
8843        page = virt_to_head_page(ptr);
8844        if (put_page_testzero(page))
8845                free_compound_page(page);
8846}
8847
8848static void *io_mem_alloc(size_t size)
8849{
8850        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
8851                                __GFP_NORETRY | __GFP_ACCOUNT;
8852
8853        return (void *) __get_free_pages(gfp_flags, get_order(size));
8854}
8855
8856static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8857                                size_t *sq_offset)
8858{
8859        struct io_rings *rings;
8860        size_t off, sq_array_size;
8861
8862        off = struct_size(rings, cqes, cq_entries);
8863        if (off == SIZE_MAX)
8864                return SIZE_MAX;
8865
8866#ifdef CONFIG_SMP
8867        off = ALIGN(off, SMP_CACHE_BYTES);
8868        if (off == 0)
8869                return SIZE_MAX;
8870#endif
8871
8872        if (sq_offset)
8873                *sq_offset = off;
8874
8875        sq_array_size = array_size(sizeof(u32), sq_entries);
8876        if (sq_array_size == SIZE_MAX)
8877                return SIZE_MAX;
8878
8879        if (check_add_overflow(off, sq_array_size, &off))
8880                return SIZE_MAX;
8881
8882        return off;
8883}
8884
8885static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
8886{
8887        struct io_mapped_ubuf *imu = *slot;
8888        unsigned int i;
8889
8890        if (imu != ctx->dummy_ubuf) {
8891                for (i = 0; i < imu->nr_bvecs; i++)
8892                        unpin_user_page(imu->bvec[i].bv_page);
8893                if (imu->acct_pages)
8894                        io_unaccount_mem(ctx, imu->acct_pages);
8895                kvfree(imu);
8896        }
8897        *slot = NULL;
8898}
8899
8900static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8901{
8902        io_buffer_unmap(ctx, &prsrc->buf);
8903        prsrc->buf = NULL;
8904}
8905
8906static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8907{
8908        unsigned int i;
8909
8910        for (i = 0; i < ctx->nr_user_bufs; i++)
8911                io_buffer_unmap(ctx, &ctx->user_bufs[i]);
8912        kfree(ctx->user_bufs);
8913        io_rsrc_data_free(ctx->buf_data);
8914        ctx->user_bufs = NULL;
8915        ctx->buf_data = NULL;
8916        ctx->nr_user_bufs = 0;
8917}
8918
8919static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8920{
8921        int ret;
8922
8923        if (!ctx->buf_data)
8924                return -ENXIO;
8925
8926        ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8927        if (!ret)
8928                __io_sqe_buffers_unregister(ctx);
8929        return ret;
8930}
8931
8932static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8933                       void __user *arg, unsigned index)
8934{
8935        struct iovec __user *src;
8936
8937#ifdef CONFIG_COMPAT
8938        if (ctx->compat) {
8939                struct compat_iovec __user *ciovs;
8940                struct compat_iovec ciov;
8941
8942                ciovs = (struct compat_iovec __user *) arg;
8943                if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8944                        return -EFAULT;
8945
8946                dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8947                dst->iov_len = ciov.iov_len;
8948                return 0;
8949        }
8950#endif
8951        src = (struct iovec __user *) arg;
8952        if (copy_from_user(dst, &src[index], sizeof(*dst)))
8953                return -EFAULT;
8954        return 0;
8955}
8956
8957/*
8958 * Not super efficient, but this is just a registration time. And we do cache
8959 * the last compound head, so generally we'll only do a full search if we don't
8960 * match that one.
8961 *
8962 * We check if the given compound head page has already been accounted, to
8963 * avoid double accounting it. This allows us to account the full size of the
8964 * page, not just the constituent pages of a huge page.
8965 */
8966static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8967                                  int nr_pages, struct page *hpage)
8968{
8969        int i, j;
8970
8971        /* check current page array */
8972        for (i = 0; i < nr_pages; i++) {
8973                if (!PageCompound(pages[i]))
8974                        continue;
8975                if (compound_head(pages[i]) == hpage)
8976                        return true;
8977        }
8978
8979        /* check previously registered pages */
8980        for (i = 0; i < ctx->nr_user_bufs; i++) {
8981                struct io_mapped_ubuf *imu = ctx->user_bufs[i];
8982
8983                for (j = 0; j < imu->nr_bvecs; j++) {
8984                        if (!PageCompound(imu->bvec[j].bv_page))
8985                                continue;
8986                        if (compound_head(imu->bvec[j].bv_page) == hpage)
8987                                return true;
8988                }
8989        }
8990
8991        return false;
8992}
8993
8994static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8995                                 int nr_pages, struct io_mapped_ubuf *imu,
8996                                 struct page **last_hpage)
8997{
8998        int i, ret;
8999
9000        imu->acct_pages = 0;
9001        for (i = 0; i < nr_pages; i++) {
9002                if (!PageCompound(pages[i])) {
9003                        imu->acct_pages++;
9004                } else {
9005                        struct page *hpage;
9006
9007                        hpage = compound_head(pages[i]);
9008                        if (hpage == *last_hpage)
9009                                continue;
9010                        *last_hpage = hpage;
9011                        if (headpage_already_acct(ctx, pages, i, hpage))
9012                                continue;
9013                        imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
9014                }
9015        }
9016
9017        if (!imu->acct_pages)
9018                return 0;
9019
9020        ret = io_account_mem(ctx, imu->acct_pages);
9021        if (ret)
9022                imu->acct_pages = 0;
9023        return ret;
9024}
9025
9026static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
9027                                  struct io_mapped_ubuf **pimu,
9028                                  struct page **last_hpage)
9029{
9030        struct io_mapped_ubuf *imu = NULL;
9031        struct vm_area_struct **vmas = NULL;
9032        struct page **pages = NULL;
9033        unsigned long off, start, end, ubuf;
9034        size_t size;
9035        int ret, pret, nr_pages, i;
9036
9037        if (!iov->iov_base) {
9038                *pimu = ctx->dummy_ubuf;
9039                return 0;
9040        }
9041
9042        ubuf = (unsigned long) iov->iov_base;
9043        end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
9044        start = ubuf >> PAGE_SHIFT;
9045        nr_pages = end - start;
9046
9047        *pimu = NULL;
9048        ret = -ENOMEM;
9049
9050        pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
9051        if (!pages)
9052                goto done;
9053
9054        vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
9055                              GFP_KERNEL);
9056        if (!vmas)
9057                goto done;
9058
9059        imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
9060        if (!imu)
9061                goto done;
9062
9063        ret = 0;
9064        mmap_read_lock(current->mm);
9065        pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
9066                              pages, vmas);
9067        if (pret == nr_pages) {
9068                /* don't support file backed memory */
9069                for (i = 0; i < nr_pages; i++) {
9070                        struct vm_area_struct *vma = vmas[i];
9071
9072                        if (vma_is_shmem(vma))
9073                                continue;
9074                        if (vma->vm_file &&
9075                            !is_file_hugepages(vma->vm_file)) {
9076                                ret = -EOPNOTSUPP;
9077                                break;
9078                        }
9079                }
9080        } else {
9081                ret = pret < 0 ? pret : -EFAULT;
9082        }
9083        mmap_read_unlock(current->mm);
9084        if (ret) {
9085                /*
9086                 * if we did partial map, or found file backed vmas,
9087                 * release any pages we did get
9088                 */
9089                if (pret > 0)
9090                        unpin_user_pages(pages, pret);
9091                goto done;
9092        }
9093
9094        ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
9095        if (ret) {
9096                unpin_user_pages(pages, pret);
9097                goto done;
9098        }
9099
9100        off = ubuf & ~PAGE_MASK;
9101        size = iov->iov_len;
9102        for (i = 0; i < nr_pages; i++) {
9103                size_t vec_len;
9104
9105                vec_len = min_t(size_t, size, PAGE_SIZE - off);
9106                imu->bvec[i].bv_page = pages[i];
9107                imu->bvec[i].bv_len = vec_len;
9108                imu->bvec[i].bv_offset = off;
9109                off = 0;
9110                size -= vec_len;
9111        }
9112        /* store original address for later verification */
9113        imu->ubuf = ubuf;
9114        imu->ubuf_end = ubuf + iov->iov_len;
9115        imu->nr_bvecs = nr_pages;
9116        *pimu = imu;
9117        ret = 0;
9118done:
9119        if (ret)
9120                kvfree(imu);
9121        kvfree(pages);
9122        kvfree(vmas);
9123        return ret;
9124}
9125
9126static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
9127{
9128        ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9129        return ctx->user_bufs ? 0 : -ENOMEM;
9130}
9131
9132static int io_buffer_validate(struct iovec *iov)
9133{
9134        unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9135
9136        /*
9137         * Don't impose further limits on the size and buffer
9138         * constraints here, we'll -EINVAL later when IO is
9139         * submitted if they are wrong.
9140         */
9141        if (!iov->iov_base)
9142                return iov->iov_len ? -EFAULT : 0;
9143        if (!iov->iov_len)
9144                return -EFAULT;
9145
9146        /* arbitrary limit, but we need something */
9147        if (iov->iov_len > SZ_1G)
9148                return -EFAULT;
9149
9150        if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9151                return -EOVERFLOW;
9152
9153        return 0;
9154}
9155
9156static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
9157                                   unsigned int nr_args, u64 __user *tags)
9158{
9159        struct page *last_hpage = NULL;
9160        struct io_rsrc_data *data;
9161        int i, ret;
9162        struct iovec iov;
9163
9164        if (ctx->user_bufs)
9165                return -EBUSY;
9166        if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
9167                return -EINVAL;
9168        ret = io_rsrc_node_switch_start(ctx);
9169        if (ret)
9170                return ret;
9171        ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9172        if (ret)
9173                return ret;
9174        ret = io_buffers_map_alloc(ctx, nr_args);
9175        if (ret) {
9176                io_rsrc_data_free(data);
9177                return ret;
9178        }
9179
9180        for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
9181                ret = io_copy_iov(ctx, &iov, arg, i);
9182                if (ret)
9183                        break;
9184                ret = io_buffer_validate(&iov);
9185                if (ret)
9186                        break;
9187                if (!iov.iov_base && *io_get_tag_slot(data, i)) {
9188                        ret = -EINVAL;
9189                        break;
9190                }
9191
9192                ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9193                                             &last_hpage);
9194                if (ret)
9195                        break;
9196        }
9197
9198        WARN_ON_ONCE(ctx->buf_data);
9199
9200        ctx->buf_data = data;
9201        if (ret)
9202                __io_sqe_buffers_unregister(ctx);
9203        else
9204                io_rsrc_node_switch(ctx, NULL);
9205        return ret;
9206}
9207
9208static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9209                                   struct io_uring_rsrc_update2 *up,
9210                                   unsigned int nr_args)
9211{
9212        u64 __user *tags = u64_to_user_ptr(up->tags);
9213        struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
9214        struct page *last_hpage = NULL;
9215        bool needs_switch = false;
9216        __u32 done;
9217        int i, err;
9218
9219        if (!ctx->buf_data)
9220                return -ENXIO;
9221        if (up->offset + nr_args > ctx->nr_user_bufs)
9222                return -EINVAL;
9223
9224        for (done = 0; done < nr_args; done++) {
9225                struct io_mapped_ubuf *imu;
9226                int offset = up->offset + done;
9227                u64 tag = 0;
9228
9229                err = io_copy_iov(ctx, &iov, iovs, done);
9230                if (err)
9231                        break;
9232                if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9233                        err = -EFAULT;
9234                        break;
9235                }
9236                err = io_buffer_validate(&iov);
9237                if (err)
9238                        break;
9239                if (!iov.iov_base && tag) {
9240                        err = -EINVAL;
9241                        break;
9242                }
9243                err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9244                if (err)
9245                        break;
9246
9247                i = array_index_nospec(offset, ctx->nr_user_bufs);
9248                if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
9249                        err = io_queue_rsrc_removal(ctx->buf_data, offset,
9250                                                    ctx->rsrc_node, ctx->user_bufs[i]);
9251                        if (unlikely(err)) {
9252                                io_buffer_unmap(ctx, &imu);
9253                                break;
9254                        }
9255                        ctx->user_bufs[i] = NULL;
9256                        needs_switch = true;
9257                }
9258
9259                ctx->user_bufs[i] = imu;
9260                *io_get_tag_slot(ctx->buf_data, offset) = tag;
9261        }
9262
9263        if (needs_switch)
9264                io_rsrc_node_switch(ctx, ctx->buf_data);
9265        return done ? done : err;
9266}
9267
9268static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
9269{
9270        __s32 __user *fds = arg;
9271        int fd;
9272
9273        if (ctx->cq_ev_fd)
9274                return -EBUSY;
9275
9276        if (copy_from_user(&fd, fds, sizeof(*fds)))
9277                return -EFAULT;
9278
9279        ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
9280        if (IS_ERR(ctx->cq_ev_fd)) {
9281                int ret = PTR_ERR(ctx->cq_ev_fd);
9282
9283                ctx->cq_ev_fd = NULL;
9284                return ret;
9285        }
9286
9287        return 0;
9288}
9289
9290static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9291{
9292        if (ctx->cq_ev_fd) {
9293                eventfd_ctx_put(ctx->cq_ev_fd);
9294                ctx->cq_ev_fd = NULL;
9295                return 0;
9296        }
9297
9298        return -ENXIO;
9299}
9300
9301static void io_destroy_buffers(struct io_ring_ctx *ctx)
9302{
9303        struct io_buffer *buf;
9304        unsigned long index;
9305
9306        xa_for_each(&ctx->io_buffers, index, buf)
9307                __io_remove_buffers(ctx, buf, index, -1U);
9308}
9309
9310static void io_req_caches_free(struct io_ring_ctx *ctx)
9311{
9312        struct io_submit_state *state = &ctx->submit_state;
9313        int nr = 0;
9314
9315        mutex_lock(&ctx->uring_lock);
9316        io_flush_cached_locked_reqs(ctx, state);
9317
9318        while (state->free_list.next) {
9319                struct io_wq_work_node *node;
9320                struct io_kiocb *req;
9321
9322                node = wq_stack_extract(&state->free_list);
9323                req = container_of(node, struct io_kiocb, comp_list);
9324                kmem_cache_free(req_cachep, req);
9325                nr++;
9326        }
9327        if (nr)
9328                percpu_ref_put_many(&ctx->refs, nr);
9329        mutex_unlock(&ctx->uring_lock);
9330}
9331
9332static void io_wait_rsrc_data(struct io_rsrc_data *data)
9333{
9334        if (data && !atomic_dec_and_test(&data->refs))
9335                wait_for_completion(&data->done);
9336}
9337
9338static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
9339{
9340        io_sq_thread_finish(ctx);
9341
9342        if (ctx->mm_account) {
9343                mmdrop(ctx->mm_account);
9344                ctx->mm_account = NULL;
9345        }
9346
9347        io_rsrc_refs_drop(ctx);
9348        /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9349        io_wait_rsrc_data(ctx->buf_data);
9350        io_wait_rsrc_data(ctx->file_data);
9351
9352        mutex_lock(&ctx->uring_lock);
9353        if (ctx->buf_data)
9354                __io_sqe_buffers_unregister(ctx);
9355        if (ctx->file_data)
9356                __io_sqe_files_unregister(ctx);
9357        if (ctx->rings)
9358                __io_cqring_overflow_flush(ctx, true);
9359        mutex_unlock(&ctx->uring_lock);
9360        io_eventfd_unregister(ctx);
9361        io_destroy_buffers(ctx);
9362        if (ctx->sq_creds)
9363                put_cred(ctx->sq_creds);
9364
9365        /* there are no registered resources left, nobody uses it */
9366        if (ctx->rsrc_node)
9367                io_rsrc_node_destroy(ctx->rsrc_node);
9368        if (ctx->rsrc_backup_node)
9369                io_rsrc_node_destroy(ctx->rsrc_backup_node);
9370        flush_delayed_work(&ctx->rsrc_put_work);
9371        flush_delayed_work(&ctx->fallback_work);
9372
9373        WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9374        WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
9375
9376#if defined(CONFIG_UNIX)
9377        if (ctx->ring_sock) {
9378                ctx->ring_sock->file = NULL; /* so that iput() is called */
9379                sock_release(ctx->ring_sock);
9380        }
9381#endif
9382        WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
9383
9384        io_mem_free(ctx->rings);
9385        io_mem_free(ctx->sq_sqes);
9386
9387        percpu_ref_exit(&ctx->refs);
9388        free_uid(ctx->user);
9389        io_req_caches_free(ctx);
9390        if (ctx->hash_map)
9391                io_wq_put_hash(ctx->hash_map);
9392        kfree(ctx->cancel_hash);
9393        kfree(ctx->dummy_ubuf);
9394        kfree(ctx);
9395}
9396
9397static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9398{
9399        struct io_ring_ctx *ctx = file->private_data;
9400        __poll_t mask = 0;
9401
9402        poll_wait(file, &ctx->cq_wait, wait);
9403        /*
9404         * synchronizes with barrier from wq_has_sleeper call in
9405         * io_commit_cqring
9406         */
9407        smp_rmb();
9408        if (!io_sqring_full(ctx))
9409                mask |= EPOLLOUT | EPOLLWRNORM;
9410
9411        /*
9412         * Don't flush cqring overflow list here, just do a simple check.
9413         * Otherwise there could possible be ABBA deadlock:
9414         *      CPU0                    CPU1
9415         *      ----                    ----
9416         * lock(&ctx->uring_lock);
9417         *                              lock(&ep->mtx);
9418         *                              lock(&ctx->uring_lock);
9419         * lock(&ep->mtx);
9420         *
9421         * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9422         * pushs them to do the flush.
9423         */
9424        if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
9425                mask |= EPOLLIN | EPOLLRDNORM;
9426
9427        return mask;
9428}
9429
9430static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9431{
9432        const struct cred *creds;
9433
9434        creds = xa_erase(&ctx->personalities, id);
9435        if (creds) {
9436                put_cred(creds);
9437                return 0;
9438        }
9439
9440        return -EINVAL;
9441}
9442
9443struct io_tctx_exit {
9444        struct callback_head            task_work;
9445        struct completion               completion;
9446        struct io_ring_ctx              *ctx;
9447};
9448
9449static __cold void io_tctx_exit_cb(struct callback_head *cb)
9450{
9451        struct io_uring_task *tctx = current->io_uring;
9452        struct io_tctx_exit *work;
9453
9454        work = container_of(cb, struct io_tctx_exit, task_work);
9455        /*
9456         * When @in_idle, we're in cancellation and it's racy to remove the
9457         * node. It'll be removed by the end of cancellation, just ignore it.
9458         */
9459        if (!atomic_read(&tctx->in_idle))
9460                io_uring_del_tctx_node((unsigned long)work->ctx);
9461        complete(&work->completion);
9462}
9463
9464static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
9465{
9466        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9467
9468        return req->ctx == data;
9469}
9470
9471static __cold void io_ring_exit_work(struct work_struct *work)
9472{
9473        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
9474        unsigned long timeout = jiffies + HZ * 60 * 5;
9475        unsigned long interval = HZ / 20;
9476        struct io_tctx_exit exit;
9477        struct io_tctx_node *node;
9478        int ret;
9479
9480        /*
9481         * If we're doing polled IO and end up having requests being
9482         * submitted async (out-of-line), then completions can come in while
9483         * we're waiting for refs to drop. We need to reap these manually,
9484         * as nobody else will be looking for them.
9485         */
9486        do {
9487                io_uring_try_cancel_requests(ctx, NULL, true);
9488                if (ctx->sq_data) {
9489                        struct io_sq_data *sqd = ctx->sq_data;
9490                        struct task_struct *tsk;
9491
9492                        io_sq_thread_park(sqd);
9493                        tsk = sqd->thread;
9494                        if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
9495                                io_wq_cancel_cb(tsk->io_uring->io_wq,
9496                                                io_cancel_ctx_cb, ctx, true);
9497                        io_sq_thread_unpark(sqd);
9498                }
9499
9500                io_req_caches_free(ctx);
9501
9502                if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
9503                        /* there is little hope left, don't run it too often */
9504                        interval = HZ * 60;
9505                }
9506        } while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
9507
9508        init_completion(&exit.completion);
9509        init_task_work(&exit.task_work, io_tctx_exit_cb);
9510        exit.ctx = ctx;
9511        /*
9512         * Some may use context even when all refs and requests have been put,
9513         * and they are free to do so while still holding uring_lock or
9514         * completion_lock, see io_req_task_submit(). Apart from other work,
9515         * this lock/unlock section also waits them to finish.
9516         */
9517        mutex_lock(&ctx->uring_lock);
9518        while (!list_empty(&ctx->tctx_list)) {
9519                WARN_ON_ONCE(time_after(jiffies, timeout));
9520
9521                node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
9522                                        ctx_node);
9523                /* don't spin on a single task if cancellation failed */
9524                list_rotate_left(&ctx->tctx_list);
9525                ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
9526                if (WARN_ON_ONCE(ret))
9527                        continue;
9528
9529                mutex_unlock(&ctx->uring_lock);
9530                wait_for_completion(&exit.completion);
9531                mutex_lock(&ctx->uring_lock);
9532        }
9533        mutex_unlock(&ctx->uring_lock);
9534        spin_lock(&ctx->completion_lock);
9535        spin_unlock(&ctx->completion_lock);
9536
9537        io_ring_ctx_free(ctx);
9538}
9539
9540/* Returns true if we found and killed one or more timeouts */
9541static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx,
9542                                    struct task_struct *tsk, bool cancel_all)
9543{
9544        struct io_kiocb *req, *tmp;
9545        int canceled = 0;
9546
9547        spin_lock(&ctx->completion_lock);
9548        spin_lock_irq(&ctx->timeout_lock);
9549        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
9550                if (io_match_task(req, tsk, cancel_all)) {
9551                        io_kill_timeout(req, -ECANCELED);
9552                        canceled++;
9553                }
9554        }
9555        spin_unlock_irq(&ctx->timeout_lock);
9556        if (canceled != 0)
9557                io_commit_cqring(ctx);
9558        spin_unlock(&ctx->completion_lock);
9559        if (canceled != 0)
9560                io_cqring_ev_posted(ctx);
9561        return canceled != 0;
9562}
9563
9564static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
9565{
9566        unsigned long index;
9567        struct creds *creds;
9568
9569        mutex_lock(&ctx->uring_lock);
9570        percpu_ref_kill(&ctx->refs);
9571        if (ctx->rings)
9572                __io_cqring_overflow_flush(ctx, true);
9573        xa_for_each(&ctx->personalities, index, creds)
9574                io_unregister_personality(ctx, index);
9575        mutex_unlock(&ctx->uring_lock);
9576
9577        io_kill_timeouts(ctx, NULL, true);
9578        io_poll_remove_all(ctx, NULL, true);
9579
9580        /* if we failed setting up the ctx, we might not have any rings */
9581        io_iopoll_try_reap_events(ctx);
9582
9583        INIT_WORK(&ctx->exit_work, io_ring_exit_work);
9584        /*
9585         * Use system_unbound_wq to avoid spawning tons of event kworkers
9586         * if we're exiting a ton of rings at the same time. It just adds
9587         * noise and overhead, there's no discernable change in runtime
9588         * over using system_wq.
9589         */
9590        queue_work(system_unbound_wq, &ctx->exit_work);
9591}
9592
9593static int io_uring_release(struct inode *inode, struct file *file)
9594{
9595        struct io_ring_ctx *ctx = file->private_data;
9596
9597        file->private_data = NULL;
9598        io_ring_ctx_wait_and_kill(ctx);
9599        return 0;
9600}
9601
9602struct io_task_cancel {
9603        struct task_struct *task;
9604        bool all;
9605};
9606
9607static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
9608{
9609        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9610        struct io_task_cancel *cancel = data;
9611
9612        return io_match_task_safe(req, cancel->task, cancel->all);
9613}
9614
9615static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
9616                                         struct task_struct *task,
9617                                         bool cancel_all)
9618{
9619        struct io_defer_entry *de;
9620        LIST_HEAD(list);
9621
9622        spin_lock(&ctx->completion_lock);
9623        list_for_each_entry_reverse(de, &ctx->defer_list, list) {
9624                if (io_match_task_safe(de->req, task, cancel_all)) {
9625                        list_cut_position(&list, &ctx->defer_list, &de->list);
9626                        break;
9627                }
9628        }
9629        spin_unlock(&ctx->completion_lock);
9630        if (list_empty(&list))
9631                return false;
9632
9633        while (!list_empty(&list)) {
9634                de = list_first_entry(&list, struct io_defer_entry, list);
9635                list_del_init(&de->list);
9636                io_req_complete_failed(de->req, -ECANCELED);
9637                kfree(de);
9638        }
9639        return true;
9640}
9641
9642static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
9643{
9644        struct io_tctx_node *node;
9645        enum io_wq_cancel cret;
9646        bool ret = false;
9647
9648        mutex_lock(&ctx->uring_lock);
9649        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
9650                struct io_uring_task *tctx = node->task->io_uring;
9651
9652                /*
9653                 * io_wq will stay alive while we hold uring_lock, because it's
9654                 * killed after ctx nodes, which requires to take the lock.
9655                 */
9656                if (!tctx || !tctx->io_wq)
9657                        continue;
9658                cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
9659                ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9660        }
9661        mutex_unlock(&ctx->uring_lock);
9662
9663        return ret;
9664}
9665
9666static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9667                                                struct task_struct *task,
9668                                                bool cancel_all)
9669{
9670        struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
9671        struct io_uring_task *tctx = task ? task->io_uring : NULL;
9672
9673        while (1) {
9674                enum io_wq_cancel cret;
9675                bool ret = false;
9676
9677                if (!task) {
9678                        ret |= io_uring_try_cancel_iowq(ctx);
9679                } else if (tctx && tctx->io_wq) {
9680                        /*
9681                         * Cancels requests of all rings, not only @ctx, but
9682                         * it's fine as the task is in exit/exec.
9683                         */
9684                        cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9685                                               &cancel, true);
9686                        ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9687                }
9688
9689                /* SQPOLL thread does its own polling */
9690                if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
9691                    (ctx->sq_data && ctx->sq_data->thread == current)) {
9692                        while (!wq_list_empty(&ctx->iopoll_list)) {
9693                                io_iopoll_try_reap_events(ctx);
9694                                ret = true;
9695                        }
9696                }
9697
9698                ret |= io_cancel_defer_files(ctx, task, cancel_all);
9699                ret |= io_poll_remove_all(ctx, task, cancel_all);
9700                ret |= io_kill_timeouts(ctx, task, cancel_all);
9701                if (task)
9702                        ret |= io_run_task_work();
9703                if (!ret)
9704                        break;
9705                cond_resched();
9706        }
9707}
9708
9709static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9710{
9711        struct io_uring_task *tctx = current->io_uring;
9712        struct io_tctx_node *node;
9713        int ret;
9714
9715        if (unlikely(!tctx)) {
9716                ret = io_uring_alloc_task_context(current, ctx);
9717                if (unlikely(ret))
9718                        return ret;
9719
9720                tctx = current->io_uring;
9721                if (ctx->iowq_limits_set) {
9722                        unsigned int limits[2] = { ctx->iowq_limits[0],
9723                                                   ctx->iowq_limits[1], };
9724
9725                        ret = io_wq_max_workers(tctx->io_wq, limits);
9726                        if (ret)
9727                                return ret;
9728                }
9729        }
9730        if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9731                node = kmalloc(sizeof(*node), GFP_KERNEL);
9732                if (!node)
9733                        return -ENOMEM;
9734                node->ctx = ctx;
9735                node->task = current;
9736
9737                ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9738                                        node, GFP_KERNEL));
9739                if (ret) {
9740                        kfree(node);
9741                        return ret;
9742                }
9743
9744                mutex_lock(&ctx->uring_lock);
9745                list_add(&node->ctx_node, &ctx->tctx_list);
9746                mutex_unlock(&ctx->uring_lock);
9747        }
9748        tctx->last = ctx;
9749        return 0;
9750}
9751
9752/*
9753 * Note that this task has used io_uring. We use it for cancelation purposes.
9754 */
9755static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9756{
9757        struct io_uring_task *tctx = current->io_uring;
9758
9759        if (likely(tctx && tctx->last == ctx))
9760                return 0;
9761        return __io_uring_add_tctx_node(ctx);
9762}
9763
9764/*
9765 * Remove this io_uring_file -> task mapping.
9766 */
9767static __cold void io_uring_del_tctx_node(unsigned long index)
9768{
9769        struct io_uring_task *tctx = current->io_uring;
9770        struct io_tctx_node *node;
9771
9772        if (!tctx)
9773                return;
9774        node = xa_erase(&tctx->xa, index);
9775        if (!node)
9776                return;
9777
9778        WARN_ON_ONCE(current != node->task);
9779        WARN_ON_ONCE(list_empty(&node->ctx_node));
9780
9781        mutex_lock(&node->ctx->uring_lock);
9782        list_del(&node->ctx_node);
9783        mutex_unlock(&node->ctx->uring_lock);
9784
9785        if (tctx->last == node->ctx)
9786                tctx->last = NULL;
9787        kfree(node);
9788}
9789
9790static __cold void io_uring_clean_tctx(struct io_uring_task *tctx)
9791{
9792        struct io_wq *wq = tctx->io_wq;
9793        struct io_tctx_node *node;
9794        unsigned long index;
9795
9796        xa_for_each(&tctx->xa, index, node) {
9797                io_uring_del_tctx_node(index);
9798                cond_resched();
9799        }
9800        if (wq) {
9801                /*
9802                 * Must be after io_uring_del_tctx_node() (removes nodes under
9803                 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9804                 */
9805                io_wq_put_and_exit(wq);
9806                tctx->io_wq = NULL;
9807        }
9808}
9809
9810static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
9811{
9812        if (tracked)
9813                return atomic_read(&tctx->inflight_tracked);
9814        return percpu_counter_sum(&tctx->inflight);
9815}
9816
9817static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
9818{
9819        struct io_uring_task *tctx = task->io_uring;
9820        unsigned int refs = tctx->cached_refs;
9821
9822        if (refs) {
9823                tctx->cached_refs = 0;
9824                percpu_counter_sub(&tctx->inflight, refs);
9825                put_task_struct_many(task, refs);
9826        }
9827}
9828
9829/*
9830 * Find any io_uring ctx that this task has registered or done IO on, and cancel
9831 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
9832 */
9833static __cold void io_uring_cancel_generic(bool cancel_all,
9834                                           struct io_sq_data *sqd)
9835{
9836        struct io_uring_task *tctx = current->io_uring;
9837        struct io_ring_ctx *ctx;
9838        s64 inflight;
9839        DEFINE_WAIT(wait);
9840
9841        WARN_ON_ONCE(sqd && sqd->thread != current);
9842
9843        if (!current->io_uring)
9844                return;
9845        if (tctx->io_wq)
9846                io_wq_exit_start(tctx->io_wq);
9847
9848        atomic_inc(&tctx->in_idle);
9849        do {
9850                io_uring_drop_tctx_refs(current);
9851                /* read completions before cancelations */
9852                inflight = tctx_inflight(tctx, !cancel_all);
9853                if (!inflight)
9854                        break;
9855
9856                if (!sqd) {
9857                        struct io_tctx_node *node;
9858                        unsigned long index;
9859
9860                        xa_for_each(&tctx->xa, index, node) {
9861                                /* sqpoll task will cancel all its requests */
9862                                if (node->ctx->sq_data)
9863                                        continue;
9864                                io_uring_try_cancel_requests(node->ctx, current,
9865                                                             cancel_all);
9866                        }
9867                } else {
9868                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9869                                io_uring_try_cancel_requests(ctx, current,
9870                                                             cancel_all);
9871                }
9872
9873                prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
9874                io_run_task_work();
9875                io_uring_drop_tctx_refs(current);
9876
9877                /*
9878                 * If we've seen completions, retry without waiting. This
9879                 * avoids a race where a completion comes in before we did
9880                 * prepare_to_wait().
9881                 */
9882                if (inflight == tctx_inflight(tctx, !cancel_all))
9883                        schedule();
9884                finish_wait(&tctx->wait, &wait);
9885        } while (1);
9886        atomic_dec(&tctx->in_idle);
9887
9888        io_uring_clean_tctx(tctx);
9889        if (cancel_all) {
9890                /* for exec all current's requests should be gone, kill tctx */
9891                __io_uring_free(current);
9892        }
9893}
9894
9895void __io_uring_cancel(bool cancel_all)
9896{
9897        io_uring_cancel_generic(cancel_all, NULL);
9898}
9899
9900static void *io_uring_validate_mmap_request(struct file *file,
9901                                            loff_t pgoff, size_t sz)
9902{
9903        struct io_ring_ctx *ctx = file->private_data;
9904        loff_t offset = pgoff << PAGE_SHIFT;
9905        struct page *page;
9906        void *ptr;
9907
9908        switch (offset) {
9909        case IORING_OFF_SQ_RING:
9910        case IORING_OFF_CQ_RING:
9911                ptr = ctx->rings;
9912                break;
9913        case IORING_OFF_SQES:
9914                ptr = ctx->sq_sqes;
9915                break;
9916        default:
9917                return ERR_PTR(-EINVAL);
9918        }
9919
9920        page = virt_to_head_page(ptr);
9921        if (sz > page_size(page))
9922                return ERR_PTR(-EINVAL);
9923
9924        return ptr;
9925}
9926
9927#ifdef CONFIG_MMU
9928
9929static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9930{
9931        size_t sz = vma->vm_end - vma->vm_start;
9932        unsigned long pfn;
9933        void *ptr;
9934
9935        ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9936        if (IS_ERR(ptr))
9937                return PTR_ERR(ptr);
9938
9939        pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9940        return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9941}
9942
9943#else /* !CONFIG_MMU */
9944
9945static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9946{
9947        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9948}
9949
9950static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9951{
9952        return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9953}
9954
9955static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9956        unsigned long addr, unsigned long len,
9957        unsigned long pgoff, unsigned long flags)
9958{
9959        void *ptr;
9960
9961        ptr = io_uring_validate_mmap_request(file, pgoff, len);
9962        if (IS_ERR(ptr))
9963                return PTR_ERR(ptr);
9964
9965        return (unsigned long) ptr;
9966}
9967
9968#endif /* !CONFIG_MMU */
9969
9970static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
9971{
9972        DEFINE_WAIT(wait);
9973
9974        do {
9975                if (!io_sqring_full(ctx))
9976                        break;
9977                prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
9978
9979                if (!io_sqring_full(ctx))
9980                        break;
9981                schedule();
9982        } while (!signal_pending(current));
9983
9984        finish_wait(&ctx->sqo_sq_wait, &wait);
9985        return 0;
9986}
9987
9988static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
9989                          struct __kernel_timespec __user **ts,
9990                          const sigset_t __user **sig)
9991{
9992        struct io_uring_getevents_arg arg;
9993
9994        /*
9995         * If EXT_ARG isn't set, then we have no timespec and the argp pointer
9996         * is just a pointer to the sigset_t.
9997         */
9998        if (!(flags & IORING_ENTER_EXT_ARG)) {
9999                *sig = (const sigset_t __user *) argp;
10000                *ts = NULL;
10001                return 0;
10002        }
10003
10004        /*
10005         * EXT_ARG is set - ensure we agree on the size of it and copy in our
10006         * timespec and sigset_t pointers if good.
10007         */
10008        if (*argsz != sizeof(arg))
10009                return -EINVAL;
10010        if (copy_from_user(&arg, argp, sizeof(arg)))
10011                return -EFAULT;
10012        *sig = u64_to_user_ptr(arg.sigmask);
10013        *argsz = arg.sigmask_sz;
10014        *ts = u64_to_user_ptr(arg.ts);
10015        return 0;
10016}
10017
10018SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
10019                u32, min_complete, u32, flags, const void __user *, argp,
10020                size_t, argsz)
10021{
10022        struct io_ring_ctx *ctx;
10023        int submitted = 0;
10024        struct fd f;
10025        long ret;
10026
10027        io_run_task_work();
10028
10029        if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
10030                               IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
10031                return -EINVAL;
10032
10033        f = fdget(fd);
10034        if (unlikely(!f.file))
10035                return -EBADF;
10036
10037        ret = -EOPNOTSUPP;
10038        if (unlikely(f.file->f_op != &io_uring_fops))
10039                goto out_fput;
10040
10041        ret = -ENXIO;
10042        ctx = f.file->private_data;
10043        if (unlikely(!percpu_ref_tryget(&ctx->refs)))
10044                goto out_fput;
10045
10046        ret = -EBADFD;
10047        if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
10048                goto out;
10049
10050        /*
10051         * For SQ polling, the thread will do all submissions and completions.
10052         * Just return the requested submit count, and wake the thread if
10053         * we were asked to.
10054         */
10055        ret = 0;
10056        if (ctx->flags & IORING_SETUP_SQPOLL) {
10057                io_cqring_overflow_flush(ctx);
10058
10059                if (unlikely(ctx->sq_data->thread == NULL)) {
10060                        ret = -EOWNERDEAD;
10061                        goto out;
10062                }
10063                if (flags & IORING_ENTER_SQ_WAKEUP)
10064                        wake_up(&ctx->sq_data->wait);
10065                if (flags & IORING_ENTER_SQ_WAIT) {
10066                        ret = io_sqpoll_wait_sq(ctx);
10067                        if (ret)
10068                                goto out;
10069                }
10070                submitted = to_submit;
10071        } else if (to_submit) {
10072                ret = io_uring_add_tctx_node(ctx);
10073                if (unlikely(ret))
10074                        goto out;
10075                mutex_lock(&ctx->uring_lock);
10076                submitted = io_submit_sqes(ctx, to_submit);
10077                mutex_unlock(&ctx->uring_lock);
10078
10079                if (submitted != to_submit)
10080                        goto out;
10081        }
10082        if (flags & IORING_ENTER_GETEVENTS) {
10083                const sigset_t __user *sig;
10084                struct __kernel_timespec __user *ts;
10085
10086                ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10087                if (unlikely(ret))
10088                        goto out;
10089
10090                min_complete = min(min_complete, ctx->cq_entries);
10091
10092                /*
10093                 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
10094                 * space applications don't need to do io completion events
10095                 * polling again, they can rely on io_sq_thread to do polling
10096                 * work, which can reduce cpu usage and uring_lock contention.
10097                 */
10098                if (ctx->flags & IORING_SETUP_IOPOLL &&
10099                    !(ctx->flags & IORING_SETUP_SQPOLL)) {
10100                        ret = io_iopoll_check(ctx, min_complete);
10101                } else {
10102                        ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
10103                }
10104        }
10105
10106out:
10107        percpu_ref_put(&ctx->refs);
10108out_fput:
10109        fdput(f);
10110        return submitted ? submitted : ret;
10111}
10112
10113#ifdef CONFIG_PROC_FS
10114static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
10115                const struct cred *cred)
10116{
10117        struct user_namespace *uns = seq_user_ns(m);
10118        struct group_info *gi;
10119        kernel_cap_t cap;
10120        unsigned __capi;
10121        int g;
10122
10123        seq_printf(m, "%5d\n", id);
10124        seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10125        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10126        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10127        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10128        seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10129        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10130        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10131        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10132        seq_puts(m, "\n\tGroups:\t");
10133        gi = cred->group_info;
10134        for (g = 0; g < gi->ngroups; g++) {
10135                seq_put_decimal_ull(m, g ? " " : "",
10136                                        from_kgid_munged(uns, gi->gid[g]));
10137        }
10138        seq_puts(m, "\n\tCapEff:\t");
10139        cap = cred->cap_effective;
10140        CAP_FOR_EACH_U32(__capi)
10141                seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10142        seq_putc(m, '\n');
10143        return 0;
10144}
10145
10146static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
10147                                          struct seq_file *m)
10148{
10149        struct io_sq_data *sq = NULL;
10150        struct io_overflow_cqe *ocqe;
10151        struct io_rings *r = ctx->rings;
10152        unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
10153        unsigned int sq_head = READ_ONCE(r->sq.head);
10154        unsigned int sq_tail = READ_ONCE(r->sq.tail);
10155        unsigned int cq_head = READ_ONCE(r->cq.head);
10156        unsigned int cq_tail = READ_ONCE(r->cq.tail);
10157        unsigned int sq_entries, cq_entries;
10158        bool has_lock;
10159        unsigned int i;
10160
10161        /*
10162         * we may get imprecise sqe and cqe info if uring is actively running
10163         * since we get cached_sq_head and cached_cq_tail without uring_lock
10164         * and sq_tail and cq_head are changed by userspace. But it's ok since
10165         * we usually use these info when it is stuck.
10166         */
10167        seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask);
10168        seq_printf(m, "SqHead:\t%u\n", sq_head);
10169        seq_printf(m, "SqTail:\t%u\n", sq_tail);
10170        seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
10171        seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
10172        seq_printf(m, "CqHead:\t%u\n", cq_head);
10173        seq_printf(m, "CqTail:\t%u\n", cq_tail);
10174        seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
10175        seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head);
10176        sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
10177        for (i = 0; i < sq_entries; i++) {
10178                unsigned int entry = i + sq_head;
10179                unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
10180                struct io_uring_sqe *sqe;
10181
10182                if (sq_idx > sq_mask)
10183                        continue;
10184                sqe = &ctx->sq_sqes[sq_idx];
10185                seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n",
10186                           sq_idx, sqe->opcode, sqe->fd, sqe->flags,
10187                           sqe->user_data);
10188        }
10189        seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
10190        cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
10191        for (i = 0; i < cq_entries; i++) {
10192                unsigned int entry = i + cq_head;
10193                struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
10194
10195                seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
10196                           entry & cq_mask, cqe->user_data, cqe->res,
10197                           cqe->flags);
10198        }
10199
10200        /*
10201         * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10202         * since fdinfo case grabs it in the opposite direction of normal use
10203         * cases. If we fail to get the lock, we just don't iterate any
10204         * structures that could be going away outside the io_uring mutex.
10205         */
10206        has_lock = mutex_trylock(&ctx->uring_lock);
10207
10208        if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
10209                sq = ctx->sq_data;
10210                if (!sq->thread)
10211                        sq = NULL;
10212        }
10213
10214        seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
10215        seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
10216        seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
10217        for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
10218                struct file *f = io_file_from_index(ctx, i);
10219
10220                if (f)
10221                        seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10222                else
10223                        seq_printf(m, "%5u: <none>\n", i);
10224        }
10225        seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
10226        for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
10227                struct io_mapped_ubuf *buf = ctx->user_bufs[i];
10228                unsigned int len = buf->ubuf_end - buf->ubuf;
10229
10230                seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
10231        }
10232        if (has_lock && !xa_empty(&ctx->personalities)) {
10233                unsigned long index;
10234                const struct cred *cred;
10235
10236                seq_printf(m, "Personalities:\n");
10237                xa_for_each(&ctx->personalities, index, cred)
10238                        io_uring_show_cred(m, index, cred);
10239        }
10240        if (has_lock)
10241                mutex_unlock(&ctx->uring_lock);
10242
10243        seq_puts(m, "PollList:\n");
10244        spin_lock(&ctx->completion_lock);
10245        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10246                struct hlist_head *list = &ctx->cancel_hash[i];
10247                struct io_kiocb *req;
10248
10249                hlist_for_each_entry(req, list, hash_node)
10250                        seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
10251                                        req->task->task_works != NULL);
10252        }
10253
10254        seq_puts(m, "CqOverflowList:\n");
10255        list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
10256                struct io_uring_cqe *cqe = &ocqe->cqe;
10257
10258                seq_printf(m, "  user_data=%llu, res=%d, flags=%x\n",
10259                           cqe->user_data, cqe->res, cqe->flags);
10260
10261        }
10262
10263        spin_unlock(&ctx->completion_lock);
10264}
10265
10266static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
10267{
10268        struct io_ring_ctx *ctx = f->private_data;
10269
10270        if (percpu_ref_tryget(&ctx->refs)) {
10271                __io_uring_show_fdinfo(ctx, m);
10272                percpu_ref_put(&ctx->refs);
10273        }
10274}
10275#endif
10276
10277static const struct file_operations io_uring_fops = {
10278        .release        = io_uring_release,
10279        .mmap           = io_uring_mmap,
10280#ifndef CONFIG_MMU
10281        .get_unmapped_area = io_uring_nommu_get_unmapped_area,
10282        .mmap_capabilities = io_uring_nommu_mmap_capabilities,
10283#endif
10284        .poll           = io_uring_poll,
10285#ifdef CONFIG_PROC_FS
10286        .show_fdinfo    = io_uring_show_fdinfo,
10287#endif
10288};
10289
10290static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10291                                         struct io_uring_params *p)
10292{
10293        struct io_rings *rings;
10294        size_t size, sq_array_offset;
10295
10296        /* make sure these are sane, as we already accounted them */
10297        ctx->sq_entries = p->sq_entries;
10298        ctx->cq_entries = p->cq_entries;
10299
10300        size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10301        if (size == SIZE_MAX)
10302                return -EOVERFLOW;
10303
10304        rings = io_mem_alloc(size);
10305        if (!rings)
10306                return -ENOMEM;
10307
10308        ctx->rings = rings;
10309        ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10310        rings->sq_ring_mask = p->sq_entries - 1;
10311        rings->cq_ring_mask = p->cq_entries - 1;
10312        rings->sq_ring_entries = p->sq_entries;
10313        rings->cq_ring_entries = p->cq_entries;
10314
10315        size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
10316        if (size == SIZE_MAX) {
10317                io_mem_free(ctx->rings);
10318                ctx->rings = NULL;
10319                return -EOVERFLOW;
10320        }
10321
10322        ctx->sq_sqes = io_mem_alloc(size);
10323        if (!ctx->sq_sqes) {
10324                io_mem_free(ctx->rings);
10325                ctx->rings = NULL;
10326                return -ENOMEM;
10327        }
10328
10329        return 0;
10330}
10331
10332static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10333{
10334        int ret, fd;
10335
10336        fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10337        if (fd < 0)
10338                return fd;
10339
10340        ret = io_uring_add_tctx_node(ctx);
10341        if (ret) {
10342                put_unused_fd(fd);
10343                return ret;
10344        }
10345        fd_install(fd, file);
10346        return fd;
10347}
10348
10349/*
10350 * Allocate an anonymous fd, this is what constitutes the application
10351 * visible backing of an io_uring instance. The application mmaps this
10352 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
10353 * we have to tie this fd to a socket for file garbage collection purposes.
10354 */
10355static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
10356{
10357        struct file *file;
10358#if defined(CONFIG_UNIX)
10359        int ret;
10360
10361        ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10362                                &ctx->ring_sock);
10363        if (ret)
10364                return ERR_PTR(ret);
10365#endif
10366
10367        file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
10368                                         O_RDWR | O_CLOEXEC, NULL);
10369#if defined(CONFIG_UNIX)
10370        if (IS_ERR(file)) {
10371                sock_release(ctx->ring_sock);
10372                ctx->ring_sock = NULL;
10373        } else {
10374                ctx->ring_sock->file = file;
10375        }
10376#endif
10377        return file;
10378}
10379
10380static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
10381                                  struct io_uring_params __user *params)
10382{
10383        struct io_ring_ctx *ctx;
10384        struct file *file;
10385        int ret;
10386
10387        if (!entries)
10388                return -EINVAL;
10389        if (entries > IORING_MAX_ENTRIES) {
10390                if (!(p->flags & IORING_SETUP_CLAMP))
10391                        return -EINVAL;
10392                entries = IORING_MAX_ENTRIES;
10393        }
10394
10395        /*
10396         * Use twice as many entries for the CQ ring. It's possible for the
10397         * application to drive a higher depth than the size of the SQ ring,
10398         * since the sqes are only used at submission time. This allows for
10399         * some flexibility in overcommitting a bit. If the application has
10400         * set IORING_SETUP_CQSIZE, it will have passed in the desired number
10401         * of CQ ring entries manually.
10402         */
10403        p->sq_entries = roundup_pow_of_two(entries);
10404        if (p->flags & IORING_SETUP_CQSIZE) {
10405                /*
10406                 * If IORING_SETUP_CQSIZE is set, we do the same roundup
10407                 * to a power-of-two, if it isn't already. We do NOT impose
10408                 * any cq vs sq ring sizing.
10409                 */
10410                if (!p->cq_entries)
10411                        return -EINVAL;
10412                if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10413                        if (!(p->flags & IORING_SETUP_CLAMP))
10414                                return -EINVAL;
10415                        p->cq_entries = IORING_MAX_CQ_ENTRIES;
10416                }
10417                p->cq_entries = roundup_pow_of_two(p->cq_entries);
10418                if (p->cq_entries < p->sq_entries)
10419                        return -EINVAL;
10420        } else {
10421                p->cq_entries = 2 * p->sq_entries;
10422        }
10423
10424        ctx = io_ring_ctx_alloc(p);
10425        if (!ctx)
10426                return -ENOMEM;
10427        ctx->compat = in_compat_syscall();
10428        if (!capable(CAP_IPC_LOCK))
10429                ctx->user = get_uid(current_user());
10430
10431        /*
10432         * This is just grabbed for accounting purposes. When a process exits,
10433         * the mm is exited and dropped before the files, hence we need to hang
10434         * on to this mm purely for the purposes of being able to unaccount
10435         * memory (locked/pinned vm). It's not used for anything else.
10436         */
10437        mmgrab(current->mm);
10438        ctx->mm_account = current->mm;
10439
10440        ret = io_allocate_scq_urings(ctx, p);
10441        if (ret)
10442                goto err;
10443
10444        ret = io_sq_offload_create(ctx, p);
10445        if (ret)
10446                goto err;
10447        /* always set a rsrc node */
10448        ret = io_rsrc_node_switch_start(ctx);
10449        if (ret)
10450                goto err;
10451        io_rsrc_node_switch(ctx, NULL);
10452
10453        memset(&p->sq_off, 0, sizeof(p->sq_off));
10454        p->sq_off.head = offsetof(struct io_rings, sq.head);
10455        p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10456        p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10457        p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10458        p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10459        p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10460        p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
10461
10462        memset(&p->cq_off, 0, sizeof(p->cq_off));
10463        p->cq_off.head = offsetof(struct io_rings, cq.head);
10464        p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10465        p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10466        p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10467        p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10468        p->cq_off.cqes = offsetof(struct io_rings, cqes);
10469        p->cq_off.flags = offsetof(struct io_rings, cq_flags);
10470
10471        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10472                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
10473                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
10474                        IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
10475                        IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10476                        IORING_FEAT_RSRC_TAGS;
10477
10478        if (copy_to_user(params, p, sizeof(*p))) {
10479                ret = -EFAULT;
10480                goto err;
10481        }
10482
10483        file = io_uring_get_file(ctx);
10484        if (IS_ERR(file)) {
10485                ret = PTR_ERR(file);
10486                goto err;
10487        }
10488
10489        /*
10490         * Install ring fd as the very last thing, so we don't risk someone
10491         * having closed it before we finish setup
10492         */
10493        ret = io_uring_install_fd(ctx, file);
10494        if (ret < 0) {
10495                /* fput will clean it up */
10496                fput(file);
10497                return ret;
10498        }
10499
10500        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
10501        return ret;
10502err:
10503        io_ring_ctx_wait_and_kill(ctx);
10504        return ret;
10505}
10506
10507/*
10508 * Sets up an aio uring context, and returns the fd. Applications asks for a
10509 * ring size, we return the actual sq/cq ring sizes (among other things) in the
10510 * params structure passed in.
10511 */
10512static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10513{
10514        struct io_uring_params p;
10515        int i;
10516
10517        if (copy_from_user(&p, params, sizeof(p)))
10518                return -EFAULT;
10519        for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10520                if (p.resv[i])
10521                        return -EINVAL;
10522        }
10523
10524        if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
10525                        IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
10526                        IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10527                        IORING_SETUP_R_DISABLED))
10528                return -EINVAL;
10529
10530        return  io_uring_create(entries, &p, params);
10531}
10532
10533SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10534                struct io_uring_params __user *, params)
10535{
10536        return io_uring_setup(entries, params);
10537}
10538
10539static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
10540                           unsigned nr_args)
10541{
10542        struct io_uring_probe *p;
10543        size_t size;
10544        int i, ret;
10545
10546        size = struct_size(p, ops, nr_args);
10547        if (size == SIZE_MAX)
10548                return -EOVERFLOW;
10549        p = kzalloc(size, GFP_KERNEL);
10550        if (!p)
10551                return -ENOMEM;
10552
10553        ret = -EFAULT;
10554        if (copy_from_user(p, arg, size))
10555                goto out;
10556        ret = -EINVAL;
10557        if (memchr_inv(p, 0, size))
10558                goto out;
10559
10560        p->last_op = IORING_OP_LAST - 1;
10561        if (nr_args > IORING_OP_LAST)
10562                nr_args = IORING_OP_LAST;
10563
10564        for (i = 0; i < nr_args; i++) {
10565                p->ops[i].op = i;
10566                if (!io_op_defs[i].not_supported)
10567                        p->ops[i].flags = IO_URING_OP_SUPPORTED;
10568        }
10569        p->ops_len = i;
10570
10571        ret = 0;
10572        if (copy_to_user(arg, p, size))
10573                ret = -EFAULT;
10574out:
10575        kfree(p);
10576        return ret;
10577}
10578
10579static int io_register_personality(struct io_ring_ctx *ctx)
10580{
10581        const struct cred *creds;
10582        u32 id;
10583        int ret;
10584
10585        creds = get_current_cred();
10586
10587        ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10588                        XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
10589        if (ret < 0) {
10590                put_cred(creds);
10591                return ret;
10592        }
10593        return id;
10594}
10595
10596static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
10597                                           void __user *arg, unsigned int nr_args)
10598{
10599        struct io_uring_restriction *res;
10600        size_t size;
10601        int i, ret;
10602
10603        /* Restrictions allowed only if rings started disabled */
10604        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10605                return -EBADFD;
10606
10607        /* We allow only a single restrictions registration */
10608        if (ctx->restrictions.registered)
10609                return -EBUSY;
10610
10611        if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10612                return -EINVAL;
10613
10614        size = array_size(nr_args, sizeof(*res));
10615        if (size == SIZE_MAX)
10616                return -EOVERFLOW;
10617
10618        res = memdup_user(arg, size);
10619        if (IS_ERR(res))
10620                return PTR_ERR(res);
10621
10622        ret = 0;
10623
10624        for (i = 0; i < nr_args; i++) {
10625                switch (res[i].opcode) {
10626                case IORING_RESTRICTION_REGISTER_OP:
10627                        if (res[i].register_op >= IORING_REGISTER_LAST) {
10628                                ret = -EINVAL;
10629                                goto out;
10630                        }
10631
10632                        __set_bit(res[i].register_op,
10633                                  ctx->restrictions.register_op);
10634                        break;
10635                case IORING_RESTRICTION_SQE_OP:
10636                        if (res[i].sqe_op >= IORING_OP_LAST) {
10637                                ret = -EINVAL;
10638                                goto out;
10639                        }
10640
10641                        __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10642                        break;
10643                case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10644                        ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10645                        break;
10646                case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10647                        ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10648                        break;
10649                default:
10650                        ret = -EINVAL;
10651                        goto out;
10652                }
10653        }
10654
10655out:
10656        /* Reset all restrictions if an error happened */
10657        if (ret != 0)
10658                memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10659        else
10660                ctx->restrictions.registered = true;
10661
10662        kfree(res);
10663        return ret;
10664}
10665
10666static int io_register_enable_rings(struct io_ring_ctx *ctx)
10667{
10668        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10669                return -EBADFD;
10670
10671        if (ctx->restrictions.registered)
10672                ctx->restricted = 1;
10673
10674        ctx->flags &= ~IORING_SETUP_R_DISABLED;
10675        if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10676                wake_up(&ctx->sq_data->wait);
10677        return 0;
10678}
10679
10680static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
10681                                     struct io_uring_rsrc_update2 *up,
10682                                     unsigned nr_args)
10683{
10684        __u32 tmp;
10685        int err;
10686
10687        if (up->resv)
10688                return -EINVAL;
10689        if (check_add_overflow(up->offset, nr_args, &tmp))
10690                return -EOVERFLOW;
10691        err = io_rsrc_node_switch_start(ctx);
10692        if (err)
10693                return err;
10694
10695        switch (type) {
10696        case IORING_RSRC_FILE:
10697                return __io_sqe_files_update(ctx, up, nr_args);
10698        case IORING_RSRC_BUFFER:
10699                return __io_sqe_buffers_update(ctx, up, nr_args);
10700        }
10701        return -EINVAL;
10702}
10703
10704static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10705                                    unsigned nr_args)
10706{
10707        struct io_uring_rsrc_update2 up;
10708
10709        if (!nr_args)
10710                return -EINVAL;
10711        memset(&up, 0, sizeof(up));
10712        if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10713                return -EFAULT;
10714        return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10715}
10716
10717static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
10718                                   unsigned size, unsigned type)
10719{
10720        struct io_uring_rsrc_update2 up;
10721
10722        if (size != sizeof(up))
10723                return -EINVAL;
10724        if (copy_from_user(&up, arg, sizeof(up)))
10725                return -EFAULT;
10726        if (!up.nr || up.resv)
10727                return -EINVAL;
10728        return __io_register_rsrc_update(ctx, type, &up, up.nr);
10729}
10730
10731static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
10732                            unsigned int size, unsigned int type)
10733{
10734        struct io_uring_rsrc_register rr;
10735
10736        /* keep it extendible */
10737        if (size != sizeof(rr))
10738                return -EINVAL;
10739
10740        memset(&rr, 0, sizeof(rr));
10741        if (copy_from_user(&rr, arg, size))
10742                return -EFAULT;
10743        if (!rr.nr || rr.resv || rr.resv2)
10744                return -EINVAL;
10745
10746        switch (type) {
10747        case IORING_RSRC_FILE:
10748                return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10749                                             rr.nr, u64_to_user_ptr(rr.tags));
10750        case IORING_RSRC_BUFFER:
10751                return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10752                                               rr.nr, u64_to_user_ptr(rr.tags));
10753        }
10754        return -EINVAL;
10755}
10756
10757static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
10758                                       void __user *arg, unsigned len)
10759{
10760        struct io_uring_task *tctx = current->io_uring;
10761        cpumask_var_t new_mask;
10762        int ret;
10763
10764        if (!tctx || !tctx->io_wq)
10765                return -EINVAL;
10766
10767        if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10768                return -ENOMEM;
10769
10770        cpumask_clear(new_mask);
10771        if (len > cpumask_size())
10772                len = cpumask_size();
10773
10774        if (copy_from_user(new_mask, arg, len)) {
10775                free_cpumask_var(new_mask);
10776                return -EFAULT;
10777        }
10778
10779        ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10780        free_cpumask_var(new_mask);
10781        return ret;
10782}
10783
10784static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10785{
10786        struct io_uring_task *tctx = current->io_uring;
10787
10788        if (!tctx || !tctx->io_wq)
10789                return -EINVAL;
10790
10791        return io_wq_cpu_affinity(tctx->io_wq, NULL);
10792}
10793
10794static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10795                                               void __user *arg)
10796        __must_hold(&ctx->uring_lock)
10797{
10798        struct io_tctx_node *node;
10799        struct io_uring_task *tctx = NULL;
10800        struct io_sq_data *sqd = NULL;
10801        __u32 new_count[2];
10802        int i, ret;
10803
10804        if (copy_from_user(new_count, arg, sizeof(new_count)))
10805                return -EFAULT;
10806        for (i = 0; i < ARRAY_SIZE(new_count); i++)
10807                if (new_count[i] > INT_MAX)
10808                        return -EINVAL;
10809
10810        if (ctx->flags & IORING_SETUP_SQPOLL) {
10811                sqd = ctx->sq_data;
10812                if (sqd) {
10813                        /*
10814                         * Observe the correct sqd->lock -> ctx->uring_lock
10815                         * ordering. Fine to drop uring_lock here, we hold
10816                         * a ref to the ctx.
10817                         */
10818                        refcount_inc(&sqd->refs);
10819                        mutex_unlock(&ctx->uring_lock);
10820                        mutex_lock(&sqd->lock);
10821                        mutex_lock(&ctx->uring_lock);
10822                        if (sqd->thread)
10823                                tctx = sqd->thread->io_uring;
10824                }
10825        } else {
10826                tctx = current->io_uring;
10827        }
10828
10829        BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
10830
10831        for (i = 0; i < ARRAY_SIZE(new_count); i++)
10832                if (new_count[i])
10833                        ctx->iowq_limits[i] = new_count[i];
10834        ctx->iowq_limits_set = true;
10835
10836        if (tctx && tctx->io_wq) {
10837                ret = io_wq_max_workers(tctx->io_wq, new_count);
10838                if (ret)
10839                        goto err;
10840        } else {
10841                memset(new_count, 0, sizeof(new_count));
10842        }
10843
10844        if (sqd) {
10845                mutex_unlock(&sqd->lock);
10846                io_put_sq_data(sqd);
10847        }
10848
10849        if (copy_to_user(arg, new_count, sizeof(new_count)))
10850                return -EFAULT;
10851
10852        /* that's it for SQPOLL, only the SQPOLL task creates requests */
10853        if (sqd)
10854                return 0;
10855
10856        /* now propagate the restriction to all registered users */
10857        list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10858                struct io_uring_task *tctx = node->task->io_uring;
10859
10860                if (WARN_ON_ONCE(!tctx->io_wq))
10861                        continue;
10862
10863                for (i = 0; i < ARRAY_SIZE(new_count); i++)
10864                        new_count[i] = ctx->iowq_limits[i];
10865                /* ignore errors, it always returns zero anyway */
10866                (void)io_wq_max_workers(tctx->io_wq, new_count);
10867        }
10868        return 0;
10869err:
10870        if (sqd) {
10871                mutex_unlock(&sqd->lock);
10872                io_put_sq_data(sqd);
10873        }
10874        return ret;
10875}
10876
10877static bool io_register_op_must_quiesce(int op)
10878{
10879        switch (op) {
10880        case IORING_REGISTER_BUFFERS:
10881        case IORING_UNREGISTER_BUFFERS:
10882        case IORING_REGISTER_FILES:
10883        case IORING_UNREGISTER_FILES:
10884        case IORING_REGISTER_FILES_UPDATE:
10885        case IORING_REGISTER_PROBE:
10886        case IORING_REGISTER_PERSONALITY:
10887        case IORING_UNREGISTER_PERSONALITY:
10888        case IORING_REGISTER_FILES2:
10889        case IORING_REGISTER_FILES_UPDATE2:
10890        case IORING_REGISTER_BUFFERS2:
10891        case IORING_REGISTER_BUFFERS_UPDATE:
10892        case IORING_REGISTER_IOWQ_AFF:
10893        case IORING_UNREGISTER_IOWQ_AFF:
10894        case IORING_REGISTER_IOWQ_MAX_WORKERS:
10895                return false;
10896        default:
10897                return true;
10898        }
10899}
10900
10901static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
10902{
10903        long ret;
10904
10905        percpu_ref_kill(&ctx->refs);
10906
10907        /*
10908         * Drop uring mutex before waiting for references to exit. If another
10909         * thread is currently inside io_uring_enter() it might need to grab the
10910         * uring_lock to make progress. If we hold it here across the drain
10911         * wait, then we can deadlock. It's safe to drop the mutex here, since
10912         * no new references will come in after we've killed the percpu ref.
10913         */
10914        mutex_unlock(&ctx->uring_lock);
10915        do {
10916                ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
10917                if (ret) {
10918                        ret = min(0L, ret);
10919                        break;
10920                }
10921
10922                ret = io_run_task_work_sig();
10923                io_req_caches_free(ctx);
10924        } while (ret >= 0);
10925        mutex_lock(&ctx->uring_lock);
10926
10927        if (ret)
10928                io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10929        return ret;
10930}
10931
10932static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10933                               void __user *arg, unsigned nr_args)
10934        __releases(ctx->uring_lock)
10935        __acquires(ctx->uring_lock)
10936{
10937        int ret;
10938
10939        /*
10940         * We're inside the ring mutex, if the ref is already dying, then
10941         * someone else killed the ctx or is already going through
10942         * io_uring_register().
10943         */
10944        if (percpu_ref_is_dying(&ctx->refs))
10945                return -ENXIO;
10946
10947        if (ctx->restricted) {
10948                if (opcode >= IORING_REGISTER_LAST)
10949                        return -EINVAL;
10950                opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10951                if (!test_bit(opcode, ctx->restrictions.register_op))
10952                        return -EACCES;
10953        }
10954
10955        if (io_register_op_must_quiesce(opcode)) {
10956                ret = io_ctx_quiesce(ctx);
10957                if (ret)
10958                        return ret;
10959        }
10960
10961        switch (opcode) {
10962        case IORING_REGISTER_BUFFERS:
10963                ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10964                break;
10965        case IORING_UNREGISTER_BUFFERS:
10966                ret = -EINVAL;
10967                if (arg || nr_args)
10968                        break;
10969                ret = io_sqe_buffers_unregister(ctx);
10970                break;
10971        case IORING_REGISTER_FILES:
10972                ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10973                break;
10974        case IORING_UNREGISTER_FILES:
10975                ret = -EINVAL;
10976                if (arg || nr_args)
10977                        break;
10978                ret = io_sqe_files_unregister(ctx);
10979                break;
10980        case IORING_REGISTER_FILES_UPDATE:
10981                ret = io_register_files_update(ctx, arg, nr_args);
10982                break;
10983        case IORING_REGISTER_EVENTFD:
10984        case IORING_REGISTER_EVENTFD_ASYNC:
10985                ret = -EINVAL;
10986                if (nr_args != 1)
10987                        break;
10988                ret = io_eventfd_register(ctx, arg);
10989                if (ret)
10990                        break;
10991                if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10992                        ctx->eventfd_async = 1;
10993                else
10994                        ctx->eventfd_async = 0;
10995                break;
10996        case IORING_UNREGISTER_EVENTFD:
10997                ret = -EINVAL;
10998                if (arg || nr_args)
10999                        break;
11000                ret = io_eventfd_unregister(ctx);
11001                break;
11002        case IORING_REGISTER_PROBE:
11003                ret = -EINVAL;
11004                if (!arg || nr_args > 256)
11005                        break;
11006                ret = io_probe(ctx, arg, nr_args);
11007                break;
11008        case IORING_REGISTER_PERSONALITY:
11009                ret = -EINVAL;
11010                if (arg || nr_args)
11011                        break;
11012                ret = io_register_personality(ctx);
11013                break;
11014        case IORING_UNREGISTER_PERSONALITY:
11015                ret = -EINVAL;
11016                if (arg)
11017                        break;
11018                ret = io_unregister_personality(ctx, nr_args);
11019                break;
11020        case IORING_REGISTER_ENABLE_RINGS:
11021                ret = -EINVAL;
11022                if (arg || nr_args)
11023                        break;
11024                ret = io_register_enable_rings(ctx);
11025                break;
11026        case IORING_REGISTER_RESTRICTIONS:
11027                ret = io_register_restrictions(ctx, arg, nr_args);
11028                break;
11029        case IORING_REGISTER_FILES2:
11030                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11031                break;
11032        case IORING_REGISTER_FILES_UPDATE2:
11033                ret = io_register_rsrc_update(ctx, arg, nr_args,
11034                                              IORING_RSRC_FILE);
11035                break;
11036        case IORING_REGISTER_BUFFERS2:
11037                ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
11038                break;
11039        case IORING_REGISTER_BUFFERS_UPDATE:
11040                ret = io_register_rsrc_update(ctx, arg, nr_args,
11041                                              IORING_RSRC_BUFFER);
11042                break;
11043        case IORING_REGISTER_IOWQ_AFF:
11044                ret = -EINVAL;
11045                if (!arg || !nr_args)
11046                        break;
11047                ret = io_register_iowq_aff(ctx, arg, nr_args);
11048                break;
11049        case IORING_UNREGISTER_IOWQ_AFF:
11050                ret = -EINVAL;
11051                if (arg || nr_args)
11052                        break;
11053                ret = io_unregister_iowq_aff(ctx);
11054                break;
11055        case IORING_REGISTER_IOWQ_MAX_WORKERS:
11056                ret = -EINVAL;
11057                if (!arg || nr_args != 2)
11058                        break;
11059                ret = io_register_iowq_max_workers(ctx, arg);
11060                break;
11061        default:
11062                ret = -EINVAL;
11063                break;
11064        }
11065
11066        if (io_register_op_must_quiesce(opcode)) {
11067                /* bring the ctx back to life */
11068                percpu_ref_reinit(&ctx->refs);
11069                reinit_completion(&ctx->ref_comp);
11070        }
11071        return ret;
11072}
11073
11074SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11075                void __user *, arg, unsigned int, nr_args)
11076{
11077        struct io_ring_ctx *ctx;
11078        long ret = -EBADF;
11079        struct fd f;
11080
11081        f = fdget(fd);
11082        if (!f.file)
11083                return -EBADF;
11084
11085        ret = -EOPNOTSUPP;
11086        if (f.file->f_op != &io_uring_fops)
11087                goto out_fput;
11088
11089        ctx = f.file->private_data;
11090
11091        io_run_task_work();
11092
11093        mutex_lock(&ctx->uring_lock);
11094        ret = __io_uring_register(ctx, opcode, arg, nr_args);
11095        mutex_unlock(&ctx->uring_lock);
11096        trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
11097                                                        ctx->cq_ev_fd != NULL, ret);
11098out_fput:
11099        fdput(f);
11100        return ret;
11101}
11102
11103static int __init io_uring_init(void)
11104{
11105#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11106        BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11107        BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11108} while (0)
11109
11110#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11111        __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11112        BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11113        BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
11114        BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
11115        BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
11116        BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
11117        BUILD_BUG_SQE_ELEM(8,  __u64,  off);
11118        BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
11119        BUILD_BUG_SQE_ELEM(16, __u64,  addr);
11120        BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
11121        BUILD_BUG_SQE_ELEM(24, __u32,  len);
11122        BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
11123        BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
11124        BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11125        BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
11126        BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
11127        BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
11128        BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
11129        BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
11130        BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
11131        BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
11132        BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
11133        BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
11134        BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
11135        BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
11136        BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
11137        BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
11138        BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
11139        BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
11140        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
11141        BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
11142        BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
11143
11144        BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11145                     sizeof(struct io_uring_rsrc_update));
11146        BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11147                     sizeof(struct io_uring_rsrc_update2));
11148
11149        /* ->buf_index is u16 */
11150        BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11151
11152        /* should fit into one byte */
11153        BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
11154        BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
11155        BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
11156
11157        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
11158        BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
11159
11160        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11161                                SLAB_ACCOUNT);
11162        return 0;
11163};
11164__initcall(io_uring_init);
11165