LXR linux/fs/io

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Shared application/kernel submission and completion ring pairs, for
   4 * supporting fast/efficient IO.
   5 *
   6 * A note on the read/write ordering memory barriers that are matched between
   7 * the application and kernel side.
   8 *
   9 * After the application reads the CQ ring tail, it must use an
  10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
  11 * before writing the tail (using smp_load_acquire to read the tail will
  12 * do). It also needs a smp_mb() before updating CQ head (ordering the
  13 * entry load(s) with the head store), pairing with an implicit barrier
  14 * through a control-dependency in io_get_cqring (smp_store_release to
  15 * store head will do). Failure to do so could lead to reading invalid
  16 * CQ entries.
  17 *
  18 * Likewise, the application must use an appropriate smp_wmb() before
  19 * writing the SQ tail (ordering SQ entry stores with the tail store),
  20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
  21 * to store the tail will do). And it needs a barrier ordering the SQ
  22 * head load before writing new SQ entries (smp_load_acquire to read
  23 * head will do).
  24 *
  25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
  26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
  27 * updating the SQ tail; a full memory barrier smp_mb() is needed
  28 * between.
  29 *
  30 * Also see the examples in the liburing library:
  31 *
  32 *      git://git.kernel.dk/liburing
  33 *
  34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
  35 * from data shared between the kernel and application. This is done both
  36 * for ordering purposes, but also to ensure that once a value is loaded from
  37 * data that the application could potentially modify, it remains stable.
  38 *
  39 * Copyright (C) 2018-2019 Jens Axboe
  40 * Copyright (c) 2018-2019 Christoph Hellwig
  41 */
  42#include <linux/kernel.h>
  43#include <linux/init.h>
  44#include <linux/errno.h>
  45#include <linux/syscalls.h>
  46#include <linux/compat.h>
  47#include <net/compat.h>
  48#include <linux/refcount.h>
  49#include <linux/uio.h>
  50#include <linux/bits.h>
  51
  52#include <linux/sched/signal.h>
  53#include <linux/fs.h>
  54#include <linux/file.h>
  55#include <linux/fdtable.h>
  56#include <linux/mm.h>
  57#include <linux/mman.h>
  58#include <linux/percpu.h>
  59#include <linux/slab.h>
  60#include <linux/kthread.h>
  61#include <linux/blkdev.h>
  62#include <linux/bvec.h>
  63#include <linux/net.h>
  64#include <net/sock.h>
  65#include <net/af_unix.h>
  66#include <net/scm.h>
  67#include <linux/anon_inodes.h>
  68#include <linux/sched/mm.h>
  69#include <linux/uaccess.h>
  70#include <linux/nospec.h>
  71#include <linux/sizes.h>
  72#include <linux/hugetlb.h>
  73#include <linux/highmem.h>
  74#include <linux/namei.h>
  75#include <linux/fsnotify.h>
  76#include <linux/fadvise.h>
  77#include <linux/eventpoll.h>
  78#include <linux/fs_struct.h>
  79#include <linux/splice.h>
  80#include <linux/task_work.h>
  81#include <linux/pagemap.h>
  82#include <linux/io_uring.h>
  83#include <linux/blk-cgroup.h>
  84#include <linux/audit.h>
  85
  86#define CREATE_TRACE_POINTS
  87#include <trace/events/io_uring.h>
  88
  89#include <uapi/linux/io_uring.h>
  90
  91#include "internal.h"
  92#include "io-wq.h"
  93
  94#define IORING_MAX_ENTRIES      32768
  95#define IORING_MAX_CQ_ENTRIES   (2 * IORING_MAX_ENTRIES)
  96
  97/*
  98 * Shift of 9 is 512 entries, or exactly one page on 64-bit archs
  99 */
 100#define IORING_FILE_TABLE_SHIFT 9
 101#define IORING_MAX_FILES_TABLE  (1U << IORING_FILE_TABLE_SHIFT)
 102#define IORING_FILE_TABLE_MASK  (IORING_MAX_FILES_TABLE - 1)
 103#define IORING_MAX_FIXED_FILES  (64 * IORING_MAX_FILES_TABLE)
 104#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
 105                                 IORING_REGISTER_LAST + IORING_OP_LAST)
 106
 107struct io_uring {
 108        u32 head ____cacheline_aligned_in_smp;
 109        u32 tail ____cacheline_aligned_in_smp;
 110};
 111
 112/*
 113 * This data is shared with the application through the mmap at offsets
 114 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
 115 *
 116 * The offsets to the member fields are published through struct
 117 * io_sqring_offsets when calling io_uring_setup.
 118 */
 119struct io_rings {
 120        /*
 121         * Head and tail offsets into the ring; the offsets need to be
 122         * masked to get valid indices.
 123         *
 124         * The kernel controls head of the sq ring and the tail of the cq ring,
 125         * and the application controls tail of the sq ring and the head of the
 126         * cq ring.
 127         */
 128        struct io_uring         sq, cq;
 129        /*
 130         * Bitmasks to apply to head and tail offsets (constant, equals
 131         * ring_entries - 1)
 132         */
 133        u32                     sq_ring_mask, cq_ring_mask;
 134        /* Ring sizes (constant, power of 2) */
 135        u32                     sq_ring_entries, cq_ring_entries;
 136        /*
 137         * Number of invalid entries dropped by the kernel due to
 138         * invalid index stored in array
 139         *
 140         * Written by the kernel, shouldn't be modified by the
 141         * application (i.e. get number of "new events" by comparing to
 142         * cached value).
 143         *
 144         * After a new SQ head value was read by the application this
 145         * counter includes all submissions that were dropped reaching
 146         * the new SQ head (and possibly more).
 147         */
 148        u32                     sq_dropped;
 149        /*
 150         * Runtime SQ flags
 151         *
 152         * Written by the kernel, shouldn't be modified by the
 153         * application.
 154         *
 155         * The application needs a full memory barrier before checking
 156         * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
 157         */
 158        u32                     sq_flags;
 159        /*
 160         * Runtime CQ flags
 161         *
 162         * Written by the application, shouldn't be modified by the
 163         * kernel.
 164         */
 165        u32                     cq_flags;
 166        /*
 167         * Number of completion events lost because the queue was full;
 168         * this should be avoided by the application by making sure
 169         * there are not more requests pending than there is space in
 170         * the completion queue.
 171         *
 172         * Written by the kernel, shouldn't be modified by the
 173         * application (i.e. get number of "new events" by comparing to
 174         * cached value).
 175         *
 176         * As completion events come in out of order this counter is not
 177         * ordered with any other data.
 178         */
 179        u32                     cq_overflow;
 180        /*
 181         * Ring buffer of completion events.
 182         *
 183         * The kernel writes completion events fresh every time they are
 184         * produced, so the application is allowed to modify pending
 185         * entries.
 186         */
 187        struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
 188};
 189
 190struct io_mapped_ubuf {
 191        u64             ubuf;
 192        size_t          len;
 193        struct          bio_vec *bvec;
 194        unsigned int    nr_bvecs;
 195        unsigned long   acct_pages;
 196};
 197
 198struct fixed_file_table {
 199        struct file             **files;
 200};
 201
 202struct fixed_file_ref_node {
 203        struct percpu_ref               refs;
 204        struct list_head                node;
 205        struct list_head                file_list;
 206        struct fixed_file_data          *file_data;
 207        struct llist_node               llist;
 208        bool                            done;
 209};
 210
 211struct fixed_file_data {
 212        struct fixed_file_table         *table;
 213        struct io_ring_ctx              *ctx;
 214
 215        struct fixed_file_ref_node      *node;
 216        struct percpu_ref               refs;
 217        struct completion               done;
 218        struct list_head                ref_list;
 219        spinlock_t                      lock;
 220};
 221
 222struct io_buffer {
 223        struct list_head list;
 224        __u64 addr;
 225        __s32 len;
 226        __u16 bid;
 227};
 228
 229struct io_restriction {
 230        DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 231        DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
 232        u8 sqe_flags_allowed;
 233        u8 sqe_flags_required;
 234        bool registered;
 235};
 236
 237struct io_sq_data {
 238        refcount_t              refs;
 239        struct mutex            lock;
 240
 241        /* ctx's that are using this sqd */
 242        struct list_head        ctx_list;
 243        struct list_head        ctx_new_list;
 244        struct mutex            ctx_lock;
 245
 246        struct task_struct      *thread;
 247        struct wait_queue_head  wait;
 248};
 249
 250struct io_ring_ctx {
 251        struct {
 252                struct percpu_ref       refs;
 253        } ____cacheline_aligned_in_smp;
 254
 255        struct {
 256                unsigned int            flags;
 257                unsigned int            compat: 1;
 258                unsigned int            limit_mem: 1;
 259                unsigned int            cq_overflow_flushed: 1;
 260                unsigned int            drain_next: 1;
 261                unsigned int            eventfd_async: 1;
 262                unsigned int            restricted: 1;
 263
 264                /*
 265                 * Ring buffer of indices into array of io_uring_sqe, which is
 266                 * mmapped by the application using the IORING_OFF_SQES offset.
 267                 *
 268                 * This indirection could e.g. be used to assign fixed
 269                 * io_uring_sqe entries to operations and only submit them to
 270                 * the queue when needed.
 271                 *
 272                 * The kernel modifies neither the indices array nor the entries
 273                 * array.
 274                 */
 275                u32                     *sq_array;
 276                unsigned                cached_sq_head;
 277                unsigned                sq_entries;
 278                unsigned                sq_mask;
 279                unsigned                sq_thread_idle;
 280                unsigned                cached_sq_dropped;
 281                unsigned                cached_cq_overflow;
 282                unsigned long           sq_check_overflow;
 283
 284                struct list_head        defer_list;
 285                struct list_head        timeout_list;
 286                struct list_head        cq_overflow_list;
 287
 288                wait_queue_head_t       inflight_wait;
 289                struct io_uring_sqe     *sq_sqes;
 290        } ____cacheline_aligned_in_smp;
 291
 292        struct io_rings *rings;
 293
 294        /* IO offload */
 295        struct io_wq            *io_wq;
 296
 297        /*
 298         * For SQPOLL usage - we hold a reference to the parent task, so we
 299         * have access to the ->files
 300         */
 301        struct task_struct      *sqo_task;
 302
 303        /* Only used for accounting purposes */
 304        struct mm_struct        *mm_account;
 305
 306#ifdef CONFIG_BLK_CGROUP
 307        struct cgroup_subsys_state      *sqo_blkcg_css;
 308#endif
 309
 310        struct io_sq_data       *sq_data;       /* if using sq thread polling */
 311
 312        struct wait_queue_head  sqo_sq_wait;
 313        struct wait_queue_entry sqo_wait_entry;
 314        struct list_head        sqd_list;
 315
 316        /*
 317         * If used, fixed file set. Writers must ensure that ->refs is dead,
 318         * readers must ensure that ->refs is alive as long as the file* is
 319         * used. Only updated through io_uring_register(2).
 320         */
 321        struct fixed_file_data  *file_data;
 322        unsigned                nr_user_files;
 323
 324        /* if used, fixed mapped user buffers */
 325        unsigned                nr_user_bufs;
 326        struct io_mapped_ubuf   *user_bufs;
 327
 328        struct user_struct      *user;
 329
 330        const struct cred       *creds;
 331
 332#ifdef CONFIG_AUDIT
 333        kuid_t                  loginuid;
 334        unsigned int            sessionid;
 335#endif
 336
 337        struct completion       ref_comp;
 338        struct completion       sq_thread_comp;
 339
 340        /* if all else fails... */
 341        struct io_kiocb         *fallback_req;
 342
 343#if defined(CONFIG_UNIX)
 344        struct socket           *ring_sock;
 345#endif
 346
 347        struct idr              io_buffer_idr;
 348
 349        struct idr              personality_idr;
 350
 351        struct {
 352                unsigned                cached_cq_tail;
 353                unsigned                cq_entries;
 354                unsigned                cq_mask;
 355                atomic_t                cq_timeouts;
 356                unsigned long           cq_check_overflow;
 357                struct wait_queue_head  cq_wait;
 358                struct fasync_struct    *cq_fasync;
 359                struct eventfd_ctx      *cq_ev_fd;
 360        } ____cacheline_aligned_in_smp;
 361
 362        struct {
 363                struct mutex            uring_lock;
 364                wait_queue_head_t       wait;
 365        } ____cacheline_aligned_in_smp;
 366
 367        struct {
 368                spinlock_t              completion_lock;
 369
 370                /*
 371                 * ->iopoll_list is protected by the ctx->uring_lock for
 372                 * io_uring instances that don't use IORING_SETUP_SQPOLL.
 373                 * For SQPOLL, only the single threaded io_sq_thread() will
 374                 * manipulate the list, hence no extra locking is needed there.
 375                 */
 376                struct list_head        iopoll_list;
 377                struct hlist_head       *cancel_hash;
 378                unsigned                cancel_hash_bits;
 379                bool                    poll_multi_file;
 380
 381                spinlock_t              inflight_lock;
 382                struct list_head        inflight_list;
 383        } ____cacheline_aligned_in_smp;
 384
 385        struct delayed_work             file_put_work;
 386        struct llist_head               file_put_llist;
 387
 388        struct work_struct              exit_work;
 389        struct io_restriction           restrictions;
 390};
 391
 392/*
 393 * First field must be the file pointer in all the
 394 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
 395 */
 396struct io_poll_iocb {
 397        struct file                     *file;
 398        union {
 399                struct wait_queue_head  *head;
 400                u64                     addr;
 401        };
 402        __poll_t                        events;
 403        bool                            done;
 404        bool                            canceled;
 405        struct wait_queue_entry         wait;
 406};
 407
 408struct io_close {
 409        struct file                     *file;
 410        struct file                     *put_file;
 411        int                             fd;
 412};
 413
 414struct io_timeout_data {
 415        struct io_kiocb                 *req;
 416        struct hrtimer                  timer;
 417        struct timespec64               ts;
 418        enum hrtimer_mode               mode;
 419};
 420
 421struct io_accept {
 422        struct file                     *file;
 423        struct sockaddr __user          *addr;
 424        int __user                      *addr_len;
 425        int                             flags;
 426        unsigned long                   nofile;
 427};
 428
 429struct io_sync {
 430        struct file                     *file;
 431        loff_t                          len;
 432        loff_t                          off;
 433        int                             flags;
 434        int                             mode;
 435};
 436
 437struct io_cancel {
 438        struct file                     *file;
 439        u64                             addr;
 440};
 441
 442struct io_timeout {
 443        struct file                     *file;
 444        u32                             off;
 445        u32                             target_seq;
 446        struct list_head                list;
 447};
 448
 449struct io_timeout_rem {
 450        struct file                     *file;
 451        u64                             addr;
 452};
 453
 454struct io_rw {
 455        /* NOTE: kiocb has the file as the first member, so don't do it here */
 456        struct kiocb                    kiocb;
 457        u64                             addr;
 458        u64                             len;
 459};
 460
 461struct io_connect {
 462        struct file                     *file;
 463        struct sockaddr __user          *addr;
 464        int                             addr_len;
 465};
 466
 467struct io_sr_msg {
 468        struct file                     *file;
 469        union {
 470                struct user_msghdr __user *umsg;
 471                void __user             *buf;
 472        };
 473        int                             msg_flags;
 474        int                             bgid;
 475        size_t                          len;
 476        struct io_buffer                *kbuf;
 477};
 478
 479struct io_open {
 480        struct file                     *file;
 481        int                             dfd;
 482        bool                            ignore_nonblock;
 483        struct filename                 *filename;
 484        struct open_how                 how;
 485        unsigned long                   nofile;
 486};
 487
 488struct io_files_update {
 489        struct file                     *file;
 490        u64                             arg;
 491        u32                             nr_args;
 492        u32                             offset;
 493};
 494
 495struct io_fadvise {
 496        struct file                     *file;
 497        u64                             offset;
 498        u32                             len;
 499        u32                             advice;
 500};
 501
 502struct io_madvise {
 503        struct file                     *file;
 504        u64                             addr;
 505        u32                             len;
 506        u32                             advice;
 507};
 508
 509struct io_epoll {
 510        struct file                     *file;
 511        int                             epfd;
 512        int                             op;
 513        int                             fd;
 514        struct epoll_event              event;
 515};
 516
 517struct io_splice {
 518        struct file                     *file_out;
 519        struct file                     *file_in;
 520        loff_t                          off_out;
 521        loff_t                          off_in;
 522        u64                             len;
 523        unsigned int                    flags;
 524};
 525
 526struct io_provide_buf {
 527        struct file                     *file;
 528        __u64                           addr;
 529        __s32                           len;
 530        __u32                           bgid;
 531        __u16                           nbufs;
 532        __u16                           bid;
 533};
 534
 535struct io_statx {
 536        struct file                     *file;
 537        int                             dfd;
 538        unsigned int                    mask;
 539        unsigned int                    flags;
 540        const char __user               *filename;
 541        struct statx __user             *buffer;
 542};
 543
 544struct io_completion {
 545        struct file                     *file;
 546        struct list_head                list;
 547        int                             cflags;
 548};
 549
 550struct io_async_connect {
 551        struct sockaddr_storage         address;
 552};
 553
 554struct io_async_msghdr {
 555        struct iovec                    fast_iov[UIO_FASTIOV];
 556        struct iovec                    *iov;
 557        struct sockaddr __user          *uaddr;
 558        struct msghdr                   msg;
 559        struct sockaddr_storage         addr;
 560};
 561
 562struct io_async_rw {
 563        struct iovec                    fast_iov[UIO_FASTIOV];
 564        const struct iovec              *free_iovec;
 565        struct iov_iter                 iter;
 566        size_t                          bytes_done;
 567        struct wait_page_queue          wpq;
 568};
 569
 570enum {
 571        REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
 572        REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
 573        REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
 574        REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
 575        REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
 576        REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
 577
 578        REQ_F_LINK_HEAD_BIT,
 579        REQ_F_FAIL_LINK_BIT,
 580        REQ_F_INFLIGHT_BIT,
 581        REQ_F_CUR_POS_BIT,
 582        REQ_F_NOWAIT_BIT,
 583        REQ_F_LINK_TIMEOUT_BIT,
 584        REQ_F_ISREG_BIT,
 585        REQ_F_NEED_CLEANUP_BIT,
 586        REQ_F_POLLED_BIT,
 587        REQ_F_BUFFER_SELECTED_BIT,
 588        REQ_F_NO_FILE_TABLE_BIT,
 589        REQ_F_WORK_INITIALIZED_BIT,
 590        REQ_F_LTIMEOUT_ACTIVE_BIT,
 591
 592        /* not a real bit, just to check we're not overflowing the space */
 593        __REQ_F_LAST_BIT,
 594};
 595
 596enum {
 597        /* ctx owns file */
 598        REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
 599        /* drain existing IO first */
 600        REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
 601        /* linked sqes */
 602        REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
 603        /* doesn't sever on completion < 0 */
 604        REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
 605        /* IOSQE_ASYNC */
 606        REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
 607        /* IOSQE_BUFFER_SELECT */
 608        REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
 609
 610        /* head of a link */
 611        REQ_F_LINK_HEAD         = BIT(REQ_F_LINK_HEAD_BIT),
 612        /* fail rest of links */
 613        REQ_F_FAIL_LINK         = BIT(REQ_F_FAIL_LINK_BIT),
 614        /* on inflight list */
 615        REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
 616        /* read/write uses file position */
 617        REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
 618        /* must not punt to workers */
 619        REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
 620        /* has or had linked timeout */
 621        REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
 622        /* regular file */
 623        REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
 624        /* needs cleanup */
 625        REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
 626        /* already went through poll handler */
 627        REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
 628        /* buffer already selected */
 629        REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
 630        /* doesn't need file table for this request */
 631        REQ_F_NO_FILE_TABLE     = BIT(REQ_F_NO_FILE_TABLE_BIT),
 632        /* io_wq_work is initialized */
 633        REQ_F_WORK_INITIALIZED  = BIT(REQ_F_WORK_INITIALIZED_BIT),
 634        /* linked timeout is active, i.e. prepared by link's head */
 635        REQ_F_LTIMEOUT_ACTIVE   = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT),
 636};
 637
 638struct async_poll {
 639        struct io_poll_iocb     poll;
 640        struct io_poll_iocb     *double_poll;
 641};
 642
 643/*
 644 * NOTE! Each of the iocb union members has the file pointer
 645 * as the first entry in their struct definition. So you can
 646 * access the file pointer through any of the sub-structs,
 647 * or directly as just 'ki_filp' in this struct.
 648 */
 649struct io_kiocb {
 650        union {
 651                struct file             *file;
 652                struct io_rw            rw;
 653                struct io_poll_iocb     poll;
 654                struct io_accept        accept;
 655                struct io_sync          sync;
 656                struct io_cancel        cancel;
 657                struct io_timeout       timeout;
 658                struct io_timeout_rem   timeout_rem;
 659                struct io_connect       connect;
 660                struct io_sr_msg        sr_msg;
 661                struct io_open          open;
 662                struct io_close         close;
 663                struct io_files_update  files_update;
 664                struct io_fadvise       fadvise;
 665                struct io_madvise       madvise;
 666                struct io_epoll         epoll;
 667                struct io_splice        splice;
 668                struct io_provide_buf   pbuf;
 669                struct io_statx         statx;
 670                /* use only after cleaning per-op data, see io_clean_op() */
 671                struct io_completion    compl;
 672        };
 673
 674        /* opcode allocated if it needs to store data for async defer */
 675        void                            *async_data;
 676        u8                              opcode;
 677        /* polled IO has completed */
 678        u8                              iopoll_completed;
 679
 680        u16                             buf_index;
 681        u32                             result;
 682
 683        struct io_ring_ctx              *ctx;
 684        unsigned int                    flags;
 685        refcount_t                      refs;
 686        struct task_struct              *task;
 687        u64                             user_data;
 688
 689        struct list_head                link_list;
 690
 691        /*
 692         * 1. used with ctx->iopoll_list with reads/writes
 693         * 2. to track reqs with ->files (see io_op_def::file_table)
 694         */
 695        struct list_head                inflight_entry;
 696
 697        struct percpu_ref               *fixed_file_refs;
 698        struct callback_head            task_work;
 699        /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
 700        struct hlist_node               hash_node;
 701        struct async_poll               *apoll;
 702        struct io_wq_work               work;
 703};
 704
 705struct io_defer_entry {
 706        struct list_head        list;
 707        struct io_kiocb         *req;
 708        u32                     seq;
 709};
 710
 711#define IO_IOPOLL_BATCH                 8
 712
 713struct io_comp_state {
 714        unsigned int            nr;
 715        struct list_head        list;
 716        struct io_ring_ctx      *ctx;
 717};
 718
 719struct io_submit_state {
 720        struct blk_plug         plug;
 721
 722        /*
 723         * io_kiocb alloc cache
 724         */
 725        void                    *reqs[IO_IOPOLL_BATCH];
 726        unsigned int            free_reqs;
 727
 728        /*
 729         * Batch completion logic
 730         */
 731        struct io_comp_state    comp;
 732
 733        /*
 734         * File reference cache
 735         */
 736        struct file             *file;
 737        unsigned int            fd;
 738        unsigned int            has_refs;
 739        unsigned int            ios_left;
 740};
 741
 742struct io_op_def {
 743        /* needs req->file assigned */
 744        unsigned                needs_file : 1;
 745        /* don't fail if file grab fails */
 746        unsigned                needs_file_no_error : 1;
 747        /* hash wq insertion if file is a regular file */
 748        unsigned                hash_reg_file : 1;
 749        /* unbound wq insertion if file is a non-regular file */
 750        unsigned                unbound_nonreg_file : 1;
 751        /* opcode is not supported by this kernel */
 752        unsigned                not_supported : 1;
 753        /* set if opcode supports polled "wait" */
 754        unsigned                pollin : 1;
 755        unsigned                pollout : 1;
 756        /* op supports buffer selection */
 757        unsigned                buffer_select : 1;
 758        /* must always have async data allocated */
 759        unsigned                needs_async_data : 1;
 760        /* size of async data needed, if any */
 761        unsigned short          async_size;
 762        unsigned                work_flags;
 763};
 764
 765static const struct io_op_def io_op_defs[] = {
 766        [IORING_OP_NOP] = {},
 767        [IORING_OP_READV] = {
 768                .needs_file             = 1,
 769                .unbound_nonreg_file    = 1,
 770                .pollin                 = 1,
 771                .buffer_select          = 1,
 772                .needs_async_data       = 1,
 773                .async_size             = sizeof(struct io_async_rw),
 774                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 775        },
 776        [IORING_OP_WRITEV] = {
 777                .needs_file             = 1,
 778                .hash_reg_file          = 1,
 779                .unbound_nonreg_file    = 1,
 780                .pollout                = 1,
 781                .needs_async_data       = 1,
 782                .async_size             = sizeof(struct io_async_rw),
 783                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 784                                                IO_WQ_WORK_FSIZE,
 785        },
 786        [IORING_OP_FSYNC] = {
 787                .needs_file             = 1,
 788                .work_flags             = IO_WQ_WORK_BLKCG,
 789        },
 790        [IORING_OP_READ_FIXED] = {
 791                .needs_file             = 1,
 792                .unbound_nonreg_file    = 1,
 793                .pollin                 = 1,
 794                .async_size             = sizeof(struct io_async_rw),
 795                .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
 796        },
 797        [IORING_OP_WRITE_FIXED] = {
 798                .needs_file             = 1,
 799                .hash_reg_file          = 1,
 800                .unbound_nonreg_file    = 1,
 801                .pollout                = 1,
 802                .async_size             = sizeof(struct io_async_rw),
 803                .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
 804                                                IO_WQ_WORK_MM,
 805        },
 806        [IORING_OP_POLL_ADD] = {
 807                .needs_file             = 1,
 808                .unbound_nonreg_file    = 1,
 809        },
 810        [IORING_OP_POLL_REMOVE] = {},
 811        [IORING_OP_SYNC_FILE_RANGE] = {
 812                .needs_file             = 1,
 813                .work_flags             = IO_WQ_WORK_BLKCG,
 814        },
 815        [IORING_OP_SENDMSG] = {
 816                .needs_file             = 1,
 817                .unbound_nonreg_file    = 1,
 818                .pollout                = 1,
 819                .needs_async_data       = 1,
 820                .async_size             = sizeof(struct io_async_msghdr),
 821                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 822                                                IO_WQ_WORK_FS,
 823        },
 824        [IORING_OP_RECVMSG] = {
 825                .needs_file             = 1,
 826                .unbound_nonreg_file    = 1,
 827                .pollin                 = 1,
 828                .buffer_select          = 1,
 829                .needs_async_data       = 1,
 830                .async_size             = sizeof(struct io_async_msghdr),
 831                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 832                                                IO_WQ_WORK_FS,
 833        },
 834        [IORING_OP_TIMEOUT] = {
 835                .needs_async_data       = 1,
 836                .async_size             = sizeof(struct io_timeout_data),
 837                .work_flags             = IO_WQ_WORK_MM,
 838        },
 839        [IORING_OP_TIMEOUT_REMOVE] = {},
 840        [IORING_OP_ACCEPT] = {
 841                .needs_file             = 1,
 842                .unbound_nonreg_file    = 1,
 843                .pollin                 = 1,
 844                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
 845        },
 846        [IORING_OP_ASYNC_CANCEL] = {},
 847        [IORING_OP_LINK_TIMEOUT] = {
 848                .needs_async_data       = 1,
 849                .async_size             = sizeof(struct io_timeout_data),
 850                .work_flags             = IO_WQ_WORK_MM,
 851        },
 852        [IORING_OP_CONNECT] = {
 853                .needs_file             = 1,
 854                .unbound_nonreg_file    = 1,
 855                .pollout                = 1,
 856                .needs_async_data       = 1,
 857                .async_size             = sizeof(struct io_async_connect),
 858                .work_flags             = IO_WQ_WORK_MM,
 859        },
 860        [IORING_OP_FALLOCATE] = {
 861                .needs_file             = 1,
 862                .work_flags             = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE,
 863        },
 864        [IORING_OP_OPENAT] = {
 865                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG |
 866                                                IO_WQ_WORK_FS,
 867        },
 868        [IORING_OP_CLOSE] = {
 869                .needs_file             = 1,
 870                .needs_file_no_error    = 1,
 871                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG,
 872        },
 873        [IORING_OP_FILES_UPDATE] = {
 874                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM,
 875        },
 876        [IORING_OP_STATX] = {
 877                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_MM |
 878                                                IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG,
 879        },
 880        [IORING_OP_READ] = {
 881                .needs_file             = 1,
 882                .unbound_nonreg_file    = 1,
 883                .pollin                 = 1,
 884                .buffer_select          = 1,
 885                .async_size             = sizeof(struct io_async_rw),
 886                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 887        },
 888        [IORING_OP_WRITE] = {
 889                .needs_file             = 1,
 890                .unbound_nonreg_file    = 1,
 891                .pollout                = 1,
 892                .async_size             = sizeof(struct io_async_rw),
 893                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
 894                                                IO_WQ_WORK_FSIZE,
 895        },
 896        [IORING_OP_FADVISE] = {
 897                .needs_file             = 1,
 898                .work_flags             = IO_WQ_WORK_BLKCG,
 899        },
 900        [IORING_OP_MADVISE] = {
 901                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 902        },
 903        [IORING_OP_SEND] = {
 904                .needs_file             = 1,
 905                .unbound_nonreg_file    = 1,
 906                .pollout                = 1,
 907                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 908        },
 909        [IORING_OP_RECV] = {
 910                .needs_file             = 1,
 911                .unbound_nonreg_file    = 1,
 912                .pollin                 = 1,
 913                .buffer_select          = 1,
 914                .work_flags             = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
 915        },
 916        [IORING_OP_OPENAT2] = {
 917                .work_flags             = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
 918                                                IO_WQ_WORK_BLKCG,
 919        },
 920        [IORING_OP_EPOLL_CTL] = {
 921                .unbound_nonreg_file    = 1,
 922                .work_flags             = IO_WQ_WORK_FILES,
 923        },
 924        [IORING_OP_SPLICE] = {
 925                .needs_file             = 1,
 926                .hash_reg_file          = 1,
 927                .unbound_nonreg_file    = 1,
 928                .work_flags             = IO_WQ_WORK_BLKCG,
 929        },
 930        [IORING_OP_PROVIDE_BUFFERS] = {},
 931        [IORING_OP_REMOVE_BUFFERS] = {},
 932        [IORING_OP_TEE] = {
 933                .needs_file             = 1,
 934                .hash_reg_file          = 1,
 935                .unbound_nonreg_file    = 1,
 936        },
 937};
 938
 939enum io_mem_account {
 940        ACCT_LOCKED,
 941        ACCT_PINNED,
 942};
 943
 944static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
 945                             struct io_comp_state *cs);
 946static void io_cqring_fill_event(struct io_kiocb *req, long res);
 947static void io_put_req(struct io_kiocb *req);
 948static void io_put_req_deferred(struct io_kiocb *req, int nr);
 949static void io_double_put_req(struct io_kiocb *req);
 950static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 951static void __io_queue_linked_timeout(struct io_kiocb *req);
 952static void io_queue_linked_timeout(struct io_kiocb *req);
 953static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 954                                 struct io_uring_files_update *ip,
 955                                 unsigned nr_args);
 956static void __io_clean_op(struct io_kiocb *req);
 957static struct file *io_file_get(struct io_submit_state *state,
 958                                struct io_kiocb *req, int fd, bool fixed);
 959static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs);
 960static void io_file_put_work(struct work_struct *work);
 961
 962static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 963                               struct iovec **iovec, struct iov_iter *iter,
 964                               bool needs_lock);
 965static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
 966                             const struct iovec *fast_iov,
 967                             struct iov_iter *iter, bool force);
 968
 969static struct kmem_cache *req_cachep;
 970
 971static const struct file_operations io_uring_fops;
 972
 973struct sock *io_uring_get_socket(struct file *file)
 974{
 975#if defined(CONFIG_UNIX)
 976        if (file->f_op == &io_uring_fops) {
 977                struct io_ring_ctx *ctx = file->private_data;
 978
 979                return ctx->ring_sock->sk;
 980        }
 981#endif
 982        return NULL;
 983}
 984EXPORT_SYMBOL(io_uring_get_socket);
 985
 986static inline void io_clean_op(struct io_kiocb *req)
 987{
 988        if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED |
 989                          REQ_F_INFLIGHT))
 990                __io_clean_op(req);
 991}
 992
 993static void io_sq_thread_drop_mm(void)
 994{
 995        struct mm_struct *mm = current->mm;
 996
 997        if (mm) {
 998                kthread_unuse_mm(mm);
 999                mmput(mm);
1000                current->mm = NULL;

1001        }
1002}
1003
1004static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
1005{
1006        struct mm_struct *mm;
1007
1008        if (current->mm)
1009                return 0;
1010
1011        /* Should never happen */
1012        if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL)))
1013                return -EFAULT;
1014
1015        task_lock(ctx->sqo_task);
1016        mm = ctx->sqo_task->mm;
1017        if (unlikely(!mm || !mmget_not_zero(mm)))
1018                mm = NULL;
1019        task_unlock(ctx->sqo_task);
1020
1021        if (mm) {
1022                kthread_use_mm(mm);
1023                return 0;
1024        }
1025
1026        return -EFAULT;
1027}
1028
1029static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
1030                                   struct io_kiocb *req)
1031{
1032        if (!(io_op_defs[req->opcode].work_flags & IO_WQ_WORK_MM))
1033                return 0;
1034        return __io_sq_thread_acquire_mm(ctx);
1035}
1036
1037static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx,
1038                                         struct cgroup_subsys_state **cur_css)
1039
1040{
1041#ifdef CONFIG_BLK_CGROUP
1042        /* puts the old one when swapping */
1043        if (*cur_css != ctx->sqo_blkcg_css) {
1044                kthread_associate_blkcg(ctx->sqo_blkcg_css);
1045                *cur_css = ctx->sqo_blkcg_css;
1046        }
1047#endif
1048}
1049
1050static void io_sq_thread_unassociate_blkcg(void)
1051{
1052#ifdef CONFIG_BLK_CGROUP
1053        kthread_associate_blkcg(NULL);
1054#endif
1055}
1056
1057static inline void req_set_fail_links(struct io_kiocb *req)
1058{
1059        if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
1060                req->flags |= REQ_F_FAIL_LINK;
1061}
1062
1063/*
1064 * None of these are dereferenced, they are simply used to check if any of
1065 * them have changed. If we're under current and check they are still the
1066 * same, we're fine to grab references to them for actual out-of-line use.
1067 */
1068static void io_init_identity(struct io_identity *id)
1069{
1070        id->files = current->files;
1071        id->mm = current->mm;
1072#ifdef CONFIG_BLK_CGROUP
1073        rcu_read_lock();
1074        id->blkcg_css = blkcg_css();
1075        rcu_read_unlock();
1076#endif
1077        id->creds = current_cred();
1078        id->nsproxy = current->nsproxy;
1079        id->fs = current->fs;
1080        id->fsize = rlimit(RLIMIT_FSIZE);
1081#ifdef CONFIG_AUDIT
1082        id->loginuid = current->loginuid;
1083        id->sessionid = current->sessionid;
1084#endif
1085        refcount_set(&id->count, 1);
1086}
1087
1088static inline void __io_req_init_async(struct io_kiocb *req)
1089{
1090        memset(&req->work, 0, sizeof(req->work));
1091        req->flags |= REQ_F_WORK_INITIALIZED;
1092}
1093
1094/*
1095 * Note: must call io_req_init_async() for the first time you
1096 * touch any members of io_wq_work.
1097 */
1098static inline void io_req_init_async(struct io_kiocb *req)
1099{
1100        struct io_uring_task *tctx = current->io_uring;
1101
1102        if (req->flags & REQ_F_WORK_INITIALIZED)
1103                return;
1104
1105        __io_req_init_async(req);
1106
1107        /* Grab a ref if this isn't our static identity */
1108        req->work.identity = tctx->identity;
1109        if (tctx->identity != &tctx->__identity)
1110                refcount_inc(&req->work.identity->count);
1111}
1112
1113static inline bool io_async_submit(struct io_ring_ctx *ctx)
1114{
1115        return ctx->flags & IORING_SETUP_SQPOLL;
1116}
1117
1118static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1119{
1120        struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1121
1122        complete(&ctx->ref_comp);
1123}
1124
1125static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1126{
1127        return !req->timeout.off;
1128}
1129
1130static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1131{
1132        struct io_ring_ctx *ctx;
1133        int hash_bits;
1134
1135        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1136        if (!ctx)
1137                return NULL;
1138
1139        ctx->fallback_req = kmem_cache_alloc(req_cachep, GFP_KERNEL);
1140        if (!ctx->fallback_req)
1141                goto err;
1142
1143        /*
1144         * Use 5 bits less than the max cq entries, that should give us around
1145         * 32 entries per hash list if totally full and uniformly spread.
1146         */
1147        hash_bits = ilog2(p->cq_entries);
1148        hash_bits -= 5;
1149        if (hash_bits <= 0)
1150                hash_bits = 1;
1151        ctx->cancel_hash_bits = hash_bits;
1152        ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1153                                        GFP_KERNEL);
1154        if (!ctx->cancel_hash)
1155                goto err;
1156        __hash_init(ctx->cancel_hash, 1U << hash_bits);
1157
1158        if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1159                            PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1160                goto err;
1161
1162        ctx->flags = p->flags;
1163        init_waitqueue_head(&ctx->sqo_sq_wait);
1164        INIT_LIST_HEAD(&ctx->sqd_list);
1165        init_waitqueue_head(&ctx->cq_wait);
1166        INIT_LIST_HEAD(&ctx->cq_overflow_list);
1167        init_completion(&ctx->ref_comp);
1168        init_completion(&ctx->sq_thread_comp);
1169        idr_init(&ctx->io_buffer_idr);
1170        idr_init(&ctx->personality_idr);
1171        mutex_init(&ctx->uring_lock);
1172        init_waitqueue_head(&ctx->wait);
1173        spin_lock_init(&ctx->completion_lock);
1174        INIT_LIST_HEAD(&ctx->iopoll_list);
1175        INIT_LIST_HEAD(&ctx->defer_list);
1176        INIT_LIST_HEAD(&ctx->timeout_list);
1177        init_waitqueue_head(&ctx->inflight_wait);
1178        spin_lock_init(&ctx->inflight_lock);
1179        INIT_LIST_HEAD(&ctx->inflight_list);
1180        INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work);
1181        init_llist_head(&ctx->file_put_llist);
1182        return ctx;
1183err:
1184        if (ctx->fallback_req)
1185                kmem_cache_free(req_cachep, ctx->fallback_req);
1186        kfree(ctx->cancel_hash);
1187        kfree(ctx);
1188        return NULL;
1189}
1190
1191static bool req_need_defer(struct io_kiocb *req, u32 seq)
1192{
1193        if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1194                struct io_ring_ctx *ctx = req->ctx;
1195
1196                return seq != ctx->cached_cq_tail
1197                                + READ_ONCE(ctx->cached_cq_overflow);
1198        }
1199
1200        return false;
1201}
1202
1203static void __io_commit_cqring(struct io_ring_ctx *ctx)
1204{
1205        struct io_rings *rings = ctx->rings;
1206
1207        /* order cqe stores with ring update */
1208        smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
1209
1210        if (wq_has_sleeper(&ctx->cq_wait)) {
1211                wake_up_interruptible(&ctx->cq_wait);
1212                kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
1213        }
1214}
1215
1216static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req)
1217{
1218        if (req->work.identity == &tctx->__identity)
1219                return;
1220        if (refcount_dec_and_test(&req->work.identity->count))
1221                kfree(req->work.identity);
1222}
1223
1224static void io_req_clean_work(struct io_kiocb *req)
1225{
1226        if (!(req->flags & REQ_F_WORK_INITIALIZED))
1227                return;
1228
1229        req->flags &= ~REQ_F_WORK_INITIALIZED;
1230
1231        if (req->work.flags & IO_WQ_WORK_MM) {
1232                mmdrop(req->work.identity->mm);
1233                req->work.flags &= ~IO_WQ_WORK_MM;
1234        }
1235#ifdef CONFIG_BLK_CGROUP
1236        if (req->work.flags & IO_WQ_WORK_BLKCG) {
1237                css_put(req->work.identity->blkcg_css);
1238                req->work.flags &= ~IO_WQ_WORK_BLKCG;
1239        }
1240#endif
1241        if (req->work.flags & IO_WQ_WORK_CREDS) {
1242                put_cred(req->work.identity->creds);
1243                req->work.flags &= ~IO_WQ_WORK_CREDS;
1244        }
1245        if (req->work.flags & IO_WQ_WORK_FS) {
1246                struct fs_struct *fs = req->work.identity->fs;
1247
1248                spin_lock(&req->work.identity->fs->lock);
1249                if (--fs->users)
1250                        fs = NULL;
1251                spin_unlock(&req->work.identity->fs->lock);
1252                if (fs)
1253                        free_fs_struct(fs);
1254                req->work.flags &= ~IO_WQ_WORK_FS;
1255        }
1256
1257        io_put_identity(req->task->io_uring, req);
1258}
1259
1260/*
1261 * Create a private copy of io_identity, since some fields don't match
1262 * the current context.
1263 */
1264static bool io_identity_cow(struct io_kiocb *req)
1265{
1266        struct io_uring_task *tctx = current->io_uring;
1267        const struct cred *creds = NULL;
1268        struct io_identity *id;
1269
1270        if (req->work.flags & IO_WQ_WORK_CREDS)
1271                creds = req->work.identity->creds;
1272
1273        id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL);
1274        if (unlikely(!id)) {
1275                req->work.flags |= IO_WQ_WORK_CANCEL;
1276                return false;
1277        }
1278
1279        /*
1280         * We can safely just re-init the creds we copied  Either the field
1281         * matches the current one, or we haven't grabbed it yet. The only
1282         * exception is ->creds, through registered personalities, so handle
1283         * that one separately.
1284         */
1285        io_init_identity(id);
1286        if (creds)
1287                id->creds = creds;
1288
1289        /* add one for this request */
1290        refcount_inc(&id->count);
1291
1292        /* drop tctx and req identity references, if needed */
1293        if (tctx->identity != &tctx->__identity &&
1294            refcount_dec_and_test(&tctx->identity->count))
1295                kfree(tctx->identity);
1296        if (req->work.identity != &tctx->__identity &&
1297            refcount_dec_and_test(&req->work.identity->count))
1298                kfree(req->work.identity);
1299
1300        req->work.identity = id;
1301        tctx->identity = id;
1302        return true;
1303}
1304
1305static bool io_grab_identity(struct io_kiocb *req)
1306{
1307        const struct io_op_def *def = &io_op_defs[req->opcode];
1308        struct io_identity *id = req->work.identity;
1309        struct io_ring_ctx *ctx = req->ctx;
1310
1311        if (def->work_flags & IO_WQ_WORK_FSIZE) {
1312                if (id->fsize != rlimit(RLIMIT_FSIZE))
1313                        return false;
1314                req->work.flags |= IO_WQ_WORK_FSIZE;
1315        }
1316#ifdef CONFIG_BLK_CGROUP
1317        if (!(req->work.flags & IO_WQ_WORK_BLKCG) &&
1318            (def->work_flags & IO_WQ_WORK_BLKCG)) {
1319                rcu_read_lock();
1320                if (id->blkcg_css != blkcg_css()) {
1321                        rcu_read_unlock();
1322                        return false;
1323                }
1324                /*
1325                 * This should be rare, either the cgroup is dying or the task
1326                 * is moving cgroups. Just punt to root for the handful of ios.
1327                 */
1328                if (css_tryget_online(id->blkcg_css))
1329                        req->work.flags |= IO_WQ_WORK_BLKCG;
1330                rcu_read_unlock();
1331        }
1332#endif
1333        if (!(req->work.flags & IO_WQ_WORK_CREDS)) {
1334                if (id->creds != current_cred())
1335                        return false;
1336                get_cred(id->creds);
1337                req->work.flags |= IO_WQ_WORK_CREDS;
1338        }
1339#ifdef CONFIG_AUDIT
1340        if (!uid_eq(current->loginuid, id->loginuid) ||
1341            current->sessionid != id->sessionid)
1342                return false;
1343#endif
1344        if (!(req->work.flags & IO_WQ_WORK_FS) &&
1345            (def->work_flags & IO_WQ_WORK_FS)) {
1346                if (current->fs != id->fs)
1347                        return false;
1348                spin_lock(&id->fs->lock);
1349                if (!id->fs->in_exec) {
1350                        id->fs->users++;
1351                        req->work.flags |= IO_WQ_WORK_FS;
1352                } else {
1353                        req->work.flags |= IO_WQ_WORK_CANCEL;
1354                }
1355                spin_unlock(&current->fs->lock);
1356        }
1357        if (!(req->work.flags & IO_WQ_WORK_FILES) &&
1358            (def->work_flags & IO_WQ_WORK_FILES) &&
1359            !(req->flags & REQ_F_NO_FILE_TABLE)) {
1360                if (id->files != current->files ||
1361                    id->nsproxy != current->nsproxy)
1362                        return false;
1363                atomic_inc(&id->files->count);
1364                get_nsproxy(id->nsproxy);
1365                req->flags |= REQ_F_INFLIGHT;
1366
1367                spin_lock_irq(&ctx->inflight_lock);
1368                list_add(&req->inflight_entry, &ctx->inflight_list);
1369                spin_unlock_irq(&ctx->inflight_lock);
1370                req->work.flags |= IO_WQ_WORK_FILES;
1371        }
1372
1373        return true;
1374}
1375
1376static void io_prep_async_work(struct io_kiocb *req)
1377{
1378        const struct io_op_def *def = &io_op_defs[req->opcode];
1379        struct io_ring_ctx *ctx = req->ctx;
1380        struct io_identity *id;
1381
1382        io_req_init_async(req);
1383        id = req->work.identity;
1384
1385        if (req->flags & REQ_F_FORCE_ASYNC)
1386                req->work.flags |= IO_WQ_WORK_CONCURRENT;
1387
1388        if (req->flags & REQ_F_ISREG) {
1389                if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1390                        io_wq_hash_work(&req->work, file_inode(req->file));
1391        } else {
1392                if (def->unbound_nonreg_file)
1393                        req->work.flags |= IO_WQ_WORK_UNBOUND;
1394        }
1395
1396        /* ->mm can never change on us */
1397        if (!(req->work.flags & IO_WQ_WORK_MM) &&
1398            (def->work_flags & IO_WQ_WORK_MM)) {
1399                mmgrab(id->mm);
1400                req->work.flags |= IO_WQ_WORK_MM;
1401        }
1402
1403        /* if we fail grabbing identity, we must COW, regrab, and retry */
1404        if (io_grab_identity(req))
1405                return;
1406
1407        if (!io_identity_cow(req))
1408                return;
1409
1410        /* can't fail at this point */
1411        if (!io_grab_identity(req))
1412                WARN_ON(1);
1413}
1414
1415static void io_prep_async_link(struct io_kiocb *req)
1416{
1417        struct io_kiocb *cur;
1418
1419        io_prep_async_work(req);
1420        if (req->flags & REQ_F_LINK_HEAD)
1421                list_for_each_entry(cur, &req->link_list, link_list)
1422                        io_prep_async_work(cur);
1423}
1424
1425static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req)
1426{
1427        struct io_ring_ctx *ctx = req->ctx;
1428        struct io_kiocb *link = io_prep_linked_timeout(req);
1429
1430        trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1431                                        &req->work, req->flags);
1432        io_wq_enqueue(ctx->io_wq, &req->work);
1433        return link;
1434}
1435
1436static void io_queue_async_work(struct io_kiocb *req)
1437{
1438        struct io_kiocb *link;
1439
1440        /* init ->work of the whole link before punting */
1441        io_prep_async_link(req);
1442        link = __io_queue_async_work(req);
1443
1444        if (link)
1445                io_queue_linked_timeout(link);
1446}
1447
1448static void io_kill_timeout(struct io_kiocb *req)
1449{
1450        struct io_timeout_data *io = req->async_data;
1451        int ret;
1452
1453        ret = hrtimer_try_to_cancel(&io->timer);
1454        if (ret != -1) {
1455                atomic_set(&req->ctx->cq_timeouts,
1456                        atomic_read(&req->ctx->cq_timeouts) + 1);
1457                list_del_init(&req->timeout.list);
1458                io_cqring_fill_event(req, 0);
1459                io_put_req_deferred(req, 1);
1460        }
1461}
1462
1463static bool io_task_match(struct io_kiocb *req, struct task_struct *tsk)
1464{
1465        struct io_ring_ctx *ctx = req->ctx;
1466
1467        if (!tsk || req->task == tsk)
1468                return true;
1469        if (ctx->flags & IORING_SETUP_SQPOLL) {
1470                if (ctx->sq_data && req->task == ctx->sq_data->thread)
1471                        return true;
1472        }
1473        return false;
1474}
1475
1476/*
1477 * Returns true if we found and killed one or more timeouts
1478 */
1479static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk)
1480{
1481        struct io_kiocb *req, *tmp;
1482        int canceled = 0;
1483
1484        spin_lock_irq(&ctx->completion_lock);
1485        list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1486                if (io_task_match(req, tsk)) {
1487                        io_kill_timeout(req);
1488                        canceled++;
1489                }
1490        }
1491        spin_unlock_irq(&ctx->completion_lock);
1492        return canceled != 0;
1493}
1494
1495static void __io_queue_deferred(struct io_ring_ctx *ctx)
1496{
1497        do {
1498                struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1499                                                struct io_defer_entry, list);
1500                struct io_kiocb *link;
1501
1502                if (req_need_defer(de->req, de->seq))
1503                        break;
1504                list_del_init(&de->list);
1505                /* punt-init is done before queueing for defer */
1506                link = __io_queue_async_work(de->req);
1507                if (link) {
1508                        __io_queue_linked_timeout(link);
1509                        /* drop submission reference */
1510                        io_put_req_deferred(link, 1);
1511                }
1512                kfree(de);
1513        } while (!list_empty(&ctx->defer_list));
1514}
1515
1516static void io_flush_timeouts(struct io_ring_ctx *ctx)
1517{
1518        while (!list_empty(&ctx->timeout_list)) {
1519                struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
1520                                                struct io_kiocb, timeout.list);
1521
1522                if (io_is_timeout_noseq(req))
1523                        break;
1524                if (req->timeout.target_seq != ctx->cached_cq_tail
1525                                        - atomic_read(&ctx->cq_timeouts))
1526                        break;
1527
1528                list_del_init(&req->timeout.list);
1529                io_kill_timeout(req);
1530        }
1531}
1532
1533static void io_commit_cqring(struct io_ring_ctx *ctx)
1534{
1535        io_flush_timeouts(ctx);
1536        __io_commit_cqring(ctx);
1537
1538        if (unlikely(!list_empty(&ctx->defer_list)))
1539                __io_queue_deferred(ctx);
1540}
1541
1542static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1543{
1544        struct io_rings *r = ctx->rings;
1545
1546        return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries;
1547}
1548
1549static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
1550{
1551        struct io_rings *rings = ctx->rings;
1552        unsigned tail;
1553
1554        tail = ctx->cached_cq_tail;
1555        /*
1556         * writes to the cq entry need to come after reading head; the
1557         * control dependency is enough as we're using WRITE_ONCE to
1558         * fill the cq entry
1559         */
1560        if (tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)
1561                return NULL;
1562
1563        ctx->cached_cq_tail++;
1564        return &rings->cqes[tail & ctx->cq_mask];
1565}
1566
1567static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1568{
1569        if (!ctx->cq_ev_fd)
1570                return false;
1571        if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1572                return false;
1573        if (!ctx->eventfd_async)
1574                return true;
1575        return io_wq_current_is_worker();
1576}
1577
1578static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1579{
1580        if (waitqueue_active(&ctx->wait))
1581                wake_up(&ctx->wait);
1582        if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1583                wake_up(&ctx->sq_data->wait);
1584        if (io_should_trigger_evfd(ctx))
1585                eventfd_signal(ctx->cq_ev_fd, 1);
1586}
1587
1588static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
1589{
1590        if (list_empty(&ctx->cq_overflow_list)) {
1591                clear_bit(0, &ctx->sq_check_overflow);
1592                clear_bit(0, &ctx->cq_check_overflow);
1593                ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
1594        }
1595}
1596
1597static inline bool __io_match_files(struct io_kiocb *req,
1598                                    struct files_struct *files)
1599{
1600        return ((req->flags & REQ_F_WORK_INITIALIZED) &&
1601                (req->work.flags & IO_WQ_WORK_FILES)) &&
1602                req->work.identity->files == files;
1603}
1604
1605static bool io_match_files(struct io_kiocb *req,
1606                           struct files_struct *files)
1607{
1608        struct io_kiocb *link;
1609
1610        if (!files)
1611                return true;
1612        if (__io_match_files(req, files))
1613                return true;
1614        if (req->flags & REQ_F_LINK_HEAD) {
1615                list_for_each_entry(link, &req->link_list, link_list) {
1616                        if (__io_match_files(link, files))
1617                                return true;
1618                }
1619        }
1620        return false;
1621}
1622
1623/* Returns true if there are no backlogged entries after the flush */
1624static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
1625                                     struct task_struct *tsk,
1626                                     struct files_struct *files)
1627{
1628        struct io_rings *rings = ctx->rings;
1629        struct io_kiocb *req, *tmp;
1630        struct io_uring_cqe *cqe;
1631        unsigned long flags;
1632        LIST_HEAD(list);
1633
1634        if (!force) {
1635                if (list_empty_careful(&ctx->cq_overflow_list))
1636                        return true;
1637                if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) ==
1638                    rings->cq_ring_entries))
1639                        return false;
1640        }
1641
1642        spin_lock_irqsave(&ctx->completion_lock, flags);
1643
1644        /* if force is set, the ring is going away. always drop after that */
1645        if (force)
1646                ctx->cq_overflow_flushed = 1;
1647
1648        cqe = NULL;
1649        list_for_each_entry_safe(req, tmp, &ctx->cq_overflow_list, compl.list) {
1650                if (tsk && req->task != tsk)
1651                        continue;
1652                if (!io_match_files(req, files))
1653                        continue;
1654
1655                cqe = io_get_cqring(ctx);
1656                if (!cqe && !force)
1657                        break;
1658
1659                list_move(&req->compl.list, &list);
1660                if (cqe) {
1661                        WRITE_ONCE(cqe->user_data, req->user_data);
1662                        WRITE_ONCE(cqe->res, req->result);
1663                        WRITE_ONCE(cqe->flags, req->compl.cflags);
1664                } else {
1665                        ctx->cached_cq_overflow++;
1666                        WRITE_ONCE(ctx->rings->cq_overflow,
1667                                   ctx->cached_cq_overflow);
1668                }
1669        }
1670
1671        io_commit_cqring(ctx);
1672        io_cqring_mark_overflow(ctx);
1673
1674        spin_unlock_irqrestore(&ctx->completion_lock, flags);
1675        io_cqring_ev_posted(ctx);
1676
1677        while (!list_empty(&list)) {
1678                req = list_first_entry(&list, struct io_kiocb, compl.list);
1679                list_del(&req->compl.list);
1680                io_put_req(req);
1681        }
1682
1683        return cqe != NULL;
1684}
1685
1686static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
1687{
1688        struct io_ring_ctx *ctx = req->ctx;
1689        struct io_uring_cqe *cqe;
1690
1691        trace_io_uring_complete(ctx, req->user_data, res);
1692
1693        /*
1694         * If we can't get a cq entry, userspace overflowed the
1695         * submission (by quite a lot). Increment the overflow count in
1696         * the ring.
1697         */
1698        cqe = io_get_cqring(ctx);
1699        if (likely(cqe)) {
1700                WRITE_ONCE(cqe->user_data, req->user_data);
1701                WRITE_ONCE(cqe->res, res);
1702                WRITE_ONCE(cqe->flags, cflags);
1703        } else if (ctx->cq_overflow_flushed ||
1704                   atomic_read(&req->task->io_uring->in_idle)) {
1705                /*
1706                 * If we're in ring overflow flush mode, or in task cancel mode,
1707                 * then we cannot store the request for later flushing, we need
1708                 * to drop it on the floor.
1709                 */
1710                ctx->cached_cq_overflow++;
1711                WRITE_ONCE(ctx->rings->cq_overflow, ctx->cached_cq_overflow);
1712        } else {
1713                if (list_empty(&ctx->cq_overflow_list)) {
1714                        set_bit(0, &ctx->sq_check_overflow);
1715                        set_bit(0, &ctx->cq_check_overflow);
1716                        ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
1717                }
1718                io_clean_op(req);
1719                req->result = res;
1720                req->compl.cflags = cflags;
1721                refcount_inc(&req->refs);
1722                list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
1723        }
1724}
1725
1726static void io_cqring_fill_event(struct io_kiocb *req, long res)
1727{
1728        __io_cqring_fill_event(req, res, 0);
1729}
1730
1731static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
1732{
1733        struct io_ring_ctx *ctx = req->ctx;
1734        unsigned long flags;
1735
1736        spin_lock_irqsave(&ctx->completion_lock, flags);
1737        __io_cqring_fill_event(req, res, cflags);
1738        io_commit_cqring(ctx);
1739        spin_unlock_irqrestore(&ctx->completion_lock, flags);
1740
1741        io_cqring_ev_posted(ctx);
1742}
1743
1744static void io_submit_flush_completions(struct io_comp_state *cs)
1745{
1746        struct io_ring_ctx *ctx = cs->ctx;
1747
1748        spin_lock_irq(&ctx->completion_lock);
1749        while (!list_empty(&cs->list)) {
1750                struct io_kiocb *req;
1751
1752                req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
1753                list_del(&req->compl.list);
1754                __io_cqring_fill_event(req, req->result, req->compl.cflags);
1755
1756                /*
1757                 * io_free_req() doesn't care about completion_lock unless one
1758                 * of these flags is set. REQ_F_WORK_INITIALIZED is in the list
1759                 * because of a potential deadlock with req->work.fs->lock
1760                 */
1761                if (req->flags & (REQ_F_FAIL_LINK|REQ_F_LINK_TIMEOUT
1762                                 |REQ_F_WORK_INITIALIZED)) {
1763                        spin_unlock_irq(&ctx->completion_lock);
1764                        io_put_req(req);
1765                        spin_lock_irq(&ctx->completion_lock);
1766                } else {
1767                        io_put_req(req);
1768                }
1769        }
1770        io_commit_cqring(ctx);
1771        spin_unlock_irq(&ctx->completion_lock);
1772
1773        io_cqring_ev_posted(ctx);
1774        cs->nr = 0;
1775}
1776
1777static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
1778                              struct io_comp_state *cs)
1779{
1780        if (!cs) {
1781                io_cqring_add_event(req, res, cflags);
1782                io_put_req(req);
1783        } else {
1784                io_clean_op(req);
1785                req->result = res;
1786                req->compl.cflags = cflags;
1787                list_add_tail(&req->compl.list, &cs->list);
1788                if (++cs->nr >= 32)
1789                        io_submit_flush_completions(cs);
1790        }
1791}
1792
1793static void io_req_complete(struct io_kiocb *req, long res)
1794{
1795        __io_req_complete(req, res, 0, NULL);
1796}
1797
1798static inline bool io_is_fallback_req(struct io_kiocb *req)
1799{
1800        return req == (struct io_kiocb *)
1801                        ((unsigned long) req->ctx->fallback_req & ~1UL);
1802}
1803
1804static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx)
1805{
1806        struct io_kiocb *req;
1807
1808        req = ctx->fallback_req;
1809        if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req))
1810                return req;
1811
1812        return NULL;
1813}
1814
1815static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
1816                                     struct io_submit_state *state)
1817{
1818        if (!state->free_reqs) {
1819                gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1820                size_t sz;
1821                int ret;
1822
1823                sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs));
1824                ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs);
1825
1826                /*
1827                 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1828                 * retry single alloc to be on the safe side.
1829                 */
1830                if (unlikely(ret <= 0)) {
1831                        state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
1832                        if (!state->reqs[0])
1833                                goto fallback;
1834                        ret = 1;
1835                }
1836                state->free_reqs = ret;
1837        }
1838
1839        state->free_reqs--;
1840        return state->reqs[state->free_reqs];
1841fallback:
1842        return io_get_fallback_req(ctx);
1843}
1844
1845static inline void io_put_file(struct io_kiocb *req, struct file *file,
1846                          bool fixed)
1847{
1848        if (fixed)
1849                percpu_ref_put(req->fixed_file_refs);
1850        else
1851                fput(file);
1852}
1853
1854static void io_dismantle_req(struct io_kiocb *req)
1855{
1856        io_clean_op(req);
1857
1858        if (req->async_data)
1859                kfree(req->async_data);
1860        if (req->file)
1861                io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
1862
1863        io_req_clean_work(req);
1864}
1865
1866static void __io_free_req(struct io_kiocb *req)
1867{
1868        struct io_uring_task *tctx = req->task->io_uring;
1869        struct io_ring_ctx *ctx = req->ctx;
1870
1871        io_dismantle_req(req);
1872
1873        percpu_counter_dec(&tctx->inflight);
1874        if (atomic_read(&tctx->in_idle))
1875                wake_up(&tctx->wait);
1876        put_task_struct(req->task);
1877
1878        if (likely(!io_is_fallback_req(req)))
1879                kmem_cache_free(req_cachep, req);
1880        else
1881                clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
1882        percpu_ref_put(&ctx->refs);
1883}
1884
1885static void io_kill_linked_timeout(struct io_kiocb *req)
1886{
1887        struct io_ring_ctx *ctx = req->ctx;
1888        struct io_kiocb *link;
1889        bool cancelled = false;
1890        unsigned long flags;
1891
1892        spin_lock_irqsave(&ctx->completion_lock, flags);
1893        link = list_first_entry_or_null(&req->link_list, struct io_kiocb,
1894                                        link_list);
1895        /*
1896         * Can happen if a linked timeout fired and link had been like
1897         * req -> link t-out -> link t-out [-> ...]
1898         */
1899        if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) {
1900                struct io_timeout_data *io = link->async_data;
1901                int ret;
1902
1903                list_del_init(&link->link_list);
1904                ret = hrtimer_try_to_cancel(&io->timer);
1905                if (ret != -1) {
1906                        io_cqring_fill_event(link, -ECANCELED);
1907                        io_commit_cqring(ctx);
1908                        cancelled = true;
1909                }
1910        }
1911        req->flags &= ~REQ_F_LINK_TIMEOUT;
1912        spin_unlock_irqrestore(&ctx->completion_lock, flags);
1913
1914        if (cancelled) {
1915                io_cqring_ev_posted(ctx);
1916                io_put_req(link);
1917        }
1918}
1919
1920static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
1921{
1922        struct io_kiocb *nxt;
1923
1924        /*
1925         * The list should never be empty when we are called here. But could
1926         * potentially happen if the chain is messed up, check to be on the
1927         * safe side.
1928         */
1929        if (unlikely(list_empty(&req->link_list)))
1930                return NULL;
1931
1932        nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
1933        list_del_init(&req->link_list);
1934        if (!list_empty(&nxt->link_list))
1935                nxt->flags |= REQ_F_LINK_HEAD;
1936        return nxt;
1937}
1938
1939/*
1940 * Called if REQ_F_LINK_HEAD is set, and we fail the head request
1941 */
1942static void io_fail_links(struct io_kiocb *req)
1943{
1944        struct io_ring_ctx *ctx = req->ctx;
1945        unsigned long flags;
1946
1947        spin_lock_irqsave(&ctx->completion_lock, flags);
1948        while (!list_empty(&req->link_list)) {
1949                struct io_kiocb *link = list_first_entry(&req->link_list,
1950                                                struct io_kiocb, link_list);
1951
1952                list_del_init(&link->link_list);
1953                trace_io_uring_fail_link(req, link);
1954
1955                io_cqring_fill_event(link, -ECANCELED);
1956
1957                /*
1958                 * It's ok to free under spinlock as they're not linked anymore,
1959                 * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on
1960                 * work.fs->lock.
1961                 */
1962                if (link->flags & REQ_F_WORK_INITIALIZED)
1963                        io_put_req_deferred(link, 2);
1964                else
1965                        io_double_put_req(link);
1966        }
1967
1968        io_commit_cqring(ctx);
1969        spin_unlock_irqrestore(&ctx->completion_lock, flags);
1970
1971        io_cqring_ev_posted(ctx);
1972}
1973
1974static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
1975{
1976        req->flags &= ~REQ_F_LINK_HEAD;
1977        if (req->flags & REQ_F_LINK_TIMEOUT)
1978                io_kill_linked_timeout(req);
1979
1980        /*
1981         * If LINK is set, we have dependent requests in this chain. If we
1982         * didn't fail this request, queue the first one up, moving any other
1983         * dependencies to the next request. In case of failure, fail the rest
1984         * of the chain.
1985         */
1986        if (likely(!(req->flags & REQ_F_FAIL_LINK)))
1987                return io_req_link_next(req);
1988        io_fail_links(req);
1989        return NULL;
1990}
1991
1992static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
1993{
1994        if (likely(!(req->flags & REQ_F_LINK_HEAD)))
1995                return NULL;
1996        return __io_req_find_next(req);
1997}
1998
1999static int io_req_task_work_add(struct io_kiocb *req, bool twa_signal_ok)
2000{

2001        struct task_struct *tsk = req->task;
2002        struct io_ring_ctx *ctx = req->ctx;
2003        enum task_work_notify_mode notify;
2004        int ret;
2005
2006        if (tsk->flags & PF_EXITING)
2007                return -ESRCH;
2008
2009        /*
2010         * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2011         * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2012         * processing task_work. There's no reliable way to tell if TWA_RESUME
2013         * will do the job.
2014         */
2015        notify = TWA_NONE;
2016        if (!(ctx->flags & IORING_SETUP_SQPOLL) && twa_signal_ok)
2017                notify = TWA_SIGNAL;
2018
2019        ret = task_work_add(tsk, &req->task_work, notify);
2020        if (!ret)
2021                wake_up_process(tsk);
2022
2023        return ret;
2024}
2025
2026static void __io_req_task_cancel(struct io_kiocb *req, int error)
2027{
2028        struct io_ring_ctx *ctx = req->ctx;
2029
2030        spin_lock_irq(&ctx->completion_lock);
2031        io_cqring_fill_event(req, error);
2032        io_commit_cqring(ctx);
2033        spin_unlock_irq(&ctx->completion_lock);
2034
2035        io_cqring_ev_posted(ctx);
2036        req_set_fail_links(req);
2037        io_double_put_req(req);
2038}
2039
2040static void io_req_task_cancel(struct callback_head *cb)
2041{
2042        struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2043        struct io_ring_ctx *ctx = req->ctx;
2044
2045        __io_req_task_cancel(req, -ECANCELED);
2046        percpu_ref_put(&ctx->refs);
2047}
2048
2049static void __io_req_task_submit(struct io_kiocb *req)
2050{
2051        struct io_ring_ctx *ctx = req->ctx;
2052
2053        if (!__io_sq_thread_acquire_mm(ctx)) {
2054                mutex_lock(&ctx->uring_lock);
2055                __io_queue_sqe(req, NULL);
2056                mutex_unlock(&ctx->uring_lock);
2057        } else {
2058                __io_req_task_cancel(req, -EFAULT);
2059        }
2060}
2061
2062static void io_req_task_submit(struct callback_head *cb)
2063{
2064        struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2065        struct io_ring_ctx *ctx = req->ctx;
2066
2067        __io_req_task_submit(req);
2068        percpu_ref_put(&ctx->refs);
2069}
2070
2071static void io_req_task_queue(struct io_kiocb *req)
2072{
2073        int ret;
2074
2075        init_task_work(&req->task_work, io_req_task_submit);
2076        percpu_ref_get(&req->ctx->refs);
2077
2078        ret = io_req_task_work_add(req, true);
2079        if (unlikely(ret)) {
2080                struct task_struct *tsk;
2081
2082                init_task_work(&req->task_work, io_req_task_cancel);
2083                tsk = io_wq_get_task(req->ctx->io_wq);
2084                task_work_add(tsk, &req->task_work, TWA_NONE);
2085                wake_up_process(tsk);
2086        }
2087}
2088
2089static void io_queue_next(struct io_kiocb *req)
2090{
2091        struct io_kiocb *nxt = io_req_find_next(req);
2092
2093        if (nxt)
2094                io_req_task_queue(nxt);
2095}
2096
2097static void io_free_req(struct io_kiocb *req)
2098{
2099        io_queue_next(req);
2100        __io_free_req(req);
2101}
2102
2103struct req_batch {
2104        void *reqs[IO_IOPOLL_BATCH];
2105        int to_free;
2106
2107        struct task_struct      *task;
2108        int                     task_refs;
2109};
2110
2111static inline void io_init_req_batch(struct req_batch *rb)
2112{
2113        rb->to_free = 0;
2114        rb->task_refs = 0;
2115        rb->task = NULL;
2116}
2117
2118static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
2119                                      struct req_batch *rb)
2120{
2121        kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
2122        percpu_ref_put_many(&ctx->refs, rb->to_free);
2123        rb->to_free = 0;
2124}
2125
2126static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2127                                     struct req_batch *rb)
2128{
2129        if (rb->to_free)
2130                __io_req_free_batch_flush(ctx, rb);
2131        if (rb->task) {
2132                struct io_uring_task *tctx = rb->task->io_uring;
2133
2134                percpu_counter_sub(&tctx->inflight, rb->task_refs);
2135                put_task_struct_many(rb->task, rb->task_refs);
2136                rb->task = NULL;
2137        }
2138}
2139
2140static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
2141{
2142        if (unlikely(io_is_fallback_req(req))) {
2143                io_free_req(req);
2144                return;
2145        }
2146        if (req->flags & REQ_F_LINK_HEAD)
2147                io_queue_next(req);
2148
2149        if (req->task != rb->task) {
2150                if (rb->task) {
2151                        struct io_uring_task *tctx = rb->task->io_uring;
2152
2153                        percpu_counter_sub(&tctx->inflight, rb->task_refs);
2154                        put_task_struct_many(rb->task, rb->task_refs);
2155                }
2156                rb->task = req->task;
2157                rb->task_refs = 0;
2158        }
2159        rb->task_refs++;
2160
2161        io_dismantle_req(req);
2162        rb->reqs[rb->to_free++] = req;
2163        if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
2164                __io_req_free_batch_flush(req->ctx, rb);
2165}
2166
2167/*
2168 * Drop reference to request, return next in chain (if there is one) if this
2169 * was the last reference to this request.
2170 */
2171static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2172{
2173        struct io_kiocb *nxt = NULL;
2174
2175        if (refcount_dec_and_test(&req->refs)) {
2176                nxt = io_req_find_next(req);
2177                __io_free_req(req);
2178        }
2179        return nxt;
2180}
2181
2182static void io_put_req(struct io_kiocb *req)
2183{
2184        if (refcount_dec_and_test(&req->refs))
2185                io_free_req(req);
2186}
2187
2188static void io_put_req_deferred_cb(struct callback_head *cb)
2189{
2190        struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
2191
2192        io_free_req(req);
2193}
2194
2195static void io_free_req_deferred(struct io_kiocb *req)
2196{
2197        int ret;
2198
2199        init_task_work(&req->task_work, io_put_req_deferred_cb);
2200        ret = io_req_task_work_add(req, true);
2201        if (unlikely(ret)) {
2202                struct task_struct *tsk;
2203
2204                tsk = io_wq_get_task(req->ctx->io_wq);
2205                task_work_add(tsk, &req->task_work, TWA_NONE);
2206                wake_up_process(tsk);
2207        }
2208}
2209
2210static inline void io_put_req_deferred(struct io_kiocb *req, int refs)
2211{
2212        if (refcount_sub_and_test(refs, &req->refs))
2213                io_free_req_deferred(req);
2214}
2215
2216static struct io_wq_work *io_steal_work(struct io_kiocb *req)
2217{
2218        struct io_kiocb *nxt;
2219
2220        /*
2221         * A ref is owned by io-wq in which context we're. So, if that's the
2222         * last one, it's safe to steal next work. False negatives are Ok,
2223         * it just will be re-punted async in io_put_work()
2224         */
2225        if (refcount_read(&req->refs) != 1)
2226                return NULL;
2227
2228        nxt = io_req_find_next(req);
2229        return nxt ? &nxt->work : NULL;
2230}
2231
2232static void io_double_put_req(struct io_kiocb *req)
2233{
2234        /* drop both submit and complete references */
2235        if (refcount_sub_and_test(2, &req->refs))
2236                io_free_req(req);
2237}
2238
2239static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
2240{
2241        struct io_rings *rings = ctx->rings;
2242
2243        if (test_bit(0, &ctx->cq_check_overflow)) {
2244                /*
2245                 * noflush == true is from the waitqueue handler, just ensure
2246                 * we wake up the task, and the next invocation will flush the
2247                 * entries. We cannot safely to it from here.
2248                 */
2249                if (noflush && !list_empty(&ctx->cq_overflow_list))
2250                        return -1U;
2251
2252                io_cqring_overflow_flush(ctx, false, NULL, NULL);
2253        }
2254
2255        /* See comment at the top of this file */
2256        smp_rmb();
2257        return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
2258}
2259
2260static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2261{
2262        struct io_rings *rings = ctx->rings;
2263
2264        /* make sure SQ entry isn't read before tail */
2265        return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2266}
2267
2268static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2269{
2270        unsigned int cflags;
2271
2272        cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2273        cflags |= IORING_CQE_F_BUFFER;
2274        req->flags &= ~REQ_F_BUFFER_SELECTED;
2275        kfree(kbuf);
2276        return cflags;
2277}
2278
2279static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2280{
2281        struct io_buffer *kbuf;
2282
2283        kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2284        return io_put_kbuf(req, kbuf);
2285}
2286
2287static inline bool io_run_task_work(void)
2288{
2289        /*
2290         * Not safe to run on exiting task, and the task_work handling will
2291         * not add work to such a task.
2292         */
2293        if (unlikely(current->flags & PF_EXITING))
2294                return false;
2295        if (current->task_works) {
2296                __set_current_state(TASK_RUNNING);
2297                task_work_run();
2298                return true;
2299        }
2300
2301        return false;
2302}
2303
2304static void io_iopoll_queue(struct list_head *again)
2305{
2306        struct io_kiocb *req;
2307
2308        do {
2309                req = list_first_entry(again, struct io_kiocb, inflight_entry);
2310                list_del(&req->inflight_entry);
2311                __io_complete_rw(req, -EAGAIN, 0, NULL);
2312        } while (!list_empty(again));
2313}
2314
2315/*
2316 * Find and free completed poll iocbs
2317 */
2318static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2319                               struct list_head *done)
2320{
2321        struct req_batch rb;
2322        struct io_kiocb *req;
2323        LIST_HEAD(again);
2324
2325        /* order with ->result store in io_complete_rw_iopoll() */
2326        smp_rmb();
2327
2328        io_init_req_batch(&rb);
2329        while (!list_empty(done)) {
2330                int cflags = 0;
2331
2332                req = list_first_entry(done, struct io_kiocb, inflight_entry);
2333                if (READ_ONCE(req->result) == -EAGAIN) {
2334                        req->result = 0;
2335                        req->iopoll_completed = 0;
2336                        list_move_tail(&req->inflight_entry, &again);
2337                        continue;
2338                }
2339                list_del(&req->inflight_entry);
2340
2341                if (req->flags & REQ_F_BUFFER_SELECTED)
2342                        cflags = io_put_rw_kbuf(req);
2343
2344                __io_cqring_fill_event(req, req->result, cflags);
2345                (*nr_events)++;
2346
2347                if (refcount_dec_and_test(&req->refs))
2348                        io_req_free_batch(&rb, req);
2349        }
2350
2351        io_commit_cqring(ctx);
2352        if (ctx->flags & IORING_SETUP_SQPOLL)
2353                io_cqring_ev_posted(ctx);
2354        io_req_free_batch_finish(ctx, &rb);
2355
2356        if (!list_empty(&again))
2357                io_iopoll_queue(&again);
2358}
2359
2360static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2361                        long min)
2362{
2363        struct io_kiocb *req, *tmp;
2364        LIST_HEAD(done);
2365        bool spin;
2366        int ret;
2367
2368        /*
2369         * Only spin for completions if we don't have multiple devices hanging
2370         * off our complete list, and we're under the requested amount.
2371         */
2372        spin = !ctx->poll_multi_file && *nr_events < min;
2373
2374        ret = 0;
2375        list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2376                struct kiocb *kiocb = &req->rw.kiocb;
2377
2378                /*
2379                 * Move completed and retryable entries to our local lists.
2380                 * If we find a request that requires polling, break out
2381                 * and complete those lists first, if we have entries there.
2382                 */
2383                if (READ_ONCE(req->iopoll_completed)) {
2384                        list_move_tail(&req->inflight_entry, &done);
2385                        continue;
2386                }
2387                if (!list_empty(&done))
2388                        break;
2389
2390                ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2391                if (ret < 0)
2392                        break;
2393
2394                /* iopoll may have completed current req */
2395                if (READ_ONCE(req->iopoll_completed))
2396                        list_move_tail(&req->inflight_entry, &done);
2397
2398                if (ret && spin)
2399                        spin = false;
2400                ret = 0;
2401        }
2402
2403        if (!list_empty(&done))
2404                io_iopoll_complete(ctx, nr_events, &done);
2405
2406        return ret;
2407}
2408
2409/*
2410 * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a
2411 * non-spinning poll check - we'll still enter the driver poll loop, but only
2412 * as a non-spinning completion check.
2413 */
2414static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
2415                                long min)
2416{
2417        while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
2418                int ret;
2419
2420                ret = io_do_iopoll(ctx, nr_events, min);
2421                if (ret < 0)
2422                        return ret;
2423                if (*nr_events >= min)
2424                        return 0;
2425        }
2426
2427        return 1;
2428}
2429
2430/*
2431 * We can't just wait for polled events to come to us, we have to actively
2432 * find and complete them.
2433 */
2434static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2435{
2436        if (!(ctx->flags & IORING_SETUP_IOPOLL))
2437                return;
2438
2439        mutex_lock(&ctx->uring_lock);
2440        while (!list_empty(&ctx->iopoll_list)) {
2441                unsigned int nr_events = 0;
2442
2443                io_do_iopoll(ctx, &nr_events, 0);
2444
2445                /* let it sleep and repeat later if can't complete a request */
2446                if (nr_events == 0)
2447                        break;
2448                /*
2449                 * Ensure we allow local-to-the-cpu processing to take place,
2450                 * in this case we need to ensure that we reap all events.
2451                 * Also let task_work, etc. to progress by releasing the mutex
2452                 */
2453                if (need_resched()) {
2454                        mutex_unlock(&ctx->uring_lock);
2455                        cond_resched();
2456                        mutex_lock(&ctx->uring_lock);
2457                }
2458        }
2459        mutex_unlock(&ctx->uring_lock);
2460}
2461
2462static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2463{
2464        unsigned int nr_events = 0;
2465        int iters = 0, ret = 0;
2466
2467        /*
2468         * We disallow the app entering submit/complete with polling, but we
2469         * still need to lock the ring to prevent racing with polled issue
2470         * that got punted to a workqueue.
2471         */
2472        mutex_lock(&ctx->uring_lock);
2473        do {
2474                /*
2475                 * Don't enter poll loop if we already have events pending.
2476                 * If we do, we can potentially be spinning for commands that
2477                 * already triggered a CQE (eg in error).
2478                 */
2479                if (io_cqring_events(ctx, false))
2480                        break;
2481
2482                /*
2483                 * If a submit got punted to a workqueue, we can have the
2484                 * application entering polling for a command before it gets
2485                 * issued. That app will hold the uring_lock for the duration
2486                 * of the poll right here, so we need to take a breather every
2487                 * now and then to ensure that the issue has a chance to add
2488                 * the poll to the issued list. Otherwise we can spin here
2489                 * forever, while the workqueue is stuck trying to acquire the
2490                 * very same mutex.
2491                 */
2492                if (!(++iters & 7)) {
2493                        mutex_unlock(&ctx->uring_lock);
2494                        io_run_task_work();
2495                        mutex_lock(&ctx->uring_lock);
2496                }
2497
2498                ret = io_iopoll_getevents(ctx, &nr_events, min);
2499                if (ret <= 0)
2500                        break;
2501                ret = 0;
2502        } while (min && !nr_events && !need_resched());
2503
2504        mutex_unlock(&ctx->uring_lock);
2505        return ret;
2506}
2507
2508static void kiocb_end_write(struct io_kiocb *req)
2509{
2510        /*
2511         * Tell lockdep we inherited freeze protection from submission
2512         * thread.
2513         */
2514        if (req->flags & REQ_F_ISREG) {
2515                struct inode *inode = file_inode(req->file);
2516
2517                __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
2518        }
2519        file_end_write(req->file);
2520}
2521
2522static void io_complete_rw_common(struct kiocb *kiocb, long res,
2523                                  struct io_comp_state *cs)
2524{
2525        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2526        int cflags = 0;
2527
2528        if (kiocb->ki_flags & IOCB_WRITE)
2529                kiocb_end_write(req);
2530
2531        if (res != req->result)
2532                req_set_fail_links(req);
2533        if (req->flags & REQ_F_BUFFER_SELECTED)
2534                cflags = io_put_rw_kbuf(req);
2535        __io_req_complete(req, res, cflags, cs);
2536}
2537
2538#ifdef CONFIG_BLOCK
2539static bool io_resubmit_prep(struct io_kiocb *req, int error)
2540{
2541        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
2542        ssize_t ret = -ECANCELED;
2543        struct iov_iter iter;
2544        int rw;
2545
2546        if (error) {
2547                ret = error;
2548                goto end_req;
2549        }
2550
2551        switch (req->opcode) {
2552        case IORING_OP_READV:
2553        case IORING_OP_READ_FIXED:
2554        case IORING_OP_READ:
2555                rw = READ;
2556                break;
2557        case IORING_OP_WRITEV:
2558        case IORING_OP_WRITE_FIXED:
2559        case IORING_OP_WRITE:
2560                rw = WRITE;
2561                break;
2562        default:
2563                printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
2564                                req->opcode);
2565                goto end_req;
2566        }
2567
2568        if (!req->async_data) {
2569                ret = io_import_iovec(rw, req, &iovec, &iter, false);
2570                if (ret < 0)
2571                        goto end_req;
2572                ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
2573                if (!ret)
2574                        return true;
2575                kfree(iovec);
2576        } else {
2577                return true;
2578        }
2579end_req:
2580        req_set_fail_links(req);
2581        return false;
2582}
2583#endif
2584
2585static bool io_rw_reissue(struct io_kiocb *req, long res)
2586{
2587#ifdef CONFIG_BLOCK
2588        umode_t mode = file_inode(req->file)->i_mode;
2589        int ret;
2590
2591        if (!S_ISBLK(mode) && !S_ISREG(mode))
2592                return false;
2593        if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
2594                return false;
2595
2596        ret = io_sq_thread_acquire_mm(req->ctx, req);
2597
2598        if (io_resubmit_prep(req, ret)) {
2599                refcount_inc(&req->refs);
2600                io_queue_async_work(req);
2601                return true;
2602        }
2603
2604#endif
2605        return false;
2606}
2607
2608static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
2609                             struct io_comp_state *cs)
2610{
2611        if (!io_rw_reissue(req, res))
2612                io_complete_rw_common(&req->rw.kiocb, res, cs);
2613}
2614
2615static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2616{
2617        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2618
2619        __io_complete_rw(req, res, res2, NULL);
2620}
2621
2622static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2623{
2624        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2625
2626        if (kiocb->ki_flags & IOCB_WRITE)
2627                kiocb_end_write(req);
2628
2629        if (res != -EAGAIN && res != req->result)
2630                req_set_fail_links(req);
2631
2632        WRITE_ONCE(req->result, res);
2633        /* order with io_poll_complete() checking ->result */
2634        smp_wmb();
2635        WRITE_ONCE(req->iopoll_completed, 1);
2636}
2637
2638/*
2639 * After the iocb has been issued, it's safe to be found on the poll list.
2640 * Adding the kiocb to the list AFTER submission ensures that we don't
2641 * find it from a io_iopoll_getevents() thread before the issuer is done
2642 * accessing the kiocb cookie.
2643 */
2644static void io_iopoll_req_issued(struct io_kiocb *req)
2645{
2646        struct io_ring_ctx *ctx = req->ctx;
2647
2648        /*
2649         * Track whether we have multiple files in our lists. This will impact
2650         * how we do polling eventually, not spinning if we're on potentially
2651         * different devices.
2652         */
2653        if (list_empty(&ctx->iopoll_list)) {
2654                ctx->poll_multi_file = false;
2655        } else if (!ctx->poll_multi_file) {
2656                struct io_kiocb *list_req;
2657
2658                list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2659                                                inflight_entry);
2660                if (list_req->file != req->file)
2661                        ctx->poll_multi_file = true;
2662        }
2663
2664        /*
2665         * For fast devices, IO may have already completed. If it has, add
2666         * it to the front so we find it first.
2667         */
2668        if (READ_ONCE(req->iopoll_completed))
2669                list_add(&req->inflight_entry, &ctx->iopoll_list);
2670        else
2671                list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2672
2673        if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2674            wq_has_sleeper(&ctx->sq_data->wait))
2675                wake_up(&ctx->sq_data->wait);
2676}
2677
2678static void __io_state_file_put(struct io_submit_state *state)
2679{
2680        if (state->has_refs)
2681                fput_many(state->file, state->has_refs);
2682        state->file = NULL;
2683}
2684
2685static inline void io_state_file_put(struct io_submit_state *state)
2686{
2687        if (state->file)
2688                __io_state_file_put(state);
2689}
2690
2691/*
2692 * Get as many references to a file as we have IOs left in this submission,
2693 * assuming most submissions are for one file, or at least that each file
2694 * has more than one submission.
2695 */
2696static struct file *__io_file_get(struct io_submit_state *state, int fd)
2697{
2698        if (!state)
2699                return fget(fd);
2700
2701        if (state->file) {
2702                if (state->fd == fd) {
2703                        state->has_refs--;
2704                        return state->file;
2705                }
2706                __io_state_file_put(state);
2707        }
2708        state->file = fget_many(fd, state->ios_left);
2709        if (!state->file)
2710                return NULL;
2711
2712        state->fd = fd;
2713        state->has_refs = state->ios_left - 1;
2714        return state->file;
2715}
2716
2717static bool io_bdev_nowait(struct block_device *bdev)
2718{
2719#ifdef CONFIG_BLOCK
2720        return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2721#else
2722        return true;
2723#endif
2724}
2725
2726/*
2727 * If we tracked the file through the SCM inflight mechanism, we could support
2728 * any file. For now, just ensure that anything potentially problematic is done
2729 * inline.
2730 */
2731static bool io_file_supports_async(struct file *file, int rw)
2732{
2733        umode_t mode = file_inode(file)->i_mode;
2734
2735        if (S_ISBLK(mode)) {
2736                if (io_bdev_nowait(file->f_inode->i_bdev))
2737                        return true;
2738                return false;
2739        }
2740        if (S_ISCHR(mode) || S_ISSOCK(mode))
2741                return true;
2742        if (S_ISREG(mode)) {
2743                if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2744                    file->f_op != &io_uring_fops)
2745                        return true;
2746                return false;
2747        }
2748
2749        /* any ->read/write should understand O_NONBLOCK */
2750        if (file->f_flags & O_NONBLOCK)
2751                return true;
2752
2753        if (!(file->f_mode & FMODE_NOWAIT))
2754                return false;
2755
2756        if (rw == READ)
2757                return file->f_op->read_iter != NULL;
2758
2759        return file->f_op->write_iter != NULL;
2760}
2761
2762static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
2763{
2764        struct io_ring_ctx *ctx = req->ctx;
2765        struct kiocb *kiocb = &req->rw.kiocb;
2766        unsigned ioprio;
2767        int ret;
2768
2769        if (S_ISREG(file_inode(req->file)->i_mode))
2770                req->flags |= REQ_F_ISREG;
2771
2772        kiocb->ki_pos = READ_ONCE(sqe->off);
2773        if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
2774                req->flags |= REQ_F_CUR_POS;
2775                kiocb->ki_pos = req->file->f_pos;
2776        }
2777        kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2778        kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2779        ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2780        if (unlikely(ret))
2781                return ret;
2782
2783        ioprio = READ_ONCE(sqe->ioprio);
2784        if (ioprio) {
2785                ret = ioprio_check_cap(ioprio);
2786                if (ret)
2787                        return ret;
2788
2789                kiocb->ki_ioprio = ioprio;
2790        } else
2791                kiocb->ki_ioprio = get_current_ioprio();
2792
2793        /* don't allow async punt if RWF_NOWAIT was requested */
2794        if (kiocb->ki_flags & IOCB_NOWAIT)
2795                req->flags |= REQ_F_NOWAIT;
2796
2797        if (ctx->flags & IORING_SETUP_IOPOLL) {
2798                if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2799                    !kiocb->ki_filp->f_op->iopoll)
2800                        return -EOPNOTSUPP;
2801
2802                kiocb->ki_flags |= IOCB_HIPRI;
2803                kiocb->ki_complete = io_complete_rw_iopoll;
2804                req->iopoll_completed = 0;
2805        } else {
2806                if (kiocb->ki_flags & IOCB_HIPRI)
2807                        return -EINVAL;
2808                kiocb->ki_complete = io_complete_rw;
2809        }
2810
2811        req->rw.addr = READ_ONCE(sqe->addr);
2812        req->rw.len = READ_ONCE(sqe->len);
2813        req->buf_index = READ_ONCE(sqe->buf_index);
2814        return 0;
2815}
2816
2817static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
2818{
2819        switch (ret) {
2820        case -EIOCBQUEUED:
2821                break;
2822        case -ERESTARTSYS:
2823        case -ERESTARTNOINTR:
2824        case -ERESTARTNOHAND:
2825        case -ERESTART_RESTARTBLOCK:
2826                /*
2827                 * We can't just restart the syscall, since previously
2828                 * submitted sqes may already be in progress. Just fail this
2829                 * IO with EINTR.
2830                 */
2831                ret = -EINTR;
2832                fallthrough;
2833        default:
2834                kiocb->ki_complete(kiocb, ret, 0);
2835        }
2836}
2837
2838static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
2839                       struct io_comp_state *cs)
2840{
2841        struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2842        struct io_async_rw *io = req->async_data;
2843
2844        /* add previously done IO, if any */
2845        if (io && io->bytes_done > 0) {
2846                if (ret < 0)
2847                        ret = io->bytes_done;
2848                else
2849                        ret += io->bytes_done;
2850        }
2851
2852        if (req->flags & REQ_F_CUR_POS)
2853                req->file->f_pos = kiocb->ki_pos;
2854        if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
2855                __io_complete_rw(req, ret, 0, cs);
2856        else
2857                io_rw_done(kiocb, ret);
2858}
2859
2860static ssize_t io_import_fixed(struct io_kiocb *req, int rw,
2861                               struct iov_iter *iter)
2862{
2863        struct io_ring_ctx *ctx = req->ctx;
2864        size_t len = req->rw.len;
2865        struct io_mapped_ubuf *imu;
2866        u16 index, buf_index = req->buf_index;
2867        size_t offset;
2868        u64 buf_addr;
2869
2870        if (unlikely(buf_index >= ctx->nr_user_bufs))
2871                return -EFAULT;
2872        index = array_index_nospec(buf_index, ctx->nr_user_bufs);
2873        imu = &ctx->user_bufs[index];
2874        buf_addr = req->rw.addr;
2875
2876        /* overflow */
2877        if (buf_addr + len < buf_addr)
2878                return -EFAULT;
2879        /* not inside the mapped region */
2880        if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len)
2881                return -EFAULT;
2882
2883        /*
2884         * May not be a start of buffer, set size appropriately
2885         * and advance us to the beginning.
2886         */
2887        offset = buf_addr - imu->ubuf;
2888        iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
2889
2890        if (offset) {
2891                /*
2892                 * Don't use iov_iter_advance() here, as it's really slow for
2893                 * using the latter parts of a big fixed buffer - it iterates
2894                 * over each segment manually. We can cheat a bit here, because
2895                 * we know that:
2896                 *
2897                 * 1) it's a BVEC iter, we set it up
2898                 * 2) all bvecs are PAGE_SIZE in size, except potentially the
2899                 *    first and last bvec
2900                 *
2901                 * So just find our index, and adjust the iterator afterwards.
2902                 * If the offset is within the first bvec (or the whole first
2903                 * bvec, just use iov_iter_advance(). This makes it easier
2904                 * since we can just skip the first segment, which may not
2905                 * be PAGE_SIZE aligned.
2906                 */
2907                const struct bio_vec *bvec = imu->bvec;
2908
2909                if (offset <= bvec->bv_len) {
2910                        iov_iter_advance(iter, offset);
2911                } else {
2912                        unsigned long seg_skip;
2913
2914                        /* skip first vec */
2915                        offset -= bvec->bv_len;
2916                        seg_skip = 1 + (offset >> PAGE_SHIFT);
2917
2918                        iter->bvec = bvec + seg_skip;
2919                        iter->nr_segs -= seg_skip;
2920                        iter->count -= bvec->bv_len + offset;
2921                        iter->iov_offset = offset & ~PAGE_MASK;
2922                }
2923        }
2924
2925        return len;
2926}
2927
2928static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
2929{
2930        if (needs_lock)
2931                mutex_unlock(&ctx->uring_lock);
2932}
2933
2934static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
2935{
2936        /*
2937         * "Normal" inline submissions always hold the uring_lock, since we
2938         * grab it from the system call. Same is true for the SQPOLL offload.
2939         * The only exception is when we've detached the request and issue it
2940         * from an async worker thread, grab the lock for that case.
2941         */
2942        if (needs_lock)
2943                mutex_lock(&ctx->uring_lock);
2944}
2945
2946static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
2947                                          int bgid, struct io_buffer *kbuf,
2948                                          bool needs_lock)
2949{
2950        struct io_buffer *head;
2951
2952        if (req->flags & REQ_F_BUFFER_SELECTED)
2953                return kbuf;
2954
2955        io_ring_submit_lock(req->ctx, needs_lock);
2956
2957        lockdep_assert_held(&req->ctx->uring_lock);
2958
2959        head = idr_find(&req->ctx->io_buffer_idr, bgid);
2960        if (head) {
2961                if (!list_empty(&head->list)) {
2962                        kbuf = list_last_entry(&head->list, struct io_buffer,
2963                                                        list);
2964                        list_del(&kbuf->list);
2965                } else {
2966                        kbuf = head;
2967                        idr_remove(&req->ctx->io_buffer_idr, bgid);
2968                }
2969                if (*len > kbuf->len)
2970                        *len = kbuf->len;
2971        } else {
2972                kbuf = ERR_PTR(-ENOBUFS);
2973        }
2974
2975        io_ring_submit_unlock(req->ctx, needs_lock);
2976
2977        return kbuf;
2978}
2979
2980static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
2981                                        bool needs_lock)
2982{
2983        struct io_buffer *kbuf;
2984        u16 bgid;
2985
2986        kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2987        bgid = req->buf_index;
2988        kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
2989        if (IS_ERR(kbuf))
2990                return kbuf;
2991        req->rw.addr = (u64) (unsigned long) kbuf;
2992        req->flags |= REQ_F_BUFFER_SELECTED;
2993        return u64_to_user_ptr(kbuf->addr);
2994}
2995
2996#ifdef CONFIG_COMPAT
2997static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
2998                                bool needs_lock)
2999{
3000        struct compat_iovec __user *uiov;

3001        compat_ssize_t clen;
3002        void __user *buf;
3003        ssize_t len;
3004
3005        uiov = u64_to_user_ptr(req->rw.addr);
3006        if (!access_ok(uiov, sizeof(*uiov)))
3007                return -EFAULT;
3008        if (__get_user(clen, &uiov->iov_len))
3009                return -EFAULT;
3010        if (clen < 0)
3011                return -EINVAL;
3012
3013        len = clen;
3014        buf = io_rw_buffer_select(req, &len, needs_lock);
3015        if (IS_ERR(buf))
3016                return PTR_ERR(buf);
3017        iov[0].iov_base = buf;
3018        iov[0].iov_len = (compat_size_t) len;
3019        return 0;
3020}
3021#endif
3022
3023static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3024                                      bool needs_lock)
3025{
3026        struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3027        void __user *buf;
3028        ssize_t len;
3029
3030        if (copy_from_user(iov, uiov, sizeof(*uiov)))
3031                return -EFAULT;
3032
3033        len = iov[0].iov_len;
3034        if (len < 0)
3035                return -EINVAL;
3036        buf = io_rw_buffer_select(req, &len, needs_lock);
3037        if (IS_ERR(buf))
3038                return PTR_ERR(buf);
3039        iov[0].iov_base = buf;
3040        iov[0].iov_len = len;
3041        return 0;
3042}
3043
3044static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3045                                    bool needs_lock)
3046{
3047        if (req->flags & REQ_F_BUFFER_SELECTED) {
3048                struct io_buffer *kbuf;
3049
3050                kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3051                iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3052                iov[0].iov_len = kbuf->len;
3053                return 0;
3054        }
3055        if (!req->rw.len)
3056                return 0;
3057        else if (req->rw.len > 1)
3058                return -EINVAL;
3059
3060#ifdef CONFIG_COMPAT
3061        if (req->ctx->compat)
3062                return io_compat_import(req, iov, needs_lock);
3063#endif
3064
3065        return __io_iov_buffer_select(req, iov, needs_lock);
3066}
3067
3068static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
3069                                 struct iovec **iovec, struct iov_iter *iter,
3070                                 bool needs_lock)
3071{
3072        void __user *buf = u64_to_user_ptr(req->rw.addr);
3073        size_t sqe_len = req->rw.len;
3074        ssize_t ret;
3075        u8 opcode;
3076
3077        opcode = req->opcode;
3078        if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3079                *iovec = NULL;
3080                return io_import_fixed(req, rw, iter);
3081        }
3082
3083        /* buffer index only valid with fixed read/write, or buffer select  */
3084        if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
3085                return -EINVAL;
3086
3087        if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3088                if (req->flags & REQ_F_BUFFER_SELECT) {
3089                        buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3090                        if (IS_ERR(buf))
3091                                return PTR_ERR(buf);
3092                        req->rw.len = sqe_len;
3093                }
3094
3095                ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3096                *iovec = NULL;
3097                return ret < 0 ? ret : sqe_len;
3098        }
3099
3100        if (req->flags & REQ_F_BUFFER_SELECT) {
3101                ret = io_iov_buffer_select(req, *iovec, needs_lock);
3102                if (!ret) {
3103                        ret = (*iovec)->iov_len;
3104                        iov_iter_init(iter, rw, *iovec, 1, ret);
3105                }
3106                *iovec = NULL;
3107                return ret;
3108        }
3109
3110        return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3111                              req->ctx->compat);
3112}
3113
3114static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
3115                               struct iovec **iovec, struct iov_iter *iter,
3116                               bool needs_lock)
3117{
3118        struct io_async_rw *iorw = req->async_data;
3119
3120        if (!iorw)
3121                return __io_import_iovec(rw, req, iovec, iter, needs_lock);
3122        *iovec = NULL;
3123        return iov_iter_count(&iorw->iter);
3124}
3125
3126static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3127{
3128        return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3129}
3130
3131/*
3132 * For files that don't have ->read_iter() and ->write_iter(), handle them
3133 * by looping over ->read() or ->write() manually.
3134 */
3135static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3136{
3137        struct kiocb *kiocb = &req->rw.kiocb;
3138        struct file *file = req->file;
3139        ssize_t ret = 0;
3140
3141        /*
3142         * Don't support polled IO through this interface, and we can't
3143         * support non-blocking either. For the latter, this just causes
3144         * the kiocb to be handled from an async context.
3145         */
3146        if (kiocb->ki_flags & IOCB_HIPRI)
3147                return -EOPNOTSUPP;
3148        if (kiocb->ki_flags & IOCB_NOWAIT)
3149                return -EAGAIN;
3150
3151        while (iov_iter_count(iter)) {
3152                struct iovec iovec;
3153                ssize_t nr;
3154
3155                if (!iov_iter_is_bvec(iter)) {
3156                        iovec = iov_iter_iovec(iter);
3157                } else {
3158                        iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3159                        iovec.iov_len = req->rw.len;
3160                }
3161
3162                if (rw == READ) {
3163                        nr = file->f_op->read(file, iovec.iov_base,
3164                                              iovec.iov_len, io_kiocb_ppos(kiocb));
3165                } else {
3166                        nr = file->f_op->write(file, iovec.iov_base,
3167                                               iovec.iov_len, io_kiocb_ppos(kiocb));
3168                }
3169
3170                if (nr < 0) {
3171                        if (!ret)
3172                                ret = nr;
3173                        break;
3174                }
3175                ret += nr;
3176                if (nr != iovec.iov_len)
3177                        break;
3178                req->rw.len -= nr;
3179                req->rw.addr += nr;
3180                iov_iter_advance(iter, nr);
3181        }
3182
3183        return ret;
3184}
3185
3186static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3187                          const struct iovec *fast_iov, struct iov_iter *iter)
3188{
3189        struct io_async_rw *rw = req->async_data;
3190
3191        memcpy(&rw->iter, iter, sizeof(*iter));
3192        rw->free_iovec = iovec;
3193        rw->bytes_done = 0;
3194        /* can only be fixed buffers, no need to do anything */
3195        if (iov_iter_is_bvec(iter))
3196                return;
3197        if (!iovec) {
3198                unsigned iov_off = 0;
3199
3200                rw->iter.iov = rw->fast_iov;
3201                if (iter->iov != fast_iov) {
3202                        iov_off = iter->iov - fast_iov;
3203                        rw->iter.iov += iov_off;
3204                }
3205                if (rw->fast_iov != fast_iov)
3206                        memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3207                               sizeof(struct iovec) * iter->nr_segs);
3208        } else {
3209                req->flags |= REQ_F_NEED_CLEANUP;
3210        }
3211}
3212
3213static inline int __io_alloc_async_data(struct io_kiocb *req)
3214{
3215        WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3216        req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3217        return req->async_data == NULL;
3218}
3219
3220static int io_alloc_async_data(struct io_kiocb *req)
3221{
3222        if (!io_op_defs[req->opcode].needs_async_data)
3223                return 0;
3224
3225        return  __io_alloc_async_data(req);
3226}
3227
3228static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3229                             const struct iovec *fast_iov,
3230                             struct iov_iter *iter, bool force)
3231{
3232        if (!force && !io_op_defs[req->opcode].needs_async_data)
3233                return 0;
3234        if (!req->async_data) {
3235                if (__io_alloc_async_data(req))
3236                        return -ENOMEM;
3237
3238                io_req_map_rw(req, iovec, fast_iov, iter);
3239        }
3240        return 0;
3241}
3242
3243static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3244{
3245        struct io_async_rw *iorw = req->async_data;
3246        struct iovec *iov = iorw->fast_iov;
3247        ssize_t ret;
3248
3249        ret = __io_import_iovec(rw, req, &iov, &iorw->iter, false);
3250        if (unlikely(ret < 0))
3251                return ret;
3252
3253        iorw->bytes_done = 0;
3254        iorw->free_iovec = iov;
3255        if (iov)
3256                req->flags |= REQ_F_NEED_CLEANUP;
3257        return 0;
3258}
3259
3260static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3261{
3262        ssize_t ret;
3263
3264        ret = io_prep_rw(req, sqe);
3265        if (ret)
3266                return ret;
3267
3268        if (unlikely(!(req->file->f_mode & FMODE_READ)))
3269                return -EBADF;
3270
3271        /* either don't need iovec imported or already have it */
3272        if (!req->async_data)
3273                return 0;
3274        return io_rw_prep_async(req, READ);
3275}
3276
3277/*
3278 * This is our waitqueue callback handler, registered through lock_page_async()
3279 * when we initially tried to do the IO with the iocb armed our waitqueue.
3280 * This gets called when the page is unlocked, and we generally expect that to
3281 * happen when the page IO is completed and the page is now uptodate. This will
3282 * queue a task_work based retry of the operation, attempting to copy the data
3283 * again. If the latter fails because the page was NOT uptodate, then we will
3284 * do a thread based blocking retry of the operation. That's the unexpected
3285 * slow path.
3286 */
3287static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3288                             int sync, void *arg)
3289{
3290        struct wait_page_queue *wpq;
3291        struct io_kiocb *req = wait->private;
3292        struct wait_page_key *key = arg;
3293        int ret;
3294
3295        wpq = container_of(wait, struct wait_page_queue, wait);
3296
3297        if (!wake_page_match(wpq, key))
3298                return 0;
3299
3300        req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3301        list_del_init(&wait->entry);
3302
3303        init_task_work(&req->task_work, io_req_task_submit);
3304        percpu_ref_get(&req->ctx->refs);
3305
3306        /* submit ref gets dropped, acquire a new one */
3307        refcount_inc(&req->refs);
3308        ret = io_req_task_work_add(req, true);
3309        if (unlikely(ret)) {
3310                struct task_struct *tsk;
3311
3312                /* queue just for cancelation */
3313                init_task_work(&req->task_work, io_req_task_cancel);
3314                tsk = io_wq_get_task(req->ctx->io_wq);
3315                task_work_add(tsk, &req->task_work, TWA_NONE);
3316                wake_up_process(tsk);
3317        }
3318        return 1;
3319}
3320
3321/*
3322 * This controls whether a given IO request should be armed for async page
3323 * based retry. If we return false here, the request is handed to the async
3324 * worker threads for retry. If we're doing buffered reads on a regular file,
3325 * we prepare a private wait_page_queue entry and retry the operation. This
3326 * will either succeed because the page is now uptodate and unlocked, or it
3327 * will register a callback when the page is unlocked at IO completion. Through
3328 * that callback, io_uring uses task_work to setup a retry of the operation.
3329 * That retry will attempt the buffered read again. The retry will generally
3330 * succeed, or in rare cases where it fails, we then fall back to using the
3331 * async worker threads for a blocking retry.
3332 */
3333static bool io_rw_should_retry(struct io_kiocb *req)
3334{
3335        struct io_async_rw *rw = req->async_data;
3336        struct wait_page_queue *wait = &rw->wpq;
3337        struct kiocb *kiocb = &req->rw.kiocb;
3338
3339        /* never retry for NOWAIT, we just complete with -EAGAIN */
3340        if (req->flags & REQ_F_NOWAIT)
3341                return false;
3342
3343        /* Only for buffered IO */
3344        if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3345                return false;
3346
3347        /*
3348         * just use poll if we can, and don't attempt if the fs doesn't
3349         * support callback based unlocks
3350         */
3351        if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3352                return false;
3353
3354        wait->wait.func = io_async_buf_func;
3355        wait->wait.private = req;
3356        wait->wait.flags = 0;
3357        INIT_LIST_HEAD(&wait->wait.entry);
3358        kiocb->ki_flags |= IOCB_WAITQ;
3359        kiocb->ki_flags &= ~IOCB_NOWAIT;
3360        kiocb->ki_waitq = wait;
3361        return true;
3362}
3363
3364static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3365{
3366        if (req->file->f_op->read_iter)
3367                return call_read_iter(req->file, &req->rw.kiocb, iter);
3368        else if (req->file->f_op->read)
3369                return loop_rw_iter(READ, req, iter);
3370        else
3371                return -EINVAL;
3372}
3373
3374static int io_read(struct io_kiocb *req, bool force_nonblock,
3375                   struct io_comp_state *cs)
3376{
3377        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3378        struct kiocb *kiocb = &req->rw.kiocb;
3379        struct iov_iter __iter, *iter = &__iter;
3380        struct io_async_rw *rw = req->async_data;
3381        ssize_t io_size, ret, ret2;
3382        size_t iov_count;
3383        bool no_async;
3384
3385        if (rw)
3386                iter = &rw->iter;
3387
3388        ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3389        if (ret < 0)
3390                return ret;
3391        iov_count = iov_iter_count(iter);
3392        io_size = ret;
3393        req->result = io_size;
3394        ret = 0;
3395
3396        /* Ensure we clear previously set non-block flag */
3397        if (!force_nonblock)
3398                kiocb->ki_flags &= ~IOCB_NOWAIT;
3399        else
3400                kiocb->ki_flags |= IOCB_NOWAIT;
3401
3402
3403        /* If the file doesn't support async, just async punt */
3404        no_async = force_nonblock && !io_file_supports_async(req->file, READ);
3405        if (no_async)
3406                goto copy_iov;
3407
3408        ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
3409        if (unlikely(ret))
3410                goto out_free;
3411
3412        ret = io_iter_do_read(req, iter);
3413
3414        if (!ret) {
3415                goto done;
3416        } else if (ret == -EIOCBQUEUED) {
3417                ret = 0;
3418                goto out_free;
3419        } else if (ret == -EAGAIN) {
3420                /* IOPOLL retry should happen for io-wq threads */
3421                if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3422                        goto done;
3423                /* no retry on NONBLOCK marked file */
3424                if (req->file->f_flags & O_NONBLOCK)
3425                        goto done;
3426                /* some cases will consume bytes even on error returns */
3427                iov_iter_revert(iter, iov_count - iov_iter_count(iter));
3428                ret = 0;
3429                goto copy_iov;
3430        } else if (ret < 0) {
3431                /* make sure -ERESTARTSYS -> -EINTR is done */
3432                goto done;
3433        }
3434
3435        /* read it all, or we did blocking attempt. no retry. */
3436        if (!iov_iter_count(iter) || !force_nonblock ||
3437            (req->file->f_flags & O_NONBLOCK))
3438                goto done;
3439
3440        io_size -= ret;
3441copy_iov:
3442        ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3443        if (ret2) {
3444                ret = ret2;
3445                goto out_free;
3446        }
3447        if (no_async)
3448                return -EAGAIN;
3449        rw = req->async_data;
3450        /* it's copied and will be cleaned with ->io */
3451        iovec = NULL;
3452        /* now use our persistent iterator, if we aren't already */
3453        iter = &rw->iter;
3454retry:
3455        rw->bytes_done += ret;
3456        /* if we can retry, do so with the callbacks armed */
3457        if (!io_rw_should_retry(req)) {
3458                kiocb->ki_flags &= ~IOCB_WAITQ;
3459                return -EAGAIN;
3460        }
3461
3462        /*
3463         * Now retry read with the IOCB_WAITQ parts set in the iocb. If we
3464         * get -EIOCBQUEUED, then we'll get a notification when the desired
3465         * page gets unlocked. We can also get a partial read here, and if we
3466         * do, then just retry at the new offset.
3467         */
3468        ret = io_iter_do_read(req, iter);
3469        if (ret == -EIOCBQUEUED) {
3470                ret = 0;
3471                goto out_free;
3472        } else if (ret > 0 && ret < io_size) {
3473                /* we got some bytes, but not all. retry. */
3474                goto retry;
3475        }
3476done:
3477        kiocb_done(kiocb, ret, cs);
3478        ret = 0;
3479out_free:
3480        /* it's reportedly faster than delegating the null check to kfree() */
3481        if (iovec)
3482                kfree(iovec);
3483        return ret;
3484}
3485
3486static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3487{
3488        ssize_t ret;
3489
3490        ret = io_prep_rw(req, sqe);
3491        if (ret)
3492                return ret;
3493
3494        if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3495                return -EBADF;
3496
3497        /* either don't need iovec imported or already have it */
3498        if (!req->async_data)
3499                return 0;
3500        return io_rw_prep_async(req, WRITE);
3501}
3502
3503static int io_write(struct io_kiocb *req, bool force_nonblock,
3504                    struct io_comp_state *cs)
3505{
3506        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3507        struct kiocb *kiocb = &req->rw.kiocb;
3508        struct iov_iter __iter, *iter = &__iter;
3509        struct io_async_rw *rw = req->async_data;
3510        size_t iov_count;
3511        ssize_t ret, ret2, io_size;
3512
3513        if (rw)
3514                iter = &rw->iter;
3515
3516        ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3517        if (ret < 0)
3518                return ret;
3519        iov_count = iov_iter_count(iter);
3520        io_size = ret;
3521        req->result = io_size;
3522
3523        /* Ensure we clear previously set non-block flag */
3524        if (!force_nonblock)
3525                kiocb->ki_flags &= ~IOCB_NOWAIT;
3526        else
3527                kiocb->ki_flags |= IOCB_NOWAIT;
3528
3529        /* If the file doesn't support async, just async punt */
3530        if (force_nonblock && !io_file_supports_async(req->file, WRITE))
3531                goto copy_iov;
3532
3533        /* file path doesn't support NOWAIT for non-direct_IO */
3534        if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3535            (req->flags & REQ_F_ISREG))
3536                goto copy_iov;
3537
3538        ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), iov_count);
3539        if (unlikely(ret))
3540                goto out_free;
3541
3542        /*
3543         * Open-code file_start_write here to grab freeze protection,
3544         * which will be released by another thread in
3545         * io_complete_rw().  Fool lockdep by telling it the lock got
3546         * released so that it doesn't complain about the held lock when
3547         * we return to userspace.
3548         */
3549        if (req->flags & REQ_F_ISREG) {
3550                sb_start_write(file_inode(req->file)->i_sb);
3551                __sb_writers_release(file_inode(req->file)->i_sb,
3552                                        SB_FREEZE_WRITE);
3553        }
3554        kiocb->ki_flags |= IOCB_WRITE;
3555
3556        if (req->file->f_op->write_iter)
3557                ret2 = call_write_iter(req->file, kiocb, iter);
3558        else if (req->file->f_op->write)
3559                ret2 = loop_rw_iter(WRITE, req, iter);
3560        else
3561                ret2 = -EINVAL;
3562
3563        /*
3564         * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3565         * retry them without IOCB_NOWAIT.
3566         */
3567        if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3568                ret2 = -EAGAIN;
3569        /* no retry on NONBLOCK marked file */
3570        if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
3571                goto done;
3572        if (!force_nonblock || ret2 != -EAGAIN) {
3573                /* IOPOLL retry should happen for io-wq threads */
3574                if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3575                        goto copy_iov;
3576done:
3577                kiocb_done(kiocb, ret2, cs);
3578        } else {
3579copy_iov:
3580                /* some cases will consume bytes even on error returns */
3581                iov_iter_revert(iter, iov_count - iov_iter_count(iter));
3582                ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3583                if (!ret)
3584                        return -EAGAIN;
3585        }
3586out_free:
3587        /* it's reportedly faster than delegating the null check to kfree() */
3588        if (iovec)
3589                kfree(iovec);
3590        return ret;
3591}
3592
3593static int __io_splice_prep(struct io_kiocb *req,
3594                            const struct io_uring_sqe *sqe)
3595{
3596        struct io_splice* sp = &req->splice;
3597        unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3598
3599        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3600                return -EINVAL;
3601
3602        sp->file_in = NULL;
3603        sp->len = READ_ONCE(sqe->len);
3604        sp->flags = READ_ONCE(sqe->splice_flags);
3605
3606        if (unlikely(sp->flags & ~valid_flags))
3607                return -EINVAL;
3608
3609        sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in),
3610                                  (sp->flags & SPLICE_F_FD_IN_FIXED));
3611        if (!sp->file_in)
3612                return -EBADF;
3613        req->flags |= REQ_F_NEED_CLEANUP;
3614
3615        if (!S_ISREG(file_inode(sp->file_in)->i_mode)) {
3616                /*
3617                 * Splice operation will be punted aync, and here need to
3618                 * modify io_wq_work.flags, so initialize io_wq_work firstly.
3619                 */
3620                io_req_init_async(req);
3621                req->work.flags |= IO_WQ_WORK_UNBOUND;
3622        }
3623
3624        return 0;
3625}
3626
3627static int io_tee_prep(struct io_kiocb *req,
3628                       const struct io_uring_sqe *sqe)
3629{
3630        if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
3631                return -EINVAL;
3632        return __io_splice_prep(req, sqe);
3633}
3634
3635static int io_tee(struct io_kiocb *req, bool force_nonblock)
3636{
3637        struct io_splice *sp = &req->splice;
3638        struct file *in = sp->file_in;
3639        struct file *out = sp->file_out;
3640        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3641        long ret = 0;
3642
3643        if (force_nonblock)
3644                return -EAGAIN;
3645        if (sp->len)
3646                ret = do_tee(in, out, sp->len, flags);
3647
3648        io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3649        req->flags &= ~REQ_F_NEED_CLEANUP;
3650
3651        if (ret != sp->len)
3652                req_set_fail_links(req);
3653        io_req_complete(req, ret);
3654        return 0;
3655}
3656
3657static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3658{
3659        struct io_splice* sp = &req->splice;
3660
3661        sp->off_in = READ_ONCE(sqe->splice_off_in);
3662        sp->off_out = READ_ONCE(sqe->off);
3663        return __io_splice_prep(req, sqe);
3664}
3665
3666static int io_splice(struct io_kiocb *req, bool force_nonblock)
3667{
3668        struct io_splice *sp = &req->splice;
3669        struct file *in = sp->file_in;
3670        struct file *out = sp->file_out;
3671        unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
3672        loff_t *poff_in, *poff_out;
3673        long ret = 0;
3674
3675        if (force_nonblock)
3676                return -EAGAIN;
3677
3678        poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
3679        poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
3680
3681        if (sp->len)
3682                ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
3683
3684        io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
3685        req->flags &= ~REQ_F_NEED_CLEANUP;
3686
3687        if (ret != sp->len)
3688                req_set_fail_links(req);
3689        io_req_complete(req, ret);
3690        return 0;
3691}
3692
3693/*
3694 * IORING_OP_NOP just posts a completion event, nothing else.
3695 */
3696static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
3697{
3698        struct io_ring_ctx *ctx = req->ctx;
3699
3700        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3701                return -EINVAL;
3702
3703        __io_req_complete(req, 0, 0, cs);
3704        return 0;
3705}
3706
3707static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3708{
3709        struct io_ring_ctx *ctx = req->ctx;
3710
3711        if (!req->file)
3712                return -EBADF;
3713
3714        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
3715                return -EINVAL;
3716        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
3717                return -EINVAL;
3718
3719        req->sync.flags = READ_ONCE(sqe->fsync_flags);
3720        if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
3721                return -EINVAL;
3722
3723        req->sync.off = READ_ONCE(sqe->off);
3724        req->sync.len = READ_ONCE(sqe->len);
3725        return 0;
3726}
3727
3728static int io_fsync(struct io_kiocb *req, bool force_nonblock)
3729{
3730        loff_t end = req->sync.off + req->sync.len;
3731        int ret;
3732
3733        /* fsync always requires a blocking context */
3734        if (force_nonblock)
3735                return -EAGAIN;
3736
3737        ret = vfs_fsync_range(req->file, req->sync.off,
3738                                end > 0 ? end : LLONG_MAX,
3739                                req->sync.flags & IORING_FSYNC_DATASYNC);
3740        if (ret < 0)
3741                req_set_fail_links(req);
3742        io_req_complete(req, ret);
3743        return 0;
3744}
3745
3746static int io_fallocate_prep(struct io_kiocb *req,
3747                             const struct io_uring_sqe *sqe)
3748{
3749        if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
3750                return -EINVAL;
3751        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3752                return -EINVAL;
3753
3754        req->sync.off = READ_ONCE(sqe->off);
3755        req->sync.len = READ_ONCE(sqe->addr);
3756        req->sync.mode = READ_ONCE(sqe->len);
3757        return 0;
3758}
3759
3760static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
3761{
3762        int ret;
3763
3764        /* fallocate always requiring blocking context */
3765        if (force_nonblock)
3766                return -EAGAIN;
3767        ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
3768                                req->sync.len);
3769        if (ret < 0)
3770                req_set_fail_links(req);
3771        io_req_complete(req, ret);
3772        return 0;
3773}
3774
3775static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3776{
3777        const char __user *fname;
3778        int ret;
3779
3780        if (unlikely(sqe->ioprio || sqe->buf_index))
3781                return -EINVAL;
3782        if (unlikely(req->flags & REQ_F_FIXED_FILE))
3783                return -EBADF;
3784
3785        /* open.how should be already initialised */
3786        if (!(req->open.how.flags & O_PATH) && force_o_largefile())
3787                req->open.how.flags |= O_LARGEFILE;
3788
3789        req->open.dfd = READ_ONCE(sqe->fd);
3790        fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3791        req->open.filename = getname(fname);
3792        if (IS_ERR(req->open.filename)) {
3793                ret = PTR_ERR(req->open.filename);
3794                req->open.filename = NULL;
3795                return ret;
3796        }
3797        req->open.nofile = rlimit(RLIMIT_NOFILE);
3798        req->open.ignore_nonblock = false;
3799        req->flags |= REQ_F_NEED_CLEANUP;
3800        return 0;
3801}
3802
3803static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3804{
3805        u64 flags, mode;
3806
3807        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3808                return -EINVAL;
3809        mode = READ_ONCE(sqe->len);
3810        flags = READ_ONCE(sqe->open_flags);
3811        req->open.how = build_open_how(flags, mode);
3812        return __io_openat_prep(req, sqe);
3813}
3814
3815static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3816{
3817        struct open_how __user *how;
3818        size_t len;
3819        int ret;
3820
3821        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
3822                return -EINVAL;
3823        how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3824        len = READ_ONCE(sqe->len);
3825        if (len < OPEN_HOW_SIZE_VER0)
3826                return -EINVAL;
3827
3828        ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
3829                                        len);
3830        if (ret)
3831                return ret;
3832
3833        return __io_openat_prep(req, sqe);
3834}
3835
3836static int io_openat2(struct io_kiocb *req, bool force_nonblock)
3837{
3838        struct open_flags op;
3839        struct file *file;
3840        int ret;
3841
3842        if (force_nonblock && !req->open.ignore_nonblock)
3843                return -EAGAIN;
3844
3845        ret = build_open_flags(&req->open.how, &op);
3846        if (ret)
3847                goto err;
3848
3849        ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
3850        if (ret < 0)
3851                goto err;
3852
3853        file = do_filp_open(req->open.dfd, req->open.filename, &op);
3854        if (IS_ERR(file)) {
3855                put_unused_fd(ret);
3856                ret = PTR_ERR(file);
3857                /*
3858                 * A work-around to ensure that /proc/self works that way
3859                 * that it should - if we get -EOPNOTSUPP back, then assume
3860                 * that proc_self_get_link() failed us because we're in async
3861                 * context. We should be safe to retry this from the task
3862                 * itself with force_nonblock == false set, as it should not
3863                 * block on lookup. Would be nice to know this upfront and
3864                 * avoid the async dance, but doesn't seem feasible.
3865                 */
3866                if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
3867                        req->open.ignore_nonblock = true;
3868                        refcount_inc(&req->refs);
3869                        io_req_task_queue(req);
3870                        return 0;
3871                }
3872        } else {
3873                fsnotify_open(file);
3874                fd_install(ret, file);
3875        }
3876err:
3877        putname(req->open.filename);
3878        req->flags &= ~REQ_F_NEED_CLEANUP;
3879        if (ret < 0)
3880                req_set_fail_links(req);
3881        io_req_complete(req, ret);
3882        return 0;
3883}
3884
3885static int io_openat(struct io_kiocb *req, bool force_nonblock)
3886{
3887        return io_openat2(req, force_nonblock);
3888}
3889
3890static int io_remove_buffers_prep(struct io_kiocb *req,
3891                                  const struct io_uring_sqe *sqe)
3892{
3893        struct io_provide_buf *p = &req->pbuf;
3894        u64 tmp;
3895
3896        if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off)
3897                return -EINVAL;
3898
3899        tmp = READ_ONCE(sqe->fd);
3900        if (!tmp || tmp > USHRT_MAX)
3901                return -EINVAL;
3902
3903        memset(p, 0, sizeof(*p));
3904        p->nbufs = tmp;
3905        p->bgid = READ_ONCE(sqe->buf_group);
3906        return 0;
3907}
3908
3909static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
3910                               int bgid, unsigned nbufs)
3911{
3912        unsigned i = 0;
3913
3914        /* shouldn't happen */
3915        if (!nbufs)
3916                return 0;
3917
3918        /* the head kbuf is the list itself */
3919        while (!list_empty(&buf->list)) {
3920                struct io_buffer *nxt;
3921
3922                nxt = list_first_entry(&buf->list, struct io_buffer, list);
3923                list_del(&nxt->list);
3924                kfree(nxt);
3925                if (++i == nbufs)
3926                        return i;
3927        }
3928        i++;
3929        kfree(buf);
3930        idr_remove(&ctx->io_buffer_idr, bgid);
3931
3932        return i;
3933}
3934
3935static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
3936                             struct io_comp_state *cs)
3937{
3938        struct io_provide_buf *p = &req->pbuf;
3939        struct io_ring_ctx *ctx = req->ctx;
3940        struct io_buffer *head;
3941        int ret = 0;
3942
3943        io_ring_submit_lock(ctx, !force_nonblock);
3944
3945        lockdep_assert_held(&ctx->uring_lock);
3946
3947        ret = -ENOENT;
3948        head = idr_find(&ctx->io_buffer_idr, p->bgid);
3949        if (head)
3950                ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
3951
3952        io_ring_submit_lock(ctx, !force_nonblock);
3953        if (ret < 0)
3954                req_set_fail_links(req);
3955        __io_req_complete(req, ret, 0, cs);
3956        return 0;
3957}
3958
3959static int io_provide_buffers_prep(struct io_kiocb *req,
3960                                   const struct io_uring_sqe *sqe)
3961{
3962        struct io_provide_buf *p = &req->pbuf;
3963        u64 tmp;
3964
3965        if (sqe->ioprio || sqe->rw_flags)
3966                return -EINVAL;
3967
3968        tmp = READ_ONCE(sqe->fd);
3969        if (!tmp || tmp > USHRT_MAX)
3970                return -E2BIG;
3971        p->nbufs = tmp;
3972        p->addr = READ_ONCE(sqe->addr);
3973        p->len = READ_ONCE(sqe->len);
3974
3975        if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs)))
3976                return -EFAULT;
3977
3978        p->bgid = READ_ONCE(sqe->buf_group);
3979        tmp = READ_ONCE(sqe->off);
3980        if (tmp > USHRT_MAX)
3981                return -E2BIG;
3982        p->bid = tmp;
3983        return 0;
3984}
3985
3986static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
3987{
3988        struct io_buffer *buf;
3989        u64 addr = pbuf->addr;
3990        int i, bid = pbuf->bid;
3991
3992        for (i = 0; i < pbuf->nbufs; i++) {
3993                buf = kmalloc(sizeof(*buf), GFP_KERNEL);
3994                if (!buf)
3995                        break;
3996
3997                buf->addr = addr;
3998                buf->len = pbuf->len;
3999                buf->bid = bid;
4000                addr += pbuf->len;

4001                bid++;
4002                if (!*head) {
4003                        INIT_LIST_HEAD(&buf->list);
4004                        *head = buf;
4005                } else {
4006                        list_add_tail(&buf->list, &(*head)->list);
4007                }
4008        }
4009
4010        return i ? i : -ENOMEM;
4011}
4012
4013static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
4014                              struct io_comp_state *cs)
4015{
4016        struct io_provide_buf *p = &req->pbuf;
4017        struct io_ring_ctx *ctx = req->ctx;
4018        struct io_buffer *head, *list;
4019        int ret = 0;
4020
4021        io_ring_submit_lock(ctx, !force_nonblock);
4022
4023        lockdep_assert_held(&ctx->uring_lock);
4024
4025        list = head = idr_find(&ctx->io_buffer_idr, p->bgid);
4026
4027        ret = io_add_buffers(p, &head);
4028        if (ret < 0)
4029                goto out;
4030
4031        if (!list) {
4032                ret = idr_alloc(&ctx->io_buffer_idr, head, p->bgid, p->bgid + 1,
4033                                        GFP_KERNEL);
4034                if (ret < 0) {
4035                        __io_remove_buffers(ctx, head, p->bgid, -1U);
4036                        goto out;
4037                }
4038        }
4039out:
4040        io_ring_submit_unlock(ctx, !force_nonblock);
4041        if (ret < 0)
4042                req_set_fail_links(req);
4043        __io_req_complete(req, ret, 0, cs);
4044        return 0;
4045}
4046
4047static int io_epoll_ctl_prep(struct io_kiocb *req,
4048                             const struct io_uring_sqe *sqe)
4049{
4050#if defined(CONFIG_EPOLL)
4051        if (sqe->ioprio || sqe->buf_index)
4052                return -EINVAL;
4053        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4054                return -EINVAL;
4055
4056        req->epoll.epfd = READ_ONCE(sqe->fd);
4057        req->epoll.op = READ_ONCE(sqe->len);
4058        req->epoll.fd = READ_ONCE(sqe->off);
4059
4060        if (ep_op_has_event(req->epoll.op)) {
4061                struct epoll_event __user *ev;
4062
4063                ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4064                if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4065                        return -EFAULT;
4066        }
4067
4068        return 0;
4069#else
4070        return -EOPNOTSUPP;
4071#endif
4072}
4073
4074static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
4075                        struct io_comp_state *cs)
4076{
4077#if defined(CONFIG_EPOLL)
4078        struct io_epoll *ie = &req->epoll;
4079        int ret;
4080
4081        ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4082        if (force_nonblock && ret == -EAGAIN)
4083                return -EAGAIN;
4084
4085        if (ret < 0)
4086                req_set_fail_links(req);
4087        __io_req_complete(req, ret, 0, cs);
4088        return 0;
4089#else
4090        return -EOPNOTSUPP;
4091#endif
4092}
4093
4094static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4095{
4096#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4097        if (sqe->ioprio || sqe->buf_index || sqe->off)
4098                return -EINVAL;
4099        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4100                return -EINVAL;
4101
4102        req->madvise.addr = READ_ONCE(sqe->addr);
4103        req->madvise.len = READ_ONCE(sqe->len);
4104        req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4105        return 0;
4106#else
4107        return -EOPNOTSUPP;
4108#endif
4109}
4110
4111static int io_madvise(struct io_kiocb *req, bool force_nonblock)
4112{
4113#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4114        struct io_madvise *ma = &req->madvise;
4115        int ret;
4116
4117        if (force_nonblock)
4118                return -EAGAIN;
4119
4120        ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4121        if (ret < 0)
4122                req_set_fail_links(req);
4123        io_req_complete(req, ret);
4124        return 0;
4125#else
4126        return -EOPNOTSUPP;
4127#endif
4128}
4129
4130static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4131{
4132        if (sqe->ioprio || sqe->buf_index || sqe->addr)
4133                return -EINVAL;
4134        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4135                return -EINVAL;
4136
4137        req->fadvise.offset = READ_ONCE(sqe->off);
4138        req->fadvise.len = READ_ONCE(sqe->len);
4139        req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4140        return 0;
4141}
4142
4143static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
4144{
4145        struct io_fadvise *fa = &req->fadvise;
4146        int ret;
4147
4148        if (force_nonblock) {
4149                switch (fa->advice) {
4150                case POSIX_FADV_NORMAL:
4151                case POSIX_FADV_RANDOM:
4152                case POSIX_FADV_SEQUENTIAL:
4153                        break;
4154                default:
4155                        return -EAGAIN;
4156                }
4157        }
4158
4159        ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4160        if (ret < 0)
4161                req_set_fail_links(req);
4162        io_req_complete(req, ret);
4163        return 0;
4164}
4165
4166static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4167{
4168        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
4169                return -EINVAL;
4170        if (sqe->ioprio || sqe->buf_index)
4171                return -EINVAL;
4172        if (req->flags & REQ_F_FIXED_FILE)
4173                return -EBADF;
4174
4175        req->statx.dfd = READ_ONCE(sqe->fd);
4176        req->statx.mask = READ_ONCE(sqe->len);
4177        req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4178        req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4179        req->statx.flags = READ_ONCE(sqe->statx_flags);
4180
4181        return 0;
4182}
4183
4184static int io_statx(struct io_kiocb *req, bool force_nonblock)
4185{
4186        struct io_statx *ctx = &req->statx;
4187        int ret;
4188
4189        if (force_nonblock) {
4190                /* only need file table for an actual valid fd */
4191                if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
4192                        req->flags |= REQ_F_NO_FILE_TABLE;
4193                return -EAGAIN;
4194        }
4195
4196        ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4197                       ctx->buffer);
4198
4199        if (ret < 0)
4200                req_set_fail_links(req);
4201        io_req_complete(req, ret);
4202        return 0;
4203}
4204
4205static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4206{
4207        /*
4208         * If we queue this for async, it must not be cancellable. That would
4209         * leave the 'file' in an undeterminate state, and here need to modify
4210         * io_wq_work.flags, so initialize io_wq_work firstly.
4211         */
4212        io_req_init_async(req);
4213        req->work.flags |= IO_WQ_WORK_NO_CANCEL;
4214
4215        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4216                return -EINVAL;
4217        if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4218            sqe->rw_flags || sqe->buf_index)
4219                return -EINVAL;
4220        if (req->flags & REQ_F_FIXED_FILE)
4221                return -EBADF;
4222
4223        req->close.fd = READ_ONCE(sqe->fd);
4224        if ((req->file && req->file->f_op == &io_uring_fops))
4225                return -EBADF;
4226
4227        req->close.put_file = NULL;
4228        return 0;
4229}
4230
4231static int io_close(struct io_kiocb *req, bool force_nonblock,
4232                    struct io_comp_state *cs)
4233{
4234        struct io_close *close = &req->close;
4235        int ret;
4236
4237        /* might be already done during nonblock submission */
4238        if (!close->put_file) {
4239                ret = __close_fd_get_file(close->fd, &close->put_file);
4240                if (ret < 0)
4241                        return (ret == -ENOENT) ? -EBADF : ret;
4242        }
4243
4244        /* if the file has a flush method, be safe and punt to async */
4245        if (close->put_file->f_op->flush && force_nonblock) {
4246                /* was never set, but play safe */
4247                req->flags &= ~REQ_F_NOWAIT;
4248                /* avoid grabbing files - we don't need the files */
4249                req->flags |= REQ_F_NO_FILE_TABLE;
4250                return -EAGAIN;
4251        }
4252
4253        /* No ->flush() or already async, safely close from here */
4254        ret = filp_close(close->put_file, req->work.identity->files);
4255        if (ret < 0)
4256                req_set_fail_links(req);
4257        fput(close->put_file);
4258        close->put_file = NULL;
4259        __io_req_complete(req, ret, 0, cs);
4260        return 0;
4261}
4262
4263static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4264{
4265        struct io_ring_ctx *ctx = req->ctx;
4266
4267        if (!req->file)
4268                return -EBADF;
4269
4270        if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4271                return -EINVAL;
4272        if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
4273                return -EINVAL;
4274
4275        req->sync.off = READ_ONCE(sqe->off);
4276        req->sync.len = READ_ONCE(sqe->len);
4277        req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4278        return 0;
4279}
4280
4281static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
4282{
4283        int ret;
4284
4285        /* sync_file_range always requires a blocking context */
4286        if (force_nonblock)
4287                return -EAGAIN;
4288
4289        ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4290                                req->sync.flags);
4291        if (ret < 0)
4292                req_set_fail_links(req);
4293        io_req_complete(req, ret);
4294        return 0;
4295}
4296
4297#if defined(CONFIG_NET)
4298static int io_setup_async_msg(struct io_kiocb *req,
4299                              struct io_async_msghdr *kmsg)
4300{
4301        struct io_async_msghdr *async_msg = req->async_data;
4302
4303        if (async_msg)
4304                return -EAGAIN;
4305        if (io_alloc_async_data(req)) {
4306                if (kmsg->iov != kmsg->fast_iov)
4307                        kfree(kmsg->iov);
4308                return -ENOMEM;
4309        }
4310        async_msg = req->async_data;
4311        req->flags |= REQ_F_NEED_CLEANUP;
4312        memcpy(async_msg, kmsg, sizeof(*kmsg));
4313        return -EAGAIN;
4314}
4315
4316static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4317                               struct io_async_msghdr *iomsg)
4318{
4319        iomsg->iov = iomsg->fast_iov;
4320        iomsg->msg.msg_name = &iomsg->addr;
4321        return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4322                                   req->sr_msg.msg_flags, &iomsg->iov);
4323}
4324
4325static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4326{
4327        struct io_async_msghdr *async_msg = req->async_data;
4328        struct io_sr_msg *sr = &req->sr_msg;
4329        int ret;
4330
4331        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4332                return -EINVAL;
4333
4334        sr->msg_flags = READ_ONCE(sqe->msg_flags);
4335        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4336        sr->len = READ_ONCE(sqe->len);
4337
4338#ifdef CONFIG_COMPAT
4339        if (req->ctx->compat)
4340                sr->msg_flags |= MSG_CMSG_COMPAT;
4341#endif
4342
4343        if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
4344                return 0;
4345        ret = io_sendmsg_copy_hdr(req, async_msg);
4346        if (!ret)
4347                req->flags |= REQ_F_NEED_CLEANUP;
4348        return ret;
4349}
4350
4351static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4352                      struct io_comp_state *cs)
4353{
4354        struct io_async_msghdr iomsg, *kmsg;
4355        struct socket *sock;
4356        unsigned flags;
4357        int ret;
4358
4359        sock = sock_from_file(req->file, &ret);
4360        if (unlikely(!sock))
4361                return ret;
4362
4363        if (req->async_data) {
4364                kmsg = req->async_data;
4365                kmsg->msg.msg_name = &kmsg->addr;
4366                /* if iov is set, it's allocated already */
4367                if (!kmsg->iov)
4368                        kmsg->iov = kmsg->fast_iov;
4369                kmsg->msg.msg_iter.iov = kmsg->iov;
4370        } else {
4371                ret = io_sendmsg_copy_hdr(req, &iomsg);
4372                if (ret)
4373                        return ret;
4374                kmsg = &iomsg;
4375        }
4376
4377        flags = req->sr_msg.msg_flags;
4378        if (flags & MSG_DONTWAIT)
4379                req->flags |= REQ_F_NOWAIT;
4380        else if (force_nonblock)
4381                flags |= MSG_DONTWAIT;
4382
4383        ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4384        if (force_nonblock && ret == -EAGAIN)
4385                return io_setup_async_msg(req, kmsg);
4386        if (ret == -ERESTARTSYS)
4387                ret = -EINTR;
4388
4389        if (kmsg->iov != kmsg->fast_iov)
4390                kfree(kmsg->iov);
4391        req->flags &= ~REQ_F_NEED_CLEANUP;
4392        if (ret < 0)
4393                req_set_fail_links(req);
4394        __io_req_complete(req, ret, 0, cs);
4395        return 0;
4396}
4397
4398static int io_send(struct io_kiocb *req, bool force_nonblock,
4399                   struct io_comp_state *cs)
4400{
4401        struct io_sr_msg *sr = &req->sr_msg;
4402        struct msghdr msg;
4403        struct iovec iov;
4404        struct socket *sock;
4405        unsigned flags;
4406        int ret;
4407
4408        sock = sock_from_file(req->file, &ret);
4409        if (unlikely(!sock))
4410                return ret;
4411
4412        ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4413        if (unlikely(ret))
4414                return ret;
4415
4416        msg.msg_name = NULL;
4417        msg.msg_control = NULL;
4418        msg.msg_controllen = 0;
4419        msg.msg_namelen = 0;
4420
4421        flags = req->sr_msg.msg_flags;
4422        if (flags & MSG_DONTWAIT)
4423                req->flags |= REQ_F_NOWAIT;
4424        else if (force_nonblock)
4425                flags |= MSG_DONTWAIT;
4426
4427        msg.msg_flags = flags;
4428        ret = sock_sendmsg(sock, &msg);
4429        if (force_nonblock && ret == -EAGAIN)
4430                return -EAGAIN;
4431        if (ret == -ERESTARTSYS)
4432                ret = -EINTR;
4433
4434        if (ret < 0)
4435                req_set_fail_links(req);
4436        __io_req_complete(req, ret, 0, cs);
4437        return 0;
4438}
4439
4440static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4441                                 struct io_async_msghdr *iomsg)
4442{
4443        struct io_sr_msg *sr = &req->sr_msg;
4444        struct iovec __user *uiov;
4445        size_t iov_len;
4446        int ret;
4447
4448        ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4449                                        &iomsg->uaddr, &uiov, &iov_len);
4450        if (ret)
4451                return ret;
4452
4453        if (req->flags & REQ_F_BUFFER_SELECT) {
4454                if (iov_len > 1)
4455                        return -EINVAL;
4456                if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
4457                        return -EFAULT;
4458                sr->len = iomsg->iov[0].iov_len;
4459                iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
4460                                sr->len);
4461                iomsg->iov = NULL;
4462        } else {
4463                ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4464                                     &iomsg->iov, &iomsg->msg.msg_iter,
4465                                     false);
4466                if (ret > 0)
4467                        ret = 0;
4468        }
4469
4470        return ret;
4471}
4472
4473#ifdef CONFIG_COMPAT
4474static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4475                                        struct io_async_msghdr *iomsg)
4476{
4477        struct compat_msghdr __user *msg_compat;
4478        struct io_sr_msg *sr = &req->sr_msg;
4479        struct compat_iovec __user *uiov;
4480        compat_uptr_t ptr;
4481        compat_size_t len;
4482        int ret;
4483
4484        msg_compat = (struct compat_msghdr __user *) sr->umsg;
4485        ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
4486                                        &ptr, &len);
4487        if (ret)
4488                return ret;
4489
4490        uiov = compat_ptr(ptr);
4491        if (req->flags & REQ_F_BUFFER_SELECT) {
4492                compat_ssize_t clen;
4493
4494                if (len > 1)
4495                        return -EINVAL;
4496                if (!access_ok(uiov, sizeof(*uiov)))
4497                        return -EFAULT;
4498                if (__get_user(clen, &uiov->iov_len))
4499                        return -EFAULT;
4500                if (clen < 0)
4501                        return -EINVAL;
4502                sr->len = clen;
4503                iomsg->iov[0].iov_len = clen;
4504                iomsg->iov = NULL;
4505        } else {
4506                ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4507                                   UIO_FASTIOV, &iomsg->iov,
4508                                   &iomsg->msg.msg_iter, true);
4509                if (ret < 0)
4510                        return ret;
4511        }
4512
4513        return 0;
4514}
4515#endif
4516
4517static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4518                               struct io_async_msghdr *iomsg)
4519{
4520        iomsg->msg.msg_name = &iomsg->addr;
4521        iomsg->iov = iomsg->fast_iov;
4522
4523#ifdef CONFIG_COMPAT
4524        if (req->ctx->compat)
4525                return __io_compat_recvmsg_copy_hdr(req, iomsg);
4526#endif
4527
4528        return __io_recvmsg_copy_hdr(req, iomsg);
4529}
4530
4531static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4532                                               bool needs_lock)
4533{
4534        struct io_sr_msg *sr = &req->sr_msg;
4535        struct io_buffer *kbuf;
4536
4537        kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
4538        if (IS_ERR(kbuf))
4539                return kbuf;
4540
4541        sr->kbuf = kbuf;
4542        req->flags |= REQ_F_BUFFER_SELECTED;
4543        return kbuf;
4544}
4545
4546static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
4547{
4548        return io_put_kbuf(req, req->sr_msg.kbuf);
4549}
4550
4551static int io_recvmsg_prep(struct io_kiocb *req,
4552                           const struct io_uring_sqe *sqe)
4553{
4554        struct io_async_msghdr *async_msg = req->async_data;
4555        struct io_sr_msg *sr = &req->sr_msg;
4556        int ret;
4557
4558        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4559                return -EINVAL;
4560
4561        sr->msg_flags = READ_ONCE(sqe->msg_flags);
4562        sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4563        sr->len = READ_ONCE(sqe->len);
4564        sr->bgid = READ_ONCE(sqe->buf_group);
4565
4566#ifdef CONFIG_COMPAT
4567        if (req->ctx->compat)
4568                sr->msg_flags |= MSG_CMSG_COMPAT;
4569#endif
4570
4571        if (!async_msg || !io_op_defs[req->opcode].needs_async_data)
4572                return 0;
4573        ret = io_recvmsg_copy_hdr(req, async_msg);
4574        if (!ret)
4575                req->flags |= REQ_F_NEED_CLEANUP;
4576        return ret;
4577}
4578
4579static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4580                      struct io_comp_state *cs)
4581{
4582        struct io_async_msghdr iomsg, *kmsg;
4583        struct socket *sock;
4584        struct io_buffer *kbuf;
4585        unsigned flags;
4586        int ret, cflags = 0;
4587
4588        sock = sock_from_file(req->file, &ret);
4589        if (unlikely(!sock))
4590                return ret;
4591
4592        if (req->async_data) {
4593                kmsg = req->async_data;
4594                kmsg->msg.msg_name = &kmsg->addr;
4595                /* if iov is set, it's allocated already */
4596                if (!kmsg->iov)
4597                        kmsg->iov = kmsg->fast_iov;
4598                kmsg->msg.msg_iter.iov = kmsg->iov;
4599        } else {
4600                ret = io_recvmsg_copy_hdr(req, &iomsg);
4601                if (ret)
4602                        return ret;
4603                kmsg = &iomsg;
4604        }
4605
4606        if (req->flags & REQ_F_BUFFER_SELECT) {
4607                kbuf = io_recv_buffer_select(req, !force_nonblock);
4608                if (IS_ERR(kbuf))
4609                        return PTR_ERR(kbuf);
4610                kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
4611                iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
4612                                1, req->sr_msg.len);
4613        }
4614
4615        flags = req->sr_msg.msg_flags;
4616        if (flags & MSG_DONTWAIT)
4617                req->flags |= REQ_F_NOWAIT;
4618        else if (force_nonblock)
4619                flags |= MSG_DONTWAIT;
4620
4621        ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
4622                                        kmsg->uaddr, flags);
4623        if (force_nonblock && ret == -EAGAIN)
4624                return io_setup_async_msg(req, kmsg);
4625        if (ret == -ERESTARTSYS)
4626                ret = -EINTR;
4627
4628        if (req->flags & REQ_F_BUFFER_SELECTED)
4629                cflags = io_put_recv_kbuf(req);
4630        if (kmsg->iov != kmsg->fast_iov)
4631                kfree(kmsg->iov);
4632        req->flags &= ~REQ_F_NEED_CLEANUP;
4633        if (ret < 0)
4634                req_set_fail_links(req);
4635        __io_req_complete(req, ret, cflags, cs);
4636        return 0;
4637}
4638
4639static int io_recv(struct io_kiocb *req, bool force_nonblock,
4640                   struct io_comp_state *cs)
4641{
4642        struct io_buffer *kbuf;
4643        struct io_sr_msg *sr = &req->sr_msg;
4644        struct msghdr msg;
4645        void __user *buf = sr->buf;
4646        struct socket *sock;
4647        struct iovec iov;
4648        unsigned flags;
4649        int ret, cflags = 0;
4650
4651        sock = sock_from_file(req->file, &ret);
4652        if (unlikely(!sock))
4653                return ret;
4654
4655        if (req->flags & REQ_F_BUFFER_SELECT) {
4656                kbuf = io_recv_buffer_select(req, !force_nonblock);
4657                if (IS_ERR(kbuf))
4658                        return PTR_ERR(kbuf);
4659                buf = u64_to_user_ptr(kbuf->addr);
4660        }
4661
4662        ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
4663        if (unlikely(ret))
4664                goto out_free;
4665
4666        msg.msg_name = NULL;
4667        msg.msg_control = NULL;
4668        msg.msg_controllen = 0;
4669        msg.msg_namelen = 0;
4670        msg.msg_iocb = NULL;
4671        msg.msg_flags = 0;
4672
4673        flags = req->sr_msg.msg_flags;
4674        if (flags & MSG_DONTWAIT)
4675                req->flags |= REQ_F_NOWAIT;
4676        else if (force_nonblock)
4677                flags |= MSG_DONTWAIT;
4678
4679        ret = sock_recvmsg(sock, &msg, flags);
4680        if (force_nonblock && ret == -EAGAIN)
4681                return -EAGAIN;
4682        if (ret == -ERESTARTSYS)
4683                ret = -EINTR;
4684out_free:
4685        if (req->flags & REQ_F_BUFFER_SELECTED)
4686                cflags = io_put_recv_kbuf(req);
4687        if (ret < 0)
4688                req_set_fail_links(req);
4689        __io_req_complete(req, ret, cflags, cs);
4690        return 0;
4691}
4692
4693static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4694{
4695        struct io_accept *accept = &req->accept;
4696
4697        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4698                return -EINVAL;
4699        if (sqe->ioprio || sqe->len || sqe->buf_index)
4700                return -EINVAL;
4701
4702        accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4703        accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4704        accept->flags = READ_ONCE(sqe->accept_flags);
4705        accept->nofile = rlimit(RLIMIT_NOFILE);
4706        return 0;
4707}
4708
4709static int io_accept(struct io_kiocb *req, bool force_nonblock,
4710                     struct io_comp_state *cs)
4711{
4712        struct io_accept *accept = &req->accept;
4713        unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
4714        int ret;
4715
4716        if (req->file->f_flags & O_NONBLOCK)
4717                req->flags |= REQ_F_NOWAIT;
4718
4719        ret = __sys_accept4_file(req->file, file_flags, accept->addr,
4720                                        accept->addr_len, accept->flags,
4721                                        accept->nofile);
4722        if (ret == -EAGAIN && force_nonblock)
4723                return -EAGAIN;
4724        if (ret < 0) {
4725                if (ret == -ERESTARTSYS)
4726                        ret = -EINTR;
4727                req_set_fail_links(req);
4728        }
4729        __io_req_complete(req, ret, 0, cs);
4730        return 0;
4731}
4732
4733static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4734{
4735        struct io_connect *conn = &req->connect;
4736        struct io_async_connect *io = req->async_data;
4737
4738        if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
4739                return -EINVAL;
4740        if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
4741                return -EINVAL;
4742
4743        conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
4744        conn->addr_len =  READ_ONCE(sqe->addr2);
4745
4746        if (!io)
4747                return 0;
4748
4749        return move_addr_to_kernel(conn->addr, conn->addr_len,
4750                                        &io->address);
4751}
4752
4753static int io_connect(struct io_kiocb *req, bool force_nonblock,
4754                      struct io_comp_state *cs)
4755{
4756        struct io_async_connect __io, *io;
4757        unsigned file_flags;
4758        int ret;
4759
4760        if (req->async_data) {
4761                io = req->async_data;
4762        } else {
4763                ret = move_addr_to_kernel(req->connect.addr,
4764                                                req->connect.addr_len,
4765                                                &__io.address);
4766                if (ret)
4767                        goto out;
4768                io = &__io;
4769        }
4770
4771        file_flags = force_nonblock ? O_NONBLOCK : 0;
4772
4773        ret = __sys_connect_file(req->file, &io->address,
4774                                        req->connect.addr_len, file_flags);
4775        if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
4776                if (req->async_data)
4777                        return -EAGAIN;
4778                if (io_alloc_async_data(req)) {
4779                        ret = -ENOMEM;
4780                        goto out;
4781                }
4782                io = req->async_data;
4783                memcpy(req->async_data, &__io, sizeof(__io));
4784                return -EAGAIN;
4785        }
4786        if (ret == -ERESTARTSYS)
4787                ret = -EINTR;
4788out:
4789        if (ret < 0)
4790                req_set_fail_links(req);
4791        __io_req_complete(req, ret, 0, cs);
4792        return 0;
4793}
4794#else /* !CONFIG_NET */
4795static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4796{
4797        return -EOPNOTSUPP;
4798}
4799
4800static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
4801                      struct io_comp_state *cs)
4802{
4803        return -EOPNOTSUPP;
4804}
4805
4806static int io_send(struct io_kiocb *req, bool force_nonblock,
4807                   struct io_comp_state *cs)
4808{
4809        return -EOPNOTSUPP;
4810}
4811
4812static int io_recvmsg_prep(struct io_kiocb *req,
4813                           const struct io_uring_sqe *sqe)
4814{
4815        return -EOPNOTSUPP;
4816}
4817
4818static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
4819                      struct io_comp_state *cs)
4820{
4821        return -EOPNOTSUPP;
4822}
4823
4824static int io_recv(struct io_kiocb *req, bool force_nonblock,
4825                   struct io_comp_state *cs)
4826{
4827        return -EOPNOTSUPP;
4828}
4829
4830static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4831{
4832        return -EOPNOTSUPP;
4833}
4834
4835static int io_accept(struct io_kiocb *req, bool force_nonblock,
4836                     struct io_comp_state *cs)
4837{
4838        return -EOPNOTSUPP;
4839}
4840
4841static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4842{
4843        return -EOPNOTSUPP;
4844}
4845
4846static int io_connect(struct io_kiocb *req, bool force_nonblock,
4847                      struct io_comp_state *cs)
4848{
4849        return -EOPNOTSUPP;
4850}
4851#endif /* CONFIG_NET */
4852
4853struct io_poll_table {
4854        struct poll_table_struct pt;
4855        struct io_kiocb *req;
4856        int error;
4857};
4858
4859static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
4860                           __poll_t mask, task_work_func_t func)
4861{
4862        bool twa_signal_ok;
4863        int ret;
4864
4865        /* for instances that support it check for an event match first: */
4866        if (mask && !(mask & poll->events))
4867                return 0;
4868
4869        trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
4870
4871        list_del_init(&poll->wait.entry);
4872
4873        req->result = mask;
4874        init_task_work(&req->task_work, func);
4875        percpu_ref_get(&req->ctx->refs);
4876
4877        /*
4878         * If we using the signalfd wait_queue_head for this wakeup, then
4879         * it's not safe to use TWA_SIGNAL as we could be recursing on the
4880         * tsk->sighand->siglock on doing the wakeup. Should not be needed
4881         * either, as the normal wakeup will suffice.
4882         */
4883        twa_signal_ok = (poll->head != &req->task->sighand->signalfd_wqh);
4884
4885        /*
4886         * If this fails, then the task is exiting. When a task exits, the
4887         * work gets canceled, so just cancel this request as well instead
4888         * of executing it. We can't safely execute it anyway, as we may not
4889         * have the needed state needed for it anyway.
4890         */
4891        ret = io_req_task_work_add(req, twa_signal_ok);
4892        if (unlikely(ret)) {
4893                struct task_struct *tsk;
4894
4895                WRITE_ONCE(poll->canceled, true);
4896                tsk = io_wq_get_task(req->ctx->io_wq);
4897                task_work_add(tsk, &req->task_work, TWA_NONE);
4898                wake_up_process(tsk);
4899        }
4900        return 1;
4901}
4902
4903static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
4904        __acquires(&req->ctx->completion_lock)
4905{
4906        struct io_ring_ctx *ctx = req->ctx;
4907
4908        if (!req->result && !READ_ONCE(poll->canceled)) {
4909                struct poll_table_struct pt = { ._key = poll->events };
4910
4911                req->result = vfs_poll(req->file, &pt) & poll->events;
4912        }
4913
4914        spin_lock_irq(&ctx->completion_lock);
4915        if (!req->result && !READ_ONCE(poll->canceled)) {
4916                add_wait_queue(poll->head, &poll->wait);
4917                return true;
4918        }
4919
4920        return false;
4921}
4922
4923static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
4924{
4925        /* pure poll stashes this in ->async_data, poll driven retry elsewhere */
4926        if (req->opcode == IORING_OP_POLL_ADD)
4927                return req->async_data;
4928        return req->apoll->double_poll;
4929}
4930
4931static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
4932{
4933        if (req->opcode == IORING_OP_POLL_ADD)
4934                return &req->poll;
4935        return &req->apoll->poll;
4936}
4937
4938static void io_poll_remove_double(struct io_kiocb *req)
4939{
4940        struct io_poll_iocb *poll = io_poll_get_double(req);
4941
4942        lockdep_assert_held(&req->ctx->completion_lock);
4943
4944        if (poll && poll->head) {
4945                struct wait_queue_head *head = poll->head;
4946
4947                spin_lock(&head->lock);
4948                list_del_init(&poll->wait.entry);
4949                if (poll->wait.private)
4950                        refcount_dec(&req->refs);
4951                poll->head = NULL;
4952                spin_unlock(&head->lock);
4953        }
4954}
4955
4956static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
4957{
4958        struct io_ring_ctx *ctx = req->ctx;
4959
4960        io_poll_remove_double(req);
4961        req->poll.done = true;
4962        io_cqring_fill_event(req, error ? error : mangle_poll(mask));
4963        io_commit_cqring(ctx);
4964}
4965
4966static void io_poll_task_func(struct callback_head *cb)
4967{
4968        struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
4969        struct io_ring_ctx *ctx = req->ctx;
4970        struct io_kiocb *nxt;
4971
4972        if (io_poll_rewait(req, &req->poll)) {
4973                spin_unlock_irq(&ctx->completion_lock);
4974        } else {
4975                hash_del(&req->hash_node);
4976                io_poll_complete(req, req->result, 0);
4977                spin_unlock_irq(&ctx->completion_lock);
4978
4979                nxt = io_put_req_find_next(req);
4980                io_cqring_ev_posted(ctx);
4981                if (nxt)
4982                        __io_req_task_submit(nxt);
4983        }
4984
4985        percpu_ref_put(&ctx->refs);
4986}
4987
4988static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
4989                               int sync, void *key)
4990{
4991        struct io_kiocb *req = wait->private;
4992        struct io_poll_iocb *poll = io_poll_get_single(req);
4993        __poll_t mask = key_to_poll(key);
4994
4995        /* for instances that support it check for an event match first: */
4996        if (mask && !(mask & poll->events))
4997                return 0;
4998
4999        list_del_init(&wait->entry);
5000

5001        if (poll && poll->head) {
5002                bool done;
5003
5004                spin_lock(&poll->head->lock);
5005                done = list_empty(&poll->wait.entry);
5006                if (!done)
5007                        list_del_init(&poll->wait.entry);
5008                /* make sure double remove sees this as being gone */
5009                wait->private = NULL;
5010                spin_unlock(&poll->head->lock);
5011                if (!done) {
5012                        /* use wait func handler, so it matches the rq type */
5013                        poll->wait.func(&poll->wait, mode, sync, key);
5014                }
5015        }
5016        refcount_dec(&req->refs);
5017        return 1;
5018}
5019
5020static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5021                              wait_queue_func_t wake_func)
5022{
5023        poll->head = NULL;
5024        poll->done = false;
5025        poll->canceled = false;
5026        poll->events = events;
5027        INIT_LIST_HEAD(&poll->wait.entry);
5028        init_waitqueue_func_entry(&poll->wait, wake_func);
5029}
5030
5031static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5032                            struct wait_queue_head *head,
5033                            struct io_poll_iocb **poll_ptr)
5034{
5035        struct io_kiocb *req = pt->req;
5036
5037        /*
5038         * If poll->head is already set, it's because the file being polled
5039         * uses multiple waitqueues for poll handling (eg one for read, one
5040         * for write). Setup a separate io_poll_iocb if this happens.
5041         */
5042        if (unlikely(poll->head)) {
5043                struct io_poll_iocb *poll_one = poll;
5044
5045                /* already have a 2nd entry, fail a third attempt */
5046                if (*poll_ptr) {
5047                        pt->error = -EINVAL;
5048                        return;
5049                }
5050                poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5051                if (!poll) {
5052                        pt->error = -ENOMEM;
5053                        return;
5054                }
5055                io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake);
5056                refcount_inc(&req->refs);
5057                poll->wait.private = req;
5058                *poll_ptr = poll;
5059        }
5060
5061        pt->error = 0;
5062        poll->head = head;
5063
5064        if (poll->events & EPOLLEXCLUSIVE)
5065                add_wait_queue_exclusive(head, &poll->wait);
5066        else
5067                add_wait_queue(head, &poll->wait);
5068}
5069
5070static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5071                               struct poll_table_struct *p)
5072{
5073        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5074        struct async_poll *apoll = pt->req->apoll;
5075
5076        __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5077}
5078
5079static void io_async_task_func(struct callback_head *cb)
5080{
5081        struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
5082        struct async_poll *apoll = req->apoll;
5083        struct io_ring_ctx *ctx = req->ctx;
5084
5085        trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
5086
5087        if (io_poll_rewait(req, &apoll->poll)) {
5088                spin_unlock_irq(&ctx->completion_lock);
5089                percpu_ref_put(&ctx->refs);
5090                return;
5091        }
5092
5093        /* If req is still hashed, it cannot have been canceled. Don't check. */
5094        if (hash_hashed(&req->hash_node))
5095                hash_del(&req->hash_node);
5096
5097        io_poll_remove_double(req);
5098        spin_unlock_irq(&ctx->completion_lock);
5099
5100        if (!READ_ONCE(apoll->poll.canceled))
5101                __io_req_task_submit(req);
5102        else
5103                __io_req_task_cancel(req, -ECANCELED);
5104
5105        percpu_ref_put(&ctx->refs);
5106        kfree(apoll->double_poll);
5107        kfree(apoll);
5108}
5109
5110static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5111                        void *key)
5112{
5113        struct io_kiocb *req = wait->private;
5114        struct io_poll_iocb *poll = &req->apoll->poll;
5115
5116        trace_io_uring_poll_wake(req->ctx, req->opcode, req->user_data,
5117                                        key_to_poll(key));
5118
5119        return __io_async_wake(req, poll, key_to_poll(key), io_async_task_func);
5120}
5121
5122static void io_poll_req_insert(struct io_kiocb *req)
5123{
5124        struct io_ring_ctx *ctx = req->ctx;
5125        struct hlist_head *list;
5126
5127        list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5128        hlist_add_head(&req->hash_node, list);
5129}
5130
5131static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
5132                                      struct io_poll_iocb *poll,
5133                                      struct io_poll_table *ipt, __poll_t mask,
5134                                      wait_queue_func_t wake_func)
5135        __acquires(&ctx->completion_lock)
5136{
5137        struct io_ring_ctx *ctx = req->ctx;
5138        bool cancel = false;
5139
5140        INIT_HLIST_NODE(&req->hash_node);
5141        io_init_poll_iocb(poll, mask, wake_func);
5142        poll->file = req->file;
5143        poll->wait.private = req;
5144
5145        ipt->pt._key = mask;
5146        ipt->req = req;
5147        ipt->error = -EINVAL;
5148
5149        mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5150
5151        spin_lock_irq(&ctx->completion_lock);
5152        if (likely(poll->head)) {
5153                spin_lock(&poll->head->lock);
5154                if (unlikely(list_empty(&poll->wait.entry))) {
5155                        if (ipt->error)
5156                                cancel = true;
5157                        ipt->error = 0;
5158                        mask = 0;
5159                }
5160                if (mask || ipt->error)
5161                        list_del_init(&poll->wait.entry);
5162                else if (cancel)
5163                        WRITE_ONCE(poll->canceled, true);
5164                else if (!poll->done) /* actually waiting for an event */
5165                        io_poll_req_insert(req);
5166                spin_unlock(&poll->head->lock);
5167        }
5168
5169        return mask;
5170}
5171
5172static bool io_arm_poll_handler(struct io_kiocb *req)
5173{
5174        const struct io_op_def *def = &io_op_defs[req->opcode];
5175        struct io_ring_ctx *ctx = req->ctx;
5176        struct async_poll *apoll;
5177        struct io_poll_table ipt;
5178        __poll_t mask, ret;
5179        int rw;
5180
5181        if (!req->file || !file_can_poll(req->file))
5182                return false;
5183        if (req->flags & REQ_F_POLLED)
5184                return false;
5185        if (def->pollin)
5186                rw = READ;
5187        else if (def->pollout)
5188                rw = WRITE;
5189        else
5190                return false;
5191        /* if we can't nonblock try, then no point in arming a poll handler */
5192        if (!io_file_supports_async(req->file, rw))
5193                return false;
5194
5195        apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5196        if (unlikely(!apoll))
5197                return false;
5198        apoll->double_poll = NULL;
5199
5200        req->flags |= REQ_F_POLLED;
5201        req->apoll = apoll;
5202
5203        mask = 0;
5204        if (def->pollin)
5205                mask |= POLLIN | POLLRDNORM;
5206        if (def->pollout)
5207                mask |= POLLOUT | POLLWRNORM;
5208
5209        /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5210        if ((req->opcode == IORING_OP_RECVMSG) &&
5211            (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5212                mask &= ~POLLIN;
5213
5214        mask |= POLLERR | POLLPRI;
5215
5216        ipt.pt._qproc = io_async_queue_proc;
5217
5218        ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
5219                                        io_async_wake);
5220        if (ret || ipt.error) {
5221                io_poll_remove_double(req);
5222                spin_unlock_irq(&ctx->completion_lock);
5223                kfree(apoll->double_poll);
5224                kfree(apoll);
5225                return false;
5226        }
5227        spin_unlock_irq(&ctx->completion_lock);
5228        trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask,
5229                                        apoll->poll.events);
5230        return true;
5231}
5232
5233static bool __io_poll_remove_one(struct io_kiocb *req,
5234                                 struct io_poll_iocb *poll)
5235{
5236        bool do_complete = false;
5237
5238        spin_lock(&poll->head->lock);
5239        WRITE_ONCE(poll->canceled, true);
5240        if (!list_empty(&poll->wait.entry)) {
5241                list_del_init(&poll->wait.entry);
5242                do_complete = true;
5243        }
5244        spin_unlock(&poll->head->lock);
5245        hash_del(&req->hash_node);
5246        return do_complete;
5247}
5248
5249static bool io_poll_remove_one(struct io_kiocb *req)
5250{
5251        bool do_complete;
5252
5253        io_poll_remove_double(req);
5254
5255        if (req->opcode == IORING_OP_POLL_ADD) {
5256                do_complete = __io_poll_remove_one(req, &req->poll);
5257        } else {
5258                struct async_poll *apoll = req->apoll;
5259
5260                /* non-poll requests have submit ref still */
5261                do_complete = __io_poll_remove_one(req, &apoll->poll);
5262                if (do_complete) {
5263                        io_put_req(req);
5264                        kfree(apoll->double_poll);
5265                        kfree(apoll);
5266                }
5267        }
5268
5269        if (do_complete) {
5270                io_cqring_fill_event(req, -ECANCELED);
5271                io_commit_cqring(req->ctx);
5272                req_set_fail_links(req);
5273                io_put_req_deferred(req, 1);
5274        }
5275
5276        return do_complete;
5277}
5278
5279/*
5280 * Returns true if we found and killed one or more poll requests
5281 */
5282static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk)
5283{
5284        struct hlist_node *tmp;
5285        struct io_kiocb *req;
5286        int posted = 0, i;
5287
5288        spin_lock_irq(&ctx->completion_lock);
5289        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5290                struct hlist_head *list;
5291
5292                list = &ctx->cancel_hash[i];
5293                hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5294                        if (io_task_match(req, tsk))
5295                                posted += io_poll_remove_one(req);
5296                }
5297        }
5298        spin_unlock_irq(&ctx->completion_lock);
5299
5300        if (posted)
5301                io_cqring_ev_posted(ctx);
5302
5303        return posted != 0;
5304}
5305
5306static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
5307{
5308        struct hlist_head *list;
5309        struct io_kiocb *req;
5310
5311        list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5312        hlist_for_each_entry(req, list, hash_node) {
5313                if (sqe_addr != req->user_data)
5314                        continue;
5315                if (io_poll_remove_one(req))
5316                        return 0;
5317                return -EALREADY;
5318        }
5319
5320        return -ENOENT;
5321}
5322
5323static int io_poll_remove_prep(struct io_kiocb *req,
5324                               const struct io_uring_sqe *sqe)
5325{
5326        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5327                return -EINVAL;
5328        if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
5329            sqe->poll_events)
5330                return -EINVAL;
5331
5332        req->poll.addr = READ_ONCE(sqe->addr);
5333        return 0;
5334}
5335
5336/*
5337 * Find a running poll command that matches one specified in sqe->addr,
5338 * and remove it if found.
5339 */
5340static int io_poll_remove(struct io_kiocb *req)
5341{
5342        struct io_ring_ctx *ctx = req->ctx;
5343        u64 addr;
5344        int ret;
5345
5346        addr = req->poll.addr;
5347        spin_lock_irq(&ctx->completion_lock);
5348        ret = io_poll_cancel(ctx, addr);
5349        spin_unlock_irq(&ctx->completion_lock);
5350
5351        if (ret < 0)
5352                req_set_fail_links(req);
5353        io_req_complete(req, ret);
5354        return 0;
5355}
5356
5357static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5358                        void *key)
5359{
5360        struct io_kiocb *req = wait->private;
5361        struct io_poll_iocb *poll = &req->poll;
5362
5363        return __io_async_wake(req, poll, key_to_poll(key), io_poll_task_func);
5364}
5365
5366static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5367                               struct poll_table_struct *p)
5368{
5369        struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5370
5371        __io_queue_proc(&pt->req->poll, pt, head, (struct io_poll_iocb **) &pt->req->async_data);
5372}
5373
5374static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5375{
5376        struct io_poll_iocb *poll = &req->poll;
5377        u32 events;
5378
5379        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5380                return -EINVAL;
5381        if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
5382                return -EINVAL;
5383
5384        events = READ_ONCE(sqe->poll32_events);
5385#ifdef __BIG_ENDIAN
5386        events = swahw32(events);
5387#endif
5388        poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
5389                       (events & EPOLLEXCLUSIVE);
5390        return 0;
5391}
5392
5393static int io_poll_add(struct io_kiocb *req)
5394{
5395        struct io_poll_iocb *poll = &req->poll;
5396        struct io_ring_ctx *ctx = req->ctx;
5397        struct io_poll_table ipt;
5398        __poll_t mask;
5399
5400        ipt.pt._qproc = io_poll_queue_proc;
5401
5402        mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,
5403                                        io_poll_wake);
5404
5405        if (mask) { /* no async, we'd stolen it */
5406                ipt.error = 0;
5407                io_poll_complete(req, mask, 0);
5408        }
5409        spin_unlock_irq(&ctx->completion_lock);
5410
5411        if (mask) {
5412                io_cqring_ev_posted(ctx);
5413                io_put_req(req);
5414        }
5415        return ipt.error;
5416}
5417
5418static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
5419{
5420        struct io_timeout_data *data = container_of(timer,
5421                                                struct io_timeout_data, timer);
5422        struct io_kiocb *req = data->req;
5423        struct io_ring_ctx *ctx = req->ctx;
5424        unsigned long flags;
5425
5426        spin_lock_irqsave(&ctx->completion_lock, flags);
5427        list_del_init(&req->timeout.list);
5428        atomic_set(&req->ctx->cq_timeouts,
5429                atomic_read(&req->ctx->cq_timeouts) + 1);
5430
5431        io_cqring_fill_event(req, -ETIME);
5432        io_commit_cqring(ctx);
5433        spin_unlock_irqrestore(&ctx->completion_lock, flags);
5434
5435        io_cqring_ev_posted(ctx);
5436        req_set_fail_links(req);
5437        io_put_req(req);
5438        return HRTIMER_NORESTART;
5439}
5440
5441static int __io_timeout_cancel(struct io_kiocb *req)
5442{
5443        struct io_timeout_data *io = req->async_data;
5444        int ret;
5445
5446        ret = hrtimer_try_to_cancel(&io->timer);
5447        if (ret == -1)
5448                return -EALREADY;
5449        list_del_init(&req->timeout.list);
5450
5451        req_set_fail_links(req);
5452        io_cqring_fill_event(req, -ECANCELED);
5453        io_put_req_deferred(req, 1);
5454        return 0;
5455}
5456
5457static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
5458{
5459        struct io_kiocb *req;
5460        int ret = -ENOENT;
5461
5462        list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
5463                if (user_data == req->user_data) {
5464                        ret = 0;
5465                        break;
5466                }
5467        }
5468
5469        if (ret == -ENOENT)
5470                return ret;
5471
5472        return __io_timeout_cancel(req);
5473}
5474
5475static int io_timeout_remove_prep(struct io_kiocb *req,
5476                                  const struct io_uring_sqe *sqe)
5477{
5478        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5479                return -EINVAL;
5480        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5481                return -EINVAL;
5482        if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->timeout_flags)
5483                return -EINVAL;
5484
5485        req->timeout_rem.addr = READ_ONCE(sqe->addr);
5486        return 0;
5487}
5488
5489/*
5490 * Remove or update an existing timeout command
5491 */
5492static int io_timeout_remove(struct io_kiocb *req)
5493{
5494        struct io_ring_ctx *ctx = req->ctx;
5495        int ret;
5496
5497        spin_lock_irq(&ctx->completion_lock);
5498        ret = io_timeout_cancel(ctx, req->timeout_rem.addr);
5499
5500        io_cqring_fill_event(req, ret);
5501        io_commit_cqring(ctx);
5502        spin_unlock_irq(&ctx->completion_lock);
5503        io_cqring_ev_posted(ctx);
5504        if (ret < 0)
5505                req_set_fail_links(req);
5506        io_put_req(req);
5507        return 0;
5508}
5509
5510static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
5511                           bool is_timeout_link)
5512{
5513        struct io_timeout_data *data;
5514        unsigned flags;
5515        u32 off = READ_ONCE(sqe->off);
5516
5517        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5518                return -EINVAL;
5519        if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
5520                return -EINVAL;
5521        if (off && is_timeout_link)
5522                return -EINVAL;
5523        flags = READ_ONCE(sqe->timeout_flags);
5524        if (flags & ~IORING_TIMEOUT_ABS)
5525                return -EINVAL;
5526
5527        req->timeout.off = off;
5528
5529        if (!req->async_data && io_alloc_async_data(req))
5530                return -ENOMEM;
5531
5532        data = req->async_data;
5533        data->req = req;
5534
5535        if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
5536                return -EFAULT;
5537
5538        if (flags & IORING_TIMEOUT_ABS)
5539                data->mode = HRTIMER_MODE_ABS;
5540        else
5541                data->mode = HRTIMER_MODE_REL;
5542
5543        hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
5544        return 0;
5545}
5546
5547static int io_timeout(struct io_kiocb *req)
5548{
5549        struct io_ring_ctx *ctx = req->ctx;
5550        struct io_timeout_data *data = req->async_data;
5551        struct list_head *entry;
5552        u32 tail, off = req->timeout.off;
5553
5554        spin_lock_irq(&ctx->completion_lock);
5555
5556        /*
5557         * sqe->off holds how many events that need to occur for this
5558         * timeout event to be satisfied. If it isn't set, then this is
5559         * a pure timeout request, sequence isn't used.
5560         */
5561        if (io_is_timeout_noseq(req)) {
5562                entry = ctx->timeout_list.prev;
5563                goto add;
5564        }
5565
5566        tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
5567        req->timeout.target_seq = tail + off;
5568
5569        /*
5570         * Insertion sort, ensuring the first entry in the list is always
5571         * the one we need first.
5572         */
5573        list_for_each_prev(entry, &ctx->timeout_list) {
5574                struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
5575                                                  timeout.list);
5576
5577                if (io_is_timeout_noseq(nxt))
5578                        continue;
5579                /* nxt.seq is behind @tail, otherwise would've been completed */
5580                if (off >= nxt->timeout.target_seq - tail)
5581                        break;
5582        }
5583add:
5584        list_add(&req->timeout.list, entry);
5585        data->timer.function = io_timeout_fn;
5586        hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
5587        spin_unlock_irq(&ctx->completion_lock);
5588        return 0;
5589}
5590
5591static bool io_cancel_cb(struct io_wq_work *work, void *data)
5592{
5593        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
5594
5595        return req->user_data == (unsigned long) data;
5596}
5597
5598static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr)
5599{
5600        enum io_wq_cancel cancel_ret;
5601        int ret = 0;
5602
5603        cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false);
5604        switch (cancel_ret) {
5605        case IO_WQ_CANCEL_OK:
5606                ret = 0;
5607                break;
5608        case IO_WQ_CANCEL_RUNNING:
5609                ret = -EALREADY;
5610                break;
5611        case IO_WQ_CANCEL_NOTFOUND:
5612                ret = -ENOENT;
5613                break;
5614        }
5615
5616        return ret;
5617}
5618
5619static void io_async_find_and_cancel(struct io_ring_ctx *ctx,
5620                                     struct io_kiocb *req, __u64 sqe_addr,
5621                                     int success_ret)
5622{
5623        unsigned long flags;
5624        int ret;
5625
5626        ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr);
5627        if (ret != -ENOENT) {
5628                spin_lock_irqsave(&ctx->completion_lock, flags);
5629                goto done;
5630        }
5631
5632        spin_lock_irqsave(&ctx->completion_lock, flags);
5633        ret = io_timeout_cancel(ctx, sqe_addr);
5634        if (ret != -ENOENT)
5635                goto done;
5636        ret = io_poll_cancel(ctx, sqe_addr);
5637done:
5638        if (!ret)
5639                ret = success_ret;
5640        io_cqring_fill_event(req, ret);
5641        io_commit_cqring(ctx);
5642        spin_unlock_irqrestore(&ctx->completion_lock, flags);
5643        io_cqring_ev_posted(ctx);
5644
5645        if (ret < 0)
5646                req_set_fail_links(req);
5647        io_put_req(req);
5648}
5649
5650static int io_async_cancel_prep(struct io_kiocb *req,
5651                                const struct io_uring_sqe *sqe)
5652{
5653        if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5654                return -EINVAL;
5655        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5656                return -EINVAL;
5657        if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags)
5658                return -EINVAL;
5659
5660        req->cancel.addr = READ_ONCE(sqe->addr);
5661        return 0;
5662}
5663
5664static int io_async_cancel(struct io_kiocb *req)
5665{
5666        struct io_ring_ctx *ctx = req->ctx;
5667
5668        io_async_find_and_cancel(ctx, req, req->cancel.addr, 0);
5669        return 0;
5670}
5671
5672static int io_files_update_prep(struct io_kiocb *req,
5673                                const struct io_uring_sqe *sqe)
5674{
5675        if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
5676                return -EINVAL;
5677        if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
5678                return -EINVAL;
5679        if (sqe->ioprio || sqe->rw_flags)
5680                return -EINVAL;
5681
5682        req->files_update.offset = READ_ONCE(sqe->off);
5683        req->files_update.nr_args = READ_ONCE(sqe->len);
5684        if (!req->files_update.nr_args)
5685                return -EINVAL;
5686        req->files_update.arg = READ_ONCE(sqe->addr);
5687        return 0;
5688}
5689
5690static int io_files_update(struct io_kiocb *req, bool force_nonblock,
5691                           struct io_comp_state *cs)
5692{
5693        struct io_ring_ctx *ctx = req->ctx;
5694        struct io_uring_files_update up;
5695        int ret;
5696
5697        if (force_nonblock)
5698                return -EAGAIN;
5699
5700        up.offset = req->files_update.offset;
5701        up.fds = req->files_update.arg;
5702
5703        mutex_lock(&ctx->uring_lock);
5704        ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
5705        mutex_unlock(&ctx->uring_lock);
5706
5707        if (ret < 0)
5708                req_set_fail_links(req);
5709        __io_req_complete(req, ret, 0, cs);
5710        return 0;
5711}
5712
5713static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5714{
5715        switch (req->opcode) {
5716        case IORING_OP_NOP:
5717                return 0;
5718        case IORING_OP_READV:
5719        case IORING_OP_READ_FIXED:
5720        case IORING_OP_READ:
5721                return io_read_prep(req, sqe);
5722        case IORING_OP_WRITEV:
5723        case IORING_OP_WRITE_FIXED:
5724        case IORING_OP_WRITE:
5725                return io_write_prep(req, sqe);
5726        case IORING_OP_POLL_ADD:
5727                return io_poll_add_prep(req, sqe);
5728        case IORING_OP_POLL_REMOVE:
5729                return io_poll_remove_prep(req, sqe);
5730        case IORING_OP_FSYNC:
5731                return io_prep_fsync(req, sqe);
5732        case IORING_OP_SYNC_FILE_RANGE:
5733                return io_prep_sfr(req, sqe);
5734        case IORING_OP_SENDMSG:
5735        case IORING_OP_SEND:
5736                return io_sendmsg_prep(req, sqe);
5737        case IORING_OP_RECVMSG:
5738        case IORING_OP_RECV:
5739                return io_recvmsg_prep(req, sqe);
5740        case IORING_OP_CONNECT:
5741                return io_connect_prep(req, sqe);
5742        case IORING_OP_TIMEOUT:
5743                return io_timeout_prep(req, sqe, false);
5744        case IORING_OP_TIMEOUT_REMOVE:
5745                return io_timeout_remove_prep(req, sqe);
5746        case IORING_OP_ASYNC_CANCEL:
5747                return io_async_cancel_prep(req, sqe);
5748        case IORING_OP_LINK_TIMEOUT:
5749                return io_timeout_prep(req, sqe, true);
5750        case IORING_OP_ACCEPT:
5751                return io_accept_prep(req, sqe);
5752        case IORING_OP_FALLOCATE:
5753                return io_fallocate_prep(req, sqe);
5754        case IORING_OP_OPENAT:
5755                return io_openat_prep(req, sqe);
5756        case IORING_OP_CLOSE:
5757                return io_close_prep(req, sqe);
5758        case IORING_OP_FILES_UPDATE:
5759                return io_files_update_prep(req, sqe);
5760        case IORING_OP_STATX:
5761                return io_statx_prep(req, sqe);
5762        case IORING_OP_FADVISE:
5763                return io_fadvise_prep(req, sqe);
5764        case IORING_OP_MADVISE:
5765                return io_madvise_prep(req, sqe);
5766        case IORING_OP_OPENAT2:
5767                return io_openat2_prep(req, sqe);
5768        case IORING_OP_EPOLL_CTL:
5769                return io_epoll_ctl_prep(req, sqe);
5770        case IORING_OP_SPLICE:
5771                return io_splice_prep(req, sqe);
5772        case IORING_OP_PROVIDE_BUFFERS:
5773                return io_provide_buffers_prep(req, sqe);
5774        case IORING_OP_REMOVE_BUFFERS:
5775                return io_remove_buffers_prep(req, sqe);
5776        case IORING_OP_TEE:
5777                return io_tee_prep(req, sqe);
5778        }
5779
5780        printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
5781                        req->opcode);
5782        return-EINVAL;
5783}
5784
5785static int io_req_defer_prep(struct io_kiocb *req,
5786                             const struct io_uring_sqe *sqe)
5787{
5788        if (!sqe)
5789                return 0;
5790        if (io_alloc_async_data(req))
5791                return -EAGAIN;
5792        return io_req_prep(req, sqe);
5793}
5794
5795static u32 io_get_sequence(struct io_kiocb *req)
5796{
5797        struct io_kiocb *pos;
5798        struct io_ring_ctx *ctx = req->ctx;
5799        u32 total_submitted, nr_reqs = 1;
5800
5801        if (req->flags & REQ_F_LINK_HEAD)
5802                list_for_each_entry(pos, &req->link_list, link_list)
5803                        nr_reqs++;
5804
5805        total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
5806        return total_submitted - nr_reqs;
5807}
5808
5809static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5810{
5811        struct io_ring_ctx *ctx = req->ctx;
5812        struct io_defer_entry *de;
5813        int ret;
5814        u32 seq;
5815
5816        /* Still need defer if there is pending req in defer list. */
5817        if (likely(list_empty_careful(&ctx->defer_list) &&
5818                !(req->flags & REQ_F_IO_DRAIN)))
5819                return 0;
5820
5821        seq = io_get_sequence(req);
5822        /* Still a chance to pass the sequence check */
5823        if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
5824                return 0;
5825
5826        if (!req->async_data) {
5827                ret = io_req_defer_prep(req, sqe);
5828                if (ret)
5829                        return ret;
5830        }
5831        io_prep_async_link(req);
5832        de = kmalloc(sizeof(*de), GFP_KERNEL);
5833        if (!de)
5834                return -ENOMEM;
5835
5836        spin_lock_irq(&ctx->completion_lock);
5837        if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
5838                spin_unlock_irq(&ctx->completion_lock);
5839                kfree(de);
5840                io_queue_async_work(req);
5841                return -EIOCBQUEUED;
5842        }
5843
5844        trace_io_uring_defer(ctx, req, req->user_data);
5845        de->req = req;
5846        de->seq = seq;
5847        list_add_tail(&de->list, &ctx->defer_list);
5848        spin_unlock_irq(&ctx->completion_lock);
5849        return -EIOCBQUEUED;
5850}
5851
5852static void io_req_drop_files(struct io_kiocb *req)
5853{
5854        struct io_ring_ctx *ctx = req->ctx;
5855        unsigned long flags;
5856
5857        spin_lock_irqsave(&ctx->inflight_lock, flags);
5858        list_del(&req->inflight_entry);
5859        if (waitqueue_active(&ctx->inflight_wait))
5860                wake_up(&ctx->inflight_wait);
5861        spin_unlock_irqrestore(&ctx->inflight_lock, flags);
5862        req->flags &= ~REQ_F_INFLIGHT;
5863        put_files_struct(req->work.identity->files);
5864        put_nsproxy(req->work.identity->nsproxy);
5865        req->work.flags &= ~IO_WQ_WORK_FILES;
5866}
5867
5868static void __io_clean_op(struct io_kiocb *req)
5869{
5870        if (req->flags & REQ_F_BUFFER_SELECTED) {
5871                switch (req->opcode) {
5872                case IORING_OP_READV:
5873                case IORING_OP_READ_FIXED:
5874                case IORING_OP_READ:
5875                        kfree((void *)(unsigned long)req->rw.addr);
5876                        break;
5877                case IORING_OP_RECVMSG:
5878                case IORING_OP_RECV:
5879                        kfree(req->sr_msg.kbuf);
5880                        break;
5881                }
5882                req->flags &= ~REQ_F_BUFFER_SELECTED;
5883        }
5884
5885        if (req->flags & REQ_F_NEED_CLEANUP) {
5886                switch (req->opcode) {
5887                case IORING_OP_READV:
5888                case IORING_OP_READ_FIXED:
5889                case IORING_OP_READ:
5890                case IORING_OP_WRITEV:
5891                case IORING_OP_WRITE_FIXED:
5892                case IORING_OP_WRITE: {
5893                        struct io_async_rw *io = req->async_data;
5894                        if (io->free_iovec)
5895                                kfree(io->free_iovec);
5896                        break;
5897                        }
5898                case IORING_OP_RECVMSG:
5899                case IORING_OP_SENDMSG: {
5900                        struct io_async_msghdr *io = req->async_data;
5901                        if (io->iov != io->fast_iov)
5902                                kfree(io->iov);
5903                        break;
5904                        }
5905                case IORING_OP_SPLICE:
5906                case IORING_OP_TEE:
5907                        io_put_file(req, req->splice.file_in,
5908                                    (req->splice.flags & SPLICE_F_FD_IN_FIXED));
5909                        break;
5910                case IORING_OP_OPENAT:
5911                case IORING_OP_OPENAT2:
5912                        if (req->open.filename)
5913                                putname(req->open.filename);
5914                        break;
5915                }
5916                req->flags &= ~REQ_F_NEED_CLEANUP;
5917        }
5918
5919        if (req->flags & REQ_F_INFLIGHT)
5920                io_req_drop_files(req);
5921}
5922
5923static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock,
5924                        struct io_comp_state *cs)
5925{
5926        struct io_ring_ctx *ctx = req->ctx;
5927        int ret;
5928
5929        switch (req->opcode) {
5930        case IORING_OP_NOP:
5931                ret = io_nop(req, cs);
5932                break;
5933        case IORING_OP_READV:
5934        case IORING_OP_READ_FIXED:
5935        case IORING_OP_READ:
5936                ret = io_read(req, force_nonblock, cs);
5937                break;
5938        case IORING_OP_WRITEV:
5939        case IORING_OP_WRITE_FIXED:
5940        case IORING_OP_WRITE:
5941                ret = io_write(req, force_nonblock, cs);
5942                break;
5943        case IORING_OP_FSYNC:
5944                ret = io_fsync(req, force_nonblock);
5945                break;
5946        case IORING_OP_POLL_ADD:
5947                ret = io_poll_add(req);
5948                break;
5949        case IORING_OP_POLL_REMOVE:
5950                ret = io_poll_remove(req);
5951                break;
5952        case IORING_OP_SYNC_FILE_RANGE:
5953                ret = io_sync_file_range(req, force_nonblock);
5954                break;
5955        case IORING_OP_SENDMSG:
5956                ret = io_sendmsg(req, force_nonblock, cs);
5957                break;
5958        case IORING_OP_SEND:
5959                ret = io_send(req, force_nonblock, cs);
5960                break;
5961        case IORING_OP_RECVMSG:
5962                ret = io_recvmsg(req, force_nonblock, cs);
5963                break;
5964        case IORING_OP_RECV:
5965                ret = io_recv(req, force_nonblock, cs);
5966                break;
5967        case IORING_OP_TIMEOUT:
5968                ret = io_timeout(req);
5969                break;
5970        case IORING_OP_TIMEOUT_REMOVE:
5971                ret = io_timeout_remove(req);
5972                break;
5973        case IORING_OP_ACCEPT:
5974                ret = io_accept(req, force_nonblock, cs);
5975                break;
5976        case IORING_OP_CONNECT:
5977                ret = io_connect(req, force_nonblock, cs);
5978                break;
5979        case IORING_OP_ASYNC_CANCEL:
5980                ret = io_async_cancel(req);
5981                break;
5982        case IORING_OP_FALLOCATE:
5983                ret = io_fallocate(req, force_nonblock);
5984                break;
5985        case IORING_OP_OPENAT:
5986                ret = io_openat(req, force_nonblock);
5987                break;
5988        case IORING_OP_CLOSE:
5989                ret = io_close(req, force_nonblock, cs);
5990                break;
5991        case IORING_OP_FILES_UPDATE:
5992                ret = io_files_update(req, force_nonblock, cs);
5993                break;
5994        case IORING_OP_STATX:
5995                ret = io_statx(req, force_nonblock);
5996                break;
5997        case IORING_OP_FADVISE:
5998                ret = io_fadvise(req, force_nonblock);
5999                break;
6000        case IORING_OP_MADVISE:

6001                ret = io_madvise(req, force_nonblock);
6002                break;
6003        case IORING_OP_OPENAT2:
6004                ret = io_openat2(req, force_nonblock);
6005                break;
6006        case IORING_OP_EPOLL_CTL:
6007                ret = io_epoll_ctl(req, force_nonblock, cs);
6008                break;
6009        case IORING_OP_SPLICE:
6010                ret = io_splice(req, force_nonblock);
6011                break;
6012        case IORING_OP_PROVIDE_BUFFERS:
6013                ret = io_provide_buffers(req, force_nonblock, cs);
6014                break;
6015        case IORING_OP_REMOVE_BUFFERS:
6016                ret = io_remove_buffers(req, force_nonblock, cs);
6017                break;
6018        case IORING_OP_TEE:
6019                ret = io_tee(req, force_nonblock);
6020                break;
6021        default:
6022                ret = -EINVAL;
6023                break;
6024        }
6025
6026        if (ret)
6027                return ret;
6028
6029        /* If the op doesn't have a file, we're not polling for it */
6030        if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
6031                const bool in_async = io_wq_current_is_worker();
6032
6033                /* workqueue context doesn't hold uring_lock, grab it now */
6034                if (in_async)
6035                        mutex_lock(&ctx->uring_lock);
6036
6037                io_iopoll_req_issued(req);
6038
6039                if (in_async)
6040                        mutex_unlock(&ctx->uring_lock);
6041        }
6042
6043        return 0;
6044}
6045
6046static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
6047{
6048        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6049        struct io_kiocb *timeout;
6050        int ret = 0;
6051
6052        timeout = io_prep_linked_timeout(req);
6053        if (timeout)
6054                io_queue_linked_timeout(timeout);
6055
6056        /* if NO_CANCEL is set, we must still run the work */
6057        if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
6058                                IO_WQ_WORK_CANCEL) {
6059                ret = -ECANCELED;
6060        }
6061
6062        if (!ret) {
6063                do {
6064                        ret = io_issue_sqe(req, false, NULL);
6065                        /*
6066                         * We can get EAGAIN for polled IO even though we're
6067                         * forcing a sync submission from here, since we can't
6068                         * wait for request slots on the block side.
6069                         */
6070                        if (ret != -EAGAIN)
6071                                break;
6072                        cond_resched();
6073                } while (1);
6074        }
6075
6076        if (ret) {
6077                req_set_fail_links(req);
6078                io_req_complete(req, ret);
6079        }
6080
6081        return io_steal_work(req);
6082}
6083
6084static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6085                                              int index)
6086{
6087        struct fixed_file_table *table;
6088
6089        table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
6090        return table->files[index & IORING_FILE_TABLE_MASK];
6091}
6092
6093static struct file *io_file_get(struct io_submit_state *state,
6094                                struct io_kiocb *req, int fd, bool fixed)
6095{
6096        struct io_ring_ctx *ctx = req->ctx;
6097        struct file *file;
6098
6099        if (fixed) {
6100                if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6101                        return NULL;
6102                fd = array_index_nospec(fd, ctx->nr_user_files);
6103                file = io_file_from_index(ctx, fd);
6104                if (file) {
6105                        req->fixed_file_refs = &ctx->file_data->node->refs;
6106                        percpu_ref_get(req->fixed_file_refs);
6107                }
6108        } else {
6109                trace_io_uring_file_get(ctx, fd);
6110                file = __io_file_get(state, fd);
6111        }
6112
6113        return file;
6114}
6115
6116static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
6117                           int fd)
6118{
6119        bool fixed;
6120
6121        fixed = (req->flags & REQ_F_FIXED_FILE) != 0;
6122        if (unlikely(!fixed && io_async_submit(req->ctx)))
6123                return -EBADF;
6124
6125        req->file = io_file_get(state, req, fd, fixed);
6126        if (req->file || io_op_defs[req->opcode].needs_file_no_error)
6127                return 0;
6128        return -EBADF;
6129}
6130
6131static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
6132{
6133        struct io_timeout_data *data = container_of(timer,
6134                                                struct io_timeout_data, timer);
6135        struct io_kiocb *req = data->req;
6136        struct io_ring_ctx *ctx = req->ctx;
6137        struct io_kiocb *prev = NULL;
6138        unsigned long flags;
6139
6140        spin_lock_irqsave(&ctx->completion_lock, flags);
6141
6142        /*
6143         * We don't expect the list to be empty, that will only happen if we
6144         * race with the completion of the linked work.
6145         */
6146        if (!list_empty(&req->link_list)) {
6147                prev = list_entry(req->link_list.prev, struct io_kiocb,
6148                                  link_list);
6149                if (refcount_inc_not_zero(&prev->refs))
6150                        list_del_init(&req->link_list);
6151                else
6152                        prev = NULL;
6153        }
6154
6155        spin_unlock_irqrestore(&ctx->completion_lock, flags);
6156
6157        if (prev) {
6158                req_set_fail_links(prev);
6159                io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
6160                io_put_req(prev);
6161        } else {
6162                io_req_complete(req, -ETIME);
6163        }
6164        return HRTIMER_NORESTART;
6165}
6166
6167static void __io_queue_linked_timeout(struct io_kiocb *req)
6168{
6169        /*
6170         * If the list is now empty, then our linked request finished before
6171         * we got a chance to setup the timer
6172         */
6173        if (!list_empty(&req->link_list)) {
6174                struct io_timeout_data *data = req->async_data;
6175
6176                data->timer.function = io_link_timeout_fn;
6177                hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
6178                                data->mode);
6179        }
6180}
6181
6182static void io_queue_linked_timeout(struct io_kiocb *req)
6183{
6184        struct io_ring_ctx *ctx = req->ctx;
6185
6186        spin_lock_irq(&ctx->completion_lock);
6187        __io_queue_linked_timeout(req);
6188        spin_unlock_irq(&ctx->completion_lock);
6189
6190        /* drop submission reference */
6191        io_put_req(req);
6192}
6193
6194static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
6195{
6196        struct io_kiocb *nxt;
6197
6198        if (!(req->flags & REQ_F_LINK_HEAD))
6199                return NULL;
6200        if (req->flags & REQ_F_LINK_TIMEOUT)
6201                return NULL;
6202
6203        nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
6204                                        link_list);
6205        if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT)
6206                return NULL;
6207
6208        nxt->flags |= REQ_F_LTIMEOUT_ACTIVE;
6209        req->flags |= REQ_F_LINK_TIMEOUT;
6210        return nxt;
6211}
6212
6213static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs)
6214{
6215        struct io_kiocb *linked_timeout;
6216        const struct cred *old_creds = NULL;
6217        int ret;
6218
6219again:
6220        linked_timeout = io_prep_linked_timeout(req);
6221
6222        if ((req->flags & REQ_F_WORK_INITIALIZED) &&
6223            (req->work.flags & IO_WQ_WORK_CREDS) &&
6224            req->work.identity->creds != current_cred()) {
6225                if (old_creds)
6226                        revert_creds(old_creds);
6227                if (old_creds == req->work.identity->creds)
6228                        old_creds = NULL; /* restored original creds */
6229                else
6230                        old_creds = override_creds(req->work.identity->creds);
6231        }
6232
6233        ret = io_issue_sqe(req, true, cs);
6234
6235        /*
6236         * We async punt it if the file wasn't marked NOWAIT, or if the file
6237         * doesn't support non-blocking read/write attempts
6238         */
6239        if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
6240                if (!io_arm_poll_handler(req)) {
6241                        /*
6242                         * Queued up for async execution, worker will release
6243                         * submit reference when the iocb is actually submitted.
6244                         */
6245                        io_queue_async_work(req);
6246                }
6247
6248                if (linked_timeout)
6249                        io_queue_linked_timeout(linked_timeout);
6250        } else if (likely(!ret)) {
6251                /* drop submission reference */
6252                req = io_put_req_find_next(req);
6253                if (linked_timeout)
6254                        io_queue_linked_timeout(linked_timeout);
6255
6256                if (req) {
6257                        if (!(req->flags & REQ_F_FORCE_ASYNC))
6258                                goto again;
6259                        io_queue_async_work(req);
6260                }
6261        } else {
6262                /* un-prep timeout, so it'll be killed as any other linked */
6263                req->flags &= ~REQ_F_LINK_TIMEOUT;
6264                req_set_fail_links(req);
6265                io_put_req(req);
6266                io_req_complete(req, ret);
6267        }
6268
6269        if (old_creds)
6270                revert_creds(old_creds);
6271}
6272
6273static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6274                         struct io_comp_state *cs)
6275{
6276        int ret;
6277
6278        ret = io_req_defer(req, sqe);
6279        if (ret) {
6280                if (ret != -EIOCBQUEUED) {
6281fail_req:
6282                        req_set_fail_links(req);
6283                        io_put_req(req);
6284                        io_req_complete(req, ret);
6285                }
6286        } else if (req->flags & REQ_F_FORCE_ASYNC) {
6287                if (!req->async_data) {
6288                        ret = io_req_defer_prep(req, sqe);
6289                        if (unlikely(ret))
6290                                goto fail_req;
6291                }
6292                io_queue_async_work(req);
6293        } else {
6294                if (sqe) {
6295                        ret = io_req_prep(req, sqe);
6296                        if (unlikely(ret))
6297                                goto fail_req;
6298                }
6299                __io_queue_sqe(req, cs);
6300        }
6301}
6302
6303static inline void io_queue_link_head(struct io_kiocb *req,
6304                                      struct io_comp_state *cs)
6305{
6306        if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
6307                io_put_req(req);
6308                io_req_complete(req, -ECANCELED);
6309        } else
6310                io_queue_sqe(req, NULL, cs);
6311}
6312
6313static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6314                         struct io_kiocb **link, struct io_comp_state *cs)
6315{
6316        struct io_ring_ctx *ctx = req->ctx;
6317        int ret;
6318
6319        /*
6320         * If we already have a head request, queue this one for async
6321         * submittal once the head completes. If we don't have a head but
6322         * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
6323         * submitted sync once the chain is complete. If none of those
6324         * conditions are true (normal request), then just queue it.
6325         */
6326        if (*link) {
6327                struct io_kiocb *head = *link;
6328
6329                /*
6330                 * Taking sequential execution of a link, draining both sides
6331                 * of the link also fullfils IOSQE_IO_DRAIN semantics for all
6332                 * requests in the link. So, it drains the head and the
6333                 * next after the link request. The last one is done via
6334                 * drain_next flag to persist the effect across calls.
6335                 */
6336                if (req->flags & REQ_F_IO_DRAIN) {
6337                        head->flags |= REQ_F_IO_DRAIN;
6338                        ctx->drain_next = 1;
6339                }
6340                ret = io_req_defer_prep(req, sqe);
6341                if (unlikely(ret)) {
6342                        /* fail even hard links since we don't submit */
6343                        head->flags |= REQ_F_FAIL_LINK;
6344                        return ret;
6345                }
6346                trace_io_uring_link(ctx, req, head);
6347                list_add_tail(&req->link_list, &head->link_list);
6348
6349                /* last request of a link, enqueue the link */
6350                if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
6351                        io_queue_link_head(head, cs);
6352                        *link = NULL;
6353                }
6354        } else {
6355                if (unlikely(ctx->drain_next)) {
6356                        req->flags |= REQ_F_IO_DRAIN;
6357                        ctx->drain_next = 0;
6358                }
6359                if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
6360                        req->flags |= REQ_F_LINK_HEAD;
6361                        INIT_LIST_HEAD(&req->link_list);
6362
6363                        ret = io_req_defer_prep(req, sqe);
6364                        if (unlikely(ret))
6365                                req->flags |= REQ_F_FAIL_LINK;
6366                        *link = req;
6367                } else {
6368                        io_queue_sqe(req, sqe, cs);
6369                }
6370        }
6371
6372        return 0;
6373}
6374
6375/*
6376 * Batched submission is done, ensure local IO is flushed out.
6377 */
6378static void io_submit_state_end(struct io_submit_state *state)
6379{
6380        if (!list_empty(&state->comp.list))
6381                io_submit_flush_completions(&state->comp);
6382        blk_finish_plug(&state->plug);
6383        io_state_file_put(state);
6384        if (state->free_reqs)
6385                kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
6386}
6387
6388/*
6389 * Start submission side cache.
6390 */
6391static void io_submit_state_start(struct io_submit_state *state,
6392                                  struct io_ring_ctx *ctx, unsigned int max_ios)
6393{
6394        blk_start_plug(&state->plug);
6395        state->comp.nr = 0;
6396        INIT_LIST_HEAD(&state->comp.list);
6397        state->comp.ctx = ctx;
6398        state->free_reqs = 0;
6399        state->file = NULL;
6400        state->ios_left = max_ios;
6401}
6402
6403static void io_commit_sqring(struct io_ring_ctx *ctx)
6404{
6405        struct io_rings *rings = ctx->rings;
6406
6407        /*
6408         * Ensure any loads from the SQEs are done at this point,
6409         * since once we write the new head, the application could
6410         * write new data to them.
6411         */
6412        smp_store_release(&rings->sq.head, ctx->cached_sq_head);
6413}
6414
6415/*
6416 * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory
6417 * that is mapped by userspace. This means that care needs to be taken to
6418 * ensure that reads are stable, as we cannot rely on userspace always
6419 * being a good citizen. If members of the sqe are validated and then later
6420 * used, it's important that those reads are done through READ_ONCE() to
6421 * prevent a re-load down the line.
6422 */
6423static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
6424{
6425        u32 *sq_array = ctx->sq_array;
6426        unsigned head;
6427
6428        /*
6429         * The cached sq head (or cq tail) serves two purposes:
6430         *
6431         * 1) allows us to batch the cost of updating the user visible
6432         *    head updates.
6433         * 2) allows the kernel side to track the head on its own, even
6434         *    though the application is the one updating it.
6435         */
6436        head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
6437        if (likely(head < ctx->sq_entries))
6438                return &ctx->sq_sqes[head];
6439
6440        /* drop invalid entries */
6441        ctx->cached_sq_dropped++;
6442        WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
6443        return NULL;
6444}
6445
6446static inline void io_consume_sqe(struct io_ring_ctx *ctx)
6447{
6448        ctx->cached_sq_head++;
6449}
6450
6451/*
6452 * Check SQE restrictions (opcode and flags).
6453 *
6454 * Returns 'true' if SQE is allowed, 'false' otherwise.
6455 */
6456static inline bool io_check_restriction(struct io_ring_ctx *ctx,
6457                                        struct io_kiocb *req,
6458                                        unsigned int sqe_flags)
6459{
6460        if (!ctx->restricted)
6461                return true;
6462
6463        if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
6464                return false;
6465
6466        if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
6467            ctx->restrictions.sqe_flags_required)
6468                return false;
6469
6470        if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
6471                          ctx->restrictions.sqe_flags_required))
6472                return false;
6473
6474        return true;
6475}
6476
6477#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
6478                                IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
6479                                IOSQE_BUFFER_SELECT)
6480
6481static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
6482                       const struct io_uring_sqe *sqe,
6483                       struct io_submit_state *state)
6484{
6485        unsigned int sqe_flags;
6486        int id, ret;
6487
6488        req->opcode = READ_ONCE(sqe->opcode);
6489        req->user_data = READ_ONCE(sqe->user_data);
6490        req->async_data = NULL;
6491        req->file = NULL;
6492        req->ctx = ctx;
6493        req->flags = 0;
6494        /* one is dropped after submission, the other at completion */
6495        refcount_set(&req->refs, 2);
6496        req->task = current;
6497        req->result = 0;
6498
6499        if (unlikely(req->opcode >= IORING_OP_LAST))
6500                return -EINVAL;
6501
6502        if (unlikely(io_sq_thread_acquire_mm(ctx, req)))
6503                return -EFAULT;
6504
6505        sqe_flags = READ_ONCE(sqe->flags);
6506        /* enforce forwards compatibility on users */
6507        if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
6508                return -EINVAL;
6509
6510        if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
6511                return -EACCES;
6512
6513        if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
6514            !io_op_defs[req->opcode].buffer_select)
6515                return -EOPNOTSUPP;
6516
6517        id = READ_ONCE(sqe->personality);
6518        if (id) {
6519                struct io_identity *iod;
6520
6521                iod = idr_find(&ctx->personality_idr, id);
6522                if (unlikely(!iod))
6523                        return -EINVAL;
6524                refcount_inc(&iod->count);
6525
6526                __io_req_init_async(req);
6527                get_cred(iod->creds);
6528                req->work.identity = iod;
6529                req->work.flags |= IO_WQ_WORK_CREDS;
6530        }
6531
6532        /* same numerical values with corresponding REQ_F_*, safe to copy */
6533        req->flags |= sqe_flags;
6534
6535        if (!io_op_defs[req->opcode].needs_file)
6536                return 0;
6537
6538        ret = io_req_set_file(state, req, READ_ONCE(sqe->fd));
6539        state->ios_left--;
6540        return ret;
6541}
6542
6543static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
6544{
6545        struct io_submit_state state;
6546        struct io_kiocb *link = NULL;
6547        int i, submitted = 0;
6548
6549        /* if we have a backlog and couldn't flush it all, return BUSY */
6550        if (test_bit(0, &ctx->sq_check_overflow)) {
6551                if (!list_empty(&ctx->cq_overflow_list) &&
6552                    !io_cqring_overflow_flush(ctx, false, NULL, NULL))
6553                        return -EBUSY;
6554        }
6555
6556        /* make sure SQ entry isn't read before tail */
6557        nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
6558
6559        if (!percpu_ref_tryget_many(&ctx->refs, nr))
6560                return -EAGAIN;
6561
6562        percpu_counter_add(&current->io_uring->inflight, nr);
6563        refcount_add(nr, &current->usage);
6564
6565        io_submit_state_start(&state, ctx, nr);
6566
6567        for (i = 0; i < nr; i++) {
6568                const struct io_uring_sqe *sqe;
6569                struct io_kiocb *req;
6570                int err;
6571
6572                sqe = io_get_sqe(ctx);
6573                if (unlikely(!sqe)) {
6574                        io_consume_sqe(ctx);
6575                        break;
6576                }
6577                req = io_alloc_req(ctx, &state);
6578                if (unlikely(!req)) {
6579                        if (!submitted)
6580                                submitted = -EAGAIN;
6581                        break;
6582                }
6583                io_consume_sqe(ctx);
6584                /* will complete beyond this point, count as submitted */
6585                submitted++;
6586
6587                err = io_init_req(ctx, req, sqe, &state);
6588                if (unlikely(err)) {
6589fail_req:
6590                        io_put_req(req);
6591                        io_req_complete(req, err);
6592                        break;
6593                }
6594
6595                trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
6596                                                true, io_async_submit(ctx));
6597                err = io_submit_sqe(req, sqe, &link, &state.comp);
6598                if (err)
6599                        goto fail_req;
6600        }
6601
6602        if (unlikely(submitted != nr)) {
6603                int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
6604                struct io_uring_task *tctx = current->io_uring;
6605                int unused = nr - ref_used;
6606
6607                percpu_ref_put_many(&ctx->refs, unused);
6608                percpu_counter_sub(&tctx->inflight, unused);
6609                put_task_struct_many(current, unused);
6610        }
6611        if (link)
6612                io_queue_link_head(link, &state.comp);
6613        io_submit_state_end(&state);
6614
6615         /* Commit SQ ring head once we've consumed and submitted all SQEs */
6616        io_commit_sqring(ctx);
6617
6618        return submitted;
6619}
6620
6621static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
6622{
6623        /* Tell userspace we may need a wakeup call */
6624        spin_lock_irq(&ctx->completion_lock);
6625        ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
6626        spin_unlock_irq(&ctx->completion_lock);
6627}
6628
6629static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
6630{
6631        spin_lock_irq(&ctx->completion_lock);
6632        ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6633        spin_unlock_irq(&ctx->completion_lock);
6634}
6635
6636static int io_sq_wake_function(struct wait_queue_entry *wqe, unsigned mode,
6637                               int sync, void *key)
6638{
6639        struct io_ring_ctx *ctx = container_of(wqe, struct io_ring_ctx, sqo_wait_entry);
6640        int ret;
6641
6642        ret = autoremove_wake_function(wqe, mode, sync, key);
6643        if (ret) {
6644                unsigned long flags;
6645
6646                spin_lock_irqsave(&ctx->completion_lock, flags);
6647                ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
6648                spin_unlock_irqrestore(&ctx->completion_lock, flags);
6649        }
6650        return ret;
6651}
6652
6653enum sq_ret {
6654        SQT_IDLE        = 1,
6655        SQT_SPIN        = 2,
6656        SQT_DID_WORK    = 4,
6657};
6658
6659static enum sq_ret __io_sq_thread(struct io_ring_ctx *ctx,
6660                                  unsigned long start_jiffies, bool cap_entries)
6661{
6662        unsigned long timeout = start_jiffies + ctx->sq_thread_idle;
6663        struct io_sq_data *sqd = ctx->sq_data;
6664        unsigned int to_submit;
6665        int ret = 0;
6666
6667again:
6668        if (!list_empty(&ctx->iopoll_list)) {
6669                unsigned nr_events = 0;
6670
6671                mutex_lock(&ctx->uring_lock);
6672                if (!list_empty(&ctx->iopoll_list) && !need_resched())
6673                        io_do_iopoll(ctx, &nr_events, 0);
6674                mutex_unlock(&ctx->uring_lock);
6675        }
6676
6677        to_submit = io_sqring_entries(ctx);
6678
6679        /*
6680         * If submit got -EBUSY, flag us as needing the application
6681         * to enter the kernel to reap and flush events.
6682         */
6683        if (!to_submit || ret == -EBUSY || need_resched()) {
6684                /*
6685                 * Drop cur_mm before scheduling, we can't hold it for
6686                 * long periods (or over schedule()). Do this before
6687                 * adding ourselves to the waitqueue, as the unuse/drop
6688                 * may sleep.
6689                 */
6690                io_sq_thread_drop_mm();
6691
6692                /*
6693                 * We're polling. If we're within the defined idle
6694                 * period, then let us spin without work before going
6695                 * to sleep. The exception is if we got EBUSY doing
6696                 * more IO, we should wait for the application to
6697                 * reap events and wake us up.
6698                 */
6699                if (!list_empty(&ctx->iopoll_list) || need_resched() ||
6700                    (!time_after(jiffies, timeout) && ret != -EBUSY &&
6701                    !percpu_ref_is_dying(&ctx->refs)))
6702                        return SQT_SPIN;
6703
6704                prepare_to_wait(&sqd->wait, &ctx->sqo_wait_entry,
6705                                        TASK_INTERRUPTIBLE);
6706
6707                /*
6708                 * While doing polled IO, before going to sleep, we need
6709                 * to check if there are new reqs added to iopoll_list,
6710                 * it is because reqs may have been punted to io worker
6711                 * and will be added to iopoll_list later, hence check
6712                 * the iopoll_list again.
6713                 */
6714                if ((ctx->flags & IORING_SETUP_IOPOLL) &&
6715                    !list_empty_careful(&ctx->iopoll_list)) {
6716                        finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
6717                        goto again;
6718                }
6719
6720                to_submit = io_sqring_entries(ctx);
6721                if (!to_submit || ret == -EBUSY)
6722                        return SQT_IDLE;
6723        }
6724
6725        finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
6726        io_ring_clear_wakeup_flag(ctx);
6727
6728        /* if we're handling multiple rings, cap submit size for fairness */
6729        if (cap_entries && to_submit > 8)
6730                to_submit = 8;
6731
6732        mutex_lock(&ctx->uring_lock);
6733        if (likely(!percpu_ref_is_dying(&ctx->refs)))
6734                ret = io_submit_sqes(ctx, to_submit);
6735        mutex_unlock(&ctx->uring_lock);
6736
6737        if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait))
6738                wake_up(&ctx->sqo_sq_wait);
6739
6740        return SQT_DID_WORK;
6741}
6742
6743static void io_sqd_init_new(struct io_sq_data *sqd)
6744{
6745        struct io_ring_ctx *ctx;
6746
6747        while (!list_empty(&sqd->ctx_new_list)) {
6748                ctx = list_first_entry(&sqd->ctx_new_list, struct io_ring_ctx, sqd_list);
6749                init_wait(&ctx->sqo_wait_entry);
6750                ctx->sqo_wait_entry.func = io_sq_wake_function;
6751                list_move_tail(&ctx->sqd_list, &sqd->ctx_list);
6752                complete(&ctx->sq_thread_comp);
6753        }
6754}
6755
6756static int io_sq_thread(void *data)
6757{
6758        struct cgroup_subsys_state *cur_css = NULL;
6759        const struct cred *old_cred = NULL;
6760        struct io_sq_data *sqd = data;
6761        struct io_ring_ctx *ctx;
6762        unsigned long start_jiffies;
6763
6764        start_jiffies = jiffies;
6765        while (!kthread_should_stop()) {
6766                enum sq_ret ret = 0;
6767                bool cap_entries;
6768
6769                /*
6770                 * Any changes to the sqd lists are synchronized through the
6771                 * kthread parking. This synchronizes the thread vs users,
6772                 * the users are synchronized on the sqd->ctx_lock.
6773                 */
6774                if (kthread_should_park())
6775                        kthread_parkme();
6776
6777                if (unlikely(!list_empty(&sqd->ctx_new_list)))
6778                        io_sqd_init_new(sqd);
6779
6780                cap_entries = !list_is_singular(&sqd->ctx_list);
6781
6782                list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
6783                        if (current->cred != ctx->creds) {
6784                                if (old_cred)
6785                                        revert_creds(old_cred);
6786                                old_cred = override_creds(ctx->creds);
6787                        }
6788                        io_sq_thread_associate_blkcg(ctx, &cur_css);
6789#ifdef CONFIG_AUDIT
6790                        current->loginuid = ctx->loginuid;
6791                        current->sessionid = ctx->sessionid;
6792#endif
6793
6794                        ret |= __io_sq_thread(ctx, start_jiffies, cap_entries);
6795
6796                        io_sq_thread_drop_mm();
6797                }
6798
6799                if (ret & SQT_SPIN) {
6800                        io_run_task_work();
6801                        cond_resched();
6802                } else if (ret == SQT_IDLE) {
6803                        if (kthread_should_park())
6804                                continue;
6805                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6806                                io_ring_set_wakeup_flag(ctx);
6807                        schedule();
6808                        start_jiffies = jiffies;
6809                        list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
6810                                io_ring_clear_wakeup_flag(ctx);
6811                }
6812        }
6813
6814        io_run_task_work();
6815
6816        if (cur_css)
6817                io_sq_thread_unassociate_blkcg();
6818        if (old_cred)
6819                revert_creds(old_cred);
6820
6821        kthread_parkme();
6822
6823        return 0;
6824}
6825
6826struct io_wait_queue {
6827        struct wait_queue_entry wq;
6828        struct io_ring_ctx *ctx;
6829        unsigned to_wait;
6830        unsigned nr_timeouts;
6831};
6832
6833static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush)
6834{
6835        struct io_ring_ctx *ctx = iowq->ctx;
6836
6837        /*
6838         * Wake up if we have enough events, or if a timeout occurred since we
6839         * started waiting. For timeouts, we always want to return to userspace,
6840         * regardless of event count.
6841         */
6842        return io_cqring_events(ctx, noflush) >= iowq->to_wait ||
6843                        atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
6844}
6845
6846static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
6847                            int wake_flags, void *key)
6848{
6849        struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
6850                                                        wq);
6851
6852        /* use noflush == true, as we can't safely rely on locking context */
6853        if (!io_should_wake(iowq, true))
6854                return -1;
6855
6856        return autoremove_wake_function(curr, mode, wake_flags, key);
6857}
6858
6859static int io_run_task_work_sig(void)
6860{
6861        if (io_run_task_work())
6862                return 1;
6863        if (!signal_pending(current))
6864                return 0;
6865        if (current->jobctl & JOBCTL_TASK_WORK) {
6866                spin_lock_irq(&current->sighand->siglock);
6867                current->jobctl &= ~JOBCTL_TASK_WORK;
6868                recalc_sigpending();
6869                spin_unlock_irq(&current->sighand->siglock);
6870                return 1;
6871        }
6872        return -EINTR;
6873}
6874
6875/*
6876 * Wait until events become available, if we don't already have some. The
6877 * application must reap them itself, as they reside on the shared cq ring.
6878 */
6879static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
6880                          const sigset_t __user *sig, size_t sigsz)
6881{
6882        struct io_wait_queue iowq = {
6883                .wq = {
6884                        .private        = current,
6885                        .func           = io_wake_function,
6886                        .entry          = LIST_HEAD_INIT(iowq.wq.entry),
6887                },
6888                .ctx            = ctx,
6889                .to_wait        = min_events,
6890        };
6891        struct io_rings *rings = ctx->rings;
6892        int ret = 0;
6893
6894        do {
6895                if (io_cqring_events(ctx, false) >= min_events)
6896                        return 0;
6897                if (!io_run_task_work())
6898                        break;
6899        } while (1);
6900
6901        if (sig) {
6902#ifdef CONFIG_COMPAT
6903                if (in_compat_syscall())
6904                        ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
6905                                                      sigsz);
6906                else
6907#endif
6908                        ret = set_user_sigmask(sig, sigsz);
6909
6910                if (ret)
6911                        return ret;
6912        }
6913
6914        iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
6915        trace_io_uring_cqring_wait(ctx, min_events);
6916        do {
6917                prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
6918                                                TASK_INTERRUPTIBLE);
6919                /* make sure we run task_work before checking for signals */
6920                ret = io_run_task_work_sig();
6921                if (ret > 0)
6922                        continue;
6923                else if (ret < 0)
6924                        break;
6925                if (io_should_wake(&iowq, false))
6926                        break;
6927                schedule();
6928        } while (1);
6929        finish_wait(&ctx->wait, &iowq.wq);
6930
6931        restore_saved_sigmask_unless(ret == -EINTR);
6932
6933        return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
6934}
6935
6936static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
6937{
6938#if defined(CONFIG_UNIX)
6939        if (ctx->ring_sock) {
6940                struct sock *sock = ctx->ring_sock->sk;
6941                struct sk_buff *skb;
6942
6943                while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
6944                        kfree_skb(skb);
6945        }
6946#else
6947        int i;
6948
6949        for (i = 0; i < ctx->nr_user_files; i++) {
6950                struct file *file;
6951
6952                file = io_file_from_index(ctx, i);
6953                if (file)
6954                        fput(file);
6955        }
6956#endif
6957}
6958
6959static void io_file_ref_kill(struct percpu_ref *ref)
6960{
6961        struct fixed_file_data *data;
6962
6963        data = container_of(ref, struct fixed_file_data, refs);
6964        complete(&data->done);
6965}
6966
6967static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
6968{
6969        struct fixed_file_data *data = ctx->file_data;
6970        struct fixed_file_ref_node *ref_node = NULL;
6971        unsigned nr_tables, i;
6972
6973        if (!data)
6974                return -ENXIO;
6975
6976        spin_lock(&data->lock);
6977        ref_node = data->node;
6978        spin_unlock(&data->lock);
6979        if (ref_node)
6980                percpu_ref_kill(&ref_node->refs);
6981
6982        percpu_ref_kill(&data->refs);
6983
6984        /* wait for all refs nodes to complete */
6985        flush_delayed_work(&ctx->file_put_work);
6986        wait_for_completion(&data->done);
6987
6988        __io_sqe_files_unregister(ctx);
6989        nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
6990        for (i = 0; i < nr_tables; i++)
6991                kfree(data->table[i].files);
6992        kfree(data->table);
6993        percpu_ref_exit(&data->refs);
6994        kfree(data);
6995        ctx->file_data = NULL;
6996        ctx->nr_user_files = 0;
6997        return 0;
6998}
6999
7000static void io_put_sq_data(struct io_sq_data *sqd)

7001{
7002        if (refcount_dec_and_test(&sqd->refs)) {
7003                /*
7004                 * The park is a bit of a work-around, without it we get
7005                 * warning spews on shutdown with SQPOLL set and affinity
7006                 * set to a single CPU.
7007                 */
7008                if (sqd->thread) {
7009                        kthread_park(sqd->thread);
7010                        kthread_stop(sqd->thread);
7011                }
7012
7013                kfree(sqd);
7014        }
7015}
7016
7017static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
7018{
7019        struct io_ring_ctx *ctx_attach;
7020        struct io_sq_data *sqd;
7021        struct fd f;
7022
7023        f = fdget(p->wq_fd);
7024        if (!f.file)
7025                return ERR_PTR(-ENXIO);
7026        if (f.file->f_op != &io_uring_fops) {
7027                fdput(f);
7028                return ERR_PTR(-EINVAL);
7029        }
7030
7031        ctx_attach = f.file->private_data;
7032        sqd = ctx_attach->sq_data;
7033        if (!sqd) {
7034                fdput(f);
7035                return ERR_PTR(-EINVAL);
7036        }
7037
7038        refcount_inc(&sqd->refs);
7039        fdput(f);
7040        return sqd;
7041}
7042
7043static struct io_sq_data *io_get_sq_data(struct io_uring_params *p)
7044{
7045        struct io_sq_data *sqd;
7046
7047        if (p->flags & IORING_SETUP_ATTACH_WQ)
7048                return io_attach_sq_data(p);
7049
7050        sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
7051        if (!sqd)
7052                return ERR_PTR(-ENOMEM);
7053
7054        refcount_set(&sqd->refs, 1);
7055        INIT_LIST_HEAD(&sqd->ctx_list);
7056        INIT_LIST_HEAD(&sqd->ctx_new_list);
7057        mutex_init(&sqd->ctx_lock);
7058        mutex_init(&sqd->lock);
7059        init_waitqueue_head(&sqd->wait);
7060        return sqd;
7061}
7062
7063static void io_sq_thread_unpark(struct io_sq_data *sqd)
7064        __releases(&sqd->lock)
7065{
7066        if (!sqd->thread)
7067                return;
7068        kthread_unpark(sqd->thread);
7069        mutex_unlock(&sqd->lock);
7070}
7071
7072static void io_sq_thread_park(struct io_sq_data *sqd)
7073        __acquires(&sqd->lock)
7074{
7075        if (!sqd->thread)
7076                return;
7077        mutex_lock(&sqd->lock);
7078        kthread_park(sqd->thread);
7079}
7080
7081static void io_sq_thread_stop(struct io_ring_ctx *ctx)
7082{
7083        struct io_sq_data *sqd = ctx->sq_data;
7084
7085        if (sqd) {
7086                if (sqd->thread) {
7087                        /*
7088                         * We may arrive here from the error branch in
7089                         * io_sq_offload_create() where the kthread is created
7090                         * without being waked up, thus wake it up now to make
7091                         * sure the wait will complete.
7092                         */
7093                        wake_up_process(sqd->thread);
7094                        wait_for_completion(&ctx->sq_thread_comp);
7095
7096                        io_sq_thread_park(sqd);
7097                }
7098
7099                mutex_lock(&sqd->ctx_lock);
7100                list_del(&ctx->sqd_list);
7101                mutex_unlock(&sqd->ctx_lock);
7102
7103                if (sqd->thread) {
7104                        finish_wait(&sqd->wait, &ctx->sqo_wait_entry);
7105                        io_sq_thread_unpark(sqd);
7106                }
7107
7108                io_put_sq_data(sqd);
7109                ctx->sq_data = NULL;
7110        }
7111}
7112
7113static void io_finish_async(struct io_ring_ctx *ctx)
7114{
7115        io_sq_thread_stop(ctx);
7116
7117        if (ctx->io_wq) {
7118                io_wq_destroy(ctx->io_wq);
7119                ctx->io_wq = NULL;
7120        }
7121}
7122
7123#if defined(CONFIG_UNIX)
7124/*
7125 * Ensure the UNIX gc is aware of our file set, so we are certain that
7126 * the io_uring can be safely unregistered on process exit, even if we have
7127 * loops in the file referencing.
7128 */
7129static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
7130{
7131        struct sock *sk = ctx->ring_sock->sk;
7132        struct scm_fp_list *fpl;
7133        struct sk_buff *skb;
7134        int i, nr_files;
7135
7136        fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
7137        if (!fpl)
7138                return -ENOMEM;
7139
7140        skb = alloc_skb(0, GFP_KERNEL);
7141        if (!skb) {
7142                kfree(fpl);
7143                return -ENOMEM;
7144        }
7145
7146        skb->sk = sk;
7147
7148        nr_files = 0;
7149        fpl->user = get_uid(ctx->user);
7150        for (i = 0; i < nr; i++) {
7151                struct file *file = io_file_from_index(ctx, i + offset);
7152
7153                if (!file)
7154                        continue;
7155                fpl->fp[nr_files] = get_file(file);
7156                unix_inflight(fpl->user, fpl->fp[nr_files]);
7157                nr_files++;
7158        }
7159
7160        if (nr_files) {
7161                fpl->max = SCM_MAX_FD;
7162                fpl->count = nr_files;
7163                UNIXCB(skb).fp = fpl;
7164                skb->destructor = unix_destruct_scm;
7165                refcount_add(skb->truesize, &sk->sk_wmem_alloc);
7166                skb_queue_head(&sk->sk_receive_queue, skb);
7167
7168                for (i = 0; i < nr_files; i++)
7169                        fput(fpl->fp[i]);
7170        } else {
7171                kfree_skb(skb);
7172                kfree(fpl);
7173        }
7174
7175        return 0;
7176}
7177
7178/*
7179 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
7180 * causes regular reference counting to break down. We rely on the UNIX
7181 * garbage collection to take care of this problem for us.
7182 */
7183static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7184{
7185        unsigned left, total;
7186        int ret = 0;
7187
7188        total = 0;
7189        left = ctx->nr_user_files;
7190        while (left) {
7191                unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
7192
7193                ret = __io_sqe_files_scm(ctx, this_files, total);
7194                if (ret)
7195                        break;
7196                left -= this_files;
7197                total += this_files;
7198        }
7199
7200        if (!ret)
7201                return 0;
7202
7203        while (total < ctx->nr_user_files) {
7204                struct file *file = io_file_from_index(ctx, total);
7205
7206                if (file)
7207                        fput(file);
7208                total++;
7209        }
7210
7211        return ret;
7212}
7213#else
7214static int io_sqe_files_scm(struct io_ring_ctx *ctx)
7215{
7216        return 0;
7217}
7218#endif
7219
7220static int io_sqe_alloc_file_tables(struct fixed_file_data *file_data,
7221                                    unsigned nr_tables, unsigned nr_files)
7222{
7223        int i;
7224
7225        for (i = 0; i < nr_tables; i++) {
7226                struct fixed_file_table *table = &file_data->table[i];
7227                unsigned this_files;
7228
7229                this_files = min(nr_files, IORING_MAX_FILES_TABLE);
7230                table->files = kcalloc(this_files, sizeof(struct file *),
7231                                        GFP_KERNEL);
7232                if (!table->files)
7233                        break;
7234                nr_files -= this_files;
7235        }
7236
7237        if (i == nr_tables)
7238                return 0;
7239
7240        for (i = 0; i < nr_tables; i++) {
7241                struct fixed_file_table *table = &file_data->table[i];
7242                kfree(table->files);
7243        }
7244        return 1;
7245}
7246
7247static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
7248{
7249#if defined(CONFIG_UNIX)
7250        struct sock *sock = ctx->ring_sock->sk;
7251        struct sk_buff_head list, *head = &sock->sk_receive_queue;
7252        struct sk_buff *skb;
7253        int i;
7254
7255        __skb_queue_head_init(&list);
7256
7257        /*
7258         * Find the skb that holds this file in its SCM_RIGHTS. When found,
7259         * remove this entry and rearrange the file array.
7260         */
7261        skb = skb_dequeue(head);
7262        while (skb) {
7263                struct scm_fp_list *fp;
7264
7265                fp = UNIXCB(skb).fp;
7266                for (i = 0; i < fp->count; i++) {
7267                        int left;
7268
7269                        if (fp->fp[i] != file)
7270                                continue;
7271
7272                        unix_notinflight(fp->user, fp->fp[i]);
7273                        left = fp->count - 1 - i;
7274                        if (left) {
7275                                memmove(&fp->fp[i], &fp->fp[i + 1],
7276                                                left * sizeof(struct file *));
7277                        }
7278                        fp->count--;
7279                        if (!fp->count) {
7280                                kfree_skb(skb);
7281                                skb = NULL;
7282                        } else {
7283                                __skb_queue_tail(&list, skb);
7284                        }
7285                        fput(file);
7286                        file = NULL;
7287                        break;
7288                }
7289
7290                if (!file)
7291                        break;
7292
7293                __skb_queue_tail(&list, skb);
7294
7295                skb = skb_dequeue(head);
7296        }
7297
7298        if (skb_peek(&list)) {
7299                spin_lock_irq(&head->lock);
7300                while ((skb = __skb_dequeue(&list)) != NULL)
7301                        __skb_queue_tail(head, skb);
7302                spin_unlock_irq(&head->lock);
7303        }
7304#else
7305        fput(file);
7306#endif
7307}
7308
7309struct io_file_put {
7310        struct list_head list;
7311        struct file *file;
7312};
7313
7314static void __io_file_put_work(struct fixed_file_ref_node *ref_node)
7315{
7316        struct fixed_file_data *file_data = ref_node->file_data;
7317        struct io_ring_ctx *ctx = file_data->ctx;
7318        struct io_file_put *pfile, *tmp;
7319
7320        list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) {
7321                list_del(&pfile->list);
7322                io_ring_file_put(ctx, pfile->file);
7323                kfree(pfile);
7324        }
7325
7326        percpu_ref_exit(&ref_node->refs);
7327        kfree(ref_node);
7328        percpu_ref_put(&file_data->refs);
7329}
7330
7331static void io_file_put_work(struct work_struct *work)
7332{
7333        struct io_ring_ctx *ctx;
7334        struct llist_node *node;
7335
7336        ctx = container_of(work, struct io_ring_ctx, file_put_work.work);
7337        node = llist_del_all(&ctx->file_put_llist);
7338
7339        while (node) {
7340                struct fixed_file_ref_node *ref_node;
7341                struct llist_node *next = node->next;
7342
7343                ref_node = llist_entry(node, struct fixed_file_ref_node, llist);
7344                __io_file_put_work(ref_node);
7345                node = next;
7346        }
7347}
7348
7349static void io_file_data_ref_zero(struct percpu_ref *ref)
7350{
7351        struct fixed_file_ref_node *ref_node;
7352        struct fixed_file_data *data;
7353        struct io_ring_ctx *ctx;
7354        bool first_add = false;
7355        int delay = HZ;
7356
7357        ref_node = container_of(ref, struct fixed_file_ref_node, refs);
7358        data = ref_node->file_data;
7359        ctx = data->ctx;
7360
7361        spin_lock(&data->lock);
7362        ref_node->done = true;
7363
7364        while (!list_empty(&data->ref_list)) {
7365                ref_node = list_first_entry(&data->ref_list,
7366                                        struct fixed_file_ref_node, node);
7367                /* recycle ref nodes in order */
7368                if (!ref_node->done)
7369                        break;
7370                list_del(&ref_node->node);
7371                first_add |= llist_add(&ref_node->llist, &ctx->file_put_llist);
7372        }
7373        spin_unlock(&data->lock);
7374
7375        if (percpu_ref_is_dying(&data->refs))
7376                delay = 0;
7377
7378        if (!delay)
7379                mod_delayed_work(system_wq, &ctx->file_put_work, 0);
7380        else if (first_add)
7381                queue_delayed_work(system_wq, &ctx->file_put_work, delay);
7382}
7383
7384static struct fixed_file_ref_node *alloc_fixed_file_ref_node(
7385                        struct io_ring_ctx *ctx)
7386{
7387        struct fixed_file_ref_node *ref_node;
7388
7389        ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7390        if (!ref_node)
7391                return ERR_PTR(-ENOMEM);
7392
7393        if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero,
7394                            0, GFP_KERNEL)) {
7395                kfree(ref_node);
7396                return ERR_PTR(-ENOMEM);
7397        }
7398        INIT_LIST_HEAD(&ref_node->node);
7399        INIT_LIST_HEAD(&ref_node->file_list);
7400        ref_node->file_data = ctx->file_data;
7401        ref_node->done = false;
7402        return ref_node;
7403}
7404
7405static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node)
7406{
7407        percpu_ref_exit(&ref_node->refs);
7408        kfree(ref_node);
7409}
7410
7411static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
7412                                 unsigned nr_args)
7413{
7414        __s32 __user *fds = (__s32 __user *) arg;
7415        unsigned nr_tables, i;
7416        struct file *file;
7417        int fd, ret = -ENOMEM;
7418        struct fixed_file_ref_node *ref_node;
7419        struct fixed_file_data *file_data;
7420
7421        if (ctx->file_data)
7422                return -EBUSY;
7423        if (!nr_args)
7424                return -EINVAL;
7425        if (nr_args > IORING_MAX_FIXED_FILES)
7426                return -EMFILE;
7427
7428        file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
7429        if (!file_data)
7430                return -ENOMEM;
7431        file_data->ctx = ctx;
7432        init_completion(&file_data->done);
7433        INIT_LIST_HEAD(&file_data->ref_list);
7434        spin_lock_init(&file_data->lock);
7435
7436        nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
7437        file_data->table = kcalloc(nr_tables, sizeof(*file_data->table),
7438                                   GFP_KERNEL);
7439        if (!file_data->table)
7440                goto out_free;
7441
7442        if (percpu_ref_init(&file_data->refs, io_file_ref_kill,
7443                                PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
7444                goto out_free;
7445
7446        if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args))
7447                goto out_ref;
7448        ctx->file_data = file_data;
7449
7450        for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
7451                struct fixed_file_table *table;
7452                unsigned index;
7453
7454                if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
7455                        ret = -EFAULT;
7456                        goto out_fput;
7457                }
7458                /* allow sparse sets */
7459                if (fd == -1)
7460                        continue;
7461
7462                file = fget(fd);
7463                ret = -EBADF;
7464                if (!file)
7465                        goto out_fput;
7466
7467                /*
7468                 * Don't allow io_uring instances to be registered. If UNIX
7469                 * isn't enabled, then this causes a reference cycle and this
7470                 * instance can never get freed. If UNIX is enabled we'll
7471                 * handle it just fine, but there's still no point in allowing
7472                 * a ring fd as it doesn't support regular read/write anyway.
7473                 */
7474                if (file->f_op == &io_uring_fops) {
7475                        fput(file);
7476                        goto out_fput;
7477                }
7478                table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7479                index = i & IORING_FILE_TABLE_MASK;
7480                table->files[index] = file;
7481        }
7482
7483        ret = io_sqe_files_scm(ctx);
7484        if (ret) {
7485                io_sqe_files_unregister(ctx);
7486                return ret;
7487        }
7488
7489        ref_node = alloc_fixed_file_ref_node(ctx);
7490        if (IS_ERR(ref_node)) {
7491                io_sqe_files_unregister(ctx);
7492                return PTR_ERR(ref_node);
7493        }
7494
7495        file_data->node = ref_node;
7496        spin_lock(&file_data->lock);
7497        list_add_tail(&ref_node->node, &file_data->ref_list);
7498        spin_unlock(&file_data->lock);
7499        percpu_ref_get(&file_data->refs);
7500        return ret;
7501out_fput:
7502        for (i = 0; i < ctx->nr_user_files; i++) {
7503                file = io_file_from_index(ctx, i);
7504                if (file)
7505                        fput(file);
7506        }
7507        for (i = 0; i < nr_tables; i++)
7508                kfree(file_data->table[i].files);
7509        ctx->nr_user_files = 0;
7510out_ref:
7511        percpu_ref_exit(&file_data->refs);
7512out_free:
7513        kfree(file_data->table);
7514        kfree(file_data);
7515        ctx->file_data = NULL;
7516        return ret;
7517}
7518
7519static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
7520                                int index)
7521{
7522#if defined(CONFIG_UNIX)
7523        struct sock *sock = ctx->ring_sock->sk;
7524        struct sk_buff_head *head = &sock->sk_receive_queue;
7525        struct sk_buff *skb;
7526
7527        /*
7528         * See if we can merge this file into an existing skb SCM_RIGHTS
7529         * file set. If there's no room, fall back to allocating a new skb
7530         * and filling it in.
7531         */
7532        spin_lock_irq(&head->lock);
7533        skb = skb_peek(head);
7534        if (skb) {
7535                struct scm_fp_list *fpl = UNIXCB(skb).fp;
7536
7537                if (fpl->count < SCM_MAX_FD) {
7538                        __skb_unlink(skb, head);
7539                        spin_unlock_irq(&head->lock);
7540                        fpl->fp[fpl->count] = get_file(file);
7541                        unix_inflight(fpl->user, fpl->fp[fpl->count]);
7542                        fpl->count++;
7543                        spin_lock_irq(&head->lock);
7544                        __skb_queue_head(head, skb);
7545                } else {
7546                        skb = NULL;
7547                }
7548        }
7549        spin_unlock_irq(&head->lock);
7550
7551        if (skb) {
7552                fput(file);
7553                return 0;
7554        }
7555
7556        return __io_sqe_files_scm(ctx, 1, index);
7557#else
7558        return 0;
7559#endif
7560}
7561
7562static int io_queue_file_removal(struct fixed_file_data *data,
7563                                 struct file *file)
7564{
7565        struct io_file_put *pfile;
7566        struct fixed_file_ref_node *ref_node = data->node;
7567
7568        pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
7569        if (!pfile)
7570                return -ENOMEM;
7571
7572        pfile->file = file;
7573        list_add(&pfile->list, &ref_node->file_list);
7574
7575        return 0;
7576}
7577
7578static int __io_sqe_files_update(struct io_ring_ctx *ctx,
7579                                 struct io_uring_files_update *up,
7580                                 unsigned nr_args)
7581{
7582        struct fixed_file_data *data = ctx->file_data;
7583        struct fixed_file_ref_node *ref_node;
7584        struct file *file;
7585        __s32 __user *fds;
7586        int fd, i, err;
7587        __u32 done;
7588        bool needs_switch = false;
7589
7590        if (check_add_overflow(up->offset, nr_args, &done))
7591                return -EOVERFLOW;
7592        if (done > ctx->nr_user_files)
7593                return -EINVAL;
7594
7595        ref_node = alloc_fixed_file_ref_node(ctx);
7596        if (IS_ERR(ref_node))
7597                return PTR_ERR(ref_node);
7598
7599        done = 0;
7600        fds = u64_to_user_ptr(up->fds);
7601        while (nr_args) {
7602                struct fixed_file_table *table;
7603                unsigned index;
7604
7605                err = 0;
7606                if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
7607                        err = -EFAULT;
7608                        break;
7609                }
7610                i = array_index_nospec(up->offset, ctx->nr_user_files);
7611                table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
7612                index = i & IORING_FILE_TABLE_MASK;
7613                if (table->files[index]) {
7614                        file = table->files[index];
7615                        err = io_queue_file_removal(data, file);
7616                        if (err)
7617                                break;
7618                        table->files[index] = NULL;
7619                        needs_switch = true;
7620                }
7621                if (fd != -1) {
7622                        file = fget(fd);
7623                        if (!file) {
7624                                err = -EBADF;
7625                                break;
7626                        }
7627                        /*
7628                         * Don't allow io_uring instances to be registered. If
7629                         * UNIX isn't enabled, then this causes a reference
7630                         * cycle and this instance can never get freed. If UNIX
7631                         * is enabled we'll handle it just fine, but there's
7632                         * still no point in allowing a ring fd as it doesn't
7633                         * support regular read/write anyway.
7634                         */
7635                        if (file->f_op == &io_uring_fops) {
7636                                fput(file);
7637                                err = -EBADF;
7638                                break;
7639                        }
7640                        table->files[index] = file;
7641                        err = io_sqe_file_register(ctx, file, i);
7642                        if (err) {
7643                                table->files[index] = NULL;
7644                                fput(file);
7645                                break;
7646                        }
7647                }
7648                nr_args--;
7649                done++;
7650                up->offset++;
7651        }
7652
7653        if (needs_switch) {
7654                percpu_ref_kill(&data->node->refs);
7655                spin_lock(&data->lock);
7656                list_add_tail(&ref_node->node, &data->ref_list);
7657                data->node = ref_node;
7658                spin_unlock(&data->lock);
7659                percpu_ref_get(&ctx->file_data->refs);
7660        } else
7661                destroy_fixed_file_ref_node(ref_node);
7662
7663        return done ? done : err;
7664}
7665
7666static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
7667                               unsigned nr_args)
7668{
7669        struct io_uring_files_update up;
7670
7671        if (!ctx->file_data)
7672                return -ENXIO;
7673        if (!nr_args)
7674                return -EINVAL;
7675        if (copy_from_user(&up, arg, sizeof(up)))
7676                return -EFAULT;
7677        if (up.resv)
7678                return -EINVAL;
7679
7680        return __io_sqe_files_update(ctx, &up, nr_args);
7681}
7682
7683static void io_free_work(struct io_wq_work *work)
7684{
7685        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
7686
7687        /* Consider that io_steal_work() relies on this ref */
7688        io_put_req(req);
7689}
7690
7691static int io_init_wq_offload(struct io_ring_ctx *ctx,
7692                              struct io_uring_params *p)
7693{
7694        struct io_wq_data data;
7695        struct fd f;
7696        struct io_ring_ctx *ctx_attach;
7697        unsigned int concurrency;
7698        int ret = 0;
7699
7700        data.user = ctx->user;
7701        data.free_work = io_free_work;
7702        data.do_work = io_wq_submit_work;
7703
7704        if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
7705                /* Do QD, or 4 * CPUS, whatever is smallest */
7706                concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
7707
7708                ctx->io_wq = io_wq_create(concurrency, &data);
7709                if (IS_ERR(ctx->io_wq)) {
7710                        ret = PTR_ERR(ctx->io_wq);
7711                        ctx->io_wq = NULL;
7712                }
7713                return ret;
7714        }
7715
7716        f = fdget(p->wq_fd);
7717        if (!f.file)
7718                return -EBADF;
7719
7720        if (f.file->f_op != &io_uring_fops) {
7721                ret = -EINVAL;
7722                goto out_fput;
7723        }
7724
7725        ctx_attach = f.file->private_data;
7726        /* @io_wq is protected by holding the fd */
7727        if (!io_wq_get(ctx_attach->io_wq, &data)) {
7728                ret = -EINVAL;
7729                goto out_fput;
7730        }
7731
7732        ctx->io_wq = ctx_attach->io_wq;
7733out_fput:
7734        fdput(f);
7735        return ret;
7736}
7737
7738static int io_uring_alloc_task_context(struct task_struct *task)
7739{
7740        struct io_uring_task *tctx;
7741        int ret;
7742
7743        tctx = kmalloc(sizeof(*tctx), GFP_KERNEL);
7744        if (unlikely(!tctx))
7745                return -ENOMEM;
7746
7747        ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
7748        if (unlikely(ret)) {
7749                kfree(tctx);
7750                return ret;
7751        }
7752
7753        xa_init(&tctx->xa);
7754        init_waitqueue_head(&tctx->wait);
7755        tctx->last = NULL;
7756        atomic_set(&tctx->in_idle, 0);
7757        tctx->sqpoll = false;
7758        io_init_identity(&tctx->__identity);
7759        tctx->identity = &tctx->__identity;
7760        task->io_uring = tctx;
7761        return 0;
7762}
7763
7764void __io_uring_free(struct task_struct *tsk)
7765{
7766        struct io_uring_task *tctx = tsk->io_uring;
7767
7768        WARN_ON_ONCE(!xa_empty(&tctx->xa));
7769        WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1);
7770        if (tctx->identity != &tctx->__identity)
7771                kfree(tctx->identity);
7772        percpu_counter_destroy(&tctx->inflight);
7773        kfree(tctx);
7774        tsk->io_uring = NULL;
7775}
7776
7777static int io_sq_offload_create(struct io_ring_ctx *ctx,
7778                                struct io_uring_params *p)
7779{
7780        int ret;
7781
7782        if (ctx->flags & IORING_SETUP_SQPOLL) {
7783                struct io_sq_data *sqd;
7784
7785                ret = -EPERM;
7786                if (!capable(CAP_SYS_ADMIN))
7787                        goto err;
7788
7789                sqd = io_get_sq_data(p);
7790                if (IS_ERR(sqd)) {
7791                        ret = PTR_ERR(sqd);
7792                        goto err;
7793                }
7794
7795                ctx->sq_data = sqd;
7796                io_sq_thread_park(sqd);
7797                mutex_lock(&sqd->ctx_lock);
7798                list_add(&ctx->sqd_list, &sqd->ctx_new_list);
7799                mutex_unlock(&sqd->ctx_lock);
7800                io_sq_thread_unpark(sqd);
7801
7802                ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
7803                if (!ctx->sq_thread_idle)
7804                        ctx->sq_thread_idle = HZ;
7805
7806                if (sqd->thread)
7807                        goto done;
7808
7809                if (p->flags & IORING_SETUP_SQ_AFF) {
7810                        int cpu = p->sq_thread_cpu;
7811
7812                        ret = -EINVAL;
7813                        if (cpu >= nr_cpu_ids)
7814                                goto err;
7815                        if (!cpu_online(cpu))
7816                                goto err;
7817
7818                        sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd,
7819                                                        cpu, "io_uring-sq");
7820                } else {
7821                        sqd->thread = kthread_create(io_sq_thread, sqd,
7822                                                        "io_uring-sq");
7823                }
7824                if (IS_ERR(sqd->thread)) {
7825                        ret = PTR_ERR(sqd->thread);
7826                        sqd->thread = NULL;
7827                        goto err;
7828                }
7829                ret = io_uring_alloc_task_context(sqd->thread);
7830                if (ret)
7831                        goto err;
7832        } else if (p->flags & IORING_SETUP_SQ_AFF) {
7833                /* Can't have SQ_AFF without SQPOLL */
7834                ret = -EINVAL;
7835                goto err;
7836        }
7837
7838done:
7839        ret = io_init_wq_offload(ctx, p);
7840        if (ret)
7841                goto err;
7842
7843        return 0;
7844err:
7845        io_finish_async(ctx);
7846        return ret;
7847}
7848
7849static void io_sq_offload_start(struct io_ring_ctx *ctx)
7850{
7851        struct io_sq_data *sqd = ctx->sq_data;
7852
7853        if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread)
7854                wake_up_process(sqd->thread);
7855}
7856
7857static inline void __io_unaccount_mem(struct user_struct *user,
7858                                      unsigned long nr_pages)
7859{
7860        atomic_long_sub(nr_pages, &user->locked_vm);
7861}
7862
7863static inline int __io_account_mem(struct user_struct *user,
7864                                   unsigned long nr_pages)
7865{
7866        unsigned long page_limit, cur_pages, new_pages;
7867
7868        /* Don't allow more pages than we can safely lock */
7869        page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
7870
7871        do {
7872                cur_pages = atomic_long_read(&user->locked_vm);
7873                new_pages = cur_pages + nr_pages;
7874                if (new_pages > page_limit)
7875                        return -ENOMEM;
7876        } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
7877                                        new_pages) != cur_pages);
7878
7879        return 0;
7880}
7881
7882static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7883                             enum io_mem_account acct)
7884{
7885        if (ctx->limit_mem)
7886                __io_unaccount_mem(ctx->user, nr_pages);
7887
7888        if (ctx->mm_account) {
7889                if (acct == ACCT_LOCKED)
7890                        ctx->mm_account->locked_vm -= nr_pages;
7891                else if (acct == ACCT_PINNED)
7892                        atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
7893        }
7894}
7895
7896static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
7897                          enum io_mem_account acct)
7898{
7899        int ret;
7900
7901        if (ctx->limit_mem) {
7902                ret = __io_account_mem(ctx->user, nr_pages);
7903                if (ret)
7904                        return ret;
7905        }
7906
7907        if (ctx->mm_account) {
7908                if (acct == ACCT_LOCKED)
7909                        ctx->mm_account->locked_vm += nr_pages;
7910                else if (acct == ACCT_PINNED)
7911                        atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
7912        }
7913
7914        return 0;
7915}
7916
7917static void io_mem_free(void *ptr)
7918{
7919        struct page *page;
7920
7921        if (!ptr)
7922                return;
7923
7924        page = virt_to_head_page(ptr);
7925        if (put_page_testzero(page))
7926                free_compound_page(page);
7927}
7928
7929static void *io_mem_alloc(size_t size)
7930{
7931        gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
7932                                __GFP_NORETRY;
7933
7934        return (void *) __get_free_pages(gfp_flags, get_order(size));
7935}
7936
7937static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
7938                                size_t *sq_offset)
7939{
7940        struct io_rings *rings;
7941        size_t off, sq_array_size;
7942
7943        off = struct_size(rings, cqes, cq_entries);
7944        if (off == SIZE_MAX)
7945                return SIZE_MAX;
7946
7947#ifdef CONFIG_SMP
7948        off = ALIGN(off, SMP_CACHE_BYTES);
7949        if (off == 0)
7950                return SIZE_MAX;
7951#endif
7952
7953        if (sq_offset)
7954                *sq_offset = off;
7955
7956        sq_array_size = array_size(sizeof(u32), sq_entries);
7957        if (sq_array_size == SIZE_MAX)
7958                return SIZE_MAX;
7959
7960        if (check_add_overflow(off, sq_array_size, &off))
7961                return SIZE_MAX;
7962
7963        return off;
7964}
7965
7966static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries)
7967{
7968        size_t pages;
7969
7970        pages = (size_t)1 << get_order(
7971                rings_size(sq_entries, cq_entries, NULL));
7972        pages += (size_t)1 << get_order(
7973                array_size(sizeof(struct io_uring_sqe), sq_entries));
7974
7975        return pages;
7976}
7977
7978static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
7979{
7980        int i, j;
7981
7982        if (!ctx->user_bufs)
7983                return -ENXIO;
7984
7985        for (i = 0; i < ctx->nr_user_bufs; i++) {
7986                struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
7987
7988                for (j = 0; j < imu->nr_bvecs; j++)
7989                        unpin_user_page(imu->bvec[j].bv_page);
7990
7991                if (imu->acct_pages)
7992                        io_unaccount_mem(ctx, imu->acct_pages, ACCT_PINNED);
7993                kvfree(imu->bvec);
7994                imu->nr_bvecs = 0;
7995        }
7996
7997        kfree(ctx->user_bufs);
7998        ctx->user_bufs = NULL;
7999        ctx->nr_user_bufs = 0;
8000        return 0;

8001}
8002
8003static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8004                       void __user *arg, unsigned index)
8005{
8006        struct iovec __user *src;
8007
8008#ifdef CONFIG_COMPAT
8009        if (ctx->compat) {
8010                struct compat_iovec __user *ciovs;
8011                struct compat_iovec ciov;
8012
8013                ciovs = (struct compat_iovec __user *) arg;
8014                if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8015                        return -EFAULT;
8016
8017                dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8018                dst->iov_len = ciov.iov_len;
8019                return 0;
8020        }
8021#endif
8022        src = (struct iovec __user *) arg;
8023        if (copy_from_user(dst, &src[index], sizeof(*dst)))
8024                return -EFAULT;
8025        return 0;
8026}
8027
8028/*
8029 * Not super efficient, but this is just a registration time. And we do cache
8030 * the last compound head, so generally we'll only do a full search if we don't
8031 * match that one.
8032 *
8033 * We check if the given compound head page has already been accounted, to
8034 * avoid double accounting it. This allows us to account the full size of the
8035 * page, not just the constituent pages of a huge page.
8036 */
8037static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8038                                  int nr_pages, struct page *hpage)
8039{
8040        int i, j;
8041
8042        /* check current page array */
8043        for (i = 0; i < nr_pages; i++) {
8044                if (!PageCompound(pages[i]))
8045                        continue;
8046                if (compound_head(pages[i]) == hpage)
8047                        return true;
8048        }
8049
8050        /* check previously registered pages */
8051        for (i = 0; i < ctx->nr_user_bufs; i++) {
8052                struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8053
8054                for (j = 0; j < imu->nr_bvecs; j++) {
8055                        if (!PageCompound(imu->bvec[j].bv_page))
8056                                continue;
8057                        if (compound_head(imu->bvec[j].bv_page) == hpage)
8058                                return true;
8059                }
8060        }
8061
8062        return false;
8063}
8064
8065static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
8066                                 int nr_pages, struct io_mapped_ubuf *imu,
8067                                 struct page **last_hpage)
8068{
8069        int i, ret;
8070
8071        for (i = 0; i < nr_pages; i++) {
8072                if (!PageCompound(pages[i])) {
8073                        imu->acct_pages++;
8074                } else {
8075                        struct page *hpage;
8076
8077                        hpage = compound_head(pages[i]);
8078                        if (hpage == *last_hpage)
8079                                continue;
8080                        *last_hpage = hpage;
8081                        if (headpage_already_acct(ctx, pages, i, hpage))
8082                                continue;
8083                        imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
8084                }
8085        }
8086
8087        if (!imu->acct_pages)
8088                return 0;
8089
8090        ret = io_account_mem(ctx, imu->acct_pages, ACCT_PINNED);
8091        if (ret)
8092                imu->acct_pages = 0;
8093        return ret;
8094}
8095
8096static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
8097                                  unsigned nr_args)
8098{
8099        struct vm_area_struct **vmas = NULL;
8100        struct page **pages = NULL;
8101        struct page *last_hpage = NULL;
8102        int i, j, got_pages = 0;
8103        int ret = -EINVAL;
8104
8105        if (ctx->user_bufs)
8106                return -EBUSY;
8107        if (!nr_args || nr_args > UIO_MAXIOV)
8108                return -EINVAL;
8109
8110        ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf),
8111                                        GFP_KERNEL);
8112        if (!ctx->user_bufs)
8113                return -ENOMEM;
8114
8115        for (i = 0; i < nr_args; i++) {
8116                struct io_mapped_ubuf *imu = &ctx->user_bufs[i];
8117                unsigned long off, start, end, ubuf;
8118                int pret, nr_pages;
8119                struct iovec iov;
8120                size_t size;
8121
8122                ret = io_copy_iov(ctx, &iov, arg, i);
8123                if (ret)
8124                        goto err;
8125
8126                /*
8127                 * Don't impose further limits on the size and buffer
8128                 * constraints here, we'll -EINVAL later when IO is
8129                 * submitted if they are wrong.
8130                 */
8131                ret = -EFAULT;
8132                if (!iov.iov_base || !iov.iov_len)
8133                        goto err;
8134
8135                /* arbitrary limit, but we need something */
8136                if (iov.iov_len > SZ_1G)
8137                        goto err;
8138
8139                ubuf = (unsigned long) iov.iov_base;
8140                end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
8141                start = ubuf >> PAGE_SHIFT;
8142                nr_pages = end - start;
8143
8144                ret = 0;
8145                if (!pages || nr_pages > got_pages) {
8146                        kvfree(vmas);
8147                        kvfree(pages);
8148                        pages = kvmalloc_array(nr_pages, sizeof(struct page *),
8149                                                GFP_KERNEL);
8150                        vmas = kvmalloc_array(nr_pages,
8151                                        sizeof(struct vm_area_struct *),
8152                                        GFP_KERNEL);
8153                        if (!pages || !vmas) {
8154                                ret = -ENOMEM;
8155                                goto err;
8156                        }
8157                        got_pages = nr_pages;
8158                }
8159
8160                imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec),
8161                                                GFP_KERNEL);
8162                ret = -ENOMEM;
8163                if (!imu->bvec)
8164                        goto err;
8165
8166                ret = 0;
8167                mmap_read_lock(current->mm);
8168                pret = pin_user_pages(ubuf, nr_pages,
8169                                      FOLL_WRITE | FOLL_LONGTERM,
8170                                      pages, vmas);
8171                if (pret == nr_pages) {
8172                        /* don't support file backed memory */
8173                        for (j = 0; j < nr_pages; j++) {
8174                                struct vm_area_struct *vma = vmas[j];
8175
8176                                if (vma->vm_file &&
8177                                    !is_file_hugepages(vma->vm_file)) {
8178                                        ret = -EOPNOTSUPP;
8179                                        break;
8180                                }
8181                        }
8182                } else {
8183                        ret = pret < 0 ? pret : -EFAULT;
8184                }
8185                mmap_read_unlock(current->mm);
8186                if (ret) {
8187                        /*
8188                         * if we did partial map, or found file backed vmas,
8189                         * release any pages we did get
8190                         */
8191                        if (pret > 0)
8192                                unpin_user_pages(pages, pret);
8193                        kvfree(imu->bvec);
8194                        goto err;
8195                }
8196
8197                ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage);
8198                if (ret) {
8199                        unpin_user_pages(pages, pret);
8200                        kvfree(imu->bvec);
8201                        goto err;
8202                }
8203
8204                off = ubuf & ~PAGE_MASK;
8205                size = iov.iov_len;
8206                for (j = 0; j < nr_pages; j++) {
8207                        size_t vec_len;
8208
8209                        vec_len = min_t(size_t, size, PAGE_SIZE - off);
8210                        imu->bvec[j].bv_page = pages[j];
8211                        imu->bvec[j].bv_len = vec_len;
8212                        imu->bvec[j].bv_offset = off;
8213                        off = 0;
8214                        size -= vec_len;
8215                }
8216                /* store original address for later verification */
8217                imu->ubuf = ubuf;
8218                imu->len = iov.iov_len;
8219                imu->nr_bvecs = nr_pages;
8220
8221                ctx->nr_user_bufs++;
8222        }
8223        kvfree(pages);
8224        kvfree(vmas);
8225        return 0;
8226err:
8227        kvfree(pages);
8228        kvfree(vmas);
8229        io_sqe_buffer_unregister(ctx);
8230        return ret;
8231}
8232
8233static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
8234{
8235        __s32 __user *fds = arg;
8236        int fd;
8237
8238        if (ctx->cq_ev_fd)
8239                return -EBUSY;
8240
8241        if (copy_from_user(&fd, fds, sizeof(*fds)))
8242                return -EFAULT;
8243
8244        ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
8245        if (IS_ERR(ctx->cq_ev_fd)) {
8246                int ret = PTR_ERR(ctx->cq_ev_fd);
8247                ctx->cq_ev_fd = NULL;
8248                return ret;
8249        }
8250
8251        return 0;
8252}
8253
8254static int io_eventfd_unregister(struct io_ring_ctx *ctx)
8255{
8256        if (ctx->cq_ev_fd) {
8257                eventfd_ctx_put(ctx->cq_ev_fd);
8258                ctx->cq_ev_fd = NULL;
8259                return 0;
8260        }
8261
8262        return -ENXIO;
8263}
8264
8265static int __io_destroy_buffers(int id, void *p, void *data)
8266{
8267        struct io_ring_ctx *ctx = data;
8268        struct io_buffer *buf = p;
8269
8270        __io_remove_buffers(ctx, buf, id, -1U);
8271        return 0;
8272}
8273
8274static void io_destroy_buffers(struct io_ring_ctx *ctx)
8275{
8276        idr_for_each(&ctx->io_buffer_idr, __io_destroy_buffers, ctx);
8277        idr_destroy(&ctx->io_buffer_idr);
8278}
8279
8280static void io_ring_ctx_free(struct io_ring_ctx *ctx)
8281{
8282        io_finish_async(ctx);
8283        io_sqe_buffer_unregister(ctx);
8284
8285        if (ctx->sqo_task) {
8286                put_task_struct(ctx->sqo_task);
8287                ctx->sqo_task = NULL;
8288                mmdrop(ctx->mm_account);
8289                ctx->mm_account = NULL;
8290        }
8291
8292#ifdef CONFIG_BLK_CGROUP
8293        if (ctx->sqo_blkcg_css)
8294                css_put(ctx->sqo_blkcg_css);
8295#endif
8296
8297        io_sqe_files_unregister(ctx);
8298        io_eventfd_unregister(ctx);
8299        io_destroy_buffers(ctx);
8300        idr_destroy(&ctx->personality_idr);
8301
8302#if defined(CONFIG_UNIX)
8303        if (ctx->ring_sock) {
8304                ctx->ring_sock->file = NULL; /* so that iput() is called */
8305                sock_release(ctx->ring_sock);
8306        }
8307#endif
8308
8309        io_mem_free(ctx->rings);
8310        io_mem_free(ctx->sq_sqes);
8311
8312        percpu_ref_exit(&ctx->refs);
8313        free_uid(ctx->user);
8314        put_cred(ctx->creds);
8315        kfree(ctx->cancel_hash);
8316        kmem_cache_free(req_cachep, ctx->fallback_req);
8317        kfree(ctx);
8318}
8319
8320static __poll_t io_uring_poll(struct file *file, poll_table *wait)
8321{
8322        struct io_ring_ctx *ctx = file->private_data;
8323        __poll_t mask = 0;
8324
8325        poll_wait(file, &ctx->cq_wait, wait);
8326        /*
8327         * synchronizes with barrier from wq_has_sleeper call in
8328         * io_commit_cqring
8329         */
8330        smp_rmb();
8331        if (!io_sqring_full(ctx))
8332                mask |= EPOLLOUT | EPOLLWRNORM;
8333        if (io_cqring_events(ctx, false))
8334                mask |= EPOLLIN | EPOLLRDNORM;
8335
8336        return mask;
8337}
8338
8339static int io_uring_fasync(int fd, struct file *file, int on)
8340{
8341        struct io_ring_ctx *ctx = file->private_data;
8342
8343        return fasync_helper(fd, file, on, &ctx->cq_fasync);
8344}
8345
8346static int io_remove_personalities(int id, void *p, void *data)
8347{
8348        struct io_ring_ctx *ctx = data;
8349        struct io_identity *iod;
8350
8351        iod = idr_remove(&ctx->personality_idr, id);
8352        if (iod) {
8353                put_cred(iod->creds);
8354                if (refcount_dec_and_test(&iod->count))
8355                        kfree(iod);
8356        }
8357        return 0;
8358}
8359
8360static void io_ring_exit_work(struct work_struct *work)
8361{
8362        struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
8363                                               exit_work);
8364
8365        /*
8366         * If we're doing polled IO and end up having requests being
8367         * submitted async (out-of-line), then completions can come in while
8368         * we're waiting for refs to drop. We need to reap these manually,
8369         * as nobody else will be looking for them.
8370         */
8371        do {
8372                if (ctx->rings)
8373                        io_cqring_overflow_flush(ctx, true, NULL, NULL);
8374                io_iopoll_try_reap_events(ctx);
8375        } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
8376        io_ring_ctx_free(ctx);
8377}
8378
8379static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
8380{
8381        mutex_lock(&ctx->uring_lock);
8382        percpu_ref_kill(&ctx->refs);
8383        mutex_unlock(&ctx->uring_lock);
8384
8385        io_kill_timeouts(ctx, NULL);
8386        io_poll_remove_all(ctx, NULL);
8387
8388        if (ctx->io_wq)
8389                io_wq_cancel_all(ctx->io_wq);
8390
8391        /* if we failed setting up the ctx, we might not have any rings */
8392        if (ctx->rings)
8393                io_cqring_overflow_flush(ctx, true, NULL, NULL);
8394        io_iopoll_try_reap_events(ctx);
8395        idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
8396
8397        /*
8398         * Do this upfront, so we won't have a grace period where the ring
8399         * is closed but resources aren't reaped yet. This can cause
8400         * spurious failure in setting up a new ring.
8401         */
8402        io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
8403                         ACCT_LOCKED);
8404
8405        INIT_WORK(&ctx->exit_work, io_ring_exit_work);
8406        /*
8407         * Use system_unbound_wq to avoid spawning tons of event kworkers
8408         * if we're exiting a ton of rings at the same time. It just adds
8409         * noise and overhead, there's no discernable change in runtime
8410         * over using system_wq.
8411         */
8412        queue_work(system_unbound_wq, &ctx->exit_work);
8413}
8414
8415static int io_uring_release(struct inode *inode, struct file *file)
8416{
8417        struct io_ring_ctx *ctx = file->private_data;
8418
8419        file->private_data = NULL;
8420        io_ring_ctx_wait_and_kill(ctx);
8421        return 0;
8422}
8423
8424static bool io_wq_files_match(struct io_wq_work *work, void *data)
8425{
8426        struct files_struct *files = data;
8427
8428        return !files || ((work->flags & IO_WQ_WORK_FILES) &&
8429                                work->identity->files == files);
8430}
8431
8432/*
8433 * Returns true if 'preq' is the link parent of 'req'
8434 */
8435static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
8436{
8437        struct io_kiocb *link;
8438
8439        if (!(preq->flags & REQ_F_LINK_HEAD))
8440                return false;
8441
8442        list_for_each_entry(link, &preq->link_list, link_list) {
8443                if (link == req)
8444                        return true;
8445        }
8446
8447        return false;
8448}
8449
8450/*
8451 * We're looking to cancel 'req' because it's holding on to our files, but
8452 * 'req' could be a link to another request. See if it is, and cancel that
8453 * parent request if so.
8454 */
8455static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
8456{
8457        struct hlist_node *tmp;
8458        struct io_kiocb *preq;
8459        bool found = false;
8460        int i;
8461
8462        spin_lock_irq(&ctx->completion_lock);
8463        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
8464                struct hlist_head *list;
8465
8466                list = &ctx->cancel_hash[i];
8467                hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
8468                        found = io_match_link(preq, req);
8469                        if (found) {
8470                                io_poll_remove_one(preq);
8471                                break;
8472                        }
8473                }
8474        }
8475        spin_unlock_irq(&ctx->completion_lock);
8476        return found;
8477}
8478
8479static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
8480                                   struct io_kiocb *req)
8481{
8482        struct io_kiocb *preq;
8483        bool found = false;
8484
8485        spin_lock_irq(&ctx->completion_lock);
8486        list_for_each_entry(preq, &ctx->timeout_list, timeout.list) {
8487                found = io_match_link(preq, req);
8488                if (found) {
8489                        __io_timeout_cancel(preq);
8490                        break;
8491                }
8492        }
8493        spin_unlock_irq(&ctx->completion_lock);
8494        return found;
8495}
8496
8497static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
8498{
8499        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8500        bool ret;
8501
8502        if (req->flags & REQ_F_LINK_TIMEOUT) {
8503                unsigned long flags;
8504                struct io_ring_ctx *ctx = req->ctx;
8505
8506                /* protect against races with linked timeouts */
8507                spin_lock_irqsave(&ctx->completion_lock, flags);
8508                ret = io_match_link(req, data);
8509                spin_unlock_irqrestore(&ctx->completion_lock, flags);
8510        } else {
8511                ret = io_match_link(req, data);
8512        }
8513        return ret;
8514}
8515
8516static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
8517{
8518        enum io_wq_cancel cret;
8519
8520        /* cancel this particular work, if it's running */
8521        cret = io_wq_cancel_work(ctx->io_wq, &req->work);
8522        if (cret != IO_WQ_CANCEL_NOTFOUND)
8523                return;
8524
8525        /* find links that hold this pending, cancel those */
8526        cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
8527        if (cret != IO_WQ_CANCEL_NOTFOUND)
8528                return;
8529
8530        /* if we have a poll link holding this pending, cancel that */
8531        if (io_poll_remove_link(ctx, req))
8532                return;
8533
8534        /* final option, timeout link is holding this req pending */
8535        io_timeout_remove_link(ctx, req);
8536}
8537
8538static void io_cancel_defer_files(struct io_ring_ctx *ctx,
8539                                  struct task_struct *task,
8540                                  struct files_struct *files)
8541{
8542        struct io_defer_entry *de = NULL;
8543        LIST_HEAD(list);
8544
8545        spin_lock_irq(&ctx->completion_lock);
8546        list_for_each_entry_reverse(de, &ctx->defer_list, list) {
8547                if (io_task_match(de->req, task) &&
8548                    io_match_files(de->req, files)) {
8549                        list_cut_position(&list, &ctx->defer_list, &de->list);
8550                        break;
8551                }
8552        }
8553        spin_unlock_irq(&ctx->completion_lock);
8554
8555        while (!list_empty(&list)) {
8556                de = list_first_entry(&list, struct io_defer_entry, list);
8557                list_del_init(&de->list);
8558                req_set_fail_links(de->req);
8559                io_put_req(de->req);
8560                io_req_complete(de->req, -ECANCELED);
8561                kfree(de);
8562        }
8563}
8564
8565/*
8566 * Returns true if we found and killed one or more files pinning requests
8567 */
8568static bool io_uring_cancel_files(struct io_ring_ctx *ctx,
8569                                  struct files_struct *files)
8570{
8571        if (list_empty_careful(&ctx->inflight_list))
8572                return false;
8573
8574        /* cancel all at once, should be faster than doing it one by one*/
8575        io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
8576
8577        while (!list_empty_careful(&ctx->inflight_list)) {
8578                struct io_kiocb *cancel_req = NULL, *req;
8579                DEFINE_WAIT(wait);
8580
8581                spin_lock_irq(&ctx->inflight_lock);
8582                list_for_each_entry(req, &ctx->inflight_list, inflight_entry) {
8583                        if (files && (req->work.flags & IO_WQ_WORK_FILES) &&
8584                            req->work.identity->files != files)
8585                                continue;
8586                        /* req is being completed, ignore */
8587                        if (!refcount_inc_not_zero(&req->refs))
8588                                continue;
8589                        cancel_req = req;
8590                        break;
8591                }
8592                if (cancel_req)
8593                        prepare_to_wait(&ctx->inflight_wait, &wait,
8594                                                TASK_UNINTERRUPTIBLE);
8595                spin_unlock_irq(&ctx->inflight_lock);
8596
8597                /* We need to keep going until we don't find a matching req */
8598                if (!cancel_req)
8599                        break;
8600                /* cancel this request, or head link requests */
8601                io_attempt_cancel(ctx, cancel_req);
8602                io_put_req(cancel_req);
8603                /* cancellations _may_ trigger task work */
8604                io_run_task_work();
8605                schedule();
8606                finish_wait(&ctx->inflight_wait, &wait);
8607        }
8608
8609        return true;
8610}
8611
8612static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
8613{
8614        struct io_kiocb *req = container_of(work, struct io_kiocb, work);
8615        struct task_struct *task = data;
8616
8617        return io_task_match(req, task);
8618}
8619
8620static bool __io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8621                                            struct task_struct *task,
8622                                            struct files_struct *files)
8623{
8624        bool ret;
8625
8626        ret = io_uring_cancel_files(ctx, files);
8627        if (!files) {
8628                enum io_wq_cancel cret;
8629
8630                cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, task, true);
8631                if (cret != IO_WQ_CANCEL_NOTFOUND)
8632                        ret = true;
8633
8634                /* SQPOLL thread does its own polling */
8635                if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
8636                        while (!list_empty_careful(&ctx->iopoll_list)) {
8637                                io_iopoll_try_reap_events(ctx);
8638                                ret = true;
8639                        }
8640                }
8641
8642                ret |= io_poll_remove_all(ctx, task);
8643                ret |= io_kill_timeouts(ctx, task);
8644        }
8645
8646        return ret;
8647}
8648
8649/*
8650 * We need to iteratively cancel requests, in case a request has dependent
8651 * hard links. These persist even for failure of cancelations, hence keep
8652 * looping until none are found.
8653 */
8654static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx,
8655                                          struct files_struct *files)
8656{
8657        struct task_struct *task = current;
8658
8659        if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
8660                task = ctx->sq_data->thread;
8661                atomic_inc(&task->io_uring->in_idle);
8662                io_sq_thread_park(ctx->sq_data);
8663        }
8664
8665        if (files)
8666                io_cancel_defer_files(ctx, NULL, files);
8667        else
8668                io_cancel_defer_files(ctx, task, NULL);
8669
8670        io_cqring_overflow_flush(ctx, true, task, files);
8671
8672        while (__io_uring_cancel_task_requests(ctx, task, files)) {
8673                io_run_task_work();
8674                cond_resched();
8675        }
8676
8677        if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) {
8678                atomic_dec(&task->io_uring->in_idle);
8679                /*
8680                 * If the files that are going away are the ones in the thread
8681                 * identity, clear them out.
8682                 */
8683                if (task->io_uring->identity->files == files)
8684                        task->io_uring->identity->files = NULL;
8685                io_sq_thread_unpark(ctx->sq_data);
8686        }
8687}
8688
8689/*
8690 * Note that this task has used io_uring. We use it for cancelation purposes.
8691 */
8692static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file)
8693{
8694        struct io_uring_task *tctx = current->io_uring;
8695
8696        if (unlikely(!tctx)) {
8697                int ret;
8698
8699                ret = io_uring_alloc_task_context(current);
8700                if (unlikely(ret))
8701                        return ret;
8702                tctx = current->io_uring;
8703        }
8704        if (tctx->last != file) {
8705                void *old = xa_load(&tctx->xa, (unsigned long)file);
8706
8707                if (!old) {
8708                        get_file(file);
8709                        xa_store(&tctx->xa, (unsigned long)file, file, GFP_KERNEL);
8710                }
8711                tctx->last = file;
8712        }
8713
8714        /*
8715         * This is race safe in that the task itself is doing this, hence it
8716         * cannot be going through the exit/cancel paths at the same time.
8717         * This cannot be modified while exit/cancel is running.
8718         */
8719        if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL))
8720                tctx->sqpoll = true;
8721
8722        return 0;
8723}
8724
8725/*
8726 * Remove this io_uring_file -> task mapping.
8727 */
8728static void io_uring_del_task_file(struct file *file)
8729{
8730        struct io_uring_task *tctx = current->io_uring;
8731
8732        if (tctx->last == file)
8733                tctx->last = NULL;
8734        file = xa_erase(&tctx->xa, (unsigned long)file);
8735        if (file)
8736                fput(file);
8737}
8738
8739/*
8740 * Drop task note for this file if we're the only ones that hold it after
8741 * pending fput()
8742 */
8743static void io_uring_attempt_task_drop(struct file *file)
8744{
8745        if (!current->io_uring)
8746                return;
8747        /*
8748         * fput() is pending, will be 2 if the only other ref is our potential
8749         * task file note. If the task is exiting, drop regardless of count.
8750         */
8751        if (fatal_signal_pending(current) || (current->flags & PF_EXITING) ||
8752            atomic_long_read(&file->f_count) == 2)
8753                io_uring_del_task_file(file);
8754}
8755
8756void __io_uring_files_cancel(struct files_struct *files)
8757{
8758        struct io_uring_task *tctx = current->io_uring;
8759        struct file *file;
8760        unsigned long index;
8761
8762        /* make sure overflow events are dropped */
8763        atomic_inc(&tctx->in_idle);
8764
8765        xa_for_each(&tctx->xa, index, file) {
8766                struct io_ring_ctx *ctx = file->private_data;
8767
8768                io_uring_cancel_task_requests(ctx, files);
8769                if (files)
8770                        io_uring_del_task_file(file);
8771        }
8772
8773        atomic_dec(&tctx->in_idle);
8774}
8775
8776static s64 tctx_inflight(struct io_uring_task *tctx)
8777{
8778        unsigned long index;
8779        struct file *file;
8780        s64 inflight;
8781
8782        inflight = percpu_counter_sum(&tctx->inflight);
8783        if (!tctx->sqpoll)
8784                return inflight;
8785
8786        /*
8787         * If we have SQPOLL rings, then we need to iterate and find them, and
8788         * add the pending count for those.
8789         */
8790        xa_for_each(&tctx->xa, index, file) {
8791                struct io_ring_ctx *ctx = file->private_data;
8792
8793                if (ctx->flags & IORING_SETUP_SQPOLL) {
8794                        struct io_uring_task *__tctx = ctx->sqo_task->io_uring;
8795
8796                        inflight += percpu_counter_sum(&__tctx->inflight);
8797                }
8798        }
8799
8800        return inflight;
8801}
8802
8803/*
8804 * Find any io_uring fd that this task has registered or done IO on, and cancel
8805 * requests.
8806 */
8807void __io_uring_task_cancel(void)
8808{
8809        struct io_uring_task *tctx = current->io_uring;
8810        DEFINE_WAIT(wait);
8811        s64 inflight;
8812
8813        /* make sure overflow events are dropped */
8814        atomic_inc(&tctx->in_idle);
8815
8816        do {
8817                /* read completions before cancelations */
8818                inflight = tctx_inflight(tctx);
8819                if (!inflight)
8820                        break;
8821                __io_uring_files_cancel(NULL);
8822
8823                prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE);
8824
8825                /*
8826                 * If we've seen completions, retry. This avoids a race where
8827                 * a completion comes in before we did prepare_to_wait().
8828                 */
8829                if (inflight != tctx_inflight(tctx))
8830                        continue;
8831                schedule();
8832        } while (1);
8833
8834        finish_wait(&tctx->wait, &wait);
8835        atomic_dec(&tctx->in_idle);
8836}
8837
8838static int io_uring_flush(struct file *file, void *data)
8839{
8840        io_uring_attempt_task_drop(file);
8841        return 0;
8842}
8843
8844static void *io_uring_validate_mmap_request(struct file *file,
8845                                            loff_t pgoff, size_t sz)
8846{
8847        struct io_ring_ctx *ctx = file->private_data;
8848        loff_t offset = pgoff << PAGE_SHIFT;
8849        struct page *page;
8850        void *ptr;
8851
8852        switch (offset) {
8853        case IORING_OFF_SQ_RING:
8854        case IORING_OFF_CQ_RING:
8855                ptr = ctx->rings;
8856                break;
8857        case IORING_OFF_SQES:
8858                ptr = ctx->sq_sqes;
8859                break;
8860        default:
8861                return ERR_PTR(-EINVAL);
8862        }
8863
8864        page = virt_to_head_page(ptr);
8865        if (sz > page_size(page))
8866                return ERR_PTR(-EINVAL);
8867
8868        return ptr;
8869}
8870
8871#ifdef CONFIG_MMU
8872
8873static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8874{
8875        size_t sz = vma->vm_end - vma->vm_start;
8876        unsigned long pfn;
8877        void *ptr;
8878
8879        ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
8880        if (IS_ERR(ptr))
8881                return PTR_ERR(ptr);
8882
8883        pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
8884        return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
8885}
8886
8887#else /* !CONFIG_MMU */
8888
8889static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
8890{
8891        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
8892}
8893
8894static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
8895{
8896        return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
8897}
8898
8899static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
8900        unsigned long addr, unsigned long len,
8901        unsigned long pgoff, unsigned long flags)
8902{
8903        void *ptr;
8904
8905        ptr = io_uring_validate_mmap_request(file, pgoff, len);
8906        if (IS_ERR(ptr))
8907                return PTR_ERR(ptr);
8908
8909        return (unsigned long) ptr;
8910}
8911
8912#endif /* !CONFIG_MMU */
8913
8914static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
8915{
8916        DEFINE_WAIT(wait);
8917
8918        do {
8919                if (!io_sqring_full(ctx))
8920                        break;
8921
8922                prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
8923
8924                if (!io_sqring_full(ctx))
8925                        break;
8926
8927                schedule();
8928        } while (!signal_pending(current));
8929
8930        finish_wait(&ctx->sqo_sq_wait, &wait);
8931}
8932
8933SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
8934                u32, min_complete, u32, flags, const sigset_t __user *, sig,
8935                size_t, sigsz)
8936{
8937        struct io_ring_ctx *ctx;
8938        long ret = -EBADF;
8939        int submitted = 0;
8940        struct fd f;
8941
8942        io_run_task_work();
8943
8944        if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
8945                        IORING_ENTER_SQ_WAIT))
8946                return -EINVAL;
8947
8948        f = fdget(fd);
8949        if (!f.file)
8950                return -EBADF;
8951
8952        ret = -EOPNOTSUPP;
8953        if (f.file->f_op != &io_uring_fops)
8954                goto out_fput;
8955
8956        ret = -ENXIO;
8957        ctx = f.file->private_data;
8958        if (!percpu_ref_tryget(&ctx->refs))
8959                goto out_fput;
8960
8961        ret = -EBADFD;
8962        if (ctx->flags & IORING_SETUP_R_DISABLED)
8963                goto out;
8964
8965        /*
8966         * For SQ polling, the thread will do all submissions and completions.
8967         * Just return the requested submit count, and wake the thread if
8968         * we were asked to.
8969         */
8970        ret = 0;
8971        if (ctx->flags & IORING_SETUP_SQPOLL) {
8972                if (!list_empty_careful(&ctx->cq_overflow_list))
8973                        io_cqring_overflow_flush(ctx, false, NULL, NULL);
8974                if (flags & IORING_ENTER_SQ_WAKEUP)
8975                        wake_up(&ctx->sq_data->wait);
8976                if (flags & IORING_ENTER_SQ_WAIT)
8977                        io_sqpoll_wait_sq(ctx);
8978                submitted = to_submit;
8979        } else if (to_submit) {
8980                ret = io_uring_add_task_file(ctx, f.file);
8981                if (unlikely(ret))
8982                        goto out;
8983                mutex_lock(&ctx->uring_lock);
8984                submitted = io_submit_sqes(ctx, to_submit);
8985                mutex_unlock(&ctx->uring_lock);
8986
8987                if (submitted != to_submit)
8988                        goto out;
8989        }
8990        if (flags & IORING_ENTER_GETEVENTS) {
8991                min_complete = min(min_complete, ctx->cq_entries);
8992
8993                /*
8994                 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
8995                 * space applications don't need to do io completion events
8996                 * polling again, they can rely on io_sq_thread to do polling
8997                 * work, which can reduce cpu usage and uring_lock contention.
8998                 */
8999                if (ctx->flags & IORING_SETUP_IOPOLL &&
9000                    !(ctx->flags & IORING_SETUP_SQPOLL)) {

9001                        ret = io_iopoll_check(ctx, min_complete);
9002                } else {
9003                        ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
9004                }
9005        }
9006
9007out:
9008        percpu_ref_put(&ctx->refs);
9009out_fput:
9010        fdput(f);
9011        return submitted ? submitted : ret;
9012}
9013
9014#ifdef CONFIG_PROC_FS
9015static int io_uring_show_cred(int id, void *p, void *data)
9016{
9017        struct io_identity *iod = p;
9018        const struct cred *cred = iod->creds;
9019        struct seq_file *m = data;
9020        struct user_namespace *uns = seq_user_ns(m);
9021        struct group_info *gi;
9022        kernel_cap_t cap;
9023        unsigned __capi;
9024        int g;
9025
9026        seq_printf(m, "%5d\n", id);
9027        seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
9028        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
9029        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
9030        seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
9031        seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
9032        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
9033        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
9034        seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
9035        seq_puts(m, "\n\tGroups:\t");
9036        gi = cred->group_info;
9037        for (g = 0; g < gi->ngroups; g++) {
9038                seq_put_decimal_ull(m, g ? " " : "",
9039                                        from_kgid_munged(uns, gi->gid[g]));
9040        }
9041        seq_puts(m, "\n\tCapEff:\t");
9042        cap = cred->cap_effective;
9043        CAP_FOR_EACH_U32(__capi)
9044                seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
9045        seq_putc(m, '\n');
9046        return 0;
9047}
9048
9049static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
9050{
9051        struct io_sq_data *sq = NULL;
9052        bool has_lock;
9053        int i;
9054
9055        /*
9056         * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
9057         * since fdinfo case grabs it in the opposite direction of normal use
9058         * cases. If we fail to get the lock, we just don't iterate any
9059         * structures that could be going away outside the io_uring mutex.
9060         */
9061        has_lock = mutex_trylock(&ctx->uring_lock);
9062
9063        if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL))
9064                sq = ctx->sq_data;
9065
9066        seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
9067        seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
9068        seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
9069        for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
9070                struct fixed_file_table *table;
9071                struct file *f;
9072
9073                table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
9074                f = table->files[i & IORING_FILE_TABLE_MASK];
9075                if (f)
9076                        seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
9077                else
9078                        seq_printf(m, "%5u: <none>\n", i);
9079        }
9080        seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
9081        for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
9082                struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
9083
9084                seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
9085                                                (unsigned int) buf->len);
9086        }
9087        if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
9088                seq_printf(m, "Personalities:\n");
9089                idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
9090        }
9091        seq_printf(m, "PollList:\n");
9092        spin_lock_irq(&ctx->completion_lock);
9093        for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
9094                struct hlist_head *list = &ctx->cancel_hash[i];
9095                struct io_kiocb *req;
9096
9097                hlist_for_each_entry(req, list, hash_node)
9098                        seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
9099                                        req->task->task_works != NULL);
9100        }
9101        spin_unlock_irq(&ctx->completion_lock);
9102        if (has_lock)
9103                mutex_unlock(&ctx->uring_lock);
9104}
9105
9106static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
9107{
9108        struct io_ring_ctx *ctx = f->private_data;
9109
9110        if (percpu_ref_tryget(&ctx->refs)) {
9111                __io_uring_show_fdinfo(ctx, m);
9112                percpu_ref_put(&ctx->refs);
9113        }
9114}
9115#endif
9116
9117static const struct file_operations io_uring_fops = {
9118        .release        = io_uring_release,
9119        .flush          = io_uring_flush,
9120        .mmap           = io_uring_mmap,
9121#ifndef CONFIG_MMU
9122        .get_unmapped_area = io_uring_nommu_get_unmapped_area,
9123        .mmap_capabilities = io_uring_nommu_mmap_capabilities,
9124#endif
9125        .poll           = io_uring_poll,
9126        .fasync         = io_uring_fasync,
9127#ifdef CONFIG_PROC_FS
9128        .show_fdinfo    = io_uring_show_fdinfo,
9129#endif
9130};
9131
9132static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
9133                                  struct io_uring_params *p)
9134{
9135        struct io_rings *rings;
9136        size_t size, sq_array_offset;
9137
9138        /* make sure these are sane, as we already accounted them */
9139        ctx->sq_entries = p->sq_entries;
9140        ctx->cq_entries = p->cq_entries;
9141
9142        size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
9143        if (size == SIZE_MAX)
9144                return -EOVERFLOW;
9145
9146        rings = io_mem_alloc(size);
9147        if (!rings)
9148                return -ENOMEM;
9149
9150        ctx->rings = rings;
9151        ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
9152        rings->sq_ring_mask = p->sq_entries - 1;
9153        rings->cq_ring_mask = p->cq_entries - 1;
9154        rings->sq_ring_entries = p->sq_entries;
9155        rings->cq_ring_entries = p->cq_entries;
9156        ctx->sq_mask = rings->sq_ring_mask;
9157        ctx->cq_mask = rings->cq_ring_mask;
9158
9159        size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
9160        if (size == SIZE_MAX) {
9161                io_mem_free(ctx->rings);
9162                ctx->rings = NULL;
9163                return -EOVERFLOW;
9164        }
9165
9166        ctx->sq_sqes = io_mem_alloc(size);
9167        if (!ctx->sq_sqes) {
9168                io_mem_free(ctx->rings);
9169                ctx->rings = NULL;
9170                return -ENOMEM;
9171        }
9172
9173        return 0;
9174}
9175
9176/*
9177 * Allocate an anonymous fd, this is what constitutes the application
9178 * visible backing of an io_uring instance. The application mmaps this
9179 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
9180 * we have to tie this fd to a socket for file garbage collection purposes.
9181 */
9182static int io_uring_get_fd(struct io_ring_ctx *ctx)
9183{
9184        struct file *file;
9185        int ret;
9186        int fd;
9187
9188#if defined(CONFIG_UNIX)
9189        ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
9190                                &ctx->ring_sock);
9191        if (ret)
9192                return ret;
9193#endif
9194
9195        ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
9196        if (ret < 0)
9197                goto err;
9198        fd = ret;
9199
9200        file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
9201                                        O_RDWR | O_CLOEXEC);
9202        if (IS_ERR(file)) {
9203                put_unused_fd(fd);
9204                ret = PTR_ERR(file);
9205                goto err;
9206        }
9207
9208#if defined(CONFIG_UNIX)
9209        ctx->ring_sock->file = file;
9210#endif
9211        ret = io_uring_add_task_file(ctx, file);
9212        if (ret) {
9213                fput(file);
9214                put_unused_fd(fd);
9215                goto err;
9216        }
9217        fd_install(fd, file);
9218        return fd;
9219err:
9220#if defined(CONFIG_UNIX)
9221        sock_release(ctx->ring_sock);
9222        ctx->ring_sock = NULL;
9223#endif
9224        return ret;
9225}
9226
9227static int io_uring_create(unsigned entries, struct io_uring_params *p,
9228                           struct io_uring_params __user *params)
9229{
9230        struct user_struct *user = NULL;
9231        struct io_ring_ctx *ctx;
9232        bool limit_mem;
9233        int ret;
9234
9235        if (!entries)
9236                return -EINVAL;
9237        if (entries > IORING_MAX_ENTRIES) {
9238                if (!(p->flags & IORING_SETUP_CLAMP))
9239                        return -EINVAL;
9240                entries = IORING_MAX_ENTRIES;
9241        }
9242
9243        /*
9244         * Use twice as many entries for the CQ ring. It's possible for the
9245         * application to drive a higher depth than the size of the SQ ring,
9246         * since the sqes are only used at submission time. This allows for
9247         * some flexibility in overcommitting a bit. If the application has
9248         * set IORING_SETUP_CQSIZE, it will have passed in the desired number
9249         * of CQ ring entries manually.
9250         */
9251        p->sq_entries = roundup_pow_of_two(entries);
9252        if (p->flags & IORING_SETUP_CQSIZE) {
9253                /*
9254                 * If IORING_SETUP_CQSIZE is set, we do the same roundup
9255                 * to a power-of-two, if it isn't already. We do NOT impose
9256                 * any cq vs sq ring sizing.
9257                 */
9258                if (!p->cq_entries)
9259                        return -EINVAL;
9260                if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
9261                        if (!(p->flags & IORING_SETUP_CLAMP))
9262                                return -EINVAL;
9263                        p->cq_entries = IORING_MAX_CQ_ENTRIES;
9264                }
9265                p->cq_entries = roundup_pow_of_two(p->cq_entries);
9266                if (p->cq_entries < p->sq_entries)
9267                        return -EINVAL;
9268        } else {
9269                p->cq_entries = 2 * p->sq_entries;
9270        }
9271
9272        user = get_uid(current_user());
9273        limit_mem = !capable(CAP_IPC_LOCK);
9274
9275        if (limit_mem) {
9276                ret = __io_account_mem(user,
9277                                ring_pages(p->sq_entries, p->cq_entries));
9278                if (ret) {
9279                        free_uid(user);
9280                        return ret;
9281                }
9282        }
9283
9284        ctx = io_ring_ctx_alloc(p);
9285        if (!ctx) {
9286                if (limit_mem)
9287                        __io_unaccount_mem(user, ring_pages(p->sq_entries,
9288                                                                p->cq_entries));
9289                free_uid(user);
9290                return -ENOMEM;
9291        }
9292        ctx->compat = in_compat_syscall();
9293        ctx->user = user;
9294        ctx->creds = get_current_cred();
9295#ifdef CONFIG_AUDIT
9296        ctx->loginuid = current->loginuid;
9297        ctx->sessionid = current->sessionid;
9298#endif
9299        ctx->sqo_task = get_task_struct(current);
9300
9301        /*
9302         * This is just grabbed for accounting purposes. When a process exits,
9303         * the mm is exited and dropped before the files, hence we need to hang
9304         * on to this mm purely for the purposes of being able to unaccount
9305         * memory (locked/pinned vm). It's not used for anything else.
9306         */
9307        mmgrab(current->mm);
9308        ctx->mm_account = current->mm;
9309
9310#ifdef CONFIG_BLK_CGROUP
9311        /*
9312         * The sq thread will belong to the original cgroup it was inited in.
9313         * If the cgroup goes offline (e.g. disabling the io controller), then
9314         * issued bios will be associated with the closest cgroup later in the
9315         * block layer.
9316         */
9317        rcu_read_lock();
9318        ctx->sqo_blkcg_css = blkcg_css();
9319        ret = css_tryget_online(ctx->sqo_blkcg_css);
9320        rcu_read_unlock();
9321        if (!ret) {
9322                /* don't init against a dying cgroup, have the user try again */
9323                ctx->sqo_blkcg_css = NULL;
9324                ret = -ENODEV;
9325                goto err;
9326        }
9327#endif
9328
9329        /*
9330         * Account memory _before_ installing the file descriptor. Once
9331         * the descriptor is installed, it can get closed at any time. Also
9332         * do this before hitting the general error path, as ring freeing
9333         * will un-account as well.
9334         */
9335        io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
9336                       ACCT_LOCKED);
9337        ctx->limit_mem = limit_mem;
9338
9339        ret = io_allocate_scq_urings(ctx, p);
9340        if (ret)
9341                goto err;
9342
9343        ret = io_sq_offload_create(ctx, p);
9344        if (ret)
9345                goto err;
9346
9347        if (!(p->flags & IORING_SETUP_R_DISABLED))
9348                io_sq_offload_start(ctx);
9349
9350        memset(&p->sq_off, 0, sizeof(p->sq_off));
9351        p->sq_off.head = offsetof(struct io_rings, sq.head);
9352        p->sq_off.tail = offsetof(struct io_rings, sq.tail);
9353        p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
9354        p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
9355        p->sq_off.flags = offsetof(struct io_rings, sq_flags);
9356        p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
9357        p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
9358
9359        memset(&p->cq_off, 0, sizeof(p->cq_off));
9360        p->cq_off.head = offsetof(struct io_rings, cq.head);
9361        p->cq_off.tail = offsetof(struct io_rings, cq.tail);
9362        p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
9363        p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
9364        p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
9365        p->cq_off.cqes = offsetof(struct io_rings, cqes);
9366        p->cq_off.flags = offsetof(struct io_rings, cq_flags);
9367
9368        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
9369                        IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
9370                        IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
9371                        IORING_FEAT_POLL_32BITS;
9372
9373        if (copy_to_user(params, p, sizeof(*p))) {
9374                ret = -EFAULT;
9375                goto err;
9376        }
9377
9378        /*
9379         * Install ring fd as the very last thing, so we don't risk someone
9380         * having closed it before we finish setup
9381         */
9382        ret = io_uring_get_fd(ctx);
9383        if (ret < 0)
9384                goto err;
9385
9386        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
9387        return ret;
9388err:
9389        io_ring_ctx_wait_and_kill(ctx);
9390        return ret;
9391}
9392
9393/*
9394 * Sets up an aio uring context, and returns the fd. Applications asks for a
9395 * ring size, we return the actual sq/cq ring sizes (among other things) in the
9396 * params structure passed in.
9397 */
9398static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
9399{
9400        struct io_uring_params p;
9401        int i;
9402
9403        if (copy_from_user(&p, params, sizeof(p)))
9404                return -EFAULT;
9405        for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
9406                if (p.resv[i])
9407                        return -EINVAL;
9408        }
9409
9410        if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
9411                        IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
9412                        IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
9413                        IORING_SETUP_R_DISABLED))
9414                return -EINVAL;
9415
9416        return  io_uring_create(entries, &p, params);
9417}
9418
9419SYSCALL_DEFINE2(io_uring_setup, u32, entries,
9420                struct io_uring_params __user *, params)
9421{
9422        return io_uring_setup(entries, params);
9423}
9424
9425static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
9426{
9427        struct io_uring_probe *p;
9428        size_t size;
9429        int i, ret;
9430
9431        size = struct_size(p, ops, nr_args);
9432        if (size == SIZE_MAX)
9433                return -EOVERFLOW;
9434        p = kzalloc(size, GFP_KERNEL);
9435        if (!p)
9436                return -ENOMEM;
9437
9438        ret = -EFAULT;
9439        if (copy_from_user(p, arg, size))
9440                goto out;
9441        ret = -EINVAL;
9442        if (memchr_inv(p, 0, size))
9443                goto out;
9444
9445        p->last_op = IORING_OP_LAST - 1;
9446        if (nr_args > IORING_OP_LAST)
9447                nr_args = IORING_OP_LAST;
9448
9449        for (i = 0; i < nr_args; i++) {
9450                p->ops[i].op = i;
9451                if (!io_op_defs[i].not_supported)
9452                        p->ops[i].flags = IO_URING_OP_SUPPORTED;
9453        }
9454        p->ops_len = i;
9455
9456        ret = 0;
9457        if (copy_to_user(arg, p, size))
9458                ret = -EFAULT;
9459out:
9460        kfree(p);
9461        return ret;
9462}
9463
9464static int io_register_personality(struct io_ring_ctx *ctx)
9465{
9466        struct io_identity *id;
9467        int ret;
9468
9469        id = kmalloc(sizeof(*id), GFP_KERNEL);
9470        if (unlikely(!id))
9471                return -ENOMEM;
9472
9473        io_init_identity(id);
9474        id->creds = get_current_cred();
9475
9476        ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL);
9477        if (ret < 0) {
9478                put_cred(id->creds);
9479                kfree(id);
9480        }
9481        return ret;
9482}
9483
9484static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9485{
9486        struct io_identity *iod;
9487
9488        iod = idr_remove(&ctx->personality_idr, id);
9489        if (iod) {
9490                put_cred(iod->creds);
9491                if (refcount_dec_and_test(&iod->count))
9492                        kfree(iod);
9493                return 0;
9494        }
9495
9496        return -EINVAL;
9497}
9498
9499static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
9500                                    unsigned int nr_args)
9501{
9502        struct io_uring_restriction *res;
9503        size_t size;
9504        int i, ret;
9505
9506        /* Restrictions allowed only if rings started disabled */
9507        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9508                return -EBADFD;
9509
9510        /* We allow only a single restrictions registration */
9511        if (ctx->restrictions.registered)
9512                return -EBUSY;
9513
9514        if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
9515                return -EINVAL;
9516
9517        size = array_size(nr_args, sizeof(*res));
9518        if (size == SIZE_MAX)
9519                return -EOVERFLOW;
9520
9521        res = memdup_user(arg, size);
9522        if (IS_ERR(res))
9523                return PTR_ERR(res);
9524
9525        ret = 0;
9526
9527        for (i = 0; i < nr_args; i++) {
9528                switch (res[i].opcode) {
9529                case IORING_RESTRICTION_REGISTER_OP:
9530                        if (res[i].register_op >= IORING_REGISTER_LAST) {
9531                                ret = -EINVAL;
9532                                goto out;
9533                        }
9534
9535                        __set_bit(res[i].register_op,
9536                                  ctx->restrictions.register_op);
9537                        break;
9538                case IORING_RESTRICTION_SQE_OP:
9539                        if (res[i].sqe_op >= IORING_OP_LAST) {
9540                                ret = -EINVAL;
9541                                goto out;
9542                        }
9543
9544                        __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
9545                        break;
9546                case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
9547                        ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
9548                        break;
9549                case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
9550                        ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
9551                        break;
9552                default:
9553                        ret = -EINVAL;
9554                        goto out;
9555                }
9556        }
9557
9558out:
9559        /* Reset all restrictions if an error happened */
9560        if (ret != 0)
9561                memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
9562        else
9563                ctx->restrictions.registered = true;
9564
9565        kfree(res);
9566        return ret;
9567}
9568
9569static int io_register_enable_rings(struct io_ring_ctx *ctx)
9570{
9571        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
9572                return -EBADFD;
9573
9574        if (ctx->restrictions.registered)
9575                ctx->restricted = 1;
9576
9577        ctx->flags &= ~IORING_SETUP_R_DISABLED;
9578
9579        io_sq_offload_start(ctx);
9580
9581        return 0;
9582}
9583
9584static bool io_register_op_must_quiesce(int op)
9585{
9586        switch (op) {
9587        case IORING_UNREGISTER_FILES:
9588        case IORING_REGISTER_FILES_UPDATE:
9589        case IORING_REGISTER_PROBE:
9590        case IORING_REGISTER_PERSONALITY:
9591        case IORING_UNREGISTER_PERSONALITY:
9592                return false;
9593        default:
9594                return true;
9595        }
9596}
9597
9598static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
9599                               void __user *arg, unsigned nr_args)
9600        __releases(ctx->uring_lock)
9601        __acquires(ctx->uring_lock)
9602{
9603        int ret;
9604
9605        /*
9606         * We're inside the ring mutex, if the ref is already dying, then
9607         * someone else killed the ctx or is already going through
9608         * io_uring_register().
9609         */
9610        if (percpu_ref_is_dying(&ctx->refs))
9611                return -ENXIO;
9612
9613        if (io_register_op_must_quiesce(opcode)) {
9614                percpu_ref_kill(&ctx->refs);
9615
9616                /*
9617                 * Drop uring mutex before waiting for references to exit. If
9618                 * another thread is currently inside io_uring_enter() it might
9619                 * need to grab the uring_lock to make progress. If we hold it
9620                 * here across the drain wait, then we can deadlock. It's safe
9621                 * to drop the mutex here, since no new references will come in
9622                 * after we've killed the percpu ref.
9623                 */
9624                mutex_unlock(&ctx->uring_lock);
9625                do {
9626                        ret = wait_for_completion_interruptible(&ctx->ref_comp);
9627                        if (!ret)
9628                                break;
9629                        ret = io_run_task_work_sig();
9630                        if (ret < 0)
9631                                break;
9632                } while (1);
9633
9634                mutex_lock(&ctx->uring_lock);
9635
9636                if (ret) {
9637                        percpu_ref_resurrect(&ctx->refs);
9638                        goto out_quiesce;
9639                }
9640        }
9641
9642        if (ctx->restricted) {
9643                if (opcode >= IORING_REGISTER_LAST) {
9644                        ret = -EINVAL;
9645                        goto out;
9646                }
9647
9648                if (!test_bit(opcode, ctx->restrictions.register_op)) {
9649                        ret = -EACCES;
9650                        goto out;
9651                }
9652        }
9653
9654        switch (opcode) {
9655        case IORING_REGISTER_BUFFERS:
9656                ret = io_sqe_buffer_register(ctx, arg, nr_args);
9657                break;
9658        case IORING_UNREGISTER_BUFFERS:
9659                ret = -EINVAL;
9660                if (arg || nr_args)
9661                        break;
9662                ret = io_sqe_buffer_unregister(ctx);
9663                break;
9664        case IORING_REGISTER_FILES:
9665                ret = io_sqe_files_register(ctx, arg, nr_args);
9666                break;
9667        case IORING_UNREGISTER_FILES:
9668                ret = -EINVAL;
9669                if (arg || nr_args)
9670                        break;
9671                ret = io_sqe_files_unregister(ctx);
9672                break;
9673        case IORING_REGISTER_FILES_UPDATE:
9674                ret = io_sqe_files_update(ctx, arg, nr_args);
9675                break;
9676        case IORING_REGISTER_EVENTFD:
9677        case IORING_REGISTER_EVENTFD_ASYNC:
9678                ret = -EINVAL;
9679                if (nr_args != 1)
9680                        break;
9681                ret = io_eventfd_register(ctx, arg);
9682                if (ret)
9683                        break;
9684                if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
9685                        ctx->eventfd_async = 1;
9686                else
9687                        ctx->eventfd_async = 0;
9688                break;
9689        case IORING_UNREGISTER_EVENTFD:
9690                ret = -EINVAL;
9691                if (arg || nr_args)
9692                        break;
9693                ret = io_eventfd_unregister(ctx);
9694                break;
9695        case IORING_REGISTER_PROBE:
9696                ret = -EINVAL;
9697                if (!arg || nr_args > 256)
9698                        break;
9699                ret = io_probe(ctx, arg, nr_args);
9700                break;
9701        case IORING_REGISTER_PERSONALITY:
9702                ret = -EINVAL;
9703                if (arg || nr_args)
9704                        break;
9705                ret = io_register_personality(ctx);
9706                break;
9707        case IORING_UNREGISTER_PERSONALITY:
9708                ret = -EINVAL;
9709                if (arg)
9710                        break;
9711                ret = io_unregister_personality(ctx, nr_args);
9712                break;
9713        case IORING_REGISTER_ENABLE_RINGS:
9714                ret = -EINVAL;
9715                if (arg || nr_args)
9716                        break;
9717                ret = io_register_enable_rings(ctx);
9718                break;
9719        case IORING_REGISTER_RESTRICTIONS:
9720                ret = io_register_restrictions(ctx, arg, nr_args);
9721                break;
9722        default:
9723                ret = -EINVAL;
9724                break;
9725        }
9726
9727out:
9728        if (io_register_op_must_quiesce(opcode)) {
9729                /* bring the ctx back to life */
9730                percpu_ref_reinit(&ctx->refs);
9731out_quiesce:
9732                reinit_completion(&ctx->ref_comp);
9733        }
9734        return ret;
9735}
9736
9737SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
9738                void __user *, arg, unsigned int, nr_args)
9739{
9740        struct io_ring_ctx *ctx;
9741        long ret = -EBADF;
9742        struct fd f;
9743
9744        f = fdget(fd);
9745        if (!f.file)
9746                return -EBADF;
9747
9748        ret = -EOPNOTSUPP;
9749        if (f.file->f_op != &io_uring_fops)
9750                goto out_fput;
9751
9752        ctx = f.file->private_data;
9753
9754        mutex_lock(&ctx->uring_lock);
9755        ret = __io_uring_register(ctx, opcode, arg, nr_args);
9756        mutex_unlock(&ctx->uring_lock);
9757        trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
9758                                                        ctx->cq_ev_fd != NULL, ret);
9759out_fput:
9760        fdput(f);
9761        return ret;
9762}
9763
9764static int __init io_uring_init(void)
9765{
9766#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
9767        BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
9768        BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
9769} while (0)
9770
9771#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
9772        __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
9773        BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
9774        BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
9775        BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
9776        BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
9777        BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
9778        BUILD_BUG_SQE_ELEM(8,  __u64,  off);
9779        BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
9780        BUILD_BUG_SQE_ELEM(16, __u64,  addr);
9781        BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
9782        BUILD_BUG_SQE_ELEM(24, __u32,  len);
9783        BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
9784        BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
9785        BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
9786        BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
9787        BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
9788        BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
9789        BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
9790        BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
9791        BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
9792        BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
9793        BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
9794        BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
9795        BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
9796        BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
9797        BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
9798        BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
9799        BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
9800        BUILD_BUG_SQE_ELEM(42, __u16,  personality);
9801        BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
9802
9803        BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
9804        BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
9805        req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
9806        return 0;
9807};
9808__initcall(io_uring_init);
9809