linux/io_uring/io_uring.c
Breno Leitao f30bd4d038 io_uring: Split io_issue_def struct
This patch removes some "cold" fields from `struct io_issue_def`.

The plan is to keep only highly used fields into `struct io_issue_def`, so,
it may be hot in the cache. The hot fields are basically all the bitfields
and the callback functions for .issue and .prep.

The other less frequently used fields are now located in a secondary and
cold struct, called `io_cold_def`.

This is the size for the structs:

Before: io_issue_def = 56 bytes
After: io_issue_def = 24 bytes; io_cold_def = 40 bytes

Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/20230112144411.2624698-2-leitao@debian.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-01-29 15:17:41 -07:00

4391 lines
113 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Shared application/kernel submission and completion ring pairs, for
* supporting fast/efficient IO.
*
* A note on the read/write ordering memory barriers that are matched between
* the application and kernel side.
*
* After the application reads the CQ ring tail, it must use an
* appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
* before writing the tail (using smp_load_acquire to read the tail will
* do). It also needs a smp_mb() before updating CQ head (ordering the
* entry load(s) with the head store), pairing with an implicit barrier
* through a control-dependency in io_get_cqe (smp_store_release to
* store head will do). Failure to do so could lead to reading invalid
* CQ entries.
*
* Likewise, the application must use an appropriate smp_wmb() before
* writing the SQ tail (ordering SQ entry stores with the tail store),
* which pairs with smp_load_acquire in io_get_sqring (smp_store_release
* to store the tail will do). And it needs a barrier ordering the SQ
* head load before writing new SQ entries (smp_load_acquire to read
* head will do).
*
* When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
* needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
* updating the SQ tail; a full memory barrier smp_mb() is needed
* between.
*
* Also see the examples in the liburing library:
*
* git://git.kernel.dk/liburing
*
* io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
* from data shared between the kernel and application. This is done both
* for ordering purposes, but also to ensure that once a value is loaded from
* data that the application could potentially modify, it remains stable.
*
* Copyright (C) 2018-2019 Jens Axboe
* Copyright (c) 2018-2019 Christoph Hellwig
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/highmem.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
#include <linux/audit.h>
#include <linux/security.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io-wq.h"
#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
#include "rsrc.h"
#include "cancel.h"
#include "net.h"
#include "notif.h"
#include "timeout.h"
#include "poll.h"
#include "alloc_cache.h"
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
IO_REQ_CLEAN_FLAGS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
#define IO_COMPL_BATCH 32
#define IO_REQ_ALLOC_BATCH 8
enum {
IO_CHECK_CQ_OVERFLOW_BIT,
IO_CHECK_CQ_DROPPED_BIT,
};
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
IO_EVENTFD_OP_FREE_BIT,
};
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
u32 seq;
};
/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
static void io_dismantle_req(struct io_kiocb *req);
static void io_clean_op(struct io_kiocb *req);
static void io_queue_sqe(struct io_kiocb *req);
static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static __cold void io_fallback_tw(struct io_uring_task *tctx);
static struct kmem_cache *req_cachep;
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
if (io_is_uring_fops(file)) {
struct io_ring_ctx *ctx = file->private_data;
return ctx->ring_sock->sk;
}
#endif
return NULL;
}
EXPORT_SYMBOL(io_uring_get_socket);
static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
{
if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
ctx->submit_state.cqes_count)
__io_submit_flush_completions(ctx);
}
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
}
static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
{
return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
}
static bool io_match_linked(struct io_kiocb *head)
{
struct io_kiocb *req;
io_for_each_link(req, head) {
if (req->flags & REQ_F_INFLIGHT)
return true;
}
return false;
}
/*
* As io_match_task() but protected against racing with linked timeouts.
* User must not hold timeout_lock.
*/
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
{
bool matched;
if (task && head->task != task)
return false;
if (cancel_all)
return true;
if (head->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = head->ctx;
/* protect against races with linked timeouts */
spin_lock_irq(&ctx->timeout_lock);
matched = io_match_linked(head);
spin_unlock_irq(&ctx->timeout_lock);
} else {
matched = io_match_linked(head);
}
return matched;
}
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{
req_set_fail(req);
io_req_set_res(req, res, 0);
}
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
}
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
complete(&ctx->ref_comp);
}
static __cold void io_fallback_req_func(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
fallback_work.work);
struct llist_node *node = llist_del_all(&ctx->fallback_llist);
struct io_kiocb *req, *tmp;
bool locked = true;
mutex_lock(&ctx->uring_lock);
llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
req->io_task_work.func(req, &locked);
if (WARN_ON_ONCE(!locked))
return;
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
}
static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
{
unsigned hash_buckets = 1U << bits;
size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
table->hbs = kmalloc(hash_size, GFP_KERNEL);
if (!table->hbs)
return -ENOMEM;
table->hash_bits = bits;
init_hash_table(table, hash_buckets);
return 0;
}
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
xa_init(&ctx->io_bl_xa);
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread, but
* don't keep too many buckets to not overconsume memory.
*/
hash_bits = ilog2(p->cq_entries) - 5;
hash_bits = clamp(hash_bits, 1, 8);
if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
goto err;
if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
goto err;
ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
if (!ctx->dummy_ubuf)
goto err;
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
0, GFP_KERNEL))
goto err;
ctx->flags = p->flags;
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
io_alloc_cache_init(&ctx->apoll_cache);
io_alloc_cache_init(&ctx->netmsg_cache);
init_completion(&ctx->ref_comp);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
init_waitqueue_head(&ctx->poll_wq);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->io_buffers_pages);
INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
spin_lock_init(&ctx->rsrc_ref_lock);
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw);
init_llist_head(&ctx->rsrc_put_llist);
init_llist_head(&ctx->work_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
return ctx;
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
}
static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
ctx->cq_extra--;
}
static bool req_need_defer(struct io_kiocb *req, u32 seq)
{
if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
struct io_ring_ctx *ctx = req->ctx;
return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
}
return false;
}
static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
atomic_inc(&req->task->io_uring->inflight_tracked);
}
}
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!req->link))
return NULL;
req->flags &= ~REQ_F_ARM_LTIMEOUT;
req->flags |= REQ_F_LINK_TIMEOUT;
/* linked timeouts should have two refs once prep'ed */
io_req_set_refcount(req);
__io_req_set_refcount(req->link, 2);
return req->link;
}
static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{
if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
return NULL;
return __io_prep_linked_timeout(req);
}
static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{
io_queue_linked_timeout(__io_prep_linked_timeout(req));
}
static inline void io_arm_ltimeout(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
__io_arm_ltimeout(req);
}
static void io_prep_async_work(struct io_kiocb *req)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CREDS)) {
req->flags |= REQ_F_CREDS;
req->creds = get_current_cred();
}
req->work.list.next = NULL;
req->work.flags = 0;
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
if (req->file && !io_req_ffs_set(req))
req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
io_wq_hash_work(&req->work, file_inode(req->file));
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
}
}
static void io_prep_async_link(struct io_kiocb *req)
{
struct io_kiocb *cur;
if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock_irq(&ctx->timeout_lock);
io_for_each_link(cur, req)
io_prep_async_work(cur);
spin_unlock_irq(&ctx->timeout_lock);
} else {
io_for_each_link(cur, req)
io_prep_async_work(cur);
}
}
void io_queue_iowq(struct io_kiocb *req, bool *dont_use)
{
struct io_kiocb *link = io_prep_linked_timeout(req);
struct io_uring_task *tctx = req->task->io_uring;
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
/* init ->work of the whole link before punting */
io_prep_async_link(req);
/*
* Not expected to happen, but if we do have a bug where this _can_
* happen, catch it here and ensure the request is marked as
* canceled. That will make io-wq go through the usual work cancel
* procedure rather than attempt to run this request (or create a new
* worker for it).
*/
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
}
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
if (req_need_defer(de->req, de->seq))
break;
list_del_init(&de->list);
io_req_task_queue(de->req);
kfree(de);
}
}
static void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
* it regardless.
*/
if (atomic_dec_and_test(&ev_fd->refs)) {
eventfd_ctx_put(ev_fd->cq_ev_fd);
kfree(ev_fd);
}
}
static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd = NULL;
rcu_read_lock();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
* and eventfd_signal
*/
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists incase an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
if (unlikely(!ev_fd))
goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
goto out;
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out;
if (likely(eventfd_signal_allowed())) {
eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
else
atomic_dec(&ev_fd->refs);
}
out:
rcu_read_unlock();
}
static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
bool skip;
spin_lock(&ctx->completion_lock);
/*
* Eventfd should only get triggered when at least one event has been
* posted. Some applications rely on the eventfd notification count
* only changing IFF a new CQE has been added to the CQ ring. There's
* no depedency on 1:1 relationship between how many times this
* function is called (and hence the eventfd count) and number of CQEs
* posted to the CQ ring.
*/
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
if (skip)
return;
io_eventfd_signal(ctx);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->poll_activated)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
if (ctx->drain_active) {
spin_lock(&ctx->completion_lock);
io_queue_deferred(ctx);
spin_unlock(&ctx->completion_lock);
}
if (ctx->has_evfd)
io_eventfd_flush_signal(ctx);
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
__acquires(ctx->completion_lock)
{
if (!ctx->task_complete)
spin_lock(&ctx->completion_lock);
}
static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
{
if (!ctx->task_complete)
spin_unlock(&ctx->completion_lock);
}
static inline void io_cq_lock(struct io_ring_ctx *ctx)
__acquires(ctx->completion_lock)
{
spin_lock(&ctx->completion_lock);
}
static inline void io_cq_unlock(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{
spin_unlock(&ctx->completion_lock);
}
/* keep it inlined for io_submit_flush_completions() */
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{
io_commit_cqring(ctx);
__io_cq_unlock(ctx);
io_commit_cqring_flush(ctx);
io_cqring_wake(ctx);
}
static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{
io_commit_cqring(ctx);
__io_cq_unlock(ctx);
io_commit_cqring_flush(ctx);
/*
* As ->task_complete implies that the ring is single tasked, cq_wait
* may only be waited on by the current in io_cqring_wait(), but since
* it will re-check the wakeup conditions once we return we can safely
* skip waking it up.
*/
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
smp_mb();
__io_cqring_wake(ctx);
}
}
void io_cq_unlock_post(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_commit_cqring_flush(ctx);
io_cqring_wake(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
{
struct io_overflow_cqe *ocqe;
LIST_HEAD(list);
io_cq_lock(ctx);
list_splice_init(&ctx->cq_overflow_list, &list);
clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
io_cq_unlock(ctx);
while (!list_empty(&list)) {
ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
list_del(&ocqe->list);
kfree(ocqe);
}
}
/* Returns true if there are no backlogged entries after the flush */
static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
size_t cqe_size = sizeof(struct io_uring_cqe);
if (__io_cqring_events(ctx) == ctx->cq_entries)
return;
if (ctx->flags & IORING_SETUP_CQE32)
cqe_size <<= 1;
io_cq_lock(ctx);
while (!list_empty(&ctx->cq_overflow_list)) {
struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
struct io_overflow_cqe *ocqe;
if (!cqe)
break;
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
memcpy(cqe, &ocqe->cqe, cqe_size);
list_del(&ocqe->list);
kfree(ocqe);
}
if (list_empty(&ctx->cq_overflow_list)) {
clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
io_cq_unlock_post(ctx);
}
static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
{
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
__io_cqring_overflow_flush(ctx);
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&ctx->uring_lock);
}
static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
{
if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
io_cqring_do_overflow_flush(ctx);
}
static void __io_put_task(struct task_struct *task, int nr)
{
struct io_uring_task *tctx = task->io_uring;
percpu_counter_sub(&tctx->inflight, nr);
if (unlikely(atomic_read(&tctx->in_idle)))
wake_up(&tctx->wait);
put_task_struct_many(task, nr);
}
/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
if (likely(task == current))
task->io_uring->cached_refs += nr;
else
__io_put_task(task, nr);
}
void io_task_refs_refill(struct io_uring_task *tctx)
{
unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
percpu_counter_add(&tctx->inflight, refill);
refcount_add(refill, &current->usage);
tctx->cached_refs += refill;
}
static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{
struct io_uring_task *tctx = task->io_uring;
unsigned int refs = tctx->cached_refs;
if (refs) {
tctx->cached_refs = 0;
percpu_counter_sub(&tctx->inflight, refs);
put_task_struct_many(task, refs);
}
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags, u64 extra1, u64 extra2)
{
struct io_overflow_cqe *ocqe;
size_t ocq_size = sizeof(struct io_overflow_cqe);
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
lockdep_assert_held(&ctx->completion_lock);
if (is_cqe32)
ocq_size += sizeof(struct io_uring_cqe);
ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* or cannot allocate an overflow entry, then we need to drop it
* on the floor.
*/
io_account_cq_overflow(ctx);
set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
if (is_cqe32) {
ocqe->cqe.big_cqe[0] = extra1;
ocqe->cqe.big_cqe[1] = extra2;
}
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
bool io_req_cqe_overflow(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_CQE32_INIT)) {
req->extra1 = 0;
req->extra2 = 0;
}
return io_cqring_event_overflow(req->ctx, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
req->extra1, req->extra2);
}
/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
unsigned int free, queued, len;
/*
* Posting into the CQ when there are pending overflowed CQEs may break
* ordering guarantees, which will affect links, F_MORE users and more.
* Force overflow the completion.
*/
if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
return NULL;
/* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
free = ctx->cq_entries - queued;
/* we need a contiguous range, limit based on the current array offset */
len = min(free, ctx->cq_entries - off);
if (!len)
return NULL;
if (ctx->flags & IORING_SETUP_CQE32) {
off <<= 1;
len <<= 1;
}
ctx->cqe_cached = &rings->cqes[off];
ctx->cqe_sentinel = ctx->cqe_cached + len;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return &rings->cqes[off];
}
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
struct io_uring_cqe *cqe;
ctx->cq_extra++;
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqe(ctx);
if (likely(cqe)) {
trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
if (ctx->flags & IORING_SETUP_CQE32) {
WRITE_ONCE(cqe->big_cqe[0], 0);
WRITE_ONCE(cqe->big_cqe[1], 0);
}
return true;
}
return false;
}
static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_submit_state *state = &ctx->submit_state;
unsigned int i;
lockdep_assert_held(&ctx->uring_lock);
for (i = 0; i < state->cqes_count; i++) {
struct io_uring_cqe *cqe = &state->cqes[i];
if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
if (ctx->task_complete) {
spin_lock(&ctx->completion_lock);
io_cqring_event_overflow(ctx, cqe->user_data,
cqe->res, cqe->flags, 0, 0);
spin_unlock(&ctx->completion_lock);
} else {
io_cqring_event_overflow(ctx, cqe->user_data,
cqe->res, cqe->flags, 0, 0);
}
}
}
state->cqes_count = 0;
}
static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
bool allow_overflow)
{
bool filled;
io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled && allow_overflow)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
io_cq_unlock_post(ctx);
return filled;
}
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
}
bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
bool allow_overflow)
{
struct io_uring_cqe *cqe;
unsigned int length;
if (!defer)
return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
length = ARRAY_SIZE(ctx->submit_state.cqes);
lockdep_assert_held(&ctx->uring_lock);
if (ctx->submit_state.cqes_count == length) {
__io_cq_lock(ctx);
__io_flush_post_cqes(ctx);
/* no need to flush - flush is deferred */
__io_cq_unlock_post(ctx);
}
/* For defered completions this is not as strict as it is otherwise,
* however it's main job is to prevent unbounded posted completions,
* and in that it works just as well.
*/
if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
return false;
cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++];
cqe->user_data = user_data;
cqe->res = res;
cqe->flags = cflags;
return true;
}
static void __io_req_complete_post(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
io_cq_lock(ctx);
if (!(req->flags & REQ_F_CQE_SKIP))
io_fill_cqe_req(ctx, req);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
*/
if (req_ref_put_and_test(req)) {
if (req->flags & IO_REQ_LINK_FLAGS) {
if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
if (req->link) {
io_req_task_queue(req->link);
req->link = NULL;
}
}
io_put_kbuf_comp(req);
io_dismantle_req(req);
io_req_put_rsrc(req);
/*
* Selected buffer deallocation in io_clean_op() assumes that
* we don't hold ->completion_lock. Clean them here to avoid
* deadlocks.
*/
io_put_task(req->task, 1);
wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
}
io_cq_unlock_post(ctx);
}
void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) {
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
} else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
!(req->ctx->flags & IORING_SETUP_IOPOLL)) {
__io_req_complete_post(req);
} else {
struct io_ring_ctx *ctx = req->ctx;
mutex_lock(&ctx->uring_lock);
__io_req_complete_post(req);
mutex_unlock(&ctx->uring_lock);
}
}
void io_req_defer_failed(struct io_kiocb *req, s32 res)
__must_hold(&ctx->uring_lock)
{
const struct io_cold_def *def = &io_cold_defs[req->opcode];
lockdep_assert_held(&req->ctx->uring_lock);
req_set_fail(req);
io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
if (def->fail)
def->fail(req);
io_req_complete_defer(req);
}
/*
* Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations.
*/
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
req->ctx = ctx;
req->link = NULL;
req->async_data = NULL;
/* not necessary, but safer to zero */
req->cqe.res = 0;
}
static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
struct io_submit_state *state)
{
spin_lock(&ctx->completion_lock);
wq_list_splice(&ctx->locked_free_list, &state->free_list);
ctx->locked_free_nr = 0;
spin_unlock(&ctx->completion_lock);
}
/*
* A request might get retired back into the request caches even before opcode
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
* Because of that, io_alloc_req() should be called only under ->uring_lock
* and with extra caution to not get a request that is still worked on.
*/
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
void *reqs[IO_REQ_ALLOC_BATCH];
int ret, i;
/*
* If we have more than a batch's worth of requests in our IRQ side
* locked cache, grab the lock and move them over to our submission
* side cache.
*/
if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
if (!io_req_cache_empty(ctx))
return true;
}
ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
reqs[0] = kmem_cache_alloc(req_cachep, gfp);
if (!reqs[0])
return false;
ret = 1;
}
percpu_ref_get_many(&ctx->refs, ret);
for (i = 0; i < ret; i++) {
struct io_kiocb *req = reqs[i];
io_preinit_req(req, ctx);
io_req_add_to_cache(req, ctx);
}
return true;
}
static inline void io_dismantle_req(struct io_kiocb *req)
{
unsigned int flags = req->flags;
if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
if (!(flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
}
__cold void io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
io_req_put_rsrc(req);
io_dismantle_req(req);
io_put_task(req->task, 1);
spin_lock(&ctx->completion_lock);
wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
ctx->locked_free_nr++;
spin_unlock(&ctx->completion_lock);
}
static void __io_req_find_next_prep(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
io_disarm_next(req);
spin_unlock(&ctx->completion_lock);
}
static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
if (unlikely(req->flags & IO_DISARM_MASK))
__io_req_find_next_prep(req);
nxt = req->link;
req->link = NULL;
return nxt;
}
static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (*locked) {
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
*locked = false;
}
percpu_ref_put(&ctx->refs);
}
static unsigned int handle_tw_list(struct llist_node *node,
struct io_ring_ctx **ctx, bool *locked,
struct llist_node *last)
{
unsigned int count = 0;
while (node != last) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
prefetch(container_of(next, struct io_kiocb, io_task_work.node));
if (req->ctx != *ctx) {
ctx_flush_and_put(*ctx, locked);
*ctx = req->ctx;
/* if not contended, grab and improve batching */
*locked = mutex_trylock(&(*ctx)->uring_lock);
percpu_ref_get(&(*ctx)->refs);
}
req->io_task_work.func(req, locked);
node = next;
count++;
}
return count;
}
/**
* io_llist_xchg - swap all entries in a lock-less list
* @head: the head of lock-less list to delete all entries
* @new: new entry as the head of the list
*
* If list is empty, return NULL, otherwise, return the pointer to the first entry.
* The order of entries returned is from the newest to the oldest added one.
*/
static inline struct llist_node *io_llist_xchg(struct llist_head *head,
struct llist_node *new)
{
return xchg(&head->first, new);
}
/**
* io_llist_cmpxchg - possibly swap all entries in a lock-less list
* @head: the head of lock-less list to delete all entries
* @old: expected old value of the first entry of the list
* @new: new entry as the head of the list
*
* perform a cmpxchg on the first entry of the list.
*/
static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
struct llist_node *old,
struct llist_node *new)
{
return cmpxchg(&head->first, old, new);
}
void tctx_task_work(struct callback_head *cb)
{
bool uring_locked = false;
struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work);
struct llist_node fake = {};
struct llist_node *node;
unsigned int loops = 1;
unsigned int count;
if (unlikely(current->flags & PF_EXITING)) {
io_fallback_tw(tctx);
return;
}
node = io_llist_xchg(&tctx->task_list, &fake);
count = handle_tw_list(node, &ctx, &uring_locked, NULL);
node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
while (node != &fake) {
loops++;
node = io_llist_xchg(&tctx->task_list, &fake);
count += handle_tw_list(node, &ctx, &uring_locked, &fake);
node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL);
}
ctx_flush_and_put(ctx, &uring_locked);
/* relaxed read is enough as only the task itself sets ->in_idle */
if (unlikely(atomic_read(&tctx->in_idle)))
io_uring_drop_tctx_refs(current);
trace_io_uring_task_work_run(tctx, count, loops);
}
static __cold void io_fallback_tw(struct io_uring_task *tctx)
{
struct llist_node *node = llist_del_all(&tctx->task_list);
struct io_kiocb *req;
while (node) {
req = container_of(node, struct io_kiocb, io_task_work.node);
node = node->next;
if (llist_add(&req->io_task_work.node,
&req->ctx->fallback_llist))
schedule_delayed_work(&req->ctx->fallback_work, 1);
}
}
static void io_req_local_work_add(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
percpu_ref_get(&ctx->refs);
if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) {
percpu_ref_put(&ctx->refs);
return;
}
/* needed for the following wake up */
smp_mb__after_atomic();
if (unlikely(atomic_read(&req->task->io_uring->in_idle))) {
io_move_task_work_from_local(ctx);
percpu_ref_put(&ctx->refs);
return;
}
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
io_eventfd_signal(ctx);
if (READ_ONCE(ctx->cq_waiting))
wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
percpu_ref_put(&ctx->refs);
}
void __io_req_task_work_add(struct io_kiocb *req, bool allow_local)
{
struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx;
if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
io_req_local_work_add(req);
return;
}
/* task_work already pending, we're done */
if (!llist_add(&req->io_task_work.node, &tctx->task_list))
return;
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;
io_fallback_tw(tctx);
}
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{
struct llist_node *node;
node = llist_del_all(&ctx->work_llist);
while (node) {
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
node = node->next;
__io_req_task_work_add(req, false);
}
}
static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked)
{
struct llist_node *node;
unsigned int loops = 0;
int ret = 0;
if (WARN_ON_ONCE(ctx->submitter_task != current))
return -EEXIST;
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
again:
node = io_llist_xchg(&ctx->work_llist, NULL);
while (node) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node);
prefetch(container_of(next, struct io_kiocb, io_task_work.node));
req->io_task_work.func(req, locked);
ret++;
node = next;
}
loops++;
if (!llist_empty(&ctx->work_llist))
goto again;
if (*locked) {
io_submit_flush_completions(ctx);
if (!llist_empty(&ctx->work_llist))
goto again;
}
trace_io_uring_local_work_run(ctx, ret, loops);
return ret;
}
static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
{
bool locked;
int ret;
if (llist_empty(&ctx->work_llist))
return 0;
locked = true;
ret = __io_run_local_work(ctx, &locked);
/* shouldn't happen! */
if (WARN_ON_ONCE(!locked))
mutex_lock(&ctx->uring_lock);
return ret;
}
static int io_run_local_work(struct io_ring_ctx *ctx)
{
bool locked = mutex_trylock(&ctx->uring_lock);
int ret;
ret = __io_run_local_work(ctx, &locked);
if (locked)
mutex_unlock(&ctx->uring_lock);
return ret;
}
static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
{
io_tw_lock(req->ctx, locked);
io_req_defer_failed(req, req->cqe.res);
}
void io_req_task_submit(struct io_kiocb *req, bool *locked)
{
io_tw_lock(req->ctx, locked);
/* req->task == current here, checking PF_EXITING is safe */
if (likely(!(req->task->flags & PF_EXITING)))
io_queue_sqe(req);
else
io_req_defer_failed(req, -EFAULT);
}
void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
io_req_set_res(req, ret, 0);
req->io_task_work.func = io_req_task_cancel;
io_req_task_work_add(req);
}
void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
io_req_task_work_add(req);
}
void io_queue_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = io_req_find_next(req);
if (nxt)
io_req_task_queue(nxt);
}
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
{
struct task_struct *task = NULL;
int task_refs = 0;
do {
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
if (req->flags & REQ_F_REFCOUNT) {
node = req->comp_list.next;
if (!req_ref_put_and_test(req))
continue;
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
struct async_poll *apoll = req->apoll;
if (apoll->double_poll)
kfree(apoll->double_poll);
if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
kfree(apoll);
req->flags &= ~REQ_F_POLLED;
}
if (req->flags & IO_REQ_LINK_FLAGS)
io_queue_next(req);
if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
io_clean_op(req);
}
if (!(req->flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
io_req_put_rsrc_locked(req, ctx);
if (req->task != task) {
if (task)
io_put_task(task, task_refs);
task = req->task;
task_refs = 0;
}
task_refs++;
node = req->comp_list.next;
io_req_add_to_cache(req, ctx);
} while (node);
if (task)
io_put_task(task, task_refs);
}
static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_wq_work_node *node, *prev;
struct io_submit_state *state = &ctx->submit_state;
__io_cq_lock(ctx);
/* must come first to preserve CQE ordering in failure cases */
if (state->cqes_count)
__io_flush_post_cqes(ctx);
wq_list_for_each(node, prev, &state->compl_reqs) {
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP) &&
unlikely(!__io_fill_cqe_req(ctx, req))) {
if (ctx->task_complete) {
spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req);
spin_unlock(&ctx->completion_lock);
} else {
io_req_cqe_overflow(req);
}
}
}
__io_cq_unlock_post_flush(ctx);
if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
}
/*
* Drop reference to request, return next in chain (if there is one) if this
* was the last reference to this request.
*/
static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
{
struct io_kiocb *nxt = NULL;
if (req_ref_put_and_test(req)) {
if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
nxt = io_req_find_next(req);
io_free_req(req);
}
return nxt;
}
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{
/* See comment at the top of this file */
smp_rmb();
return __io_cqring_events(ctx);
}
/*
* We can't just wait for polled events to come to us, we have to actively
* find and complete them.
*/
static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_IOPOLL))
return;
mutex_lock(&ctx->uring_lock);
while (!wq_list_empty(&ctx->iopoll_list)) {
/* let it sleep and repeat later if can't complete a request */
if (io_do_iopoll(ctx, true) == 0)
break;
/*
* Ensure we allow local-to-the-cpu processing to take place,
* in this case we need to ensure that we reap all events.
* Also let task_work, etc. to progress by releasing the mutex
*/
if (need_resched()) {
mutex_unlock(&ctx->uring_lock);
cond_resched();
mutex_lock(&ctx->uring_lock);
}
}
mutex_unlock(&ctx->uring_lock);
}
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{
unsigned int nr_events = 0;
int ret = 0;
unsigned long check_cq;
if (!io_allowed_run_tw(ctx))
return -EEXIST;
check_cq = READ_ONCE(ctx->check_cq);
if (unlikely(check_cq)) {
if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
__io_cqring_overflow_flush(ctx);
/*
* Similarly do not spin if we have not informed the user of any
* dropped CQE.
*/
if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
return -EBADR;
}
/*
* Don't enter poll loop if we already have events pending.
* If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error).
*/
if (io_cqring_events(ctx))
return 0;
do {
/*
* If a submit got punted to a workqueue, we can have the
* application entering polling for a command before it gets
* issued. That app will hold the uring_lock for the duration
* of the poll right here, so we need to take a breather every
* now and then to ensure that the issue has a chance to add
* the poll to the issued list. Otherwise we can spin here
* forever, while the workqueue is stuck trying to acquire the
* very same mutex.
*/
if (wq_list_empty(&ctx->iopoll_list) ||
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
(void) io_run_local_work_locked(ctx);
if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
mutex_unlock(&ctx->uring_lock);
io_run_task_work();
mutex_lock(&ctx->uring_lock);
}
/* some requests don't go through iopoll_list */
if (tail != ctx->cached_cq_tail ||
wq_list_empty(&ctx->iopoll_list))
break;
}
ret = io_do_iopoll(ctx, !min);
if (ret < 0)
break;
nr_events += ret;
ret = 0;
} while (nr_events < min && !need_resched());
return ret;
}
void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
if (*locked)
io_req_complete_defer(req);
else
io_req_complete_post(req, IO_URING_F_UNLOCKED);
}
/*
* After the iocb has been issued, it's safe to be found on the poll list.
* Adding the kiocb to the list AFTER submission ensures that we don't
* find it from a io_do_iopoll() thread before the issuer is done
* accessing the kiocb cookie.
*/
static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
/* workqueue context doesn't hold uring_lock, grab it now */
if (unlikely(needs_lock))
mutex_lock(&ctx->uring_lock);
/*
* Track whether we have multiple files in our lists. This will impact
* how we do polling eventually, not spinning if we're on potentially
* different devices.
*/
if (wq_list_empty(&ctx->iopoll_list)) {
ctx->poll_multi_queue = false;
} else if (!ctx->poll_multi_queue) {
struct io_kiocb *list_req;
list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
comp_list);
if (list_req->file != req->file)
ctx->poll_multi_queue = true;
}
/*
* For fast devices, IO may have already completed. If it has, add
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
else
wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
if (unlikely(needs_lock)) {
/*
* If IORING_SETUP_SQPOLL is enabled, sqes are either handle
* in sq thread task context or in io worker task context. If
* current task context is sq thread, we don't need to check
* whether should wake up sq thread.
*/
if ((ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
mutex_unlock(&ctx->uring_lock);
}
}
static bool io_bdev_nowait(struct block_device *bdev)
{
return !bdev || bdev_nowait(bdev);
}
/*
* If we tracked the file through the SCM inflight mechanism, we could support
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
static bool __io_file_supports_nowait(struct file *file, umode_t mode)
{
if (S_ISBLK(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(I_BDEV(file->f_mapping->host)))
return true;
return false;
}
if (S_ISSOCK(mode))
return true;
if (S_ISREG(mode)) {
if (IS_ENABLED(CONFIG_BLOCK) &&
io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
!io_is_uring_fops(file))
return true;
return false;
}
/* any ->read/write should understand O_NONBLOCK */
if (file->f_flags & O_NONBLOCK)
return true;
return file->f_mode & FMODE_NOWAIT;
}
/*
* If we tracked the file through the SCM inflight mechanism, we could support
* any file. For now, just ensure that anything potentially problematic is done
* inline.
*/
unsigned int io_file_get_flags(struct file *file)
{
umode_t mode = file_inode(file)->i_mode;
unsigned int res = 0;
if (S_ISREG(mode))
res |= FFS_ISREG;
if (__io_file_supports_nowait(file, mode))
res |= FFS_NOWAIT;
return res;
}
bool io_alloc_async_data(struct io_kiocb *req)
{
WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL);
if (req->async_data) {
req->flags |= REQ_F_ASYNC_DATA;
return false;
}
return true;
}
int io_req_prep_async(struct io_kiocb *req)
{
const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
const struct io_issue_def *def = &io_issue_defs[req->opcode];
/* assign early for deferred execution for non-fixed file */
if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
req->file = io_file_get_normal(req, req->cqe.fd);
if (!cdef->prep_async)
return 0;
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (!def->manual_alloc) {
if (io_alloc_async_data(req))
return -EAGAIN;
}
return cdef->prep_async(req);
}
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
struct io_kiocb *cur;
/* need original cached_sq_head, but it was increased for each req */
io_for_each_link(cur, req)
seq--;
return seq;
}
static __cold void io_drain_req(struct io_kiocb *req)
__must_hold(&ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_defer_entry *de;
int ret;
u32 seq = io_get_sequence(req);
/* Still need defer if there is pending req in defer list. */
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
queue:
ctx->drain_active = false;
io_req_task_queue(req);
return;
}
spin_unlock(&ctx->completion_lock);
io_prep_async_link(req);
de = kmalloc(sizeof(*de), GFP_KERNEL);
if (!de) {
ret = -ENOMEM;
io_req_defer_failed(req, ret);
return;
}
spin_lock(&ctx->completion_lock);
if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
spin_unlock(&ctx->completion_lock);
kfree(de);
goto queue;
}
trace_io_uring_defer(req);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
spin_unlock(&ctx->completion_lock);
}
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED) {
spin_lock(&req->ctx->completion_lock);
io_put_kbuf_comp(req);
spin_unlock(&req->ctx->completion_lock);
}
if (req->flags & REQ_F_NEED_CLEANUP) {
const struct io_cold_def *def = &io_cold_defs[req->opcode];
if (def->cleanup)
def->cleanup(req);
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
kfree(req->apoll->double_poll);
kfree(req->apoll);
req->apoll = NULL;
}
if (req->flags & REQ_F_INFLIGHT) {
struct io_uring_task *tctx = req->task->io_uring;
atomic_dec(&tctx->inflight_tracked);
}
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
if (req->flags & REQ_F_ASYNC_DATA) {
kfree(req->async_data);
req->async_data = NULL;
}
req->flags &= ~IO_REQ_CLEAN_FLAGS;
}
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags)
{
if (req->file || !io_issue_defs[req->opcode].needs_file)
return true;
if (req->flags & REQ_F_FIXED_FILE)
req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
else
req->file = io_file_get_normal(req, req->cqe.fd);
return !!req->file;
}
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
const struct cred *creds = NULL;
int ret;
if (unlikely(!io_assign_file(req, issue_flags)))
return -EBADF;
if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);
if (!def->audit_skip)
audit_uring_entry(req->opcode);
ret = def->issue(req, issue_flags);
if (!def->audit_skip)
audit_uring_exit(!ret, ret);
if (creds)
revert_creds(creds);
if (ret == IOU_OK) {
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_defer(req);
else
io_req_complete_post(req, issue_flags);
} else if (ret != IOU_ISSUE_SKIP_COMPLETE)
return ret;
/* If the op doesn't have a file, we're not polling for it */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
io_iopoll_req_issued(req, issue_flags);
return 0;
}
int io_poll_issue(struct io_kiocb *req, bool *locked)
{
io_tw_lock(req->ctx, locked);
return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
IO_URING_F_COMPLETE_DEFER);
}
struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
req = io_put_req_find_next(req);
return req ? &req->work : NULL;
}
void io_wq_submit_work(struct io_wq_work *work)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
const struct io_issue_def *def = &io_issue_defs[req->opcode];
unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
bool needs_poll = false;
int ret = 0, err = -ECANCELED;
/* one will be dropped by ->io_wq_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
__io_req_set_refcount(req, 2);
else
req_ref_get(req);
io_arm_ltimeout(req);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) {
fail:
io_req_task_queue_fail(req, err);
return;
}
if (!io_assign_file(req, issue_flags)) {
err = -EBADF;
work->flags |= IO_WQ_WORK_CANCEL;
goto fail;
}
if (req->flags & REQ_F_FORCE_ASYNC) {
bool opcode_poll = def->pollin || def->pollout;
if (opcode_poll && file_can_poll(req->file)) {
needs_poll = true;
issue_flags |= IO_URING_F_NONBLOCK;
}
}
do {
ret = io_issue_sqe(req, issue_flags);
if (ret != -EAGAIN)
break;
/*
* We can get EAGAIN for iopolled IO even though we're
* forcing a sync submission from here, since we can't
* wait for request slots on the block side.
*/
if (!needs_poll) {
if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
break;
cond_resched();
continue;
}
if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
return;
/* aborted or ready, in either case retry blocking */
needs_poll = false;
issue_flags &= ~IO_URING_F_NONBLOCK;
} while (1);
/* avoid locking problems by failing it from a clean context */
if (ret < 0)
io_req_task_queue_fail(req, ret);
}
inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct file *file = NULL;
unsigned long file_ptr;
io_ring_submit_lock(ctx, issue_flags);
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
goto out;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file_ptr &= ~FFS_MASK;
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
io_req_set_rsrc_node(req, ctx, 0);
out:
io_ring_submit_unlock(ctx, issue_flags);
return file;
}
struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{
struct file *file = fget(fd);
trace_io_uring_file_get(req, fd);
/* we don't allow fixed io_uring files */
if (file && io_is_uring_fops(file))
io_req_track_inflight(req);
return file;
}
static void io_queue_async(struct io_kiocb *req, int ret)
__must_hold(&req->ctx->uring_lock)
{
struct io_kiocb *linked_timeout;
if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
io_req_defer_failed(req, ret);
return;
}
linked_timeout = io_prep_linked_timeout(req);
switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
io_kbuf_recycle(req, 0);
io_req_task_queue(req);
break;
case IO_APOLL_ABORTED:
io_kbuf_recycle(req, 0);
io_queue_iowq(req, NULL);
break;
case IO_APOLL_OK:
break;
}
if (linked_timeout)
io_queue_linked_timeout(linked_timeout);
}
static inline void io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
int ret;
ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
* doesn't support non-blocking read/write attempts
*/
if (likely(!ret))
io_arm_ltimeout(req);
else
io_queue_async(req, ret);
}
static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{
if (unlikely(req->flags & REQ_F_FAIL)) {
/*
* We don't submit, fail them all, for that replace hardlinks
* with normal links. Extra REQ_F_LINK is tolerated.
*/
req->flags &= ~REQ_F_HARDLINK;
req->flags |= REQ_F_LINK;
io_req_defer_failed(req, req->cqe.res);
} else {
int ret = io_req_prep_async(req);
if (unlikely(ret)) {
io_req_defer_failed(req, ret);
return;
}
if (unlikely(req->ctx->drain_active))
io_drain_req(req);
else
io_queue_iowq(req, NULL);
}
}
/*
* Check SQE restrictions (opcode and flags).
*
* Returns 'true' if SQE is allowed, 'false' otherwise.
*/
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{
if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
return false;
if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
ctx->restrictions.sqe_flags_required)
return false;
if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
ctx->restrictions.sqe_flags_required))
return false;
return true;
}
static void io_init_req_drain(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *head = ctx->submit_state.link.head;
ctx->drain_active = true;
if (head) {
/*
* If we need to drain a request in the middle of a link, drain
* the head request and the next request/link after the current
* link. Considering sequential execution of links,
* REQ_F_IO_DRAIN will be maintained for every request of our
* link.
*/
head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
ctx->drain_next = true;
}
}
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
const struct io_issue_def *def;
unsigned int sqe_flags;
int personality;
u8 opcode;
/* req is partially pre-initialised, see io_preinit_req() */
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags = sqe_flags = READ_ONCE(sqe->flags);
req->cqe.user_data = READ_ONCE(sqe->user_data);
req->file = NULL;
req->rsrc_node = NULL;
req->task = current;
if (unlikely(opcode >= IORING_OP_LAST)) {
req->opcode = 0;
return -EINVAL;
}
def = &io_issue_defs[opcode];
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
return -EINVAL;
if (sqe_flags & IOSQE_BUFFER_SELECT) {
if (!def->buffer_select)
return -EOPNOTSUPP;
req->buf_index = READ_ONCE(sqe->buf_group);
}
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
if (sqe_flags & IOSQE_IO_DRAIN) {
if (ctx->drain_disabled)
return -EOPNOTSUPP;
io_init_req_drain(req);
}
}
if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
return -EACCES;
/* knock it to the slow queue path, will be drained there */
if (ctx->drain_active)
req->flags |= REQ_F_FORCE_ASYNC;
/* if there is no link, we're at "next" request and need to drain */
if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
ctx->drain_next = false;
ctx->drain_active = true;
req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
}
}
if (!def->ioprio && sqe->ioprio)
return -EINVAL;
if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (def->needs_file) {
struct io_submit_state *state = &ctx->submit_state;
req->cqe.fd = READ_ONCE(sqe->fd);
/*
* Plug now if we have more than 2 IO left after this, and the
* target is potentially a read/write to block based storage.
*/
if (state->need_plug && def->plug) {
state->plug_started = true;
state->need_plug = false;
blk_start_plug_nr_ios(&state->plug, state->submit_nr);
}
}
personality = READ_ONCE(sqe->personality);
if (personality) {
int ret;
req->creds = xa_load(&ctx->personalities, personality);
if (!req->creds)
return -EINVAL;
get_cred(req->creds);
ret = security_uring_override_creds(req->creds);
if (ret) {
put_cred(req->creds);
return ret;
}
req->flags |= REQ_F_CREDS;
}
return def->prep(req, sqe);
}
static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
struct io_kiocb *req, int ret)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_submit_link *link = &ctx->submit_state.link;
struct io_kiocb *head = link->head;
trace_io_uring_req_failed(sqe, req, ret);
/*
* Avoid breaking links in the middle as it renders links with SQPOLL
* unusable. Instead of failing eagerly, continue assembling the link if
* applicable and mark the head with REQ_F_FAIL. The link flushing code
* should find the flag and handle the rest.
*/
req_fail_link_node(req, ret);
if (head && !(head->flags & REQ_F_FAIL))
req_fail_link_node(head, -ECANCELED);
if (!(req->flags & IO_REQ_LINK_FLAGS)) {
if (head) {
link->last->link = req;
link->head = NULL;
req = head;
}
io_queue_sqe_fallback(req);
return ret;
}
if (head)
link->last->link = req;
else
link->head = req;
link->last = req;
return 0;
}
static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{
struct io_submit_link *link = &ctx->submit_state.link;
int ret;
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret))
return io_submit_fail_init(sqe, req, ret);
/* don't need @sqe from now on */
trace_io_uring_submit_sqe(req, true);
/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
* IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
if (unlikely(link->head)) {
ret = io_req_prep_async(req);
if (unlikely(ret))
return io_submit_fail_init(sqe, req, ret);
trace_io_uring_link(req, link->head);
link->last->link = req;
link->last = req;
if (req->flags & IO_REQ_LINK_FLAGS)
return 0;
/* last request of the link, flush it */
req = link->head;
link->head = NULL;
if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
goto fallback;
} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
if (req->flags & IO_REQ_LINK_FLAGS) {
link->head = req;
link->last = req;
} else {
fallback:
io_queue_sqe_fallback(req);
}
return 0;
}
io_queue_sqe(req);
return 0;
}
/*
* Batched submission is done, ensure local IO is flushed out.
*/
static void io_submit_state_end(struct io_ring_ctx *ctx)
{
struct io_submit_state *state = &ctx->submit_state;
if (unlikely(state->link.head))
io_queue_sqe_fallback(state->link.head);
/* flush only after queuing links as they can generate completions */
io_submit_flush_completions(ctx);
if (state->plug_started)
blk_finish_plug(&state->plug);
}
/*
* Start submission side cache.
*/
static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{
state->plug_started = false;
state->need_plug = max_ios > 2;
state->submit_nr = max_ios;
/* set only head, no need to init link_last in advance */
state->link.head = NULL;
}
static void io_commit_sqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
/*
* Ensure any loads from the SQEs are done at this point,
* since once we write the new head, the application could
* write new data to them.
*/
smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}
/*
* Fetch an sqe, if one is available. Note this returns a pointer to memory
* that is mapped by userspace. This means that care needs to be taken to
* ensure that reads are stable, as we cannot rely on userspace always
* being a good citizen. If members of the sqe are validated and then later
* used, it's important that those reads are done through READ_ONCE() to
* prevent a re-load down the line.
*/
static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
{
unsigned head, mask = ctx->sq_entries - 1;
unsigned sq_idx = ctx->cached_sq_head++ & mask;
/*
* The cached sq head (or cq tail) serves two purposes:
*
* 1) allows us to batch the cost of updating the user visible
* head updates.
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
head = READ_ONCE(ctx->sq_array[sq_idx]);
if (likely(head < ctx->sq_entries)) {
/* double index for 128-byte SQEs, twice as long */
if (ctx->flags & IORING_SETUP_SQE128)
head <<= 1;
return &ctx->sq_sqes[head];
}
/* drop invalid entries */
ctx->cq_extra--;
WRITE_ONCE(ctx->rings->sq_dropped,
READ_ONCE(ctx->rings->sq_dropped) + 1);
return NULL;
}
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{
unsigned int entries = io_sqring_entries(ctx);
unsigned int left;
int ret;
if (unlikely(!entries))
return 0;
/* make sure SQ entry isn't read before tail */
ret = left = min3(nr, ctx->sq_entries, entries);
io_get_task_refs(left);
io_submit_state_start(&ctx->submit_state, left);
do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
if (unlikely(!io_alloc_req_refill(ctx)))
break;
req = io_alloc_req(ctx);
sqe = io_get_sqe(ctx);
if (unlikely(!sqe)) {
io_req_add_to_cache(req, ctx);
break;
}
/*
* Continue submitting even for sqe failure if the
* ring was setup with IORING_SETUP_SUBMIT_ALL
*/
if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
left--;
break;
}
} while (--left);
if (unlikely(left)) {
ret -= left;
/* try again if it submitted nothing and can't allocate a req */
if (!ret && io_req_cache_empty(ctx))
ret = -EAGAIN;
current->io_uring->cached_refs += left;
}
io_submit_state_end(ctx);
/* Commit SQ ring head once we've consumed and submitted all SQEs */
io_commit_sqring(ctx);
return ret;
}
struct io_wait_queue {
struct wait_queue_entry wq;
struct io_ring_ctx *ctx;
unsigned cq_tail;
unsigned nr_timeouts;
ktime_t timeout;
};
static inline bool io_has_work(struct io_ring_ctx *ctx)
{
return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
!llist_empty(&ctx->work_llist);
}
static inline bool io_should_wake(struct io_wait_queue *iowq)
{
struct io_ring_ctx *ctx = iowq->ctx;
int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
/*
* Wake up if we have enough events, or if a timeout occurred since we
* started waiting. For timeouts, we always want to return to userspace,
* regardless of event count.
*/
return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}
static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
int wake_flags, void *key)
{
struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
/*
* Cannot safely flush overflowed CQEs from here, ensure we wake up
* the task, and the next invocation will do it.
*/
if (io_should_wake(iowq) || io_has_work(iowq->ctx))
return autoremove_wake_function(curr, mode, wake_flags, key);
return -1;
}
int io_run_task_work_sig(struct io_ring_ctx *ctx)
{
if (!llist_empty(&ctx->work_llist)) {
__set_current_state(TASK_RUNNING);
if (io_run_local_work(ctx) > 0)
return 1;
}
if (io_run_task_work() > 0)
return 1;
if (task_sigpending(current))
return -EINTR;
return 0;
}
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
if (unlikely(READ_ONCE(ctx->check_cq)))
return 1;
if (unlikely(!llist_empty(&ctx->work_llist)))
return 1;
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
return 1;
if (unlikely(task_sigpending(current)))
return -EINTR;
if (unlikely(io_should_wake(iowq)))
return 0;
if (iowq->timeout == KTIME_MAX)
schedule();
else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
return -ETIME;
return 0;
}
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
*/
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz,
struct __kernel_timespec __user *uts)
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
int ret;
if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (!llist_empty(&ctx->work_llist))
io_run_local_work(ctx);
io_run_task_work();
io_cqring_overflow_flush(ctx);
/* if user messes with these they will just get an early return */
if (__io_cqring_events_user(ctx) >= min_events)
return 0;
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
sigsz);
else
#endif
ret = set_user_sigmask(sig, sigsz);
if (ret)
return ret;
}
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
iowq.ctx = ctx;
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
iowq.timeout = KTIME_MAX;
if (uts) {
struct timespec64 ts;
if (get_timespec64(&ts, uts))
return -EFAULT;
iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
}
trace_io_uring_cqring_wait(ctx, min_events);
do {
unsigned long check_cq;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
WRITE_ONCE(ctx->cq_waiting, 1);
set_current_state(TASK_INTERRUPTIBLE);
} else {
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
}
ret = io_cqring_wait_schedule(ctx, &iowq);
__set_current_state(TASK_RUNNING);
WRITE_ONCE(ctx->cq_waiting, 0);
if (ret < 0)
break;
/*
* Run task_work after scheduling and before io_should_wake().
* If we got woken because of task_work being processed, run it
* now rather than let the caller do another wait loop.
*/
io_run_task_work();
if (!llist_empty(&ctx->work_llist))
io_run_local_work(ctx);
check_cq = READ_ONCE(ctx->check_cq);
if (unlikely(check_cq)) {
/* let the caller flush overflows, retry */
if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
io_cqring_do_overflow_flush(ctx);
if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
ret = -EBADR;
break;
}
}
if (io_should_wake(&iowq)) {
ret = 0;
break;
}
cond_resched();
} while (1);
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
finish_wait(&ctx->cq_wait, &iowq.wq);
restore_saved_sigmask_unless(ret == -EINTR);
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
static void io_mem_free(void *ptr)
{
struct page *page;
if (!ptr)
return;
page = virt_to_head_page(ptr);
if (put_page_testzero(page))
free_compound_page(page);
}
static void *io_mem_alloc(size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
return (void *) __get_free_pages(gfp, get_order(size));
}
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset)
{
struct io_rings *rings;
size_t off, sq_array_size;
off = struct_size(rings, cqes, cq_entries);
if (off == SIZE_MAX)
return SIZE_MAX;
if (ctx->flags & IORING_SETUP_CQE32) {
if (check_shl_overflow(off, 1, &off))
return SIZE_MAX;
}
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
if (off == 0)
return SIZE_MAX;
#endif
if (sq_offset)
*sq_offset = off;
sq_array_size = array_size(sizeof(u32), sq_entries);
if (sq_array_size == SIZE_MAX)
return SIZE_MAX;
if (check_add_overflow(off, sq_array_size, &off))
return SIZE_MAX;
return off;
}
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
atomic_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
return 0;
}
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
call_rcu(&ev_fd->rcu, io_eventfd_ops);
return 0;
}
return -ENXIO;
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
{
int nr = 0;
mutex_lock(&ctx->uring_lock);
io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
while (!io_req_cache_empty(ctx)) {
struct io_kiocb *req = io_alloc_req(ctx);
kmem_cache_free(req_cachep, req);
nr++;
}
if (nr)
percpu_ref_put_many(&ctx->refs, nr);
mutex_unlock(&ctx->uring_lock);
}
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
io_rsrc_refs_drop(ctx);
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
io_wait_rsrc_data(ctx->buf_data);
io_wait_rsrc_data(ctx->file_data);
mutex_lock(&ctx->uring_lock);
if (ctx->buf_data)
__io_sqe_buffers_unregister(ctx);
if (ctx->file_data)
__io_sqe_files_unregister(ctx);
io_cqring_overflow_kill(ctx);
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
mutex_unlock(&ctx->uring_lock);
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
if (ctx->submitter_task)
put_task_struct(ctx->submitter_task);
/* there are no registered resources left, nobody uses it */
if (ctx->rsrc_node)
io_rsrc_node_destroy(ctx->rsrc_node);
if (ctx->rsrc_backup_node)
io_rsrc_node_destroy(ctx->rsrc_backup_node);
flush_delayed_work(&ctx->rsrc_put_work);
flush_delayed_work(&ctx->fallback_work);
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
#if defined(CONFIG_UNIX)
if (ctx->ring_sock) {
ctx->ring_sock->file = NULL; /* so that iput() is called */
sock_release(ctx->ring_sock);
}
#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
if (ctx->mm_account) {
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
}
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
kfree(ctx->dummy_ubuf);
kfree(ctx->io_bl);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
static __cold void io_activate_pollwq_cb(struct callback_head *cb)
{
struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
poll_wq_task_work);
mutex_lock(&ctx->uring_lock);
ctx->poll_activated = true;
mutex_unlock(&ctx->uring_lock);
/*
* Wake ups for some events between start of polling and activation
* might've been lost due to loose synchronisation.
*/
wake_up_all(&ctx->poll_wq);
percpu_ref_put(&ctx->refs);
}
static __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
/* already activated or in progress */
if (ctx->poll_activated || ctx->poll_wq_task_work.func)
goto out;
if (WARN_ON_ONCE(!ctx->task_complete))
goto out;
if (!ctx->submitter_task)
goto out;
/*
* with ->submitter_task only the submitter task completes requests, we
* only need to sync with it, which is done by injecting a tw
*/
init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
percpu_ref_get(&ctx->refs);
if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
percpu_ref_put(&ctx->refs);
out:
spin_unlock(&ctx->completion_lock);
}
static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
struct io_ring_ctx *ctx = file->private_data;
__poll_t mask = 0;
if (unlikely(!ctx->poll_activated))
io_activate_pollwq(ctx);
poll_wait(file, &ctx->poll_wq, wait);
/*
* synchronizes with barrier from wq_has_sleeper call in
* io_commit_cqring
*/
smp_rmb();
if (!io_sqring_full(ctx))
mask |= EPOLLOUT | EPOLLWRNORM;
/*
* Don't flush cqring overflow list here, just do a simple check.
* Otherwise there could possible be ABBA deadlock:
* CPU0 CPU1
* ---- ----
* lock(&ctx->uring_lock);
* lock(&ep->mtx);
* lock(&ctx->uring_lock);
* lock(&ep->mtx);
*
* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
* pushes them to do the flush.
*/
if (io_cqring_events(ctx) || io_has_work(ctx))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
}
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *creds;
creds = xa_erase(&ctx->personalities, id);
if (creds) {
put_cred(creds);
return 0;
}
return -EINVAL;
}
struct io_tctx_exit {
struct callback_head task_work;
struct completion completion;
struct io_ring_ctx *ctx;
};
static __cold void io_tctx_exit_cb(struct callback_head *cb)
{
struct io_uring_task *tctx = current->io_uring;
struct io_tctx_exit *work;
work = container_of(cb, struct io_tctx_exit, task_work);
/*
* When @in_idle, we're in cancellation and it's racy to remove the
* node. It'll be removed by the end of cancellation, just ignore it.
* tctx can be NULL if the queueing of this task_work raced with
* work cancelation off the exec path.
*/
if (tctx && !atomic_read(&tctx->in_idle))
io_uring_del_tctx_node((unsigned long)work->ctx);
complete(&work->completion);
}
static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
return req->ctx == data;
}
static __cold void io_ring_exit_work(struct work_struct *work)
{
struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
unsigned long timeout = jiffies + HZ * 60 * 5;
unsigned long interval = HZ / 20;
struct io_tctx_exit exit;
struct io_tctx_node *node;
int ret;
/*
* If we're doing polled IO and end up having requests being
* submitted async (out-of-line), then completions can come in while
* we're waiting for refs to drop. We need to reap these manually,
* as nobody else will be looking for them.
*/
do {
if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
mutex_lock(&ctx->uring_lock);
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);
while (io_uring_try_cancel_requests(ctx, NULL, true))
cond_resched();
if (ctx->sq_data) {
struct io_sq_data *sqd = ctx->sq_data;
struct task_struct *tsk;
io_sq_thread_park(sqd);
tsk = sqd->thread;
if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
io_wq_cancel_cb(tsk->io_uring->io_wq,
io_cancel_ctx_cb, ctx, true);
io_sq_thread_unpark(sqd);
}
io_req_caches_free(ctx);
if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
/* there is little hope left, don't run it too often */
interval = HZ * 60;
}
} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
init_completion(&exit.completion);
init_task_work(&exit.task_work, io_tctx_exit_cb);
exit.ctx = ctx;
/*
* Some may use context even when all refs and requests have been put,
* and they are free to do so while still holding uring_lock or
* completion_lock, see io_req_task_submit(). Apart from other work,
* this lock/unlock section also waits them to finish.
*/
mutex_lock(&ctx->uring_lock);
while (!list_empty(&ctx->tctx_list)) {
WARN_ON_ONCE(time_after(jiffies, timeout));
node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
ctx_node);
/* don't spin on a single task if cancellation failed */
list_rotate_left(&ctx->tctx_list);
ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
if (WARN_ON_ONCE(ret))
continue;
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&exit.completion);
mutex_lock(&ctx->uring_lock);
}
mutex_unlock(&ctx->uring_lock);
spin_lock(&ctx->completion_lock);
spin_unlock(&ctx->completion_lock);
io_ring_ctx_free(ctx);
}
static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
unsigned long index;
struct creds *creds;
mutex_lock(&ctx->uring_lock);
percpu_ref_kill(&ctx->refs);
xa_for_each(&ctx->personalities, index, creds)
io_unregister_personality(ctx, index);
if (ctx->rings)
io_poll_remove_all(ctx, NULL, true);
mutex_unlock(&ctx->uring_lock);
/*
* If we failed setting up the ctx, we might not have any rings
* and therefore did not submit any requests
*/
if (ctx->rings)
io_kill_timeouts(ctx, NULL, true);
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
/*
* Use system_unbound_wq to avoid spawning tons of event kworkers
* if we're exiting a ton of rings at the same time. It just adds
* noise and overhead, there's no discernable change in runtime
* over using system_wq.
*/
queue_work(system_unbound_wq, &ctx->exit_work);
}
static int io_uring_release(struct inode *inode, struct file *file)
{
struct io_ring_ctx *ctx = file->private_data;
file->private_data = NULL;
io_ring_ctx_wait_and_kill(ctx);
return 0;
}
struct io_task_cancel {
struct task_struct *task;
bool all;
};
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_task_cancel *cancel = data;
return io_match_task_safe(req, cancel->task, cancel->all);
}
static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
{
struct io_defer_entry *de;
LIST_HEAD(list);
spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
if (io_match_task_safe(de->req, task, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
spin_unlock(&ctx->completion_lock);
if (list_empty(&list))
return false;
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
io_req_task_queue_fail(de->req, -ECANCELED);
kfree(de);
}
return true;
}
static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{
struct io_tctx_node *node;
enum io_wq_cancel cret;
bool ret = false;
mutex_lock(&ctx->uring_lock);
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
/*
* io_wq will stay alive while we hold uring_lock, because it's
* killed after ctx nodes, which requires to take the lock.
*/
if (!tctx || !tctx->io_wq)
continue;
cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
}
mutex_unlock(&ctx->uring_lock);
return ret;
}
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
{
struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
struct io_uring_task *tctx = task ? task->io_uring : NULL;
enum io_wq_cancel cret;
bool ret = false;
/* failed during ring init, it couldn't have issued any requests */
if (!ctx->rings)
return false;
if (!task) {
ret |= io_uring_try_cancel_iowq(ctx);
} else if (tctx && tctx->io_wq) {
/*
* Cancels requests of all rings, not only @ctx, but
* it's fine as the task is in exit/exec.
*/
cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
&cancel, true);
ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
}
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
(ctx->sq_data && ctx->sq_data->thread == current)) {
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
}
}
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx) > 0;
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
ret |= io_run_task_work() > 0;
return ret;
}
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{
if (tracked)
return atomic_read(&tctx->inflight_tracked);
return percpu_counter_sum(&tctx->inflight);
}
/*
* Find any io_uring ctx that this task has registered or done IO on, and cancel
* requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
*/
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{
struct io_uring_task *tctx = current->io_uring;
struct io_ring_ctx *ctx;
s64 inflight;
DEFINE_WAIT(wait);
WARN_ON_ONCE(sqd && sqd->thread != current);
if (!current->io_uring)
return;
if (tctx->io_wq)
io_wq_exit_start(tctx->io_wq);
atomic_inc(&tctx->in_idle);
do {
bool loop = false;
io_uring_drop_tctx_refs(current);
/* read completions before cancelations */
inflight = tctx_inflight(tctx, !cancel_all);
if (!inflight)
break;
if (!sqd) {
struct io_tctx_node *node;
unsigned long index;
xa_for_each(&tctx->xa, index, node) {
/* sqpoll task will cancel all its requests */
if (node->ctx->sq_data)
continue;
loop |= io_uring_try_cancel_requests(node->ctx,
current, cancel_all);
}
} else {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
loop |= io_uring_try_cancel_requests(ctx,
current,
cancel_all);
}
if (loop) {
cond_resched();
continue;
}
prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
io_run_task_work();
io_uring_drop_tctx_refs(current);
/*
* If we've seen completions, retry without waiting. This
* avoids a race where a completion comes in before we did
* prepare_to_wait().
*/
if (inflight == tctx_inflight(tctx, !cancel_all))
schedule();
finish_wait(&tctx->wait, &wait);
} while (1);
io_uring_clean_tctx(tctx);
if (cancel_all) {
/*
* We shouldn't run task_works after cancel, so just leave
* ->in_idle set for normal exit.
*/
atomic_dec(&tctx->in_idle);
/* for exec all current's requests should be gone, kill tctx */
__io_uring_free(current);
}
}
void __io_uring_cancel(bool cancel_all)
{
io_uring_cancel_generic(cancel_all, NULL);
}
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
struct page *page;
void *ptr;
switch (offset) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
ptr = ctx->rings;
break;
case IORING_OFF_SQES:
ptr = ctx->sq_sqes;
break;
default:
return ERR_PTR(-EINVAL);
}
page = virt_to_head_page(ptr);
if (sz > page_size(page))
return ERR_PTR(-EINVAL);
return ptr;
}
#ifdef CONFIG_MMU
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
void *ptr;
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
#else /* !CONFIG_MMU */
static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
}
static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
{
return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
}
static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
void *ptr;
ptr = io_uring_validate_mmap_request(file, pgoff, len);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
return (unsigned long) ptr;
}
#endif /* !CONFIG_MMU */
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{
if (flags & IORING_ENTER_EXT_ARG) {
struct io_uring_getevents_arg arg;
if (argsz != sizeof(arg))
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
}
return 0;
}
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
struct __kernel_timespec __user **ts,
const sigset_t __user **sig)
{
struct io_uring_getevents_arg arg;
/*
* If EXT_ARG isn't set, then we have no timespec and the argp pointer
* is just a pointer to the sigset_t.
*/
if (!(flags & IORING_ENTER_EXT_ARG)) {
*sig = (const sigset_t __user *) argp;
*ts = NULL;
return 0;
}
/*
* EXT_ARG is set - ensure we agree on the size of it and copy in our
* timespec and sigset_t pointers if good.
*/
if (*argsz != sizeof(arg))
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
if (arg.pad)
return -EINVAL;
*sig = u64_to_user_ptr(arg.sigmask);
*argsz = arg.sigmask_sz;
*ts = u64_to_user_ptr(arg.ts);
return 0;
}
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const void __user *, argp,
size_t, argsz)
{
struct io_ring_ctx *ctx;
struct fd f;
long ret;
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
IORING_ENTER_REGISTERED_RING)))
return -EINVAL;
/*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it.
*/
if (flags & IORING_ENTER_REGISTERED_RING) {
struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd];
f.flags = 0;
if (unlikely(!f.file))
return -EBADF;
} else {
f = fdget(fd);
if (unlikely(!f.file))
return -EBADF;
ret = -EOPNOTSUPP;
if (unlikely(!io_is_uring_fops(f.file)))
goto out;
}
ctx = f.file->private_data;
ret = -EBADFD;
if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
goto out;
/*
* For SQ polling, the thread will do all submissions and completions.
* Just return the requested submit count, and wake the thread if
* we were asked to.
*/
ret = 0;
if (ctx->flags & IORING_SETUP_SQPOLL) {
io_cqring_overflow_flush(ctx);
if (unlikely(ctx->sq_data->thread == NULL)) {
ret = -EOWNERDEAD;
goto out;
}
if (flags & IORING_ENTER_SQ_WAKEUP)
wake_up(&ctx->sq_data->wait);
if (flags & IORING_ENTER_SQ_WAIT)
io_sqpoll_wait_sq(ctx);
ret = to_submit;
} else if (to_submit) {
ret = io_uring_add_tctx_node(ctx);
if (unlikely(ret))
goto out;
mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit);
if (ret != to_submit) {
mutex_unlock(&ctx->uring_lock);
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
if (ctx->syscall_iopoll)
goto iopoll_locked;
/*
* Ignore errors, we'll soon call io_cqring_wait() and
* it should handle ownership problems if any.
*/
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
(void)io_run_local_work_locked(ctx);
}
mutex_unlock(&ctx->uring_lock);
}
if (flags & IORING_ENTER_GETEVENTS) {
int ret2;
if (ctx->syscall_iopoll) {
/*
* We disallow the app entering submit/complete with
* polling, but we still need to lock the ring to
* prevent racing with polled issue that got punted to
* a workqueue.
*/
mutex_lock(&ctx->uring_lock);
iopoll_locked:
ret2 = io_validate_ext_arg(flags, argp, argsz);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
ret2 = io_iopoll_check(ctx, min_complete);
}
mutex_unlock(&ctx->uring_lock);
} else {
const sigset_t __user *sig;
struct __kernel_timespec __user *ts;
ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
ret2 = io_cqring_wait(ctx, min_complete, sig,
argsz, ts);
}
}
if (!ret) {
ret = ret2;
/*
* EBADR indicates that one or more CQE were dropped.
* Once the user has been informed we can clear the bit
* as they are obviously ok with those drops.
*/
if (unlikely(ret2 == -EBADR))
clear_bit(IO_CHECK_CQ_DROPPED_BIT,
&ctx->check_cq);
}
}
out:
fdput(f);
return ret;
}
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.mmap = io_uring_mmap,
#ifndef CONFIG_MMU
.get_unmapped_area = io_uring_nommu_get_unmapped_area,
.mmap_capabilities = io_uring_nommu_mmap_capabilities,
#endif
.poll = io_uring_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = io_uring_show_fdinfo,
#endif
};
bool io_is_uring_fops(struct file *file)
{
return file->f_op == &io_uring_fops;
}
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
struct io_rings *rings;
size_t size, sq_array_offset;
/* make sure these are sane, as we already accounted them */
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
rings = io_mem_alloc(size);
if (!rings)
return -ENOMEM;
ctx->rings = rings;
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1;
rings->cq_ring_mask = p->cq_entries - 1;
rings->sq_ring_entries = p->sq_entries;
rings->cq_ring_entries = p->cq_entries;
if (p->flags & IORING_SETUP_SQE128)
size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
else
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
io_mem_free(ctx->rings);
ctx->rings = NULL;
return -EOVERFLOW;
}
ctx->sq_sqes = io_mem_alloc(size);
if (!ctx->sq_sqes) {
io_mem_free(ctx->rings);
ctx->rings = NULL;
return -ENOMEM;
}
return 0;
}
static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
{
int ret, fd;
fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (fd < 0)
return fd;
ret = __io_uring_add_tctx_node(ctx);
if (ret) {
put_unused_fd(fd);
return ret;
}
fd_install(fd, file);
return fd;
}
/*
* Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this
* fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
* we have to tie this fd to a socket for file garbage collection purposes.
*/
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{
struct file *file;
#if defined(CONFIG_UNIX)
int ret;
ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
&ctx->ring_sock);
if (ret)
return ERR_PTR(ret);
#endif
file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC, NULL);
#if defined(CONFIG_UNIX)
if (IS_ERR(file)) {
sock_release(ctx->ring_sock);
ctx->ring_sock = NULL;
} else {
ctx->ring_sock->file = file;
}
#endif
return file;
}
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{
struct io_ring_ctx *ctx;
struct file *file;
int ret;
if (!entries)
return -EINVAL;
if (entries > IORING_MAX_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
entries = IORING_MAX_ENTRIES;
}
/*
* Use twice as many entries for the CQ ring. It's possible for the
* application to drive a higher depth than the size of the SQ ring,
* since the sqes are only used at submission time. This allows for
* some flexibility in overcommitting a bit. If the application has
* set IORING_SETUP_CQSIZE, it will have passed in the desired number
* of CQ ring entries manually.
*/
p->sq_entries = roundup_pow_of_two(entries);
if (p->flags & IORING_SETUP_CQSIZE) {
/*
* If IORING_SETUP_CQSIZE is set, we do the same roundup
* to a power-of-two, if it isn't already. We do NOT impose
* any cq vs sq ring sizing.
*/
if (!p->cq_entries)
return -EINVAL;
if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
p->cq_entries = IORING_MAX_CQ_ENTRIES;
}
p->cq_entries = roundup_pow_of_two(p->cq_entries);
if (p->cq_entries < p->sq_entries)
return -EINVAL;
} else {
p->cq_entries = 2 * p->sq_entries;
}
ctx = io_ring_ctx_alloc(p);
if (!ctx)
return -ENOMEM;
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
!(ctx->flags & IORING_SETUP_IOPOLL) &&
!(ctx->flags & IORING_SETUP_SQPOLL))
ctx->task_complete = true;
/*
* lazy poll_wq activation relies on ->task_complete for synchronisation
* purposes, see io_activate_pollwq()
*/
if (!ctx->task_complete)
ctx->poll_activated = true;
/*
* When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
* space applications don't need to do io completion events
* polling again, they can rely on io_sq_thread to do polling
* work, which can reduce cpu usage and uring_lock contention.
*/
if (ctx->flags & IORING_SETUP_IOPOLL &&
!(ctx->flags & IORING_SETUP_SQPOLL))
ctx->syscall_iopoll = 1;
ctx->compat = in_compat_syscall();
if (!capable(CAP_IPC_LOCK))
ctx->user = get_uid(current_user());
/*
* For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
* COOP_TASKRUN is set, then IPIs are never needed by the app.
*/
ret = -EINVAL;
if (ctx->flags & IORING_SETUP_SQPOLL) {
/* IPI related flags don't make sense with SQPOLL */
if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_DEFER_TASKRUN))
goto err;
ctx->notify_method = TWA_SIGNAL_NO_IPI;
} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
ctx->notify_method = TWA_SIGNAL_NO_IPI;
} else {
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
goto err;
ctx->notify_method = TWA_SIGNAL;
}
/*
* For DEFER_TASKRUN we require the completion task to be the same as the
* submission task. This implies that there is only one submitter, so enforce
* that.
*/
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
!(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
goto err;
}
/*
* This is just grabbed for accounting purposes. When a process exits,
* the mm is exited and dropped before the files, hence we need to hang
* on to this mm purely for the purposes of being able to unaccount
* memory (locked/pinned vm). It's not used for anything else.
*/
mmgrab(current->mm);
ctx->mm_account = current->mm;
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
/* always set a rsrc node */
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
io_rsrc_node_switch(ctx, NULL);
memset(&p->sq_off, 0, sizeof(p->sq_off));
p->sq_off.head = offsetof(struct io_rings, sq.head);
p->sq_off.tail = offsetof(struct io_rings, sq.tail);
p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
p->sq_off.flags = offsetof(struct io_rings, sq_flags);
p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
memset(&p->cq_off, 0, sizeof(p->cq_off));
p->cq_off.head = offsetof(struct io_rings, cq.head);
p->cq_off.tail = offsetof(struct io_rings, cq.tail);
p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
p->cq_off.cqes = offsetof(struct io_rings, cqes);
p->cq_off.flags = offsetof(struct io_rings, cq_flags);
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
goto err;
}
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
&& !(ctx->flags & IORING_SETUP_R_DISABLED))
WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
file = io_uring_get_file(ctx);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
goto err;
}
/*
* Install ring fd as the very last thing, so we don't risk someone
* having closed it before we finish setup
*/
ret = io_uring_install_fd(ctx, file);
if (ret < 0) {
/* fput will clean it up */
fput(file);
return ret;
}
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
io_ring_ctx_wait_and_kill(ctx);
return ret;
}
/*
* Sets up an aio uring context, and returns the fd. Applications asks for a
* ring size, we return the actual sq/cq ring sizes (among other things) in the
* params structure passed in.
*/
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{
struct io_uring_params p;
int i;
if (copy_from_user(&p, params, sizeof(p)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
if (p.resv[i])
return -EINVAL;
}
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
return -EINVAL;
return io_uring_create(entries, &p, params);
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
struct io_uring_params __user *, params)
{
return io_uring_setup(entries, params);
}
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
int i, ret;
size = struct_size(p, ops, nr_args);
if (size == SIZE_MAX)
return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
ret = -EFAULT;
if (copy_from_user(p, arg, size))
goto out;
ret = -EINVAL;
if (memchr_inv(p, 0, size))
goto out;
p->last_op = IORING_OP_LAST - 1;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
if (!io_issue_defs[i].not_supported)
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
ret = 0;
if (copy_to_user(arg, p, size))
ret = -EFAULT;
out:
kfree(p);
return ret;
}
static int io_register_personality(struct io_ring_ctx *ctx)
{
const struct cred *creds;
u32 id;
int ret;
creds = get_current_cred();
ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
if (ret < 0) {
put_cred(creds);
return ret;
}
return id;
}
static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
void __user *arg, unsigned int nr_args)
{
struct io_uring_restriction *res;
size_t size;
int i, ret;
/* Restrictions allowed only if rings started disabled */
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
/* We allow only a single restrictions registration */
if (ctx->restrictions.registered)
return -EBUSY;
if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
return -EINVAL;
size = array_size(nr_args, sizeof(*res));
if (size == SIZE_MAX)
return -EOVERFLOW;
res = memdup_user(arg, size);
if (IS_ERR(res))
return PTR_ERR(res);
ret = 0;
for (i = 0; i < nr_args; i++) {
switch (res[i].opcode) {
case IORING_RESTRICTION_REGISTER_OP:
if (res[i].register_op >= IORING_REGISTER_LAST) {
ret = -EINVAL;
goto out;
}
__set_bit(res[i].register_op,
ctx->restrictions.register_op);
break;
case IORING_RESTRICTION_SQE_OP:
if (res[i].sqe_op >= IORING_OP_LAST) {
ret = -EINVAL;
goto out;
}
__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
break;
case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
break;
case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
break;
default:
ret = -EINVAL;
goto out;
}
}
out:
/* Reset all restrictions if an error happened */
if (ret != 0)
memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
else
ctx->restrictions.registered = true;
kfree(res);
return ret;
}
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EBADFD;
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
/*
* Lazy activation attempts would fail if it was polled before
* submitter_task is set.
*/
if (wq_has_sleeper(&ctx->poll_wq))
io_activate_pollwq(ctx);
}
if (ctx->restrictions.registered)
ctx->restricted = 1;
ctx->flags &= ~IORING_SETUP_R_DISABLED;
if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
wake_up(&ctx->sq_data->wait);
return 0;
}
static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
void __user *arg, unsigned len)
{
struct io_uring_task *tctx = current->io_uring;
cpumask_var_t new_mask;
int ret;
if (!tctx || !tctx->io_wq)
return -EINVAL;
if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
return -ENOMEM;
cpumask_clear(new_mask);
if (len > cpumask_size())
len = cpumask_size();
if (in_compat_syscall()) {
ret = compat_get_bitmap(cpumask_bits(new_mask),
(const compat_ulong_t __user *)arg,
len * 8 /* CHAR_BIT */);
} else {
ret = copy_from_user(new_mask, arg, len);
}
if (ret) {
free_cpumask_var(new_mask);
return -EFAULT;
}
ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
free_cpumask_var(new_mask);
return ret;
}
static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
{
struct io_uring_task *tctx = current->io_uring;
if (!tctx || !tctx->io_wq)
return -EINVAL;
return io_wq_cpu_affinity(tctx->io_wq, NULL);
}
static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
void __user *arg)
__must_hold(&ctx->uring_lock)
{
struct io_tctx_node *node;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
int i, ret;
if (copy_from_user(new_count, arg, sizeof(new_count)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i] > INT_MAX)
return -EINVAL;
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
/*
* Observe the correct sqd->lock -> ctx->uring_lock
* ordering. Fine to drop uring_lock here, we hold
* a ref to the ctx.
*/
refcount_inc(&sqd->refs);
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
if (sqd->thread)
tctx = sqd->thread->io_uring;
}
} else {
tctx = current->io_uring;
}
BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i])
ctx->iowq_limits[i] = new_count[i];
ctx->iowq_limits_set = true;
if (tctx && tctx->io_wq) {
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
goto err;
} else {
memset(new_count, 0, sizeof(new_count));
}
if (sqd) {
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
}
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
/* that's it for SQPOLL, only the SQPOLL task creates requests */
if (sqd)
return 0;
/* now propagate the restriction to all registered users */
list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
struct io_uring_task *tctx = node->task->io_uring;
if (WARN_ON_ONCE(!tctx->io_wq))
continue;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
new_count[i] = ctx->iowq_limits[i];
/* ignore errors, it always returns zero anyway */
(void)io_wq_max_workers(tctx->io_wq, new_count);
}
return 0;
err:
if (sqd) {
mutex_unlock(&sqd->lock);
io_put_sq_data(sqd);
}
return ret;
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
__acquires(ctx->uring_lock)
{
int ret;
/*
* We don't quiesce the refs for register anymore and so it can't be
* dying as we're holding a file ref here.
*/
if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
return -ENXIO;
if (ctx->submitter_task && ctx->submitter_task != current)
return -EEXIST;
if (ctx->restricted) {
opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
if (!test_bit(opcode, ctx->restrictions.register_op))
return -EACCES;
}
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
ret = -EFAULT;
if (!arg)
break;
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_sqe_files_unregister(ctx);
break;
case IORING_REGISTER_FILES_UPDATE:
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 0);
break;
case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_eventfd_unregister(ctx);
break;
case IORING_REGISTER_PROBE:
ret = -EINVAL;
if (!arg || nr_args > 256)
break;
ret = io_probe(ctx, arg, nr_args);
break;
case IORING_REGISTER_PERSONALITY:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_personality(ctx);
break;
case IORING_UNREGISTER_PERSONALITY:
ret = -EINVAL;
if (arg)
break;
ret = io_unregister_personality(ctx, nr_args);
break;
case IORING_REGISTER_ENABLE_RINGS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_enable_rings(ctx);
break;
case IORING_REGISTER_RESTRICTIONS:
ret = io_register_restrictions(ctx, arg, nr_args);
break;
case IORING_REGISTER_FILES2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
break;
case IORING_REGISTER_FILES_UPDATE2:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_FILE);
break;
case IORING_REGISTER_BUFFERS2:
ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_BUFFERS_UPDATE:
ret = io_register_rsrc_update(ctx, arg, nr_args,
IORING_RSRC_BUFFER);
break;
case IORING_REGISTER_IOWQ_AFF:
ret = -EINVAL;
if (!arg || !nr_args)
break;
ret = io_register_iowq_aff(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_IOWQ_AFF:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_unregister_iowq_aff(ctx);
break;
case IORING_REGISTER_IOWQ_MAX_WORKERS:
ret = -EINVAL;
if (!arg || nr_args != 2)
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
case IORING_REGISTER_RING_FDS:
ret = io_ringfd_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_RING_FDS:
ret = io_ringfd_unregister(ctx, arg, nr_args);
break;
case IORING_REGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_pbuf_ring(ctx, arg);
break;
case IORING_UNREGISTER_PBUF_RING:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_unregister_pbuf_ring(ctx, arg);
break;
case IORING_REGISTER_SYNC_CANCEL:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_sync_cancel(ctx, arg);
break;
case IORING_REGISTER_FILE_ALLOC_RANGE:
ret = -EINVAL;
if (!arg || nr_args)
break;
ret = io_register_file_alloc_range(ctx, arg);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct fd f;
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
f = fdget(fd);
if (!f.file)
return -EBADF;
ret = -EOPNOTSUPP;
if (!io_is_uring_fops(f.file))
goto out_fput;
ctx = f.file->private_data;
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
return ret;
}
static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
} while (0)
#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
#define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
BUILD_BUG_SQE_ELEM(0, __u8, opcode);
BUILD_BUG_SQE_ELEM(1, __u8, flags);
BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
BUILD_BUG_SQE_ELEM(4, __s32, fd);
BUILD_BUG_SQE_ELEM(8, __u64, off);
BUILD_BUG_SQE_ELEM(8, __u64, addr2);
BUILD_BUG_SQE_ELEM(8, __u32, cmd_op);
BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
BUILD_BUG_SQE_ELEM(16, __u64, addr);
BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
BUILD_BUG_SQE_ELEM(24, __u32, len);
BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
BUILD_BUG_SQE_ELEM(28, __u32, rename_flags);
BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags);
BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags);
BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags);
BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags);
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
sizeof(struct io_uring_rsrc_update));
BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
sizeof(struct io_uring_rsrc_update2));
/* ->buf_index is u16 */
BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
offsetof(struct io_uring_buf_ring, tail));
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
io_uring_optable_init();
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT);
return 0;
};
__initcall(io_uring_init);