io_uring: add timeout support for io_uring_enter()

Now users who want to get woken when waiting for events should submit a
timeout command first. It is not safe for applications that split SQ and
CQ handling between two threads, such as mysql. Users should synchronize
the two threads explicitly to protect SQ and that will impact the
performance.

This patch adds support for timeout to existing io_uring_enter(). To
avoid overloading arguments, it introduces a new parameter structure
which contains sigmask and timeout.

I have tested the workloads with one thread submiting nop requests
while the other reaping the cqe with timeout. It shows 1.8~2x faster
when the iodepth is 16.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Hao Xu <haoxu@linux.alibaba.com>
[axboe: various cleanups/fixes, and name change to SIG_IS_DATA]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Hao Xu 2020-11-03 10:54:37 +08:00 committed by Jens Axboe
parent 27926b683d
commit c73ebb685f
3 changed files with 72 additions and 8 deletions

View File

@ -7118,7 +7118,8 @@ static int io_run_task_work_sig(void)
* application must reap them itself, as they reside on the shared cq ring.
*/
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
const sigset_t __user *sig, size_t sigsz,
struct __kernel_timespec __user *uts)
{
struct io_wait_queue iowq = {
.wq = {
@ -7130,6 +7131,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings;
struct timespec64 ts;
signed long timeout = 0;
int ret = 0;
do {
@ -7152,6 +7155,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
if (uts) {
if (get_timespec64(&ts, uts))
return -EFAULT;
timeout = timespec64_to_jiffies(&ts);
}
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
@ -7165,7 +7174,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
if (io_should_wake(&iowq, false))
break;
schedule();
if (uts) {
timeout = schedule_timeout(timeout);
if (timeout == 0) {
ret = -ETIME;
break;
}
} else {
schedule();
}
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@ -9167,9 +9184,39 @@ static void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
finish_wait(&ctx->sqo_sq_wait, &wait);
}
static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
struct __kernel_timespec __user **ts,
const sigset_t __user **sig)
{
struct io_uring_getevents_arg arg;
/*
* If EXT_ARG isn't set, then we have no timespec and the argp pointer
* is just a pointer to the sigset_t.
*/
if (!(flags & IORING_ENTER_EXT_ARG)) {
*sig = (const sigset_t __user *) argp;
*ts = NULL;
return 0;
}
/*
* EXT_ARG is set - ensure we agree on the size of it and copy in our
* timespec and sigset_t pointers if good.
*/
if (*argsz != sizeof(arg))
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
*sig = u64_to_user_ptr(arg.sigmask);
*argsz = arg.sigmask_sz;
*ts = u64_to_user_ptr(arg.ts);
return 0;
}
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const sigset_t __user *, sig,
size_t, sigsz)
u32, min_complete, u32, flags, const void __user *, argp,
size_t, argsz)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
@ -9179,7 +9226,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
IORING_ENTER_SQ_WAIT))
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))
return -EINVAL;
f = fdget(fd);
@ -9225,6 +9272,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
goto out;
}
if (flags & IORING_ENTER_GETEVENTS) {
const sigset_t __user *sig;
struct __kernel_timespec __user *ts;
ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
if (unlikely(ret))
goto out;
min_complete = min(min_complete, ctx->cq_entries);
/*
@ -9237,7 +9291,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
!(ctx->flags & IORING_SETUP_SQPOLL)) {
ret = io_iopoll_check(ctx, min_complete);
} else {
ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
}
}
@ -9600,7 +9654,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED;
IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
IORING_FEAT_EXT_ARG;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;

View File

@ -317,7 +317,7 @@ asmlinkage long sys_io_uring_setup(u32 entries,
struct io_uring_params __user *p);
asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
u32 min_complete, u32 flags,
const sigset_t __user *sig, size_t sigsz);
const void __user *argp, size_t argsz);
asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
void __user *arg, unsigned int nr_args);

View File

@ -231,6 +231,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_GETEVENTS (1U << 0)
#define IORING_ENTER_SQ_WAKEUP (1U << 1)
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@ -259,6 +260,7 @@ struct io_uring_params {
#define IORING_FEAT_FAST_POLL (1U << 5)
#define IORING_FEAT_POLL_32BITS (1U << 6)
#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
#define IORING_FEAT_EXT_ARG (1U << 8)
/*
* io_uring_register(2) opcodes and arguments
@ -335,4 +337,11 @@ enum {
IORING_RESTRICTION_LAST
};
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
__u32 pad;
__u64 ts;
};
#endif