for-5.2/io_uring-20190507

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAlzR3t0QHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgptEYD/wIREUHkb/k/Wx9QIfEi28/reNr+iMnhhVD
 Xqw3G9cjuw423NgFYV09cGtpDB7q34f4JTQZfMvCyRKQzKDFMq++gdjPd8ELHpMb
 mnM3apSaY6N1Og1PMsPrAEiKiKShov7eLTj5UmRtGHUndnfnDrKG8rZ5XeZO7gBo
 N0q9XA6QQsJdmDlwgkr7uoby4gMi6HQ3oAfw4qaZrl7wpwBJqq2tz46vMVQYf7xI
 dqWOSeVxAjsrJC3Xzlnooi2TbXlK84j2zdl+CCpaloPtsmSEVs2pl6oeZ2MdraFi
 nzmGMenepV1DmoHleweUPm0Rc2mRwC/x7DXlaIjK3YeWzJK79fbOx/cUl6H+124n
 MGPpRutEIvQTNG7e4gFl/73I0K/QYY5axZvfl2P0cHI1jPCoP3LqPHR+ZP13o6tm
 rPgCrDbdFNaSvrdna9j2qRVa2vsuBTJ/cxM/ciQjsGZvMUXE3b49rZnw9ON3Y0I2
 sJCm1mP+/rNh40yV6xTMD3gH+dI4L484BO21v9u9Qc03M/OQ8mKR3pJ8XYMT1PF1
 rQp6uFi83wab0XRcBI0PL6xFsQyvWtWdgILOhqubqGdGeZYmEQKRGTEPMnlLnfFA
 bZZpPmuvOz8qerlM5TADDyrzHIJJ1Ej98x7jyvZAWjwwgJngvJDatgrdXqLu0XfU
 2cMnNwCLiw==
 =rMo3
 -----END PGP SIGNATURE-----

Merge tag 'for-5.2/io_uring-20190507' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Set of changes/improvements for io_uring. This contains:

   - Fix of a shadowed variable (Colin)

   - Add support for draining commands (me)

   - Add support for sync_file_range() (me)

   - Add eventfd support (me)

   - cpu_online() fix (Shenghui)

   - Removal of a redundant ->error assignment (Stefan)"

* tag 'for-5.2/io_uring-20190507' of git://git.kernel.dk/linux-block:
  io_uring: use cpu_online() to check p->sq_thread_cpu instead of cpu_possible()
  io_uring: fix shadowed variable ret return code being not checked
  req->error only used for iopoll
  io_uring: add support for eventfd notifications
  io_uring: add support for IORING_OP_SYNC_FILE_RANGE
  fs: add sync_file_range() helper
  io_uring: add support for marking commands as draining
This commit is contained in:
Linus Torvalds 2019-05-07 18:30:11 -07:00
commit 52ae2456d6
4 changed files with 269 additions and 72 deletions

View File

@ -222,6 +222,8 @@ struct io_ring_ctx {
unsigned sq_mask;
unsigned sq_thread_idle;
struct io_uring_sqe *sq_sqes;
struct list_head defer_list;
} ____cacheline_aligned_in_smp;
/* IO offload */
@ -239,6 +241,7 @@ struct io_ring_ctx {
unsigned cq_mask;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd;
} ____cacheline_aligned_in_smp;
/*
@ -327,8 +330,11 @@ struct io_kiocb {
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
#define REQ_F_SEQ_PREV 8 /* sequential with previous */
#define REQ_F_PREPPED 16 /* prep already done */
#define REQ_F_IO_DRAIN 32 /* drain existing IO first */
#define REQ_F_IO_DRAINED 64 /* drain done */
u64 user_data;
u64 error;
u32 error; /* iopoll result from callback */
u32 sequence;
struct work_struct work;
};
@ -356,6 +362,8 @@ struct io_submit_state {
unsigned int ios_left;
};
static void io_sq_wq_submit_work(struct work_struct *work);
static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
@ -407,10 +415,36 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->cancel_list);
INIT_LIST_HEAD(&ctx->defer_list);
return ctx;
}
static void io_commit_cqring(struct io_ring_ctx *ctx)
static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN)
return false;
return req->sequence > ctx->cached_cq_tail + ctx->sq_ring->dropped;
}
static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
if (list_empty(&ctx->defer_list))
return NULL;
req = list_first_entry(&ctx->defer_list, struct io_kiocb, list);
if (!io_sequence_defer(ctx, req)) {
list_del_init(&req->list);
return req;
}
return NULL;
}
static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_cq_ring *ring = ctx->cq_ring;
@ -425,6 +459,18 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
}
}
static void io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
__io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) {
req->flags |= REQ_F_IO_DRAINED;
queue_work(ctx->sqo_wq, &req->work);
}
}
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
{
struct io_cq_ring *ring = ctx->cq_ring;
@ -471,6 +517,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
if (ctx->cq_ev_fd)
eventfd_signal(ctx->cq_ev_fd, 1);
}
static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data,
@ -1222,6 +1270,54 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
int ret = 0;
if (!req->file)
return -EBADF;
/* Prep already done (EAGAIN retry) */
if (req->flags & REQ_F_PREPPED)
return 0;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
req->flags |= REQ_F_PREPPED;
return ret;
}
static int io_sync_file_range(struct io_kiocb *req,
const struct io_uring_sqe *sqe,
bool force_nonblock)
{
loff_t sqe_off;
loff_t sqe_len;
unsigned flags;
int ret;
ret = io_prep_sfr(req, sqe);
if (ret)
return ret;
/* sync_file_range always requires a blocking context */
if (force_nonblock)
return -EAGAIN;
sqe_off = READ_ONCE(sqe->off);
sqe_len = READ_ONCE(sqe->len);
flags = READ_ONCE(sqe->sync_range_flags);
ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
io_cqring_add_event(req->ctx, sqe->user_data, ret, 0);
io_put_req(req);
return 0;
}
static void io_poll_remove_one(struct io_kiocb *req)
{
struct io_poll_iocb *poll = &req->poll;
@ -1424,7 +1520,6 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
spin_unlock(&poll->head->lock);
}
if (mask) { /* no async, we'd stolen it */
req->error = mangle_poll(mask);
ipt.error = 0;
io_poll_complete(ctx, req, mask);
}
@ -1437,6 +1532,34 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return ipt.error;
}
static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_uring_sqe *sqe_copy;
if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list))
return 0;
sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
if (!sqe_copy)
return -EAGAIN;
spin_lock_irq(&ctx->completion_lock);
if (!io_sequence_defer(ctx, req) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
kfree(sqe_copy);
return 0;
}
memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
req->submit.sqe = sqe_copy;
INIT_WORK(&req->work, io_sq_wq_submit_work);
list_add_tail(&req->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock);
return -EIOCBQUEUED;
}
static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct sqe_submit *s, bool force_nonblock)
{
@ -1476,6 +1599,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
case IORING_OP_POLL_REMOVE:
ret = io_poll_remove(req, s->sqe);
break;
case IORING_OP_SYNC_FILE_RANGE:
ret = io_sync_file_range(req, s->sqe, force_nonblock);
break;
default:
ret = -EINVAL;
break;
@ -1684,6 +1810,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
flags = READ_ONCE(s->sqe->flags);
fd = READ_ONCE(s->sqe->fd);
if (flags & IOSQE_IO_DRAIN) {
req->flags |= REQ_F_IO_DRAIN;
req->sequence = ctx->cached_sq_head - 1;
}
if (!io_op_needs_file(s->sqe)) {
req->file = NULL;
return 0;
@ -1713,7 +1844,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
int ret;
/* enforce forwards compatibility on users */
if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
return -EINVAL;
req = io_get_req(ctx, state);
@ -1724,6 +1855,13 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
if (unlikely(ret))
goto out;
ret = io_req_defer(ctx, req, s->sqe);
if (ret) {
if (ret == -EIOCBQUEUED)
ret = 0;
return ret;
}
ret = __io_submit_sqe(ctx, req, s, true);
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
struct io_uring_sqe *sqe_copy;
@ -2225,7 +2363,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
left = ctx->nr_user_files;
while (left) {
unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
int ret;
ret = __io_sqe_files_scm(ctx, this_files, total);
if (ret)
@ -2334,7 +2471,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
nr_cpu_ids);
ret = -EINVAL;
if (!cpu_possible(cpu))
if (!cpu_online(cpu))
goto err;
ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread,
@ -2621,6 +2758,38 @@ err:
return ret;
}
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
__s32 __user *fds = arg;
int fd;
if (ctx->cq_ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ctx->cq_ev_fd)) {
int ret = PTR_ERR(ctx->cq_ev_fd);
ctx->cq_ev_fd = NULL;
return ret;
}
return 0;
}
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
if (ctx->cq_ev_fd) {
eventfd_ctx_put(ctx->cq_ev_fd);
ctx->cq_ev_fd = NULL;
return 0;
}
return -ENXIO;
}
static void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_finish_async(ctx);
@ -2630,6 +2799,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_iopoll_reap_events(ctx);
io_sqe_buffer_unregister(ctx);
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
#if defined(CONFIG_UNIX)
if (ctx->ring_sock)
@ -3043,6 +3213,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_sqe_files_unregister(ctx);
break;
case IORING_REGISTER_EVENTFD:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_eventfd_unregister(ctx);
break;
default:
ret = -EINVAL;
break;

139
fs/sync.c
View File

@ -234,6 +234,77 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
return do_fsync(fd, 1);
}
int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
unsigned int flags)
{
int ret;
struct address_space *mapping;
loff_t endbyte; /* inclusive */
umode_t i_mode;
ret = -EINVAL;
if (flags & ~VALID_FLAGS)
goto out;
endbyte = offset + nbytes;
if ((s64)offset < 0)
goto out;
if ((s64)endbyte < 0)
goto out;
if (endbyte < offset)
goto out;
if (sizeof(pgoff_t) == 4) {
if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* The range starts outside a 32 bit machine's
* pagecache addressing capabilities. Let it "succeed"
*/
ret = 0;
goto out;
}
if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* Out to EOF
*/
nbytes = 0;
}
}
if (nbytes == 0)
endbyte = LLONG_MAX;
else
endbyte--; /* inclusive */
i_mode = file_inode(file)->i_mode;
ret = -ESPIPE;
if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
!S_ISLNK(i_mode))
goto out;
mapping = file->f_mapping;
ret = 0;
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
ret = file_fdatawait_range(file, offset, endbyte);
if (ret < 0)
goto out;
}
if (flags & SYNC_FILE_RANGE_WRITE) {
ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
if (ret < 0)
goto out;
}
if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
ret = file_fdatawait_range(file, offset, endbyte);
out:
return ret;
}
/*
* sys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
@ -286,77 +357,13 @@ int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
{
int ret;
struct fd f;
struct address_space *mapping;
loff_t endbyte; /* inclusive */
umode_t i_mode;
ret = -EINVAL;
if (flags & ~VALID_FLAGS)
goto out;
endbyte = offset + nbytes;
if ((s64)offset < 0)
goto out;
if ((s64)endbyte < 0)
goto out;
if (endbyte < offset)
goto out;
if (sizeof(pgoff_t) == 4) {
if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* The range starts outside a 32 bit machine's
* pagecache addressing capabilities. Let it "succeed"
*/
ret = 0;
goto out;
}
if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
/*
* Out to EOF
*/
nbytes = 0;
}
}
if (nbytes == 0)
endbyte = LLONG_MAX;
else
endbyte--; /* inclusive */
ret = -EBADF;
f = fdget(fd);
if (!f.file)
goto out;
if (f.file)
ret = sync_file_range(f.file, offset, nbytes, flags);
i_mode = file_inode(f.file)->i_mode;
ret = -ESPIPE;
if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
!S_ISLNK(i_mode))
goto out_put;
mapping = f.file->f_mapping;
ret = 0;
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
ret = file_fdatawait_range(f.file, offset, endbyte);
if (ret < 0)
goto out_put;
}
if (flags & SYNC_FILE_RANGE_WRITE) {
ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
if (ret < 0)
goto out_put;
}
if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
ret = file_fdatawait_range(f.file, offset, endbyte);
out_put:
fdput(f);
out:
return ret;
}

View File

@ -2789,6 +2789,9 @@ extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern int vfs_fsync(struct file *file, int datasync);
extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
unsigned int flags);
/*
* Sync the bytes written if this was a synchronous write. Expect ki_pos
* to already be updated for the write, and will return either the amount

View File

@ -26,6 +26,7 @@ struct io_uring_sqe {
__kernel_rwf_t rw_flags;
__u32 fsync_flags;
__u16 poll_events;
__u32 sync_range_flags;
};
__u64 user_data; /* data to be passed back at completion time */
union {
@ -38,6 +39,7 @@ struct io_uring_sqe {
* sqe->flags
*/
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
/*
* io_uring_setup() flags
@ -54,6 +56,7 @@ struct io_uring_sqe {
#define IORING_OP_WRITE_FIXED 5
#define IORING_OP_POLL_ADD 6
#define IORING_OP_POLL_REMOVE 7
#define IORING_OP_SYNC_FILE_RANGE 8
/*
* sqe->fsync_flags
@ -133,5 +136,7 @@ struct io_uring_params {
#define IORING_UNREGISTER_BUFFERS 1
#define IORING_REGISTER_FILES 2
#define IORING_UNREGISTER_FILES 3
#define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5
#endif