Merge branch 'for-6.15/io_uring-epoll-wait' into for-6.15/io_uring-reg-vec

* for-6.15/io_uring-epoll-wait:
  io_uring/epoll: add support for IORING_OP_EPOLL_WAIT
  io_uring/epoll: remove CONFIG_EPOLL guards
  eventpoll: add epoll_sendevents() helper
  eventpoll: abstract out ep_try_send_events() helper
  eventpoll: abstract out parameter sanity checking
pull/1188/head
Jens Axboe 2025-03-07 09:07:19 -07:00
commit 6e3da40ed6
7 changed files with 122 additions and 30 deletions

View File

@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
return ret;
}
static int ep_try_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
int res;
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
res = ep_send_events(ep, events, maxevents);
if (res > 0)
ep_suspend_napi_irqs(ep);
return res;
}
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@ -2031,17 +2047,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
while (1) {
if (eavail) {
/*
* Try to transfer events to user space. In case we get
* 0 events and there's still timeout left over, we go
* trying again in search of more luck.
*/
res = ep_send_events(ep, events, maxevents);
if (res) {
if (res > 0)
ep_suspend_napi_irqs(ep);
res = ep_try_send_events(ep, events, maxevents);
if (res)
return res;
}
}
if (timed_out)
@ -2445,6 +2453,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
static int ep_check_params(struct file *file, struct epoll_event __user *evs,
int maxevents)
{
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
if (!is_file_epoll(file))
return -EINVAL;
return 0;
}
int epoll_sendevents(struct file *file, struct epoll_event __user *events,
int maxevents)
{
struct eventpoll *ep;
int ret;
ret = ep_check_params(file, events, maxevents);
if (unlikely(ret))
return ret;
ep = file->private_data;
/*
* Racy call, but that's ok - it should get retried based on
* poll readiness anyway.
*/
if (ep_events_available(ep))
return ep_try_send_events(ep, events, maxevents);
return 0;
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
@ -2453,26 +2502,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
return -EFAULT;
int ret;
/* Get the "struct file *" for the eventpoll file */
CLASS(fd, f)(epfd);
if (fd_empty(f))
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
if (!is_file_epoll(fd_file(f)))
return -EINVAL;
ret = ep_check_params(fd_file(f), events, maxevents);
if (unlikely(ret))
return ret;
/*
* At this point it is safe to assume that the "private_data" contains

View File

@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t
/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);
/* Copy ready events to userspace */
int epoll_sendevents(struct file *file, struct epoll_event __user *events,
int maxevents);
/*
* This is called from inside fs/file_table.c:__fput() to unlink files
* from the eventpoll interface. We need to have this facility to cleanup

View File

@ -280,6 +280,7 @@ enum io_uring_op {
IORING_OP_BIND,
IORING_OP_LISTEN,
IORING_OP_RECV_ZC,
IORING_OP_EPOLL_WAIT,
/* this goes last, obviously */
IORING_OP_LAST,

View File

@ -11,10 +11,11 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
epoll.o statx.o timeout.o fdinfo.o \
cancel.o waitid.o register.o \
truncate.o memmap.o alloc_cache.o
statx.o timeout.o fdinfo.o cancel.o \
waitid.o register.o truncate.o \
memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o

View File

@ -12,7 +12,6 @@
#include "io_uring.h"
#include "epoll.h"
#if defined(CONFIG_EPOLL)
struct io_epoll {
struct file *file;
int epfd;
@ -21,6 +20,12 @@ struct io_epoll {
struct epoll_event event;
};
struct io_epoll_wait {
struct file *file;
int maxevents;
struct epoll_event __user *events;
};
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll);
@ -58,4 +63,30 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0);
return IOU_OK;
}
#endif
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
iew->maxevents = READ_ONCE(sqe->len);
iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr));
return 0;
}
int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
int ret;
ret = epoll_sendevents(req->file, iew->events, iew->maxevents);
if (ret == 0)
return -EAGAIN;
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}

View File

@ -3,4 +3,6 @@
#if defined(CONFIG_EPOLL)
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags);
#endif

View File

@ -527,6 +527,17 @@ const struct io_issue_def io_issue_defs[] = {
.issue = io_recvzc,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_EPOLL_WAIT] = {
.needs_file = 1,
.audit_skip = 1,
.pollin = 1,
#if defined(CONFIG_EPOLL)
.prep = io_epoll_wait_prep,
.issue = io_epoll_wait,
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
@ -761,6 +772,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_RECV_ZC] = {
.name = "RECV_ZC",
},
[IORING_OP_EPOLL_WAIT] = {
.name = "EPOLL_WAIT",
},
};
const char *io_uring_get_opcode(u8 opcode)