From 2410251cde0bac9f660f276307d6c967466eef0c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:25 +0100 Subject: [PATCH 01/42] net: timestamp: add helper returning skb's tx tstamp Add a helper function skb_get_tx_timestamp() that returns a tx timestamp associated with an error queue skb. Signed-off-by: Pavel Begunkov Acked-by: Willem de Bruijn Link: https://patch.msgid.link/702357dd8936ef4c0d3864441e853bfe3224a677.1750065793.git.asml.silence@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++++ net/socket.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 92e7c1aae3cc..f5f5a9ad290b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2677,6 +2677,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts); + static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { diff --git a/net/socket.c b/net/socket.c index 9a0e720f0859..2cab805943c0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -843,6 +843,52 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, sizeof(ts_pktinfo), &ts_pktinfo); } +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk) +{ + const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + u32 tsflags = READ_ONCE(sk->sk_tsflags); + + if (serr->ee.ee_errno != ENOMSG || + serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING) + return false; + + /* software time stamp available and wanted */ + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp) + return true; + /* hardware time stamps available and wanted */ + return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + skb_hwtstamps(skb)->hwtstamp; +} + +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts) +{ + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t hwtstamp; + int if_index = 0; + + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && + ktime_to_timespec64_cond(skb->tstamp, ts)) + return SOF_TIMESTAMPING_TX_SOFTWARE; + + if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) || + skb_is_swtx_tstamp(skb, false)) + return -ENOENT; + + if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) + hwtstamp = get_timestamp(sk, skb, &if_index); + else + hwtstamp = skb_hwtstamps(skb)->hwtstamp; + + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) + hwtstamp = ptp_convert_timestamp(&hwtstamp, + READ_ONCE(sk->sk_bind_phc)); + if (!ktime_to_timespec64_cond(hwtstamp, ts)) + return -ENOENT; + + return SOF_TIMESTAMPING_TX_HARDWARE; +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ From 4d811e395bbe54ba2febb3940d4b6c4741f360a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Jun 2025 11:48:33 -0600 Subject: [PATCH 02/42] io_uring: add IO_URING_F_INLINE issue flag Set when the execution of the request is done inline from the system call itself. Any deferred issue will never have this flag set. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ io_uring/io_uring.c | 12 +++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 2922635986f5..054c43c02c96 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -26,6 +26,8 @@ enum io_uring_cmd_flags { IO_URING_F_MULTISHOT = 4, /* executed by io-wq */ IO_URING_F_IOWQ = 8, + /* executed inline from syscall */ + IO_URING_F_INLINE = 16, /* int's last bit, sign checks are usually faster than a bit test */ IO_URING_F_NONBLOCK = INT_MIN, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5111ec040c53..c60d1c286c87 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -147,7 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all, bool is_sqpoll_thread); -static void io_queue_sqe(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -1377,7 +1377,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); else - io_queue_sqe(req); + io_queue_sqe(req, 0); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1960,12 +1960,14 @@ static void io_queue_async(struct io_kiocb *req, int ret) } } -static inline void io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags) __must_hold(&req->ctx->uring_lock) { + unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_COMPLETE_DEFER | extra_flags; int ret; - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + ret = io_issue_sqe(req, issue_flags); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2221,7 +2223,7 @@ fallback: return 0; } - io_queue_sqe(req); + io_queue_sqe(req, IO_URING_F_INLINE); return 0; } From af19388a973877b2349df46c4487a789cd3148ed Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Jun 2025 11:33:52 -0600 Subject: [PATCH 03/42] io_uring: add struct io_cold_def->sqe_copy() method Will be called by the core of io_uring, if inline issue is not going to be tried for a request. Opcodes can define this handler to defer copying of SQE data that should remain stable. Only called if IO_URING_F_INLINE is set. If it isn't set, then there's a bug in the core handling of this, and -EFAULT will be returned instead to terminate the request. This will trigger a WARN_ON_ONCE(). Don't expect this to ever trigger, and down the line this can be removed. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/io_uring.c | 27 +++++++++++++++++++++++++-- io_uring/opdef.h | 1 + 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 054c43c02c96..4ab3bdc103f2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -504,6 +504,7 @@ enum { REQ_F_BUF_NODE_BIT, REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, + REQ_F_SQE_COPIED_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -593,6 +594,8 @@ enum { * For SEND_ZC, whether to import buffers (i.e. the first issue). */ REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), + /* ->sqe_copy() has been called, if necessary */ + REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c60d1c286c87..3a23c8713f1b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1938,14 +1938,34 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) return file; } -static void io_queue_async(struct io_kiocb *req, int ret) +static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + + if (req->flags & REQ_F_SQE_COPIED) + return 0; + req->flags |= REQ_F_SQE_COPIED; + if (!def->sqe_copy) + return 0; + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE))) + return -EFAULT; + def->sqe_copy(req); + return 0; +} + +static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret) __must_hold(&req->ctx->uring_lock) { if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { +fail: io_req_defer_failed(req, ret); return; } + ret = io_req_sqe_copy(req, issue_flags); + if (unlikely(ret)) + goto fail; + switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); @@ -1974,7 +1994,7 @@ static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags) * doesn't support non-blocking read/write attempts */ if (unlikely(ret)) - io_queue_async(req, ret); + io_queue_async(req, issue_flags, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) @@ -1989,6 +2009,8 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { + /* can't fail with IO_URING_F_INLINE */ + io_req_sqe_copy(req, IO_URING_F_INLINE); if (unlikely(req->ctx->drain_active)) io_drain_req(req); else @@ -2200,6 +2222,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, */ if (unlikely(link->head)) { trace_io_uring_link(req, link->last); + io_req_sqe_copy(req, IO_URING_F_INLINE); link->last->link = req; link->last = req; diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 719a52104abe..c2f0907ed78c 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -38,6 +38,7 @@ struct io_issue_def { struct io_cold_def { const char *name; + void (*sqe_copy)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; From ead21053bf34941c7c7bf680d29b8d15af5406de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Jun 2025 14:00:27 -0600 Subject: [PATCH 04/42] io_uring/uring_cmd: get rid of io_uring_cmd_prep_setup() It's a pretty pointless helper, just allocates and copies data. Fold it into io_uring_cmd_prep(). Reviewed-by: Caleb Sander Mateos Reviewed-by: Anuj Gupta Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 929cad6ee326..e204f4941d72 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -181,8 +181,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -static int io_uring_cmd_prep_setup(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_async_cmd *ac; @@ -190,6 +189,18 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, /* see io_uring_cmd_get_async_data() */ BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0); + if (sqe->__pad1) + return -EINVAL; + + ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags); + if (ioucmd->flags & ~IORING_URING_CMD_MASK) + return -EINVAL; + + if (ioucmd->flags & IORING_URING_CMD_FIXED) + req->buf_index = READ_ONCE(sqe->buf_index); + + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); if (!ac) return -ENOMEM; @@ -207,25 +218,6 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, return 0; } -int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - - if (sqe->__pad1) - return -EINVAL; - - ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags); - if (ioucmd->flags & ~IORING_URING_CMD_MASK) - return -EINVAL; - - if (ioucmd->flags & IORING_URING_CMD_FIXED) - req->buf_index = READ_ONCE(sqe->buf_index); - - ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - - return io_uring_cmd_prep_setup(req, sqe); -} - int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); From ecf47d452ced9be162831192fcfb3e9f5cdcde7f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 Jun 2025 11:39:17 -0600 Subject: [PATCH 05/42] io_uring/uring_cmd: implement ->sqe_copy() to avoid unnecessary copies uring_cmd currently copies the full SQE at prep time, just in case it needs it to be stable. However, for inline completions or requests that get queued up on the device side, there's no need to ever copy the SQE. This is particularly important, as various use cases of uring_cmd will be using 128b sized SQEs. Opt in to using ->sqe_copy() to let the core of io_uring decide when to copy SQEs. This callback will only be called if it is safe to do so. Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/opdef.c | 1 + io_uring/uring_cmd.c | 23 +++++++++++++---------- io_uring/uring_cmd.h | 1 + 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 6e0882b051f9..287f9a23b816 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -759,6 +759,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", + .sqe_copy = io_uring_cmd_sqe_copy, .cleanup = io_uring_cmd_cleanup, }, [IORING_OP_SEND_ZC] = { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e204f4941d72..9ad0ea5398c2 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -205,19 +205,22 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!ac) return -ENOMEM; ac->data.op_data = NULL; - - /* - * Unconditionally cache the SQE for now - this is only needed for - * requests that go async, but prep handlers must ensure that any - * sqe data is stable beyond prep. Since uring_cmd is special in - * that it doesn't read in per-op data, play it safe and ensure that - * any SQE data is stable beyond prep. This can later get relaxed. - */ - memcpy(ac->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = ac->sqes; + ioucmd->sqe = sqe; return 0; } +void io_uring_cmd_sqe_copy(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_async_cmd *ac = req->async_data; + + /* Should not happen, as REQ_F_SQE_COPIED covers this */ + if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) + return; + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = ac->sqes; +} + int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index e6a5142c890e..a6dad47afc6b 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -11,6 +11,7 @@ struct io_async_cmd { int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +void io_uring_cmd_sqe_copy(struct io_kiocb *req); void io_uring_cmd_cleanup(struct io_kiocb *req); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, From cb9ccfb404e700dc0db59d68242d79fe386bb3f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Jun 2025 17:05:19 -0600 Subject: [PATCH 06/42] io_uring/nop: add IORING_NOP_TW completion flag To test and profile the overhead of io_uring task_work and the various types of it, add IORING_NOP_TW which tells nop to signal completions through task_work rather than complete them inline. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/nop.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cfd17e382082..8c3d43caab02 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -449,6 +449,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FILE (1U << 1) #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) +#define IORING_NOP_TW (1U << 4) /* * IO completion data structure (Completion Queue Entry) diff --git a/io_uring/nop.c b/io_uring/nop.c index 6ac2de761fd3..20ed0f85b1c2 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -20,7 +20,8 @@ struct io_nop { }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ - IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) + IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ + IORING_NOP_TW) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -68,5 +69,10 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); + if (nop->flags & IORING_NOP_TW) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return IOU_ISSUE_SKIP_COMPLETE; + } return IOU_COMPLETE; } From 162151889267089bb920609830c35f9272087c3f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:26 +0100 Subject: [PATCH 07/42] io_uring/poll: introduce io_arm_apoll() In preparation to allowing commands to do file polling, add a helper that takes the desired poll event mask and arms it for polling. We won't be able to use io_arm_poll_handler() with IORING_OP_URING_CMD as it tries to infer the mask from the opcode data, and we can't unify it across all commands. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7ee5633f2dc45fd15243f1a60965f7e30e1c48e8.1750065793.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 44 +++++++++++++++++++++++++++----------------- io_uring/poll.h | 1 + 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 0526062e2f81..c7e9fb34563d 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -669,33 +669,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, return apoll; } -int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; - if (!def->pollin && !def->pollout) - return IO_APOLL_ABORTED; + mask |= EPOLLET; if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; - if (def->pollin) { - mask |= EPOLLIN | EPOLLRDNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if (req->flags & REQ_F_CLEAR_POLLIN) - mask &= ~EPOLLIN; - } else { - mask |= EPOLLOUT | EPOLLWRNORM; - } - if (def->poll_exclusive) - mask |= EPOLLEXCLUSIVE; - apoll = io_req_alloc_apoll(req, issue_flags); if (!apoll) return IO_APOLL_ABORTED; @@ -712,6 +697,31 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_OK; } +int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + __poll_t mask = POLLPRI | POLLERR; + + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; + if (!io_file_can_poll(req)) + return IO_APOLL_ABORTED; + + if (def->pollin) { + mask |= EPOLLIN | EPOLLRDNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if (req->flags & REQ_F_CLEAR_POLLIN) + mask &= ~EPOLLIN; + } else { + mask |= EPOLLOUT | EPOLLWRNORM; + } + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; + + return io_arm_apoll(req, issue_flags, mask); +} + /* * Returns true if we found and killed one or more poll requests */ diff --git a/io_uring/poll.h b/io_uring/poll.h index 27e2db2ed4ae..c8438286dfa0 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -41,6 +41,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); +int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); From b95575495948a81ac9b0110aa721ea061dd850d9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:27 +0100 Subject: [PATCH 08/42] io_uring/cmd: allow multishot polled commands Some commands like timestamping in the next patch can make use of multishot polling, i.e. REQ_F_APOLL_MULTISHOT. Add support for that, which is condensed in a single helper called io_cmd_poll_multishot(). The user who wants to continue with a request in a multishot mode must call the function, and only if it returns 0 the user is free to proceed. Apart from normal terminal errors, it can also end up with -EIOCBQUEUED, in which case the user must forward it to the core io_uring. It's forbidden to use task work while the request is executing in a multishot mode. The API is not foolproof, hence it's not exported to modules nor exposed in public headers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bcf97c31659662c72b69fc8fcdf2a88cfc16e430.1750065793.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 23 +++++++++++++++++++++++ io_uring/uring_cmd.h | 3 +++ 2 files changed, 26 insertions(+) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9ad0ea5398c2..02cec6231831 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -12,6 +12,7 @@ #include "alloc_cache.h" #include "rsrc.h" #include "uring_cmd.h" +#include "poll.h" void io_cmd_cache_free(const void *entry) { @@ -136,6 +137,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) + return; + ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; __io_req_task_work_add(req, flags); @@ -158,6 +162,9 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) + return; + io_uring_cmd_del_cancelable(ioucmd, issue_flags); if (ret < 0) @@ -305,3 +312,19 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) io_req_queue_iowq(req); } + +int io_cmd_poll_multishot(struct io_uring_cmd *cmd, + unsigned int issue_flags, __poll_t mask) +{ + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + int ret; + + if (likely(req->flags & REQ_F_APOLL_MULTISHOT)) + return 0; + + req->flags |= REQ_F_APOLL_MULTISHOT; + mask &= ~EPOLLONESHOT; + + ret = io_arm_apoll(req, issue_flags, mask); + return ret == IO_APOLL_OK ? -EIOCBQUEUED : -ECANCELED; +} diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index a6dad47afc6b..50a6ccb831df 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -18,3 +18,6 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); void io_cmd_cache_free(const void *entry); + +int io_cmd_poll_multishot(struct io_uring_cmd *cmd, + unsigned int issue_flags, __poll_t mask); From ac479eac22e81c0ff56c6bdb93fad787015149cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:28 +0100 Subject: [PATCH 09/42] io_uring: add mshot helper for posting CQE32 Add a helper for posting 32 byte CQEs in a multishot mode and add a cmd helper on top. As it specifically works with requests, the helper ignore the passed in cqe->user_data and sets it to the one stored in the request. The command helper is only valid with multishot requests. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c29d7720c16e1f981cfaa903df187138baa3946b.1750065793.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 40 ++++++++++++++++++++++++++++++++++++++++ io_uring/io_uring.h | 1 + io_uring/uring_cmd.c | 11 +++++++++++ io_uring/uring_cmd.h | 4 ++++ 4 files changed, 56 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3a23c8713f1b..895740c955d0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -793,6 +793,21 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) return true; } +static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, + struct io_uring_cqe src_cqe[2]) +{ + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + return false; + if (unlikely(!io_get_cqe(ctx, &cqe))) + return false; + + memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); + trace_io_uring_complete(ctx, NULL, cqe); + return true; +} + static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { @@ -904,6 +919,31 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) return posted; } +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + lockdep_assert(!io_wq_current_is_worker()); + lockdep_assert_held(&ctx->uring_lock); + + cqe[0].user_data = req->cqe.user_data; + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux32(ctx, cqe); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux32(ctx, cqe); + } + + ctx->submit_state.cq_flush = true; + return posted; +} + static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 66c1ca73f55e..dc17162e7af1 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -81,6 +81,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); void io_req_track_inflight(struct io_kiocb *req); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 02cec6231831..b228b84a510f 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -328,3 +328,14 @@ int io_cmd_poll_multishot(struct io_uring_cmd *cmd, ret = io_arm_apoll(req, issue_flags, mask); return ret == IO_APOLL_OK ? -EIOCBQUEUED : -ECANCELED; } + +bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct io_uring_cqe cqe[2]) +{ + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_MULTISHOT))) + return false; + return io_req_post_cqe32(req, cqe); +} diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 50a6ccb831df..9e11da10ecab 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -17,6 +17,10 @@ void io_uring_cmd_cleanup(struct io_kiocb *req); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); +bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct io_uring_cqe cqe[2]); + void io_cmd_cache_free(const void *entry); int io_cmd_poll_multishot(struct io_uring_cmd *cmd, From 9e4ed359b8efad0e8ad4510d8ad22bf0b060526a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:29 +0100 Subject: [PATCH 10/42] io_uring/netcmd: add tx timestamping cmd support Add a new socket command which returns tx time stamps to the user. It provide an alternative to the existing error queue recvmsg interface. The command works in a polled multishot mode, which means io_uring will poll the socket and keep posting timestamps until the request is cancelled or fails in any other way (e.g. with no space in the CQ). It reuses the net infra and grabs timestamps from the socket's error queue. The command requires IORING_SETUP_CQE32. All non-final CQEs (marked with IORING_CQE_F_MORE) have cqe->res set to the tskey, and the upper 16 bits of cqe->flags keep tstype (i.e. offset by IORING_CQE_BUFFER_SHIFT). The timevalue is store in the upper part of the extended CQE. The final completion won't have IORING_CQE_F_MORE and will have cqe->res storing 0/error. Suggested-by: Vadim Fedorenko Acked-by: Willem de Bruijn Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/92ee66e6b33b8de062a977843d825f58f21ecd37.1750065793.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 16 +++++++ io_uring/cmd_net.c | 82 +++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8c3d43caab02..85600ad0ac08 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -969,6 +969,22 @@ enum io_uring_socket_op { SOCKET_URING_OP_SIOCOUTQ, SOCKET_URING_OP_GETSOCKOPT, SOCKET_URING_OP_SETSOCKOPT, + SOCKET_URING_OP_TX_TIMESTAMP, +}; + +/* + * SOCKET_URING_OP_TX_TIMESTAMP definitions + */ + +#define IORING_TIMESTAMP_HW_SHIFT 16 +/* The cqe->flags bit from which the timestamp type is stored */ +#define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1) +/* The cqe->flags flag signifying whether it's a hardware timestamp */ +#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT); + +struct io_timespec { + __u64 tv_sec; + __u64 tv_nsec; }; /* Zero copy receive refill queue entry */ diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c index e99170c7d41a..3866fe6ff541 100644 --- a/io_uring/cmd_net.c +++ b/io_uring/cmd_net.c @@ -1,5 +1,6 @@ #include #include +#include #include #include "uring_cmd.h" @@ -51,6 +52,85 @@ static inline int io_uring_cmd_setsockopt(struct socket *sock, optlen); } +static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk, + struct sk_buff *skb, unsigned issue_flags) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct io_uring_cqe cqe[2]; + struct io_timespec *iots; + struct timespec64 ts; + u32 tstype, tskey; + int ret; + + BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec)); + + ret = skb_get_tx_timestamp(skb, sk, &ts); + if (ret < 0) + return false; + + tskey = serr->ee.ee_data; + tstype = serr->ee.ee_info; + + cqe->user_data = 0; + cqe->res = tskey; + cqe->flags = IORING_CQE_F_MORE; + cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; + if (ret == SOF_TIMESTAMPING_TX_HARDWARE) + cqe->flags |= IORING_CQE_F_TSTAMP_HW; + + iots = (struct io_timespec *)&cqe[1]; + iots->tv_sec = ts.tv_sec; + iots->tv_nsec = ts.tv_nsec; + return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe); +} + +static int io_uring_cmd_timestamp(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct sock *sk = sock->sk; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *tmp; + struct sk_buff_head list; + int ret; + + if (!(issue_flags & IO_URING_F_CQE32)) + return -EINVAL; + ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR); + if (unlikely(ret)) + return ret; + + if (skb_queue_empty_lockless(q)) + return -EAGAIN; + __skb_queue_head_init(&list); + + scoped_guard(spinlock_irq, &q->lock) { + skb_queue_walk_safe(q, skb, tmp) { + /* don't support skbs with payload */ + if (!skb_has_tx_timestamp(skb, sk) || skb->len) + continue; + __skb_unlink(skb, q); + __skb_queue_tail(&list, skb); + } + } + + while (1) { + skb = skb_peek(&list); + if (!skb) + break; + if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags)) + break; + __skb_dequeue(&list); + consume_skb(skb); + } + + if (!unlikely(skb_queue_empty(&list))) { + scoped_guard(spinlock_irqsave, &q->lock) + skb_queue_splice(q, &list); + } + return -EAGAIN; +} + int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct socket *sock = cmd->file->private_data; @@ -76,6 +156,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) return io_uring_cmd_getsockopt(sock, cmd, issue_flags); case SOCKET_URING_OP_SETSOCKOPT: return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_TX_TIMESTAMP: + return io_uring_cmd_timestamp(sock, cmd, issue_flags); default: return -EOPNOTSUPP; } From 94b2030968be70b33fed9a5514a5967c7f20aebc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Jun 2025 11:36:54 -0600 Subject: [PATCH 11/42] io_uring: remove errant ';' from IORING_CQE_F_TSTAMP_HW definition An errant ';' slipped into that definition, which will cause some compilers to complain when it's used in an application: timestamp.c:257:45: error: empty expression statement has no effect; remove unnecessary ';' to silence this warning [-Werror,-Wextra-semi-stmt] 257 | hwts = cqe->flags & IORING_CQE_F_TSTAMP_HW; | ^ Fixes: 9e4ed359b8ef ("io_uring/netcmd: add tx timestamping cmd support") Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 85600ad0ac08..b6be063693c8 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -980,7 +980,7 @@ enum io_uring_socket_op { /* The cqe->flags bit from which the timestamp type is stored */ #define IORING_TIMESTAMP_TYPE_SHIFT (IORING_TIMESTAMP_HW_SHIFT + 1) /* The cqe->flags flag signifying whether it's a hardware timestamp */ -#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT); +#define IORING_CQE_F_TSTAMP_HW ((__u32)1 << IORING_TIMESTAMP_HW_SHIFT) struct io_timespec { __u64 tv_sec; From 3a0ae385f69e9b2d87c9b017c4ffb5567c015197 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:51 +0100 Subject: [PATCH 12/42] io_uring/mock: add basic infra for test mock files io_uring commands provide an ioctl style interface for files to implement file specific operations. io_uring provides many features and advanced api to commands, and it's getting hard to test as it requires specific files/devices. Add basic infrastucture for creating special mock files that will be implementing the cmd api and using various io_uring features we want to test. It'll also be useful to test some more obscure read/write/polling edge cases in the future. Suggested-by: chase xd Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/93f21b0af58c1367a2b22635d5a7d694ad0272fc.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + include/uapi/linux/io_uring/mock_file.h | 22 ++++ init/Kconfig | 11 ++ io_uring/Makefile | 1 + io_uring/mock_file.c | 148 ++++++++++++++++++++++++ 5 files changed, 183 insertions(+) create mode 100644 include/uapi/linux/io_uring/mock_file.h create mode 100644 io_uring/mock_file.c diff --git a/MAINTAINERS b/MAINTAINERS index c3f7fbd0d67a..24e11687f8b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12679,6 +12679,7 @@ F: include/linux/io_uring.h F: include/linux/io_uring_types.h F: include/trace/events/io_uring.h F: include/uapi/linux/io_uring.h +F: include/uapi/linux/io_uring/ F: io_uring/ IPMI SUBSYSTEM diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h new file mode 100644 index 000000000000..a44273fd526d --- /dev/null +++ b/include/uapi/linux/io_uring/mock_file.h @@ -0,0 +1,22 @@ +#ifndef LINUX_IO_URING_MOCK_FILE_H +#define LINUX_IO_URING_MOCK_FILE_H + +#include + +struct io_uring_mock_probe { + __u64 features; + __u64 __resv[9]; +}; + +struct io_uring_mock_create { + __u32 out_fd; + __u32 flags; + __u64 __resv[15]; +}; + +enum { + IORING_MOCK_MGR_CMD_PROBE, + IORING_MOCK_MGR_CMD_CREATE, +}; + +#endif diff --git a/init/Kconfig b/init/Kconfig index af4c2f085455..c40a7c65fb4c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1801,6 +1801,17 @@ config GCOV_PROFILE_URING the io_uring subsystem, hence this should only be enabled for specific test purposes. +config IO_URING_MOCK_FILE + tristate "Enable io_uring mock files (Experimental)" if EXPERT + default n + depends on IO_URING + help + Enable mock files for io_uring subststem testing. The ABI might + still change, so it's still experimental and should only be enabled + for specific test purposes. + + If unsure, say N. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y diff --git a/io_uring/Makefile b/io_uring/Makefile index d97c6b51d584..b3f1bd492804 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -21,3 +21,4 @@ obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o obj-$(CONFIG_NET) += net.o cmd_net.o obj-$(CONFIG_PROC_FS) += fdinfo.o +obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c new file mode 100644 index 000000000000..3681d0b8d8de --- /dev/null +++ b/io_uring/mock_file.c @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + return -ENOTSUPP; +} + +static const struct file_operations io_mock_fops = { + .owner = THIS_MODULE, + .uring_cmd = io_mock_cmd, +}; + +static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_create mc, __user *uarg; + struct file *file = NULL; + size_t uarg_size; + int fd, ret; + + /* + * It's a testing only driver that allows exercising edge cases + * that wouldn't be possible to hit otherwise. + */ + add_taint(TAINT_TEST, LOCKDEP_STILL_OK); + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index) + return -EINVAL; + if (uarg_size != sizeof(mc)) + return -EINVAL; + + memset(&mc, 0, sizeof(mc)); + if (copy_from_user(&mc, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags) + return -EINVAL; + + fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (fd < 0) + return fd; + + file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, + NULL, O_RDWR | O_CLOEXEC, NULL); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + mc.out_fd = fd; + if (copy_to_user(uarg, &mc, uarg_size)) { + fput(file); + ret = -EFAULT; + goto fail; + } + + fd_install(fd, file); + return 0; +fail: + put_unused_fd(fd); + return ret; +} + +static int io_probe_mock(struct io_uring_cmd *cmd) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_probe mp, __user *uarg; + size_t uarg_size; + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index || + uarg_size != sizeof(mp)) + return -EINVAL; + + memset(&mp, 0, sizeof(mp)); + if (copy_from_user(&mp, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(&mp, sizeof(mp))) + return -EINVAL; + + mp.features = 0; + + if (copy_to_user(uarg, &mp, uarg_size)) + return -EFAULT; + return 0; +} + +static int iou_mock_mgr_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd->cmd_op) { + case IORING_MOCK_MGR_CMD_PROBE: + return io_probe_mock(cmd); + case IORING_MOCK_MGR_CMD_CREATE: + return io_create_mock_file(cmd, issue_flags); + } + return -EOPNOTSUPP; +} + +static const struct file_operations iou_mock_dev_fops = { + .owner = THIS_MODULE, + .uring_cmd = iou_mock_mgr_cmd, +}; + +static struct miscdevice iou_mock_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "io_uring_mock", + .fops = &iou_mock_dev_fops, +}; + +static int __init io_mock_init(void) +{ + int ret; + + ret = misc_register(&iou_mock_miscdev); + if (ret < 0) { + pr_err("Could not initialize io_uring mock device\n"); + return ret; + } + return 0; +} + +static void __exit io_mock_exit(void) +{ + misc_deregister(&iou_mock_miscdev); +} + +module_init(io_mock_init) +module_exit(io_mock_exit) + +MODULE_AUTHOR("Pavel Begunkov "); +MODULE_DESCRIPTION("io_uring mock file"); +MODULE_LICENSE("GPL"); From 4aac001f780388b252534396feaf49b250eae27f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:52 +0100 Subject: [PATCH 13/42] io_uring/mock: add cmd using vectored regbufs There is a command api allowing to import vectored registered buffers, add a new mock command that uses the feature and simply copies the specified registered buffer into user space or vice versa. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/229a113fd7de6b27dbef9567f7c0bf4475c9017d.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 14 +++++ io_uring/mock_file.c | 70 ++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index a44273fd526d..73aca477d5c8 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -3,6 +3,12 @@ #include +enum { + IORING_MOCK_FEAT_CMD_COPY, + + IORING_MOCK_FEAT_END, +}; + struct io_uring_mock_probe { __u64 features; __u64 __resv[9]; @@ -19,4 +25,12 @@ enum { IORING_MOCK_MGR_CMD_CREATE, }; +enum { + IORING_MOCK_CMD_COPY_REGBUF, +}; + +enum { + IORING_MOCK_COPY_FROM = 1, +}; + #endif diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 3681d0b8d8de..8285393f4a5b 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -9,8 +9,76 @@ #include #include +#define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM + +static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf) +{ + size_t ret, copied = 0; + size_t buflen = PAGE_SIZE; + void *tmp_buf; + + tmp_buf = kzalloc(buflen, GFP_KERNEL); + if (!tmp_buf) + return -ENOMEM; + + while (iov_iter_count(reg_iter)) { + size_t len = min(iov_iter_count(reg_iter), buflen); + + if (iov_iter_rw(reg_iter) == ITER_SOURCE) { + ret = copy_from_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + if (copy_to_user(ubuf, tmp_buf, ret)) + break; + } else { + if (copy_from_user(tmp_buf, ubuf, len)) + break; + ret = copy_to_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + } + ubuf += ret; + copied += ret; + } + + kfree(tmp_buf); + return copied; +} + +static int io_cmd_copy_regbuf(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + const struct iovec __user *iovec; + unsigned flags, iovec_len; + struct iov_iter iter; + void __user *ubuf; + int dir, ret; + + ubuf = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + iovec = u64_to_user_ptr(READ_ONCE(sqe->addr)); + iovec_len = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->file_index); + + if (unlikely(sqe->ioprio || sqe->__pad1)) + return -EINVAL; + if (flags & ~IO_VALID_COPY_CMD_FLAGS) + return -EINVAL; + + dir = (flags & IORING_MOCK_COPY_FROM) ? ITER_SOURCE : ITER_DEST; + ret = io_uring_cmd_import_fixed_vec(cmd, iovec, iovec_len, dir, &iter, + issue_flags); + if (ret) + return ret; + ret = io_copy_regbuf(&iter, ubuf); + return ret ? ret : -EFAULT; +} + static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { + switch (cmd->cmd_op) { + case IORING_MOCK_CMD_COPY_REGBUF: + return io_cmd_copy_regbuf(cmd, issue_flags); + } return -ENOTSUPP; } @@ -91,7 +159,7 @@ static int io_probe_mock(struct io_uring_cmd *cmd) if (!mem_is_zero(&mp, sizeof(mp))) return -EINVAL; - mp.features = 0; + mp.features = IORING_MOCK_FEAT_END; if (copy_to_user(uarg, &mp, uarg_size)) return -EFAULT; From d1aa0346571436203a24cc3fc0c80f14cabbd630 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:53 +0100 Subject: [PATCH 14/42] io_uring/mock: add sync read/write Add support for synchronous zero read/write for mock files. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/571f3c9fe688e918256a06a722d3db6ced9ca3d5.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 4 +- io_uring/mock_file.c | 67 +++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index 73aca477d5c8..de27295bb365 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -5,6 +5,7 @@ enum { IORING_MOCK_FEAT_CMD_COPY, + IORING_MOCK_FEAT_RW_ZERO, IORING_MOCK_FEAT_END, }; @@ -17,7 +18,8 @@ struct io_uring_mock_probe { struct io_uring_mock_create { __u32 out_fd; __u32 flags; - __u64 __resv[15]; + __u64 file_size; + __u64 __resv[14]; }; enum { diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 8285393f4a5b..90160ccb50f0 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -9,6 +9,10 @@ #include #include +struct io_mock_file { + size_t size; +}; + #define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf) @@ -82,18 +86,59 @@ static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -ENOTSUPP; } +static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(to); + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + return iov_iter_zero(len, to); +} + +static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(from); + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + iov_iter_advance(from, len); + return len; +} + +static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) +{ + struct io_mock_file *mf = file->private_data; + + return fixed_size_llseek(file, offset, whence, mf->size); +} + +static int io_mock_release(struct inode *inode, struct file *file) +{ + struct io_mock_file *mf = file->private_data; + + kfree(mf); + return 0; +} + static const struct file_operations io_mock_fops = { .owner = THIS_MODULE, + .release = io_mock_release, .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, }; static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) { const struct io_uring_sqe *sqe = cmd->sqe; struct io_uring_mock_create mc, __user *uarg; + struct io_mock_file *mf = NULL; struct file *file = NULL; size_t uarg_size; - int fd, ret; + int fd = -1, ret; /* * It's a testing only driver that allows exercising edge cases @@ -114,18 +159,28 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag return -EFAULT; if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags) return -EINVAL; + if (mc.file_size > SZ_1G) + return -EINVAL; + mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT); + if (!mf) + return -ENOMEM; - fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) - return fd; + goto fail; + mf->size = mc.file_size; file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, - NULL, O_RDWR | O_CLOEXEC, NULL); + mf, O_RDWR | O_CLOEXEC, NULL); if (IS_ERR(file)) { ret = PTR_ERR(file); goto fail; } + file->f_mode |= FMODE_READ | FMODE_CAN_READ | + FMODE_WRITE | FMODE_CAN_WRITE | + FMODE_LSEEK; + mc.out_fd = fd; if (copy_to_user(uarg, &mc, uarg_size)) { fput(file); @@ -136,7 +191,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag fd_install(fd, file); return 0; fail: - put_unused_fd(fd); + if (fd >= 0) + put_unused_fd(fd); + kfree(mf); return ret; } From 2f71d2386f4feed5bfb9ee7b3d2c0ad953db1fa5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:54 +0100 Subject: [PATCH 15/42] io_uring/mock: allow to choose FMODE_NOWAIT Add an option to choose whether the file supports FMODE_NOWAIT, that changes the execution path io_uring request takes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e532565b05a05b23589d237c24ee1a3d90c2fd9.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 5 +++++ io_uring/mock_file.c | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index de27295bb365..125949d2b5ce 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -6,6 +6,7 @@ enum { IORING_MOCK_FEAT_CMD_COPY, IORING_MOCK_FEAT_RW_ZERO, + IORING_MOCK_FEAT_RW_NOWAIT, IORING_MOCK_FEAT_END, }; @@ -15,6 +16,10 @@ struct io_uring_mock_probe { __u64 __resv[9]; }; +enum { + IORING_MOCK_CREATE_F_SUPPORT_NOWAIT = 1, +}; + struct io_uring_mock_create { __u32 out_fd; __u32 flags; diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 90160ccb50f0..0eb1d3bd6368 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -131,6 +131,8 @@ static const struct file_operations io_mock_fops = { .llseek = io_mock_llseek, }; +#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) + static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) { const struct io_uring_sqe *sqe = cmd->sqe; @@ -157,7 +159,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag memset(&mc, 0, sizeof(mc)); if (copy_from_user(&mc, uarg, uarg_size)) return -EFAULT; - if (!mem_is_zero(mc.__resv, sizeof(mc.__resv)) || mc.flags) + if (!mem_is_zero(mc.__resv, sizeof(mc.__resv))) + return -EINVAL; + if (mc.flags & ~IO_VALID_CREATE_FLAGS) return -EINVAL; if (mc.file_size > SZ_1G) return -EINVAL; @@ -180,6 +184,8 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag file->f_mode |= FMODE_READ | FMODE_CAN_READ | FMODE_WRITE | FMODE_CAN_WRITE | FMODE_LSEEK; + if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) + file->f_mode |= FMODE_NOWAIT; mc.out_fd = fd; if (copy_to_user(uarg, &mc, uarg_size)) { From 0c98a44329c10bf904434524425cb42043513bd6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:55 +0100 Subject: [PATCH 16/42] io_uring/mock: support for async read/write Let the user to specify a delay to read/write request. io_uring will start a timer, return -EIOCBQUEUED and complete the request asynchronously after the delay pass. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/38f9d2e143fda8522c90a724b74630e68f9bbd16.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 4 +- io_uring/mock_file.c | 59 +++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index 125949d2b5ce..c8fa77e39c68 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -7,6 +7,7 @@ enum { IORING_MOCK_FEAT_CMD_COPY, IORING_MOCK_FEAT_RW_ZERO, IORING_MOCK_FEAT_RW_NOWAIT, + IORING_MOCK_FEAT_RW_ASYNC, IORING_MOCK_FEAT_END, }; @@ -24,7 +25,8 @@ struct io_uring_mock_create { __u32 out_fd; __u32 flags; __u64 file_size; - __u64 __resv[14]; + __u64 rw_delay_ns; + __u64 __resv[13]; }; enum { diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index 0eb1d3bd6368..ed6a5505763e 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -4,13 +4,22 @@ #include #include #include +#include +#include #include #include #include +struct io_mock_iocb { + struct kiocb *iocb; + struct hrtimer timer; + int res; +}; + struct io_mock_file { - size_t size; + size_t size; + u64 rw_delay_ns; }; #define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM @@ -86,14 +95,48 @@ static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -ENOTSUPP; } +static enum hrtimer_restart io_mock_rw_timer_expired(struct hrtimer *timer) +{ + struct io_mock_iocb *mio = container_of(timer, struct io_mock_iocb, timer); + struct kiocb *iocb = mio->iocb; + + WRITE_ONCE(iocb->private, NULL); + iocb->ki_complete(iocb, mio->res); + kfree(mio); + return HRTIMER_NORESTART; +} + +static ssize_t io_mock_delay_rw(struct kiocb *iocb, size_t len) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + struct io_mock_iocb *mio; + + mio = kzalloc(sizeof(*mio), GFP_KERNEL); + if (!mio) + return -ENOMEM; + + mio->iocb = iocb; + mio->res = len; + hrtimer_setup(&mio->timer, io_mock_rw_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_start(&mio->timer, ns_to_ktime(mf->rw_delay_ns), + HRTIMER_MODE_REL); + return -EIOCBQUEUED; +} + static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct io_mock_file *mf = iocb->ki_filp->private_data; size_t len = iov_iter_count(to); + size_t nr_zeroed; if (iocb->ki_pos + len > mf->size) return -EINVAL; - return iov_iter_zero(len, to); + nr_zeroed = iov_iter_zero(len, to); + if (!mf->rw_delay_ns || nr_zeroed != len) + return nr_zeroed; + + return io_mock_delay_rw(iocb, len); } static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -103,8 +146,12 @@ static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_pos + len > mf->size) return -EINVAL; - iov_iter_advance(from, len); - return len; + if (!mf->rw_delay_ns) { + iov_iter_advance(from, len); + return len; + } + + return io_mock_delay_rw(iocb, len); } static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) @@ -165,6 +212,9 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag return -EINVAL; if (mc.file_size > SZ_1G) return -EINVAL; + if (mc.rw_delay_ns > NSEC_PER_SEC) + return -EINVAL; + mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT); if (!mf) return -ENOMEM; @@ -174,6 +224,7 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag goto fail; mf->size = mc.file_size; + mf->rw_delay_ns = mc.rw_delay_ns; file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, mf, O_RDWR | O_CLOEXEC, NULL); if (IS_ERR(file)) { From e448d578264a9512d38deb8c418954d5f3e20712 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Jun 2025 19:16:56 +0100 Subject: [PATCH 17/42] io_uring/mock: add trivial poll handler Add a flag that enables polling on the mock file. For now it's trivially says that there is always data available, it'll be extended in the future. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f16de043ec4876d65fae294fc99ade57415fba0c.1750599274.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/mock_file.h | 2 ++ io_uring/mock_file.c | 37 +++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/io_uring/mock_file.h b/include/uapi/linux/io_uring/mock_file.h index c8fa77e39c68..debeee8e4527 100644 --- a/include/uapi/linux/io_uring/mock_file.h +++ b/include/uapi/linux/io_uring/mock_file.h @@ -8,6 +8,7 @@ enum { IORING_MOCK_FEAT_RW_ZERO, IORING_MOCK_FEAT_RW_NOWAIT, IORING_MOCK_FEAT_RW_ASYNC, + IORING_MOCK_FEAT_POLL, IORING_MOCK_FEAT_END, }; @@ -19,6 +20,7 @@ struct io_uring_mock_probe { enum { IORING_MOCK_CREATE_F_SUPPORT_NOWAIT = 1, + IORING_MOCK_CREATE_F_POLL = 2, }; struct io_uring_mock_create { diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c index ed6a5505763e..45d3735b2708 100644 --- a/io_uring/mock_file.c +++ b/io_uring/mock_file.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,8 @@ struct io_mock_iocb { struct io_mock_file { size_t size; u64 rw_delay_ns; + bool pollable; + struct wait_queue_head poll_wq; }; #define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM @@ -161,6 +164,18 @@ static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) return fixed_size_llseek(file, offset, whence, mf->size); } +static __poll_t io_mock_poll(struct file *file, struct poll_table_struct *pt) +{ + struct io_mock_file *mf = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &mf->poll_wq, pt); + + mask |= EPOLLOUT | EPOLLWRNORM; + mask |= EPOLLIN | EPOLLRDNORM; + return mask; +} + static int io_mock_release(struct inode *inode, struct file *file) { struct io_mock_file *mf = file->private_data; @@ -178,10 +193,22 @@ static const struct file_operations io_mock_fops = { .llseek = io_mock_llseek, }; -#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) +static const struct file_operations io_mock_poll_fops = { + .owner = THIS_MODULE, + .release = io_mock_release, + .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, + .poll = io_mock_poll, +}; + +#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT | \ + IORING_MOCK_CREATE_F_POLL) static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) { + const struct file_operations *fops = &io_mock_fops; const struct io_uring_sqe *sqe = cmd->sqe; struct io_uring_mock_create mc, __user *uarg; struct io_mock_file *mf = NULL; @@ -223,9 +250,15 @@ static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flag if (fd < 0) goto fail; + init_waitqueue_head(&mf->poll_wq); mf->size = mc.file_size; mf->rw_delay_ns = mc.rw_delay_ns; - file = anon_inode_create_getfile("[io_uring_mock]", &io_mock_fops, + if (mc.flags & IORING_MOCK_CREATE_F_POLL) { + fops = &io_mock_poll_fops; + mf->pollable = true; + } + + file = anon_inode_create_getfile("[io_uring_mock]", fops, mf, O_RDWR | O_CLOEXEC, NULL); if (IS_ERR(file)) { ret = PTR_ERR(file); From daa01d954b13a178c216b6a91f8451a7b83b3bf6 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 19 Jun 2025 08:34:34 -0600 Subject: [PATCH 18/42] io_uring/rsrc: skip atomic refcount for uncloned buffers io_buffer_unmap() performs an atomic decrement of the io_mapped_ubuf's reference count in case it has been cloned into another io_ring_ctx's registered buffer table. This is an expensive operation and unnecessary in the common case that the io_mapped_ubuf is only registered once. Load the reference count first and check whether it's 1. In that case, skip the atomic decrement and immediately free the io_mapped_ubuf. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250619143435.3474028-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d724602697e7..fc51ca7de733 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -135,8 +135,10 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { - if (!refcount_dec_and_test(&imu->refs)) - return; + if (unlikely(refcount_read(&imu->refs) > 1)) { + if (!refcount_dec_and_test(&imu->refs)) + return; + } if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); From cf73d9970ea4f8cace5d8f02d2565a2723003112 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 21:31:54 +0100 Subject: [PATCH 19/42] io_uring: don't use int for ABI __kernel_rwf_t is defined as int, the actual size of which is implementation defined. It won't go well if some compiler / archs ever defines it as i64, so replace it with __u32, hoping that there is no one using i16 for it. Cc: stable@vger.kernel.org Fixes: 2b188cc1bb857 ("Add io_uring IO interface") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/47c666c4ee1df2018863af3a2028af18feef11ed.1751412511.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b6be063693c8..b8a0e70ee2fd 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -50,7 +50,7 @@ struct io_uring_sqe { }; __u32 len; /* buffer size or number of iovecs */ union { - __kernel_rwf_t rw_flags; + __u32 rw_flags; __u32 fsync_flags; __u16 poll_events; /* compatibility */ __u32 poll32_events; /* word-reversed for BE */ From 825aea662b492571877b32aeeae13689fd9fbee4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jul 2025 16:46:30 -0600 Subject: [PATCH 20/42] io_uring/rw: cast rw->flags assignment to rwf_t kernel test robot reports that a recent change of the sqe->rw_flags field throws a sparse warning on 32-bit archs: >> io_uring/rw.c:291:19: sparse: sparse: incorrect type in assignment (different base types) @@ expected restricted __kernel_rwf_t [usertype] flags @@ got unsigned int @@ io_uring/rw.c:291:19: sparse: expected restricted __kernel_rwf_t [usertype] flags io_uring/rw.c:291:19: sparse: got unsigned int Force cast it to rwf_t to silence that new sparse warning. Fixes: cf73d9970ea4 ("io_uring: don't use int for ABI") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507032211.PwSNPNSP-lkp@intel.com/ Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 710d8cd53ebb..52a5b950b2e5 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -288,7 +288,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); - rw->flags = READ_ONCE(sqe->rw_flags); + rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags); attr_type_mask = READ_ONCE(sqe->attr_type_mask); if (attr_type_mask) { From e9a9ddb15b092eb4dc0d34a3e043e73f2510a6b0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 15:29:04 +0100 Subject: [PATCH 21/42] io_uring/zcrx: always pass page to io_zcrx_copy_chunk io_zcrx_copy_chunk() currently takes either a page or virtual address. Unify the parameters, make it take pages and resolve the linear part into a page the same way general networking code does that. Signed-off-by: Pavel Begunkov Reviewed-by: David Wei Link: https://lore.kernel.org/r/b8f9f4bac027f5f44a9ccf85350912d1db41ceb8.1751466461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 085eeed8cd50..e94a4647d409 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -945,8 +945,8 @@ static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) } static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, - void *src_base, struct page *src_page, - unsigned int src_offset, size_t len) + struct page *src_page, unsigned int src_offset, + size_t len) { struct io_zcrx_area *area = ifq->area; size_t copied = 0; @@ -960,7 +960,7 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const int dst_off = 0; struct net_iov *niov; struct page *dst_page; - void *dst_addr; + void *dst_addr, *src_addr; niov = io_zcrx_alloc_fallback(area); if (!niov) { @@ -970,13 +970,11 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, dst_page = io_zcrx_iov_page(niov); dst_addr = kmap_local_page(dst_page); - if (src_page) - src_base = kmap_local_page(src_page); + src_addr = kmap_local_page(src_page); - memcpy(dst_addr, src_base + src_offset, copy_size); + memcpy(dst_addr, src_addr + src_offset, copy_size); - if (src_page) - kunmap_local(src_base); + kunmap_local(src_addr); kunmap_local(dst_addr); if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { @@ -1005,7 +1003,7 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, skb_frag_foreach_page(frag, off, len, page, p_off, p_len, t) { - ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); + ret = io_zcrx_copy_chunk(req, ifq, page, p_off, p_len); if (ret < 0) return copied ? copied : ret; copied += ret; @@ -1067,8 +1065,9 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, size_t to_copy; to_copy = min_t(size_t, skb_headlen(skb) - offset, len); - copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, - offset, to_copy); + copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data), + offset_in_page(skb->data) + offset, + to_copy); if (copied < 0) { ret = copied; goto out; From 06897ddfc523cea415bd139148c5276b8b61b016 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 15:29:05 +0100 Subject: [PATCH 22/42] io_uring/zcrx: return error from io_zcrx_map_area_* io_zcrx_map_area_*() helpers return the number of processed niovs, which we use to unroll some of the mappings for user memory areas. It's unhandy, and dmabuf doesn't care about it. Return an error code instead and move failure partial unmapping into io_zcrx_map_area_umem(). Signed-off-by: Pavel Begunkov Reviewed-by: David Wei Link: https://lore.kernel.org/r/42668e82be3a84b07ee8fc76d1d6d5ac0f137fe5.1751466461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e94a4647d409..6fb7c9bedfcb 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -141,13 +141,13 @@ static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area struct net_iov *niov = &area->nia.niovs[niov_idx]; if (net_mp_niov_set_dma_addr(niov, dma)) - return 0; + return -EFAULT; sg_len -= PAGE_SIZE; dma += PAGE_SIZE; niov_idx++; } } - return niov_idx; + return 0; } static int io_import_umem(struct io_zcrx_ifq *ifq, @@ -256,29 +256,30 @@ static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *a break; } } - return i; + + if (i != area->nia.num_niovs) { + __io_zcrx_unmap_area(ifq, area, i); + return -EINVAL; + } + return 0; } static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { - unsigned nr; + int ret; guard(mutex)(&ifq->dma_lock); if (area->is_mapped) return 0; if (area->mem.is_dmabuf) - nr = io_zcrx_map_area_dmabuf(ifq, area); + ret = io_zcrx_map_area_dmabuf(ifq, area); else - nr = io_zcrx_map_area_umem(ifq, area); + ret = io_zcrx_map_area_umem(ifq, area); - if (nr != area->nia.num_niovs) { - __io_zcrx_unmap_area(ifq, area, nr); - return -EINVAL; - } - - area->is_mapped = true; - return 0; + if (ret == 0) + area->is_mapped = true; + return ret; } static void io_zcrx_sync_for_device(const struct page_pool *pool, From 54e89a93ef05d1a7c9996ff12e42eeecb4f66697 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 15:29:06 +0100 Subject: [PATCH 23/42] io_uring/zcrx: introduce io_populate_area_dma Add a helper that initialises page-pool dma addresses from a sg table. It'll be reused in following patches. Signed-off-by: Pavel Begunkov Reviewed-by: David Wei Link: https://lore.kernel.org/r/a8972a77be9b5675abc585d6e2e6e30f9c7dbd85.1751466461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 56 +++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 6fb7c9bedfcb..172eb67ddc62 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -47,6 +47,35 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) return area->mem.pages[net_iov_idx(niov)]; } +static int io_populate_area_dma(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, + struct sg_table *sgt, unsigned long off) +{ + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + for_each_sgtable_dma_sg(sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return -EFAULT; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } + } + return 0; +} + static void io_release_dmabuf(struct io_zcrx_mem *mem) { if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) @@ -121,33 +150,10 @@ err: static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { - unsigned long off = area->mem.dmabuf_offset; - struct scatterlist *sg; - unsigned i, niov_idx = 0; - if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) return -EINVAL; - - for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { - dma_addr_t dma = sg_dma_address(sg); - unsigned long sg_len = sg_dma_len(sg); - unsigned long sg_off = min(sg_len, off); - - off -= sg_off; - sg_len -= sg_off; - dma += sg_off; - - while (sg_len && niov_idx < area->nia.num_niovs) { - struct net_iov *niov = &area->nia.niovs[niov_idx]; - - if (net_mp_niov_set_dma_addr(niov, dma)) - return -EFAULT; - sg_len -= PAGE_SIZE; - dma += PAGE_SIZE; - niov_idx++; - } - } - return 0; + return io_populate_area_dma(ifq, area, area->mem.sgt, + area->mem.dmabuf_offset); } static int io_import_umem(struct io_zcrx_ifq *ifq, From b84621d96ee0221e0bfbf9f477bbec7a5077c464 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 15:29:07 +0100 Subject: [PATCH 24/42] io_uring/zcrx: allocate sgtable for umem areas Currently, dma addresses for umem areas are stored directly in niovs. It's memory efficient but inconvenient. I need a better format 1) to share code with dmabuf areas, and 2) for disentangling page, folio and niov sizes. dmabuf already provides sg_table, create one for user memory as well. Signed-off-by: Pavel Begunkov Reviewed-by: David Wei Link: https://lore.kernel.org/r/f3c15081827c1bf5427d3a2e693bc526476b87ee.1751466461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 80 +++++++++++++++++-------------------------------- io_uring/zcrx.h | 1 + 2 files changed, 29 insertions(+), 52 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 172eb67ddc62..3f3c8cbde61e 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -161,7 +161,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_area_reg *area_reg) { struct page **pages; - int nr_pages; + int nr_pages, ret; if (area_reg->dmabuf_fd) return -EINVAL; @@ -172,6 +172,12 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, if (IS_ERR(pages)) return PTR_ERR(pages); + ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, + 0, nr_pages << PAGE_SHIFT, + GFP_KERNEL_ACCOUNT); + if (ret) + return ret; + mem->pages = pages; mem->nr_folios = nr_pages; mem->size = area_reg->len; @@ -186,6 +192,7 @@ static void io_release_area_mem(struct io_zcrx_mem *mem) } if (mem->pages) { unpin_user_pages(mem->pages, mem->nr_folios); + sg_free_table(&mem->page_sg_table); kvfree(mem->pages); } } @@ -207,67 +214,36 @@ static int io_import_area(struct io_zcrx_ifq *ifq, return io_import_umem(ifq, mem, area_reg); } -static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, - struct io_zcrx_area *area, int nr_mapped) +static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) { int i; - for (i = 0; i < nr_mapped; i++) { - netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); - dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); - - dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); - } -} - -static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, - struct io_zcrx_area *area, int nr_mapped) -{ - int i; - - if (area->mem.is_dmabuf) - io_release_dmabuf(&area->mem); - else - io_zcrx_unmap_umem(ifq, area, nr_mapped); + guard(mutex)(&ifq->dma_lock); + if (!area->is_mapped) + return; + area->is_mapped = false; for (i = 0; i < area->nia.num_niovs; i++) net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); + + if (area->mem.is_dmabuf) { + io_release_dmabuf(&area->mem); + } else { + dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + } } -static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { - guard(mutex)(&ifq->dma_lock); + int ret; - if (area->is_mapped) - __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); - area->is_mapped = false; -} - -static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) -{ - int i; - - for (i = 0; i < area->nia.num_niovs; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - dma_addr_t dma; - - dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, - PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); - if (dma_mapping_error(ifq->dev, dma)) - break; - if (net_mp_niov_set_dma_addr(niov, dma)) { - dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); - break; - } - } - - if (i != area->nia.num_niovs) { - __io_zcrx_unmap_area(ifq, area, i); - return -EINVAL; - } - return 0; + ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (ret < 0) + return ret; + return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0); } static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 2f5e26389f22..89015b923911 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -14,6 +14,7 @@ struct io_zcrx_mem { struct page **pages; unsigned long nr_folios; + struct sg_table page_sg_table; struct dma_buf_attachment *attach; struct dma_buf *dmabuf; From 1b4dc1ff0a8887c2fbb83a48e87284375ab4b02a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 15:29:08 +0100 Subject: [PATCH 25/42] io_uring/zcrx: assert area type in io_zcrx_iov_page Add a simple debug assertion to io_zcrx_iov_page() making it's not trying to return pages for a dmabuf area. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c3c30a926a18436a399a1768f3cc86c76cd17fa7.1751466461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 3f3c8cbde61e..208d1943a9fd 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -44,6 +44,8 @@ static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + lockdep_assert(!area->mem.is_dmabuf); + return area->mem.pages[net_iov_idx(niov)]; } From e67645bb7f3f48e0dd794ca813ede75f61e1b31b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Jul 2025 15:29:09 +0100 Subject: [PATCH 26/42] io_uring/zcrx: prepare fallback for larger pages io_zcrx_copy_chunk() processes one page at a time, which won't be sufficient when the net_iov size grows. Introduce a structure keeping the target niov page and other parameters, it's more convenient and can be reused later. And add a helper function that can efficient copy buffers of an arbitrary length. For 64bit archs the loop inside should be compiled out. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e84bc705a4e1edeb9aefff470d96558d8232388f.1751466461.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 83 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 208d1943a9fd..7d7396ce876c 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -929,6 +929,51 @@ static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) return niov; } +struct io_copy_cache { + struct page *page; + unsigned long offset; + size_t size; +}; + +static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, + unsigned int src_offset, size_t len) +{ + size_t copied = 0; + + len = min(len, cc->size); + + while (len) { + void *src_addr, *dst_addr; + struct page *dst_page = cc->page; + unsigned dst_offset = cc->offset; + size_t n = len; + + if (folio_test_partial_kmap(page_folio(dst_page)) || + folio_test_partial_kmap(page_folio(src_page))) { + dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE); + dst_offset = offset_in_page(dst_offset); + src_page = nth_page(src_page, src_offset / PAGE_SIZE); + src_offset = offset_in_page(src_offset); + n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); + n = min(n, len); + } + + dst_addr = kmap_local_page(dst_page) + dst_offset; + src_addr = kmap_local_page(src_page) + src_offset; + + memcpy(dst_addr, src_addr, n); + + kunmap_local(src_addr); + kunmap_local(dst_addr); + + cc->size -= n; + cc->offset += n; + len -= n; + copied += n; + } + return copied; +} + static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct page *src_page, unsigned int src_offset, size_t len) @@ -941,11 +986,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, return -EFAULT; while (len) { - size_t copy_size = min_t(size_t, PAGE_SIZE, len); - const int dst_off = 0; + struct io_copy_cache cc; struct net_iov *niov; - struct page *dst_page; - void *dst_addr, *src_addr; + size_t n; niov = io_zcrx_alloc_fallback(area); if (!niov) { @@ -953,25 +996,22 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, break; } - dst_page = io_zcrx_iov_page(niov); - dst_addr = kmap_local_page(dst_page); - src_addr = kmap_local_page(src_page); + cc.page = io_zcrx_iov_page(niov); + cc.offset = 0; + cc.size = PAGE_SIZE; - memcpy(dst_addr, src_addr + src_offset, copy_size); + n = io_copy_page(&cc, src_page, src_offset, len); - kunmap_local(src_addr); - kunmap_local(dst_addr); - - if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { + if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) { io_zcrx_return_niov(niov); ret = -ENOSPC; break; } io_zcrx_get_niov_uref(niov); - src_offset += copy_size; - len -= copy_size; - copied += copy_size; + src_offset += n; + len -= n; + copied += n; } return copied ? copied : ret; @@ -981,19 +1021,8 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { struct page *page = skb_frag_page(frag); - u32 p_off, p_len, t, copied = 0; - int ret = 0; - off += skb_frag_off(frag); - - skb_frag_foreach_page(frag, off, len, - page, p_off, p_len, t) { - ret = io_zcrx_copy_chunk(req, ifq, page, p_off, p_len); - if (ret < 0) - return copied ? copied : ret; - copied += ret; - } - return copied; + return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); } static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, From e227c8cdb47b586ebf20b6b4caca0a30bb7e6b68 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jul 2025 19:36:57 -0600 Subject: [PATCH 27/42] io_uring/net: use passed in 'len' in io_recv_buf_select() len is a pointer to the desired len, use that rather than grab it from sr->len again. No functional changes as of this patch, but it does prepare io_recv_buf_select() for getting passed in a value that differs from sr->len. Link: https://lore.kernel.org/io-uring/20250709203420.1321689-3-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 43a43522f406..b448b165ad96 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1076,7 +1076,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } if (kmsg->msg.msg_inq > 1) - arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + arg.max_len = min_not_zero(*len, kmsg->msg.msg_inq); ret = io_buffers_peek(req, &arg); if (unlikely(ret < 0)) From 3919b695932dd1990b5c7fd44fc52361f8e2ac5f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jul 2025 16:54:12 -0600 Subject: [PATCH 28/42] io_uring/net: move io_sr_msg->retry_flags to io_sr_msg->flags There's plenty of space left, as sr->flags is a 16-bit type. The UAPI bits are the lower 8 bits, as that's all that sqe->ioprio can carry in the SQE anyway. Use a few of the upper 8 bits for internal uses, rather than have two separate flags entries. Link: https://lore.kernel.org/io-uring/20250709203420.1321689-2-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/net.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b448b165ad96..08309b5ed45e 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -75,15 +75,21 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - unsigned short retry_flags; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; +/* + * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold + * anyway. Use the upper 8 bits for internal uses. + */ enum sr_retry_flags { - IO_SR_MSG_RETRY = 1, - IO_SR_MSG_PARTIAL_MAP = 2, + IORING_RECV_RETRY = (1U << 15), + IORING_RECV_PARTIAL_MAP = (1U << 14), + + IORING_RECV_RETRY_CLEAR = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP, + IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP, }; /* @@ -192,7 +198,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; - sr->retry_flags = 0; + sr->flags &= ~IORING_RECV_RETRY_CLEAR; sr->len = 0; /* get from the provided buffer */ } @@ -402,7 +408,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry_flags = 0; sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -756,7 +761,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry_flags = 0; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -828,7 +832,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); - if (sr->retry_flags & IO_SR_MSG_RETRY) + if (sr->flags & IORING_RECV_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) @@ -837,12 +841,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * If more is available AND it was a full transfer, retry and * append to this one */ - if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && + if (!(sr->flags & IORING_RECV_NO_RETRY) && + kmsg->msg.msg_inq > 1 && this_ret > 0 && !iov_iter_count(&kmsg->msg.msg_iter)) { req->cqe.flags = cflags & ~CQE_F_MASK; sr->len = kmsg->msg.msg_inq; sr->done_io += this_ret; - sr->retry_flags |= IO_SR_MSG_RETRY; + sr->flags |= IORING_RECV_RETRY; return false; } } else { @@ -1088,7 +1093,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg req->flags |= REQ_F_NEED_CLEANUP; } if (arg.partial_map) - sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; + sr->flags |= IORING_RECV_PARTIAL_MAP; /* special case 1 vec, can be a fast path */ if (ret == 1) { @@ -1283,7 +1288,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; zc->done_io = 0; - zc->retry_flags = 0; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; From 6a8afb9fff6478e7944794f089181e93df1c728a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jul 2025 19:38:45 -0600 Subject: [PATCH 29/42] io_uring/net: allow multishot receive per-invocation cap If an application is handling multiple receive streams using recv multishot, then the amount of retries and buffer peeking for multishot and bundles can process too much per socket before moving on. This isn't directly controllable by the application. By default, io_uring will retry a recv MULTISHOT_MAX_RETRY (32) times, if the socket keeps having data to receive. And if using bundles, then each bundle peek will potentially map up to PEEK_MAX_IMPORT (256) iovecs of data. Once these limits are hit, then a requeue operation will be done, where the request will get retried after other pending requests have had a time to get executed. Add support for capping the per-invocation receive length, before a requeue condition is considered for each receive. This is done by setting sqe->mshot_len to the byte value. For example, if this is set to 1024, then each receive will be requeued by 1024 bytes received. Link: https://lore.kernel.org/io-uring/20250709203420.1321689-4-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/net.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 08309b5ed45e..40f4ac0ab151 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -75,6 +75,7 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; + unsigned mshot_len; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -87,9 +88,11 @@ struct io_sr_msg { enum sr_retry_flags { IORING_RECV_RETRY = (1U << 15), IORING_RECV_PARTIAL_MAP = (1U << 14), + IORING_RECV_MSHOT_CAP = (1U << 13), IORING_RECV_RETRY_CLEAR = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP, - IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP, + IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP | + IORING_RECV_MSHOT_CAP, }; /* @@ -199,7 +202,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; sr->flags &= ~IORING_RECV_RETRY_CLEAR; - sr->len = 0; /* get from the provided buffer */ + sr->len = sr->mshot_len; } static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -787,13 +790,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->buf_group = req->buf_index; req->buf_list = NULL; } + sr->mshot_len = 0; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) return -EINVAL; - if (req->opcode == IORING_OP_RECV && sr->len) - return -EINVAL; + if (req->opcode == IORING_OP_RECV) + sr->mshot_len = sr->len; req->flags |= REQ_F_APOLL_MULTISHOT; } if (sr->flags & IORING_RECVSEND_BUNDLE) { @@ -834,6 +838,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, issue_flags); if (sr->flags & IORING_RECV_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); + if (sr->mshot_len && *ret >= sr->mshot_len) + sr->flags |= IORING_RECV_MSHOT_CAP; /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; @@ -864,10 +870,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { - if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) + if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY && + !(sr->flags & IORING_RECV_MSHOT_CAP)) { return false; + } /* mshot retries exceeded, force a requeue */ sr->nr_multishot_loops = 0; + sr->flags &= ~IORING_RECV_MSHOT_CAP; if (issue_flags & IO_URING_F_MULTISHOT) *ret = IOU_REQUEUE; } @@ -1080,7 +1089,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg arg.mode |= KBUF_MODE_FREE; } - if (kmsg->msg.msg_inq > 1) + if (*len) + arg.max_len = *len; + else if (kmsg->msg.msg_inq > 1) arg.max_len = min_not_zero(*len, kmsg->msg.msg_inq); ret = io_buffers_peek(req, &arg); From 6e4098382b667e6ef485fbf09cd7ddd2e54fe6aa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Jul 2025 15:41:02 -0600 Subject: [PATCH 30/42] io_uring/poll: cleanup apoll freeing No point having REQ_F_POLLED in both IO_REQ_CLEAN_FLAGS and in IO_REQ_CLEAN_SLOW_FLAGS, and having both io_free_batch_list() and then io_clean_op() check for it and clean it. Move REQ_F_POLLED to IO_REQ_CLEAN_SLOW_FLAGS and drop it from IO_REQ_CLEAN_FLAGS, and have only io_free_batch_list() do the check and freeing. Link: https://lore.kernel.org/io-uring/20250712000344.1579663-2-axboe@kernel.dk Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 886368cd2c40..06b9a0f25eee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -114,11 +114,11 @@ #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) + REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ - REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) + REQ_F_REISSUE | REQ_F_POLLED | \ + IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -392,11 +392,6 @@ static void io_clean_op(struct io_kiocb *req) if (def->cleanup) def->cleanup(req); } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } if (req->flags & REQ_F_INFLIGHT) atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) From b1915b18e1d00eb4e8babcdc2ca3a64b43e20e9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Jul 2025 16:36:08 -0600 Subject: [PATCH 31/42] io_uring/net: cast min_not_zero() type kernel test robot reports that xtensa complains about different signedness for a min_not_zero() comparison. Cast the int part to size_t to avoid this issue. Fixes: e227c8cdb47b ("io_uring/net: use passed in 'len' in io_recv_buf_select()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507150504.zO5FsCPm-lkp@intel.com/ Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 40f4ac0ab151..639f111408a1 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1092,7 +1092,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg if (*len) arg.max_len = *len; else if (kmsg->msg.msg_inq > 1) - arg.max_len = min_not_zero(*len, kmsg->msg.msg_inq); + arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq); ret = io_buffers_peek(req, &arg); if (unlikely(ret < 0)) From 8723c146ad4ca17d340213f3676ce1829668b79b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Jul 2025 12:20:06 -0600 Subject: [PATCH 32/42] io_uring: deduplicate wakeup handling Both io_poll_wq_wake() and io_cqring_wake() contain the exact same code, and most of the comment in the latter applies equally to both. Move the test and wakeup handling into a basic helper that they can both use, and move part of the comment that applies generically to this new helper. Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dc17162e7af1..abc6de227f74 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -294,11 +294,22 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline void __io_wq_wake(struct wait_queue_head *wq) +{ + /* + * + * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter + * set in the mask so that if we recurse back into our own poll + * waitqueue handlers, we know we have a dependency between eventfd or + * epoll and should terminate multishot poll at that point. + */ + if (wq_has_sleeper(wq)) + __wake_up(wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); +} + static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) { - if (wq_has_sleeper(&ctx->poll_wq)) - __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, - poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + __io_wq_wake(&ctx->poll_wq); } static inline void io_cqring_wake(struct io_ring_ctx *ctx) @@ -307,15 +318,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx) * Trigger waitqueue handler on all waiters on our waitqueue. This * won't necessarily wake up all the tasks, io_should_wake() will make * that decision. - * - * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter - * set in the mask so that if we recurse back into our own poll - * waitqueue handlers, we know we have a dependency between eventfd or - * epoll and should terminate multishot poll at that point. */ - if (wq_has_sleeper(&ctx->cq_wait)) - __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, - poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + + __io_wq_wake(&ctx->cq_wait); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) From 0ebc9a7ecf6acecf8bdf3a3cb02b6073df4a2288 Mon Sep 17 00:00:00 2001 From: Norman Maurer Date: Tue, 15 Jul 2025 16:02:50 +0200 Subject: [PATCH 33/42] io_uring/net: Support multishot receive len cap At the moment its very hard to do fine grained backpressure when using multishot as the kernel might produce a lot of completions before the user has a chance to cancel a previous submitted multishot recv. This change adds support to issue a multishot recv that is capped by a len, which means the kernel will only rearm until X amount of data is received. When the limit is reached the completion will signal to the user that a re-arm needs to happen manually by not setting the IORING_CQE_F_MORE flag. Signed-off-by: Norman Maurer Link: https://lore.kernel.org/r/20250715140249.31186-1-norman_maurer@apple.com Signed-off-by: Jens Axboe --- io_uring/net.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 639f111408a1..ba2d0abea349 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -75,7 +75,10 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; + /* per-invocation mshot limit */ unsigned mshot_len; + /* overall mshot byte limit */ + unsigned mshot_total_len; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -89,10 +92,12 @@ enum sr_retry_flags { IORING_RECV_RETRY = (1U << 15), IORING_RECV_PARTIAL_MAP = (1U << 14), IORING_RECV_MSHOT_CAP = (1U << 13), + IORING_RECV_MSHOT_LIM = (1U << 12), + IORING_RECV_MSHOT_DONE = (1U << 11), IORING_RECV_RETRY_CLEAR = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP, IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP | - IORING_RECV_MSHOT_CAP, + IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE, }; /* @@ -765,7 +770,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->done_io = 0; - if (unlikely(sqe->file_index || sqe->addr2)) + if (unlikely(sqe->addr2)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -790,16 +795,25 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->buf_group = req->buf_index; req->buf_list = NULL; } - sr->mshot_len = 0; + sr->mshot_total_len = sr->mshot_len = 0; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) return -EINVAL; - if (req->opcode == IORING_OP_RECV) + if (req->opcode == IORING_OP_RECV) { sr->mshot_len = sr->len; + sr->mshot_total_len = READ_ONCE(sqe->optlen); + if (sr->mshot_total_len) + sr->flags |= IORING_RECV_MSHOT_LIM; + } else if (sqe->optlen) { + return -EINVAL; + } req->flags |= REQ_F_APOLL_MULTISHOT; + } else if (sqe->optlen) { + return -EINVAL; } + if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_RECVMSG) return -EINVAL; @@ -831,6 +845,19 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { + /* + * If sr->len hits zero, the limit has been reached. Mark + * mshot as finished, and flag MSHOT_DONE as well to prevent + * a potential bundle from being retried. + */ + sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len); + if (!sr->mshot_total_len) { + sr->flags |= IORING_RECV_MSHOT_DONE; + mshot_finished = true; + } + } + if (sr->flags & IORING_RECVSEND_BUNDLE) { size_t this_ret = *ret - sr->done_io; @@ -1094,6 +1121,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg else if (kmsg->msg.msg_inq > 1) arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq); + /* if mshot limited, ensure we don't go over */ + if (sr->flags & IORING_RECV_MSHOT_LIM) + arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len); ret = io_buffers_peek(req, &arg); if (unlikely(ret < 0)) return ret; From 11fbada7184f9e19bcdfa2f6b15828a78b8897a6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 16 Jul 2025 22:04:08 +0100 Subject: [PATCH 34/42] io_uring: export io_[un]account_mem Export pinned memory accounting helpers, they'll be used by zcrx shortly. Cc: stable@vger.kernel.org Fixes: cf96310c5f9a0 ("io_uring/zcrx: add io_zcrx_area") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9a61e54bd89289b39570ae02fe620e12487439e4.1752699568.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 4 ++-- io_uring/rsrc.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 043018bc9b9c..f75f5e43fa4a 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -55,7 +55,7 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); @@ -64,7 +64,7 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 25e7e998dcfd..a3ca6ba66596 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -120,6 +120,8 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int __io_account_mem(struct user_struct *user, unsigned long nr_pages); +int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); +void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) From 262ab205180d2ba3ab6110899a4dbe439c51dfaa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 16 Jul 2025 22:04:09 +0100 Subject: [PATCH 35/42] io_uring/zcrx: account area memory zcrx areas can be quite large and need to be accounted and checked against RLIMIT_MEMLOCK. In practise it shouldn't be a big issue as the inteface already requires cap_net_admin. Cc: stable@vger.kernel.org Fixes: cf96310c5f9a0 ("io_uring/zcrx: add io_zcrx_area") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4b53f0c575bd062f63d12bec6cac98037fc66aeb.1752699568.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 27 +++++++++++++++++++++++++++ io_uring/zcrx.h | 1 + 2 files changed, 28 insertions(+) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 7d7396ce876c..dabce3ee0e8b 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -158,6 +158,23 @@ static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area area->mem.dmabuf_offset); } +static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) +{ + struct folio *last_folio = NULL; + unsigned long res = 0; + int i; + + for (i = 0; i < nr_pages; i++) { + struct folio *folio = page_folio(pages[i]); + + if (folio == last_folio) + continue; + last_folio = folio; + res += 1UL << folio_order(folio); + } + return res; +} + static int io_import_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_mem *mem, struct io_uring_zcrx_area_reg *area_reg) @@ -180,6 +197,13 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, if (ret) return ret; + mem->account_pages = io_count_account_pages(pages, nr_pages); + ret = io_account_mem(ifq->ctx, mem->account_pages); + if (ret < 0) { + mem->account_pages = 0; + return ret; + } + mem->pages = pages; mem->nr_folios = nr_pages; mem->size = area_reg->len; @@ -357,6 +381,9 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) io_zcrx_unmap_area(area->ifq, area); io_release_area_mem(&area->mem); + if (area->mem.account_pages) + io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); + kvfree(area->freelist); kvfree(area->nia.niovs); kvfree(area->user_refs); diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 89015b923911..109c4ca36434 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -15,6 +15,7 @@ struct io_zcrx_mem { struct page **pages; unsigned long nr_folios; struct sg_table page_sg_table; + unsigned long account_pages; struct dma_buf_attachment *attach; struct dma_buf *dmabuf; From 733c43f1df34f9185b945e6f12ac00c8556c6dfe Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 8 Jul 2025 14:22:10 -0600 Subject: [PATCH 36/42] io_uring/cmd: introduce IORING_URING_CMD_REISSUE flag Add a flag IORING_URING_CMD_REISSUE that ->uring_cmd() implementations can use to tell whether this is the first or subsequent issue of the uring_cmd. This will allow ->uring_cmd() implementations to store information in the io_uring_cmd's pdu across issues. Signed-off-by: Caleb Sander Mateos Acked-by: David Sterba Link: https://lore.kernel.org/r/20250708202212.2851548-3-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 ++ io_uring/uring_cmd.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 53408124c1e5..29892f54e0ac 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -8,6 +8,8 @@ /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ #define IORING_URING_CMD_CANCELABLE (1U << 30) +/* io_uring_cmd is being issued again */ +#define IORING_URING_CMD_REISSUE (1U << 31) struct io_uring_cmd { struct file *file; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index b228b84a510f..58964a2f8582 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -261,7 +261,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN || ret == -EIOCBQUEUED) + if (ret == -EAGAIN) { + ioucmd->flags |= IORING_URING_CMD_REISSUE; + return ret; + } + if (ret == -EIOCBQUEUED) return ret; if (ret < 0) req_set_fail(req); From 9aad72b4e3f0233e747bb6b1ec05ea71365f4246 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 8 Jul 2025 14:22:11 -0600 Subject: [PATCH 37/42] btrfs/ioctl: store btrfs_uring_encoded_data in io_btrfs_cmd btrfs is the only user of struct io_uring_cmd_data and its op_data field. Switch its ->uring_cmd() implementations to store the struct btrfs_uring_encoded_data * in the struct io_btrfs_cmd, overlayed with io_uring_cmd's pdu field. This avoids having to touch another cache line to access the struct btrfs_uring_encoded_data *, and allows op_data and struct io_uring_cmd_data to be removed. Signed-off-by: Caleb Sander Mateos Acked-by: David Sterba Link: https://lore.kernel.org/r/20250708202212.2851548-4-csander@purestorage.com Signed-off-by: Jens Axboe --- fs/btrfs/ioctl.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 913acef3f0a9..9eb06ae79362 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4629,6 +4629,13 @@ out_acct: return ret; } +struct btrfs_uring_encoded_data { + struct btrfs_ioctl_encoded_io_args args; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov; + struct iov_iter iter; +}; + /* * Context that's attached to an encoded read io_uring command, in cmd->pdu. It * contains the fields in btrfs_uring_read_extent that are necessary to finish @@ -4650,6 +4657,7 @@ struct btrfs_uring_priv { }; struct io_btrfs_cmd { + struct btrfs_uring_encoded_data *data; struct btrfs_uring_priv *priv; }; @@ -4708,6 +4716,7 @@ out: kfree(priv->pages); kfree(priv->iov); kfree(priv); + kfree(bc->data); } void btrfs_uring_read_extent_endio(void *ctx, int err) @@ -4791,13 +4800,6 @@ out_fail: return ret; } -struct btrfs_uring_encoded_data { - struct btrfs_ioctl_encoded_io_args args; - struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; - struct iov_iter iter; -}; - static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); @@ -4813,7 +4815,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; - struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4842,7 +4848,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue goto out_acct; } - io_uring_cmd_get_async_data(cmd)->op_data = data; + bc->data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -4940,6 +4946,9 @@ out_acct: add_rchar(current, ret); inc_syscr(current); + if (ret != -EIOCBQUEUED && ret != -EAGAIN) + kfree(data); + return ret; } @@ -4950,7 +4959,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu struct file *file; ssize_t ret; void __user *sqe_addr; - struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; + struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); + struct btrfs_uring_encoded_data *data = NULL; + + if (cmd->flags & IORING_URING_CMD_REISSUE) + data = bc->data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; @@ -4972,7 +4985,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu goto out_acct; } - io_uring_cmd_get_async_data(cmd)->op_data = data; + bc->data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) @@ -5062,6 +5075,9 @@ out_acct: if (ret > 0) add_wchar(current, ret); inc_syscw(current); + + if (ret != -EAGAIN) + kfree(data); return ret; } From 2e6dbb25ea15844c8b617260d635731c37c85ac9 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 8 Jul 2025 14:22:12 -0600 Subject: [PATCH 38/42] io_uring/cmd: remove struct io_uring_cmd_data There are no more users of struct io_uring_cmd_data and its op_data field. Remove it to shave 8 bytes from struct io_async_cmd and eliminate a store and load for every uring_cmd. Signed-off-by: Caleb Sander Mateos Acked-by: David Sterba Link: https://lore.kernel.org/r/20250708202212.2851548-5-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 9 --------- io_uring/uring_cmd.c | 12 +----------- io_uring/uring_cmd.h | 1 - 3 files changed, 1 insertion(+), 21 deletions(-) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 29892f54e0ac..cfa6d0c0c322 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -21,10 +21,6 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; -struct io_uring_cmd_data { - void *op_data; -}; - static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; @@ -137,11 +133,6 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd return cmd_to_io_kiocb(cmd)->tctx->task; } -static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd) -{ - return cmd_to_io_kiocb(cmd)->async_data; -} - /* * Return uring_cmd's context reference as its context handle for driver to * track per-context resource, such as registered kernel IO buffer diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 58964a2f8582..053bac89b6c0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -26,12 +26,6 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_async_cmd *ac = req->async_data; - struct io_uring_cmd_data *cache = &ac->data; - - if (cache->op_data) { - kfree(cache->op_data); - cache->op_data = NULL; - } if (issue_flags & IO_URING_F_UNLOCKED) return; @@ -40,7 +34,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP) io_vec_free(&ac->vec); - if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) { + if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) { ioucmd->sqe = NULL; req->async_data = NULL; req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); @@ -193,9 +187,6 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_async_cmd *ac; - /* see io_uring_cmd_get_async_data() */ - BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0); - if (sqe->__pad1) return -EINVAL; @@ -211,7 +202,6 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); if (!ac) return -ENOMEM; - ac->data.op_data = NULL; ioucmd->sqe = sqe; return 0; } diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 9e11da10ecab..041aef8a8aa3 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -4,7 +4,6 @@ #include struct io_async_cmd { - struct io_uring_cmd_data data; struct iou_vec vec; struct io_uring_sqe sqes[2]; }; From d1fbe1ebf4a12cabd7945335d5e47718cb2bef99 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 19 Jul 2025 18:04:56 -0700 Subject: [PATCH 39/42] io_uring: fix breakage in EXPERT menu Add a dependency for IO_URING for the GCOV_PROFILE_URING symbol. Without this patch the EXPERT config menu ends with "Enable IO uring support" and the menu prompts for GCOV_PROFILE_URING and IO_URING_MOCK_FILE are not subordinate to it. This causes all of the EXPERT Kconfig options that follow GCOV_PROFILE_URING to be display in the "upper" menu (General setup), just following the EXPERT menu. Fixes: 1802656ef890 ("io_uring: add GCOV_PROFILE_URING Kconfig option") Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: Andrew Morton Cc: Masahiro Yamada Cc: io-uring@vger.kernel.org Link: https://lore.kernel.org/r/20250720010456.2945344-1-rdunlap@infradead.org Signed-off-by: Jens Axboe --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index c40a7c65fb4c..0fa05ba08442 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1790,7 +1790,7 @@ config IO_URING config GCOV_PROFILE_URING bool "Enable GCOV profiling on the io_uring subsystem" - depends on GCOV_KERNEL + depends on IO_URING && GCOV_KERNEL help Enable GCOV profiling on the io_uring subsystem, to facilitate code coverage testing. From 720df2310b89cf76c1dc1a05902536282506f8bf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 21 Jul 2025 10:56:20 +0100 Subject: [PATCH 40/42] io_uring/zcrx: fix null ifq on area destruction Dan reports that ifq can be null when infering arguments for io_unaccount_mem() from io_zcrx_free_area(). Fix it by always setting a correct ifq. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202507180628.gBxrOgqr-lkp@intel.com/ Fixes: 262ab205180d2 ("io_uring/zcrx: account area memory") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/20670d163bb90dba2a81a4150f1125603cefb101.1753091564.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index dabce3ee0e8b..6b4bdefb40c4 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -377,8 +377,7 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) static void io_zcrx_free_area(struct io_zcrx_area *area) { - if (area->ifq) - io_zcrx_unmap_area(area->ifq, area); + io_zcrx_unmap_area(area->ifq, area); io_release_area_mem(&area->mem); if (area->mem.account_pages) @@ -411,6 +410,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) goto err; + area->ifq = ifq; ret = io_import_area(ifq, &area->mem, area_reg); if (ret) @@ -445,7 +445,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, } area->free_count = nr_iovs; - area->ifq = ifq; /* we're only supporting one area per ifq for now */ area->area_id = 0; area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; From 6bbd3411ff87df1ca38ff32d36eb5dc673ca8021 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 21 Jul 2025 10:56:21 +0100 Subject: [PATCH 41/42] io_uring/zcrx: don't leak pages on account failure Someone needs to release pinned pages in io_import_umem() if accounting fails. Assign them to the area but return an error, the following io_zcrx_free_area() will clean them up. Fixes: 262ab205180d2 ("io_uring/zcrx: account area memory") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e19f283a912f200c0d427e376cb789fc3f3d69bc.1753091564.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 6b4bdefb40c4..6a983f1ab592 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -199,15 +199,13 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, mem->account_pages = io_count_account_pages(pages, nr_pages); ret = io_account_mem(ifq->ctx, mem->account_pages); - if (ret < 0) { + if (ret < 0) mem->account_pages = 0; - return ret; - } mem->pages = pages; mem->nr_folios = nr_pages; mem->size = area_reg->len; - return 0; + return ret; } static void io_release_area_mem(struct io_zcrx_mem *mem) From d9f595b9a65e9c9eb03e21f3db98fde158d128db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 21 Jul 2025 10:56:22 +0100 Subject: [PATCH 42/42] io_uring/zcrx: fix leaking pages on sg init fail If sg_alloc_table_from_pages() fails, io_import_umem() returns without cleaning up pinned pages first. Fix it. Fixes: b84621d96ee02 ("io_uring/zcrx: allocate sgtable for umem areas") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9fd94d1bc8c316611eccfec7579799182ff3fb0a.1753091564.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 6a983f1ab592..2d8bc4219463 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -194,8 +194,10 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, 0, nr_pages << PAGE_SHIFT, GFP_KERNEL_ACCOUNT); - if (ret) + if (ret) { + unpin_user_pages(pages, nr_pages); return ret; + } mem->account_pages = io_count_account_pages(pages, nr_pages); ret = io_account_mem(ifq->ctx, mem->account_pages);