From f0243d2b86b97a575a7a013370e934f70ee77dd3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:09 +0000 Subject: [PATCH 01/10] io_uring/zcrx: convert to use netmem_desc Convert zcrx to struct netmem_desc, and use struct net_iov::desc to access its fields instead of struct net_iov inner union alises. zcrx only directly reads niov->pp, so with this patch it doesn't depend on the union anymore. Signed-off-by: Pavel Begunkov Reviewed-by: Byungchul Park Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index c57ab332acbd..635ee4eb5d8d 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -696,12 +696,12 @@ static void io_zcrx_return_niov(struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); - if (!niov->pp) { + if (!niov->desc.pp) { /* copy fallback allocated niovs */ io_zcrx_return_niov_freelist(niov); return; } - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); } static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) @@ -815,7 +815,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp, if (!page_pool_unref_and_test(netmem)) continue; - if (unlikely(niov->pp != pp)) { + if (unlikely(niov->desc.pp != pp)) { io_zcrx_return_niov(niov); continue; } @@ -1082,13 +1082,15 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { struct net_iov *niov; + struct page_pool *pp; if (unlikely(!skb_frag_is_net_iov(frag))) return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || - io_pp_to_ifq(niov->pp) != ifq) + pp = niov->desc.pp; + + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) return -EFAULT; if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) From a0169c3a62875d1bafa0caffa42e1d1cf6aa40e6 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Thu, 13 Nov 2025 10:46:10 +0000 Subject: [PATCH 02/10] io_uring/zcrx: use folio_nr_pages() instead of shift operation folio_nr_pages() is a faster helper function to get the number of pages when NR_PAGES_IN_LARGE_FOLIO is enabled. Signed-off-by: Pedro Demarchi Gomes Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 635ee4eb5d8d..149bf9d5b983 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -170,7 +170,7 @@ static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pag if (folio == last_folio) continue; last_folio = folio; - res += 1UL << folio_order(folio); + res += folio_nr_pages(folio); } return res; } From 1b8b5d0316da7468ae4d40f6c2102d559d9e3ca2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:11 +0000 Subject: [PATCH 03/10] io_uring/zcrx: elide passing msg flags zcrx sqe->msg_flags has never been defined and checked to be zero. It doesn't need to be a MSG_* bitmask. Keep them undefined, don't mix with MSG_DONTWAIT, and don't pass into io_zcrx_recv() as it's ignored anyway. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/net.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index a95cc9ca2a4d..69f901fa3040 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -110,7 +110,6 @@ enum sr_retry_flags { struct io_recvzc { struct file *file; - unsigned msg_flags; u16 flags; u32 len; struct io_zcrx_ifq *ifq; @@ -1253,8 +1252,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); - zc->msg_flags = READ_ONCE(sqe->msg_flags); - if (zc->msg_flags) + if (READ_ONCE(sqe->msg_flags)) return -EINVAL; if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) return -EINVAL; @@ -1283,8 +1281,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; len = zc->len; - ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, - issue_flags, &zc->len); + ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len); if (len && zc->len == 0) { io_req_set_res(req, 0, 0); From d663976dad68de9b2e3df59cc31f0a24ee4c4511 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:12 +0000 Subject: [PATCH 04/10] io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL It'll be annoying and take enough of boilerplate code to implement new zcrx features as separate io_uring register opcode. Introduce IORING_REGISTER_ZCRX_CTRL that will multiplex such calls to zcrx. Note, there are no real users of the opcode in this patch. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 13 +++++++++++++ io_uring/register.c | 3 +++ io_uring/zcrx.c | 21 +++++++++++++++++++++ io_uring/zcrx.h | 6 ++++++ 4 files changed, 43 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e96080db3e4d..0e1d353fab1d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -697,6 +697,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ + IORING_REGISTER_ZCRX_CTRL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1078,6 +1081,16 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +enum zcrx_ctrl_op { + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[8]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index 334a457da3f7..fc66a5364483 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -815,6 +815,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; + case IORING_REGISTER_ZCRX_CTRL: + ret = io_zcrx_ctrl(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 149bf9d5b983..0b5f4320c7a9 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -941,6 +941,27 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct zcrx_ctrl ctrl; + struct io_zcrx_ifq *zcrx; + + if (nr_args) + return -EINVAL; + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) + return -EFAULT; + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) + return -EFAULT; + + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); + if (!zcrx) + return -ENXIO; + if (ctrl.op >= __ZCRX_CTRL_LAST) + return -EOPNOTSUPP; + + return -EINVAL; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index c9b9bfae0547..f29edc22c91f 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -65,6 +65,7 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -93,6 +94,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } +static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); From 475eb39b00478b1898bc9080344dcd8e86c53c7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:13 +0000 Subject: [PATCH 05/10] io_uring/zcrx: add sync refill queue flushing Add an zcrx interface via IORING_REGISTER_ZCRX_CTRL that forces the kernel to flush / consume entries from the refill queue. Just as with the IORING_REGISTER_ZCRX_REFILL attempt, the motivation is to address cases where the refill queue becomes full, and the user can't return buffers and needs to stash them. It's still a slow path, and the user should size refill queue appropriately, but it should be helpful for handling temporary traffic spikes and other unpredictable conditions. The interface is simpler comparing to ZCRX_REFILL as it doesn't need temporary refill entry arrays and gives natural batching, whereas ZCRX_REFILL requires even more user logic to be somewhat efficient. Also, add a structure for the operation. It's not currently used but can serve for future improvements like limiting the number of buffers to process, etc. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++- io_uring/zcrx.c | 74 +++++++++++++++++++++++++++++++++-- 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0e1d353fab1d..db47fced2cc6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1082,13 +1082,21 @@ struct io_uring_zcrx_ifq_reg { }; enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + __ZCRX_CTRL_LAST, }; +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ - __u64 __resv[8]; + __u64 __resv[2]; + + struct zcrx_ctrl_flush_rq zc_flush; }; #ifdef __cplusplus diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 0b5f4320c7a9..08c103af69bc 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -941,6 +941,71 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, + struct io_zcrx_ifq *zcrx) +{ + unsigned int mask = zcrx->rq_entries - 1; + unsigned int i; + + guard(spinlock_bh)(&zcrx->rq_lock); + + nr = min(nr, io_zcrx_rqring_entries(zcrx)); + for (i = 0; i < nr; i++) { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); + struct net_iov *niov; + + if (!io_parse_rqe(rqe, zcrx, &niov)) + break; + netmem_array[i] = net_iov_to_netmem(niov); + } + + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); + return i; +} + +#define ZCRX_FLUSH_BATCH 32 + +static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) { + netmem_ref netmem = netmems[i]; + struct net_iov *niov = netmem_to_net_iov(netmem); + + if (!io_zcrx_put_niov_uref(niov)) + continue; + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, + struct zcrx_ctrl *ctrl) +{ + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; + netmem_ref netmems[ZCRX_FLUSH_BATCH]; + unsigned total = 0; + unsigned nr; + + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) + return -EINVAL; + + do { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); + + zcrx_return_buffers(netmems, nr); + total += nr; + + if (fatal_signal_pending(current)) + break; + cond_resched(); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); + + return 0; +} + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct zcrx_ctrl ctrl; @@ -956,10 +1021,13 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); if (!zcrx) return -ENXIO; - if (ctrl.op >= __ZCRX_CTRL_LAST) - return -EOPNOTSUPP; - return -EINVAL; + switch (ctrl.op) { + case ZCRX_CTRL_FLUSH_RQ: + return zcrx_flush_rq(ctx, zcrx, &ctrl); + } + + return -EOPNOTSUPP; } static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, From 39c9676f789eb71ce1005a22eebe2be80a00de6a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:14 +0000 Subject: [PATCH 06/10] io_uring/zcrx: count zcrx users zcrx tries to detach ifq / terminate page pools when the io_uring ctx owning it is being destroyed. There will be multiple io_uring instances attached to it in the future, so add a separate counter to track the users. Note, refs can't be reused for this purpose as it only used to prevent zcrx and rings destruction, and also used by page pools to keep it alive. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 7 +++++-- io_uring/zcrx.h | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 08c103af69bc..2335f140ff19 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -482,6 +482,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) spin_lock_init(&ifq->rq_lock); mutex_init(&ifq->pp_lock); refcount_set(&ifq->refs, 1); + refcount_set(&ifq->user_refs, 1); return ifq; } @@ -742,8 +743,10 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) if (!ifq) break; - io_close_queue(ifq); - io_zcrx_scrub(ifq); + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } io_put_zcrx_ifq(ifq); } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index f29edc22c91f..32ab95b2cb81 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -55,6 +55,8 @@ struct io_zcrx_ifq { struct net_device *netdev; netdevice_tracker netdev_tracker; refcount_t refs; + /* counts userspace facing users like io_uring */ + refcount_t user_refs; /* * Page pool and net configuration lock, can be taken deeper in the From 742cb2e14ecb059cd4a77b92aa4945c20f85d414 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:15 +0000 Subject: [PATCH 07/10] io_uring/zcrx: move io_zcrx_scrub() and dependencies up In preparation for adding zcrx ifq exporting and importing, move io_zcrx_scrub() and its dependencies up the file to be closer to io_close_queue(). Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 84 ++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 2335f140ff19..e60c5c00a611 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -544,6 +544,48 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) io_zcrx_ifq_free(ifq); } +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->desc.pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -684,48 +726,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) return &area->nia.niovs[niov_idx]; } -static void io_zcrx_return_niov_freelist(struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - spin_lock_bh(&area->freelist_lock); - area->freelist[area->free_count++] = net_iov_idx(niov); - spin_unlock_bh(&area->freelist_lock); -} - -static void io_zcrx_return_niov(struct net_iov *niov) -{ - netmem_ref netmem = net_iov_to_netmem(niov); - - if (!niov->desc.pp) { - /* copy fallback allocated niovs */ - io_zcrx_return_niov_freelist(niov); - return; - } - page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); -} - -static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) -{ - struct io_zcrx_area *area = ifq->area; - int i; - - if (!area) - return; - - /* Reclaim back all buffers given to the user space. */ - for (i = 0; i < area->nia.num_niovs; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - int nr; - - if (!atomic_read(io_get_user_counter(niov))) - continue; - nr = atomic_xchg(io_get_user_counter(niov), 0); - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) - io_zcrx_return_niov(niov); - } -} - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; From d7af80b213e5675664b14f12240cb282e81773d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:16 +0000 Subject: [PATCH 08/10] io_uring/zcrx: export zcrx via a file Add an option to wrap a zcrx instance into a file and expose it to the user space. Currently, users can't do anything meaningful with the file, but it'll be used in a next patch to import it into another io_uring instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the IORING_REGISTER_ZCRX_CTRL registration opcode. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 11 +++++- io_uring/zcrx.c | 68 +++++++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index db47fced2cc6..4bedc0310a55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1083,6 +1083,7 @@ struct io_uring_zcrx_ifq_reg { enum zcrx_ctrl_op { ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, __ZCRX_CTRL_LAST, }; @@ -1091,12 +1092,20 @@ struct zcrx_ctrl_flush_rq { __u64 __resv[6]; }; +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ __u64 __resv[2]; - struct zcrx_ctrl_flush_rq zc_flush; + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; }; #ifdef __cplusplus diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e60c5c00a611..815992aff246 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -586,6 +587,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) } } +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } + io_put_zcrx_ifq(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -596,6 +606,55 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, return ifq ? &ifq->region : NULL; } +static int zcrx_box_release(struct inode *inode, struct file *file) +{ + struct io_zcrx_ifq *ifq = file->private_data; + + if (WARN_ON_ONCE(!ifq)) + return -EFAULT; + zcrx_unregister(ifq); + return 0; +} + +static const struct file_operations zcrx_box_fops = { + .owner = THIS_MODULE, + .release = zcrx_box_release, +}; + +static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, + struct zcrx_ctrl *ctrl, void __user *arg) +{ + struct zcrx_ctrl_export *ce = &ctrl->zc_export; + struct file *file; + int fd = -1; + + if (!mem_is_zero(ce, sizeof(*ce))) + return -EINVAL; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + ce->zcrx_fd = fd; + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { + put_unused_fd(fd); + return -EFAULT; + } + + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, + ifq, O_CLOEXEC, NULL); + if (IS_ERR(file)) { + put_unused_fd(fd); + zcrx_unregister(ifq); + return PTR_ERR(file); + } + + fd_install(fd, file); + return 0; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -742,12 +801,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - - if (refcount_dec_and_test(&ifq->user_refs)) { - io_close_queue(ifq); - io_zcrx_scrub(ifq); - } - io_put_zcrx_ifq(ifq); + zcrx_unregister(ifq); } xa_destroy(&ctx->zcrx_ctxs); @@ -1028,6 +1082,8 @@ int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) switch (ctrl.op) { case ZCRX_CTRL_FLUSH_RQ: return zcrx_flush_rq(ctx, zcrx, &ctrl); + case ZCRX_CTRL_EXPORT: + return zcrx_export(ctx, zcrx, &ctrl, arg); } return -EOPNOTSUPP; From 0926f94ab36a6d76d07fa8f0934e65f5f66647ec Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:17 +0000 Subject: [PATCH 09/10] io_uring/zcrx: add io_fill_zcrx_offsets() Add a helper io_fill_zcrx_offsets() that sets the constant offsets in struct io_uring_zcrx_offsets returned to userspace. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 815992aff246..da7e556c349e 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -345,6 +345,13 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } +static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) +{ + offsets->head = offsetof(struct io_uring, head); + offsets->tail = offsetof(struct io_uring, tail); + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); +} + static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, @@ -356,7 +363,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, void *ptr; int ret; - off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); + io_fill_zcrx_offsets(®->offsets); + off = reg->offsets.rqes; size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; if (size > rd->size) return -EINVAL; @@ -372,9 +380,6 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); - reg->offsets.head = offsetof(struct io_uring, head); - reg->offsets.tail = offsetof(struct io_uring, tail); - reg->offsets.rqes = off; return 0; } From 00d91481279fb2df8c46d19090578afd523ca630 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:18 +0000 Subject: [PATCH 10/10] io_uring/zcrx: share an ifq between rings Add a way to share an ifq from a src ring that is real (i.e. bound to a HW RX queue) with other rings. This is done by passing a new flag IORING_ZCRX_IFQ_REG_IMPORT in the registration struct io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 +++ io_uring/zcrx.c | 63 +++++++++++++++++++++++++++++++++-- 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4bedc0310a55..deb772222b6d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg { __u64 __resv2[2]; }; +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index da7e556c349e..b99cf2c6670a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -660,6 +660,63 @@ static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, return 0; } +static int import_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg, + struct io_uring_zcrx_ifq_reg *reg) +{ + struct io_zcrx_ifq *ifq; + struct file *file; + int fd, ret; + u32 id; + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) + return -EINVAL; + + fd = reg->if_idx; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + file = fd_file(f); + if (file->f_op != &zcrx_box_fops || !file->private_data) + return -EBADF; + + ifq = file->private_data; + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto err; + } + + reg->zcrx_id = id; + io_fill_zcrx_offsets(®->offsets); + if (copy_to_user(arg, reg, sizeof(*reg))) { + ret = -EFAULT; + goto err_xa_erase; + } + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err_xa_erase; + } + + return 0; +err_xa_erase: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +err: + zcrx_unregister(ifq); + return ret; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -685,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) - return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.__resv2 || reg.zcrx_id) return -EINVAL; + if (reg.flags & ZCRX_REG_IMPORT) + return import_zcrx(ctx, arg, ®); + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {