From 88a80066af1617fab444776135d840467414beb6 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Mon, 23 Jun 2025 19:02:18 +0800 Subject: [PATCH 1/7] io_uring: make fallocate be hashed work Like ftruncate and write, fallocate operations on the same file cannot be executed in parallel, so it is better to make fallocate be hashed work. Signed-off-by: Fengnan Chang Link: https://lore.kernel.org/r/20250623110218.61490-1-changfengnan@bytedance.com Signed-off-by: Jens Axboe --- io_uring/opdef.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 6e0882b051f9..6de6229207a8 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, From 5afb4bf9fc62d828647647ec31745083637132e4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Jun 2025 14:40:33 +0100 Subject: [PATCH 2/7] io_uring/rsrc: fix folio unpinning syzbot complains about an unmapping failure: [ 108.070381][ T14] kernel BUG at mm/gup.c:71! [ 108.070502][ T14] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP [ 108.123672][ T14] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20250221-8.fc42 02/21/2025 [ 108.127458][ T14] Workqueue: iou_exit io_ring_exit_work [ 108.174205][ T14] Call trace: [ 108.175649][ T14] sanity_check_pinned_pages+0x7cc/0x7d0 (P) [ 108.178138][ T14] unpin_user_page+0x80/0x10c [ 108.180189][ T14] io_release_ubuf+0x84/0xf8 [ 108.182196][ T14] io_free_rsrc_node+0x250/0x57c [ 108.184345][ T14] io_rsrc_data_free+0x148/0x298 [ 108.186493][ T14] io_sqe_buffers_unregister+0x84/0xa0 [ 108.188991][ T14] io_ring_ctx_free+0x48/0x480 [ 108.191057][ T14] io_ring_exit_work+0x764/0x7d8 [ 108.193207][ T14] process_one_work+0x7e8/0x155c [ 108.195431][ T14] worker_thread+0x958/0xed8 [ 108.197561][ T14] kthread+0x5fc/0x75c [ 108.199362][ T14] ret_from_fork+0x10/0x20 We can pin a tail page of a folio, but then io_uring will try to unpin the head page of the folio. While it should be fine in terms of keeping the page actually alive, mm folks say it's wrong and triggers a debug warning. Use unpin_user_folio() instead of unpin_user_page*. Cc: stable@vger.kernel.org Debugged-by: David Hildenbrand Reported-by: syzbot+1d335893772467199ab6@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/683f1551.050a0220.55ceb.0017.GAE@google.com Fixes: a8edbb424b139 ("io_uring/rsrc: enable multi-hugepage buffer coalescing") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/io-uring/a28b0f87339ac2acf14a645dad1e95bbcbf18acd.1750771718.git.asml.silence@gmail.com/ [axboe: adapt to current tree, massage commit message] Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d724602697e7..0c09e38784c9 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -112,8 +112,11 @@ static void io_release_ubuf(void *priv) struct io_mapped_ubuf *imu = priv; unsigned int i; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); + + unpin_user_folio(folio, 1); + } } static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, @@ -840,8 +843,10 @@ done: if (ret) { if (imu) io_free_imu(ctx, imu); - if (pages) - unpin_user_pages(pages, nr_pages); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } From 3a3c6d61577dbb23c09df3e21f6f9eda1ecd634b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Jun 2025 14:40:34 +0100 Subject: [PATCH 3/7] io_uring/rsrc: don't rely on user vaddr alignment There is no guaranteed alignment for user pointers, however the calculation of an offset of the first page into a folio after coalescing uses some weird bit mask logic, get rid of it. Cc: stable@vger.kernel.org Reported-by: David Hildenbrand Fixes: a8edbb424b139 ("io_uring/rsrc: enable multi-hugepage buffer coalescing") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/io-uring/e387b4c78b33f231105a601d84eefd8301f57954.1750771718.git.asml.silence@gmail.com/ Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 7 ++++++- io_uring/rsrc.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0c09e38784c9..afc67530f912 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -734,6 +734,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, data->nr_pages_mid = folio_nr_pages(folio); data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); /* * Check if pages are contiguous inside a folio, and all folios have @@ -827,7 +828,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); + + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; + node->buf = imu; ret = 0; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 0d2138f16322..25e7e998dcfd 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -49,6 +49,7 @@ struct io_imu_folio_data { unsigned int nr_pages_mid; unsigned int folio_shift; unsigned int nr_folios; + unsigned long first_folio_page_idx; }; bool io_rsrc_cache_init(struct io_ring_ctx *ctx); From e1d7727b73a1f78035316ac35ee184d477059f0b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Jun 2025 14:40:35 +0100 Subject: [PATCH 4/7] io_uring: don't assume uaddr alignment in io_vec_fill_bvec There is no guaranteed alignment for user pointers. Don't use mask trickery and adjust the offset by bv_offset. Cc: stable@vger.kernel.org Reported-by: David Hildenbrand Fixes: 9ef4cbbcb4ac3 ("io_uring: add infra for importing vectored reg buffers") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/io-uring/19530391f5c361a026ac9b401ff8e123bde55d98.1750771718.git.asml.silence@gmail.com/ Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index afc67530f912..f2b31fb68992 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1339,7 +1339,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, { unsigned long folio_size = 1 << imu->folio_shift; unsigned long folio_mask = folio_size - 1; - u64 folio_addr = imu->ubuf & ~folio_mask; struct bio_vec *res_bvec = vec->bvec; size_t total_len = 0; unsigned bvec_idx = 0; @@ -1361,8 +1360,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) return -EOVERFLOW; - /* by using folio address it also accounts for bvec offset */ - offset = buf_addr - folio_addr; + offset = buf_addr - imu->ubuf; + /* + * Only the first bvec can have non zero bv_offset, account it + * here and work with full folios below. + */ + offset += imu->bvec[0].bv_offset; + src_bvec = imu->bvec + (offset >> imu->folio_shift); offset &= folio_mask; From 7cac633a42a7b3c8146eb1db76fb80dc652998de Mon Sep 17 00:00:00 2001 From: Penglei Jiang Date: Wed, 25 Jun 2025 03:27:03 -0700 Subject: [PATCH 5/7] io_uring: fix resource leak in io_import_dmabuf() Replace the return statement with setting ret = -EINVAL and jumping to the err label to ensure resources are released via io_release_dmabuf. Fixes: a5c98e942457 ("io_uring/zcrx: dmabuf backed zerocopy receive") Signed-off-by: Penglei Jiang Link: https://lore.kernel.org/r/20250625102703.68336-1-superman.xpt@gmail.com Signed-off-by: Jens Axboe --- io_uring/zcrx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 21c816c3bfe0..ade4da9c4e31 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -106,8 +106,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq, for_each_sgtable_dma_sg(mem->sgt, sg, i) total_size += sg_dma_len(sg); - if (total_size < off + len) - return -EINVAL; + if (total_size < off + len) { + ret = -EINVAL; + goto err; + } mem->dmabuf_offset = off; mem->size = len; From 9a709b7e98e6fa51600b5f2d24c5068efa6d39de Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 Jun 2025 10:17:06 -0600 Subject: [PATCH 6/7] io_uring/net: mark iov as dynamically allocated even for single segments A bigger array of vecs could've been allocated, but io_ring_buffers_peek() still decided to cap the mapped range depending on how much data was available. Hence don't rely on the segment count to know if the request should be marked as needing cleanup, always check upfront if the iov array is different than the fast_iov array. Fixes: 26ec15e4b0c1 ("io_uring/kbuf: don't truncate end buffer for multiple buffer peeks") Signed-off-by: Jens Axboe --- io_uring/net.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 9550d4c8f866..5c1e8c4ba468 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1077,6 +1077,12 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg if (unlikely(ret < 0)) return ret; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + /* special case 1 vec, can be a fast path */ if (ret == 1) { sr->buf = arg.iovs[0].iov_base; @@ -1085,11 +1091,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { - kmsg->vec.nr = ret; - kmsg->vec.iovec = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } } else { void __user *buf; From 178b8ff66ff827c41b4fa105e9aabb99a0b5c537 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 Jun 2025 12:17:48 -0600 Subject: [PATCH 7/7] io_uring/kbuf: flag partial buffer mappings A previous commit aborted mapping more for a non-incremental ring for bundle peeking, but depending on where in the process this peeking happened, it would not necessarily prevent a retry by the user. That can create gaps in the received/read data. Add struct buf_sel_arg->partial_map, which can pass this information back. The networking side can then map that to internal state and use it to gate retry as well. Since this necessitates a new flag, change io_sr_msg->retry to a retry_flags member, and store both the retry and partial map condition in there. Cc: stable@vger.kernel.org Fixes: 26ec15e4b0c1 ("io_uring/kbuf: don't truncate end buffer for multiple buffer peeks") Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 1 + io_uring/kbuf.h | 3 ++- io_uring/net.c | 23 +++++++++++++++-------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index ce95e3af44a9..f2d2cc319faa 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -271,6 +271,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, if (len > arg->max_len) { len = arg->max_len; if (!(bl->flags & IOBL_INC)) { + arg->partial_map = 1; if (iov != arg->iovs) break; buf->len = len; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 5d83c7adc739..723d0361898e 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -58,7 +58,8 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; - unsigned buf_group; + unsigned short buf_group; + unsigned short partial_map; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, diff --git a/io_uring/net.c b/io_uring/net.c index 5c1e8c4ba468..43a43522f406 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -75,12 +75,17 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - bool retry; + unsigned short retry_flags; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; +enum sr_retry_flags { + IO_SR_MSG_RETRY = 1, + IO_SR_MSG_PARTIAL_MAP = 2, +}; + /* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from @@ -187,7 +192,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; sr->len = 0; /* get from the provided buffer */ } @@ -397,7 +402,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -751,7 +756,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -823,7 +828,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); - if (sr->retry) + if (sr->retry_flags & IO_SR_MSG_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) @@ -832,12 +837,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * If more is available AND it was a full transfer, retry and * append to this one */ - if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 && + if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && !iov_iter_count(&kmsg->msg.msg_iter)) { req->cqe.flags = cflags & ~CQE_F_MASK; sr->len = kmsg->msg.msg_inq; sr->done_io += this_ret; - sr->retry = true; + sr->retry_flags |= IO_SR_MSG_RETRY; return false; } } else { @@ -1082,6 +1087,8 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg kmsg->vec.iovec = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } + if (arg.partial_map) + sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; /* special case 1 vec, can be a fast path */ if (ret == 1) { @@ -1276,7 +1283,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; zc->done_io = 0; - zc->retry = false; + zc->retry_flags = 0; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL;