io_uring-6.19-20251208

-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmk3KXIQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpo91EACGlORRzg4FJXox8DcItdOQsZGFIqCXts9p SVtbxV6sPdsHwRB/xGTzHWP2iWUjA4+i5l3n4mt8vzGAmQU50gtdaIsJEMq7SOfB nJW0wNi905qcLihOfTpQ/2xpE5Am/iWPavFkAqOF7qo6GlS7aN47TIaHCPmAm3Nx Kla2XMDnneFhl8xCdnJHaLrzyD94xlArywG5UPjkgFGCmLEu2ZE6T9ivq86DHQZJ Ujy3ueMO/7SErfoDbY4I/gPs4ONxBaaieKycuyljQQB3n6sj15EBNB0TMDPA/Rwx Aq4WD/MC48titpxV2BT9RKCjYvJ4wsBww4uFLkCTKDlFCRH0pqclzgtd2iB46kge tj9KfTS9tkLBp9steMcw45FStu0iiHBwqqTcqUr1q/wzIPbPAQ/L/Mu6AlUOheW/ MmedhtPP22IShpkKYWSv923P2Qp2HhKa6LtoKJzxOK9rb6yoYvHl0zEQlKbWtPgq lpGzjbBoCtjqwlQKTpcH8diwaZ/fafrIP4h80Hg1pRiQEwzBgDpA3/N0EcfigkmU 2IgyH3k6F9v/IgyVPkpzNh4w6hrr9RnxVA8yaf2ItkfWKwajWJAtPLUBuING8qqa 3xg1MZ27NS6gUKEdCEy/mAaz8Vt2SGRUc3szHYrZHy7OFEW94WoiKAYK9qsZXGzX ms2VldIiQA== =Mbok -----END PGP SIGNATURE----- Merge tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux Pull io_uring updates from Jens Axboe: "Followup set of fixes for io_uring for this merge window. These are either later fixes, or cleanups that don't make sense to defer. This pull request contains: - Fix for a recent regression in io-wq worker creation - Tracing cleanup - Use READ_ONCE/WRITE_ONCE consistently for ring mapped kbufs. Mostly for documentation purposes, indicating that they are shared with userspace - Fix for POLL_ADD losing a completion, if the request is updated and now is triggerable - eg, if POLLIN is set with the updated, and the polled file is readable - In conjunction with the above fix, also unify how poll wait queue entries are deleted with the head update. We had 3 different spots doing both the list deletion and head write, with one of them nicely documented. Abstract that into a helper and use it consistently - Small series from Joanne fixing an issue with buffer cloning, and cleaning up the arg validation" * tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: io_uring/poll: unify poll waitqueue entry and list removal io_uring/kbuf: use WRITE_ONCE() for userspace-shared buffer ring fields io_uring/kbuf: use READ_ONCE() for userspace-mapped memory io_uring/rsrc: fix lost entries after cloned range io_uring/rsrc: rename misleading src_node variable in io_clone_buffers() io_uring/rsrc: clean up buffer cloning arg validation io_uring/trace: rename io_uring_queue_async_work event "rw" field io_uring/io-wq: always retry worker create on ERESTART* io_uring/poll: correctly handle io_poll_add() return value on update
2025-12-09 09:07:28 +09:00 · 2025-12-09 09:07:28 +09:00 · cfd4039213
parent 4482ebb297 55d57b3bcc
commit cfd4039213
5 changed files with 67 additions and 65 deletions
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@ -133,15 +133,15 @@ TRACE_EVENT(io_uring_file_get,
 * io_uring_queue_async_work - called before submitting a new async work
 *
 * @req:	pointer to a submitted request
- * @rw:		type of workqueue, hashed or normal
+ * @hashed:	whether async work is hashed
 *
 * Allows to trace asynchronous work submission.
 */
 TRACE_EVENT(io_uring_queue_async_work,
-	TP_PROTO(struct io_kiocb *req, int rw),
+	TP_PROTO(struct io_kiocb *req, bool hashed),
-	TP_ARGS(req, rw),
+	TP_ARGS(req, hashed),
 	TP_STRUCT__entry (
 		__field(  void *,			ctx		)
@ -150,7 +150,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__field(  u8,				opcode		)
 		__field(  unsigned long long,		flags		)
 		__field(  struct io_wq_work *,		work		)
-		__field(  int,				rw		)
+		__field(  bool,				hashed		)
 		__string( op_str, io_uring_get_opcode(req->opcode)	)
 	),
@ -162,7 +162,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__entry->flags		= (__force unsigned long long) req->flags;
 		__entry->opcode		= req->opcode;
 		__entry->work		= &req->work;
-		__entry->rw		= rw;
+		__entry->hashed		= hashed;
 		__assign_str(op_str);
 	),
@ -170,7 +170,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
 		__entry->ctx, __entry->req, __entry->user_data,
 		__get_str(op_str), __entry->flags,
-		__entry->rw ? "hashed" : "normal", __entry->work)
+		__entry->hashed ? "hashed" : "normal", __entry->work)
 );
 /**
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@ -805,11 +805,12 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err)
 	 */
 	if (fatal_signal_pending(current))
 		return false;
 	if (worker->init_retries++ >= WORKER_INIT_LIMIT)
 		return false;
 	worker->init_retries++;
 	switch (err) {
 	case -EAGAIN:
 		return worker->init_retries <= WORKER_INIT_LIMIT;
 	/* Analogous to a fork() syscall, always retry on a restartable error */
 	case -ERESTARTSYS:
 	case -ERESTARTNOINTR:
 	case -ERESTARTNOHAND:
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@ -44,11 +44,11 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
 		buf_len -= this_len;
 		/* Stop looping for invalid buffer length of 0 */
 		if (buf_len || !this_len) {
-			buf->addr += this_len;
+			WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
-			buf->len = buf_len;
+			WRITE_ONCE(buf->len, buf_len);
 			return false;
 		}
-		buf->len = 0;
+		WRITE_ONCE(buf->len, 0);
 		bl->head++;
 		len -= this_len;
 	}
@ -198,9 +198,9 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	if (*len == 0 || *len > buf_len)
 		*len = buf_len;
 	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
-	req->buf_index = buf->bid;
+	req->buf_index = READ_ONCE(buf->bid);
 	sel.buf_list = bl;
-	sel.addr = u64_to_user_ptr(buf->addr);
+	sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
 	if (io_should_commit(req, issue_flags)) {
 		io_kbuf_commit(req, sel.buf_list, *len, 1);
@ -280,7 +280,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 	if (!arg->max_len)
 		arg->max_len = INT_MAX;
-	req->buf_index = buf->bid;
+	req->buf_index = READ_ONCE(buf->bid);
 	do {
 		u32 len = READ_ONCE(buf->len);
@ -291,11 +291,11 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 				arg->partial_map = 1;
 				if (iov != arg->iovs)
 					break;
-				buf->len = len;
+				WRITE_ONCE(buf->len, len);
 			}
 		}
-		iov->iov_base = u64_to_user_ptr(buf->addr);
+		iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr));
 		iov->iov_len = len;
 		iov++;
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@ -138,14 +138,32 @@ static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
 	init_waitqueue_func_entry(&poll->wait, io_poll_wake);
 }
 static void io_poll_remove_waitq(struct io_poll *poll)
 {
 	/*
 	 * If the waitqueue is being freed early but someone is already holds
 	 * ownership over it, we have to tear down the request as best we can.
 	 * That means immediately removing the request from its waitqueue and
 	 * preventing all further accesses to the waitqueue via the request.
 	 */
 	list_del_init(&poll->wait.entry);
 	/*
 	 * Careful: this *must* be the last step, since as soon as req->head is
 	 * NULL'ed out, the request can be completed and freed, since
 	 * io_poll_remove_entry() will no longer need to take the waitqueue
 	 * lock.
 	 */
 	smp_store_release(&poll->head, NULL);
 }
 static inline void io_poll_remove_entry(struct io_poll *poll)
 {
 	struct wait_queue_head *head = smp_load_acquire(&poll->head);
 	if (head) {
 		spin_lock_irq(&head->lock);
-		list_del_init(&poll->wait.entry);
+		io_poll_remove_waitq(poll);
 		poll->head = NULL;
 		spin_unlock_irq(&head->lock);
 	}
 }
@ -368,23 +386,7 @@ static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
 	io_poll_mark_cancelled(req);
 	/* we have to kick tw in case it's not already */
 	io_poll_execute(req, 0);
-
+	io_poll_remove_waitq(poll);
 	/*
 	 * If the waitqueue is being freed early but someone is already
 	 * holds ownership over it, we have to tear down the request as
 	 * best we can. That means immediately removing the request from
 	 * its waitqueue and preventing all further accesses to the
 	 * waitqueue via the request.
 	 */
 	list_del_init(&poll->wait.entry);
 	/*
 	 * Careful: this *must* be the last step, since as soon
 	 * as req->head is NULL'ed out, the request can be
 	 * completed and freed, since aio_poll_complete_work()
 	 * will no longer need to take the waitqueue lock.
 	 */
 	smp_store_release(&poll->head, NULL);
 	return 1;
 }
@ -413,8 +415,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		/* optional, saves extra locking for removal in tw handler */
 		if (mask && poll->events & EPOLLONESHOT) {
-			list_del_init(&poll->wait.entry);
+			io_poll_remove_waitq(poll);
 			poll->head = NULL;
 			if (wqe_is_double(wait))
 				req->flags &= ~REQ_F_DOUBLE_POLL;
 			else
@ -937,12 +938,17 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 		ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED);
 		/* successfully updated, don't complete poll request */
-		if (!ret2 || ret2 == -EIOCBQUEUED)
+		if (ret2 == IOU_ISSUE_SKIP_COMPLETE)
 			goto out;
 		/* request completed as part of the update, complete it */
 		else if (ret2 == IOU_COMPLETE)
 			goto complete;
 	}
 	req_set_fail(preq);
 	io_req_set_res(preq, -ECANCELED, 0);
 complete:
 	if (preq->cqe.res < 0)
 		req_set_fail(preq);
 	preq->io_task_work.func = io_req_task_complete;
 	io_req_task_work_add(preq);
 out:
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@ -1186,12 +1186,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		return -EBUSY;
 	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
 		return -ENXIO;
 	if (!arg->nr)
 		arg->nr = nbufs;
 	else if (arg->nr > nbufs)
 		return -EINVAL;
 	else if (arg->nr > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
 	if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
 		return -EOVERFLOW;
 	if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
 		return -EOVERFLOW;
 	if (nbufs > IORING_MAX_REG_BUFFERS)
@ -1201,31 +1205,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	if (ret)
 		return ret;
-	/* Fill entries in data from dst that won't overlap with src */
+	/* Copy original dst nodes from before the cloned range */
 	for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
-		struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
+		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
-		if (src_node) {
+		if (node) {
-			data.nodes[i] = src_node;
+			data.nodes[i] = node;
-			src_node->refs++;
+			node->refs++;
 		}
 	}
 	ret = -ENXIO;
 	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
 		goto out_free;
 	ret = -EINVAL;
 	if (!arg->nr)
 		arg->nr = nbufs;
 	else if (arg->nr > nbufs)
 		goto out_free;
 	ret = -EOVERFLOW;
 	if (check_add_overflow(arg->nr, arg->src_off, &off))
 		goto out_free;
 	if (off > nbufs)
 		goto out_free;
 	off = arg->dst_off;
 	i = arg->src_off;
 	nr = arg->nr;
@ -1238,8 +1227,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		} else {
 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 			if (!dst_node) {
-				ret = -ENOMEM;
+				io_rsrc_data_free(ctx, &data);
-				goto out_free;
+				return -ENOMEM;
 			}
 			refcount_inc(&src_node->buf->refs);
@ -1249,6 +1238,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		i++;
 	}
 	/* Copy original dst nodes from after the cloned range */
 	for (i = nbufs; i < ctx->buf_table.nr; i++) {
 		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
 		if (node) {
 			data.nodes[i] = node;
 			node->refs++;
 		}
 	}
 	/*
 	 * If asked for replace, put the old table. data->nodes[] holds both
 	 * old and new nodes at this point.
@ -1265,10 +1264,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	WARN_ON_ONCE(ctx->buf_table.nr);
 	ctx->buf_table = data;
 	return 0;
 out_free:
 	io_rsrc_data_free(ctx, &data);
 	return ret;
 }
 /*