io_uring-6.19-20251208
-----BEGIN PGP SIGNATURE-----
iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmk3KXIQHGF4Ym9lQGtl
cm5lbC5kawAKCRD301j7KXHgpo91EACGlORRzg4FJXox8DcItdOQsZGFIqCXts9p
SVtbxV6sPdsHwRB/xGTzHWP2iWUjA4+i5l3n4mt8vzGAmQU50gtdaIsJEMq7SOfB
nJW0wNi905qcLihOfTpQ/2xpE5Am/iWPavFkAqOF7qo6GlS7aN47TIaHCPmAm3Nx
Kla2XMDnneFhl8xCdnJHaLrzyD94xlArywG5UPjkgFGCmLEu2ZE6T9ivq86DHQZJ
Ujy3ueMO/7SErfoDbY4I/gPs4ONxBaaieKycuyljQQB3n6sj15EBNB0TMDPA/Rwx
Aq4WD/MC48titpxV2BT9RKCjYvJ4wsBww4uFLkCTKDlFCRH0pqclzgtd2iB46kge
tj9KfTS9tkLBp9steMcw45FStu0iiHBwqqTcqUr1q/wzIPbPAQ/L/Mu6AlUOheW/
MmedhtPP22IShpkKYWSv923P2Qp2HhKa6LtoKJzxOK9rb6yoYvHl0zEQlKbWtPgq
lpGzjbBoCtjqwlQKTpcH8diwaZ/fafrIP4h80Hg1pRiQEwzBgDpA3/N0EcfigkmU
2IgyH3k6F9v/IgyVPkpzNh4w6hrr9RnxVA8yaf2ItkfWKwajWJAtPLUBuING8qqa
3xg1MZ27NS6gUKEdCEy/mAaz8Vt2SGRUc3szHYrZHy7OFEW94WoiKAYK9qsZXGzX
ms2VldIiQA==
=Mbok
-----END PGP SIGNATURE-----
Merge tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
"Followup set of fixes for io_uring for this merge window. These are
either later fixes, or cleanups that don't make sense to defer. This
pull request contains:
- Fix for a recent regression in io-wq worker creation
- Tracing cleanup
- Use READ_ONCE/WRITE_ONCE consistently for ring mapped kbufs. Mostly
for documentation purposes, indicating that they are shared with
userspace
- Fix for POLL_ADD losing a completion, if the request is updated and
now is triggerable - eg, if POLLIN is set with the updated, and the
polled file is readable
- In conjunction with the above fix, also unify how poll wait queue
entries are deleted with the head update. We had 3 different spots
doing both the list deletion and head write, with one of them
nicely documented. Abstract that into a helper and use it
consistently
- Small series from Joanne fixing an issue with buffer cloning, and
cleaning up the arg validation"
* tag 'io_uring-6.19-20251208' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
io_uring/poll: unify poll waitqueue entry and list removal
io_uring/kbuf: use WRITE_ONCE() for userspace-shared buffer ring fields
io_uring/kbuf: use READ_ONCE() for userspace-mapped memory
io_uring/rsrc: fix lost entries after cloned range
io_uring/rsrc: rename misleading src_node variable in io_clone_buffers()
io_uring/rsrc: clean up buffer cloning arg validation
io_uring/trace: rename io_uring_queue_async_work event "rw" field
io_uring/io-wq: always retry worker create on ERESTART*
io_uring/poll: correctly handle io_poll_add() return value on update
pull/1354/merge
commit
cfd4039213
|
|
@ -133,15 +133,15 @@ TRACE_EVENT(io_uring_file_get,
|
||||||
* io_uring_queue_async_work - called before submitting a new async work
|
* io_uring_queue_async_work - called before submitting a new async work
|
||||||
*
|
*
|
||||||
* @req: pointer to a submitted request
|
* @req: pointer to a submitted request
|
||||||
* @rw: type of workqueue, hashed or normal
|
* @hashed: whether async work is hashed
|
||||||
*
|
*
|
||||||
* Allows to trace asynchronous work submission.
|
* Allows to trace asynchronous work submission.
|
||||||
*/
|
*/
|
||||||
TRACE_EVENT(io_uring_queue_async_work,
|
TRACE_EVENT(io_uring_queue_async_work,
|
||||||
|
|
||||||
TP_PROTO(struct io_kiocb *req, int rw),
|
TP_PROTO(struct io_kiocb *req, bool hashed),
|
||||||
|
|
||||||
TP_ARGS(req, rw),
|
TP_ARGS(req, hashed),
|
||||||
|
|
||||||
TP_STRUCT__entry (
|
TP_STRUCT__entry (
|
||||||
__field( void *, ctx )
|
__field( void *, ctx )
|
||||||
|
|
@ -150,7 +150,7 @@ TRACE_EVENT(io_uring_queue_async_work,
|
||||||
__field( u8, opcode )
|
__field( u8, opcode )
|
||||||
__field( unsigned long long, flags )
|
__field( unsigned long long, flags )
|
||||||
__field( struct io_wq_work *, work )
|
__field( struct io_wq_work *, work )
|
||||||
__field( int, rw )
|
__field( bool, hashed )
|
||||||
|
|
||||||
__string( op_str, io_uring_get_opcode(req->opcode) )
|
__string( op_str, io_uring_get_opcode(req->opcode) )
|
||||||
),
|
),
|
||||||
|
|
@ -162,7 +162,7 @@ TRACE_EVENT(io_uring_queue_async_work,
|
||||||
__entry->flags = (__force unsigned long long) req->flags;
|
__entry->flags = (__force unsigned long long) req->flags;
|
||||||
__entry->opcode = req->opcode;
|
__entry->opcode = req->opcode;
|
||||||
__entry->work = &req->work;
|
__entry->work = &req->work;
|
||||||
__entry->rw = rw;
|
__entry->hashed = hashed;
|
||||||
|
|
||||||
__assign_str(op_str);
|
__assign_str(op_str);
|
||||||
),
|
),
|
||||||
|
|
@ -170,7 +170,7 @@ TRACE_EVENT(io_uring_queue_async_work,
|
||||||
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
|
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
|
||||||
__entry->ctx, __entry->req, __entry->user_data,
|
__entry->ctx, __entry->req, __entry->user_data,
|
||||||
__get_str(op_str), __entry->flags,
|
__get_str(op_str), __entry->flags,
|
||||||
__entry->rw ? "hashed" : "normal", __entry->work)
|
__entry->hashed ? "hashed" : "normal", __entry->work)
|
||||||
);
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -805,11 +805,12 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err)
|
||||||
*/
|
*/
|
||||||
if (fatal_signal_pending(current))
|
if (fatal_signal_pending(current))
|
||||||
return false;
|
return false;
|
||||||
if (worker->init_retries++ >= WORKER_INIT_LIMIT)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
|
worker->init_retries++;
|
||||||
switch (err) {
|
switch (err) {
|
||||||
case -EAGAIN:
|
case -EAGAIN:
|
||||||
|
return worker->init_retries <= WORKER_INIT_LIMIT;
|
||||||
|
/* Analogous to a fork() syscall, always retry on a restartable error */
|
||||||
case -ERESTARTSYS:
|
case -ERESTARTSYS:
|
||||||
case -ERESTARTNOINTR:
|
case -ERESTARTNOINTR:
|
||||||
case -ERESTARTNOHAND:
|
case -ERESTARTNOHAND:
|
||||||
|
|
|
||||||
|
|
@ -44,11 +44,11 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
|
||||||
buf_len -= this_len;
|
buf_len -= this_len;
|
||||||
/* Stop looping for invalid buffer length of 0 */
|
/* Stop looping for invalid buffer length of 0 */
|
||||||
if (buf_len || !this_len) {
|
if (buf_len || !this_len) {
|
||||||
buf->addr += this_len;
|
WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len);
|
||||||
buf->len = buf_len;
|
WRITE_ONCE(buf->len, buf_len);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
buf->len = 0;
|
WRITE_ONCE(buf->len, 0);
|
||||||
bl->head++;
|
bl->head++;
|
||||||
len -= this_len;
|
len -= this_len;
|
||||||
}
|
}
|
||||||
|
|
@ -198,9 +198,9 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
|
||||||
if (*len == 0 || *len > buf_len)
|
if (*len == 0 || *len > buf_len)
|
||||||
*len = buf_len;
|
*len = buf_len;
|
||||||
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
|
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
|
||||||
req->buf_index = buf->bid;
|
req->buf_index = READ_ONCE(buf->bid);
|
||||||
sel.buf_list = bl;
|
sel.buf_list = bl;
|
||||||
sel.addr = u64_to_user_ptr(buf->addr);
|
sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
|
||||||
|
|
||||||
if (io_should_commit(req, issue_flags)) {
|
if (io_should_commit(req, issue_flags)) {
|
||||||
io_kbuf_commit(req, sel.buf_list, *len, 1);
|
io_kbuf_commit(req, sel.buf_list, *len, 1);
|
||||||
|
|
@ -280,7 +280,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
|
||||||
if (!arg->max_len)
|
if (!arg->max_len)
|
||||||
arg->max_len = INT_MAX;
|
arg->max_len = INT_MAX;
|
||||||
|
|
||||||
req->buf_index = buf->bid;
|
req->buf_index = READ_ONCE(buf->bid);
|
||||||
do {
|
do {
|
||||||
u32 len = READ_ONCE(buf->len);
|
u32 len = READ_ONCE(buf->len);
|
||||||
|
|
||||||
|
|
@ -291,11 +291,11 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
|
||||||
arg->partial_map = 1;
|
arg->partial_map = 1;
|
||||||
if (iov != arg->iovs)
|
if (iov != arg->iovs)
|
||||||
break;
|
break;
|
||||||
buf->len = len;
|
WRITE_ONCE(buf->len, len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
iov->iov_base = u64_to_user_ptr(buf->addr);
|
iov->iov_base = u64_to_user_ptr(READ_ONCE(buf->addr));
|
||||||
iov->iov_len = len;
|
iov->iov_len = len;
|
||||||
iov++;
|
iov++;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -138,14 +138,32 @@ static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
|
||||||
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
|
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void io_poll_remove_waitq(struct io_poll *poll)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If the waitqueue is being freed early but someone is already holds
|
||||||
|
* ownership over it, we have to tear down the request as best we can.
|
||||||
|
* That means immediately removing the request from its waitqueue and
|
||||||
|
* preventing all further accesses to the waitqueue via the request.
|
||||||
|
*/
|
||||||
|
list_del_init(&poll->wait.entry);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Careful: this *must* be the last step, since as soon as req->head is
|
||||||
|
* NULL'ed out, the request can be completed and freed, since
|
||||||
|
* io_poll_remove_entry() will no longer need to take the waitqueue
|
||||||
|
* lock.
|
||||||
|
*/
|
||||||
|
smp_store_release(&poll->head, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
static inline void io_poll_remove_entry(struct io_poll *poll)
|
static inline void io_poll_remove_entry(struct io_poll *poll)
|
||||||
{
|
{
|
||||||
struct wait_queue_head *head = smp_load_acquire(&poll->head);
|
struct wait_queue_head *head = smp_load_acquire(&poll->head);
|
||||||
|
|
||||||
if (head) {
|
if (head) {
|
||||||
spin_lock_irq(&head->lock);
|
spin_lock_irq(&head->lock);
|
||||||
list_del_init(&poll->wait.entry);
|
io_poll_remove_waitq(poll);
|
||||||
poll->head = NULL;
|
|
||||||
spin_unlock_irq(&head->lock);
|
spin_unlock_irq(&head->lock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -368,23 +386,7 @@ static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll)
|
||||||
io_poll_mark_cancelled(req);
|
io_poll_mark_cancelled(req);
|
||||||
/* we have to kick tw in case it's not already */
|
/* we have to kick tw in case it's not already */
|
||||||
io_poll_execute(req, 0);
|
io_poll_execute(req, 0);
|
||||||
|
io_poll_remove_waitq(poll);
|
||||||
/*
|
|
||||||
* If the waitqueue is being freed early but someone is already
|
|
||||||
* holds ownership over it, we have to tear down the request as
|
|
||||||
* best we can. That means immediately removing the request from
|
|
||||||
* its waitqueue and preventing all further accesses to the
|
|
||||||
* waitqueue via the request.
|
|
||||||
*/
|
|
||||||
list_del_init(&poll->wait.entry);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Careful: this *must* be the last step, since as soon
|
|
||||||
* as req->head is NULL'ed out, the request can be
|
|
||||||
* completed and freed, since aio_poll_complete_work()
|
|
||||||
* will no longer need to take the waitqueue lock.
|
|
||||||
*/
|
|
||||||
smp_store_release(&poll->head, NULL);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -413,8 +415,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
|
||||||
|
|
||||||
/* optional, saves extra locking for removal in tw handler */
|
/* optional, saves extra locking for removal in tw handler */
|
||||||
if (mask && poll->events & EPOLLONESHOT) {
|
if (mask && poll->events & EPOLLONESHOT) {
|
||||||
list_del_init(&poll->wait.entry);
|
io_poll_remove_waitq(poll);
|
||||||
poll->head = NULL;
|
|
||||||
if (wqe_is_double(wait))
|
if (wqe_is_double(wait))
|
||||||
req->flags &= ~REQ_F_DOUBLE_POLL;
|
req->flags &= ~REQ_F_DOUBLE_POLL;
|
||||||
else
|
else
|
||||||
|
|
@ -937,12 +938,17 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
|
||||||
|
|
||||||
ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED);
|
ret2 = io_poll_add(preq, issue_flags & ~IO_URING_F_UNLOCKED);
|
||||||
/* successfully updated, don't complete poll request */
|
/* successfully updated, don't complete poll request */
|
||||||
if (!ret2 || ret2 == -EIOCBQUEUED)
|
if (ret2 == IOU_ISSUE_SKIP_COMPLETE)
|
||||||
goto out;
|
goto out;
|
||||||
|
/* request completed as part of the update, complete it */
|
||||||
|
else if (ret2 == IOU_COMPLETE)
|
||||||
|
goto complete;
|
||||||
}
|
}
|
||||||
|
|
||||||
req_set_fail(preq);
|
|
||||||
io_req_set_res(preq, -ECANCELED, 0);
|
io_req_set_res(preq, -ECANCELED, 0);
|
||||||
|
complete:
|
||||||
|
if (preq->cqe.res < 0)
|
||||||
|
req_set_fail(preq);
|
||||||
preq->io_task_work.func = io_req_task_complete;
|
preq->io_task_work.func = io_req_task_complete;
|
||||||
io_req_task_work_add(preq);
|
io_req_task_work_add(preq);
|
||||||
out:
|
out:
|
||||||
|
|
|
||||||
|
|
@ -1186,12 +1186,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
|
||||||
return -EBUSY;
|
return -EBUSY;
|
||||||
|
|
||||||
nbufs = src_ctx->buf_table.nr;
|
nbufs = src_ctx->buf_table.nr;
|
||||||
|
if (!nbufs)
|
||||||
|
return -ENXIO;
|
||||||
if (!arg->nr)
|
if (!arg->nr)
|
||||||
arg->nr = nbufs;
|
arg->nr = nbufs;
|
||||||
else if (arg->nr > nbufs)
|
else if (arg->nr > nbufs)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
else if (arg->nr > IORING_MAX_REG_BUFFERS)
|
else if (arg->nr > IORING_MAX_REG_BUFFERS)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
if (check_add_overflow(arg->nr, arg->src_off, &off) || off > nbufs)
|
||||||
|
return -EOVERFLOW;
|
||||||
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
|
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
|
||||||
return -EOVERFLOW;
|
return -EOVERFLOW;
|
||||||
if (nbufs > IORING_MAX_REG_BUFFERS)
|
if (nbufs > IORING_MAX_REG_BUFFERS)
|
||||||
|
|
@ -1201,31 +1205,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
|
||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
/* Fill entries in data from dst that won't overlap with src */
|
/* Copy original dst nodes from before the cloned range */
|
||||||
for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
|
for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
|
||||||
struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
|
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
|
||||||
|
|
||||||
if (src_node) {
|
if (node) {
|
||||||
data.nodes[i] = src_node;
|
data.nodes[i] = node;
|
||||||
src_node->refs++;
|
node->refs++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = -ENXIO;
|
|
||||||
nbufs = src_ctx->buf_table.nr;
|
|
||||||
if (!nbufs)
|
|
||||||
goto out_free;
|
|
||||||
ret = -EINVAL;
|
|
||||||
if (!arg->nr)
|
|
||||||
arg->nr = nbufs;
|
|
||||||
else if (arg->nr > nbufs)
|
|
||||||
goto out_free;
|
|
||||||
ret = -EOVERFLOW;
|
|
||||||
if (check_add_overflow(arg->nr, arg->src_off, &off))
|
|
||||||
goto out_free;
|
|
||||||
if (off > nbufs)
|
|
||||||
goto out_free;
|
|
||||||
|
|
||||||
off = arg->dst_off;
|
off = arg->dst_off;
|
||||||
i = arg->src_off;
|
i = arg->src_off;
|
||||||
nr = arg->nr;
|
nr = arg->nr;
|
||||||
|
|
@ -1238,8 +1227,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
|
||||||
} else {
|
} else {
|
||||||
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
|
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
|
||||||
if (!dst_node) {
|
if (!dst_node) {
|
||||||
ret = -ENOMEM;
|
io_rsrc_data_free(ctx, &data);
|
||||||
goto out_free;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
refcount_inc(&src_node->buf->refs);
|
refcount_inc(&src_node->buf->refs);
|
||||||
|
|
@ -1249,6 +1238,16 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Copy original dst nodes from after the cloned range */
|
||||||
|
for (i = nbufs; i < ctx->buf_table.nr; i++) {
|
||||||
|
struct io_rsrc_node *node = ctx->buf_table.nodes[i];
|
||||||
|
|
||||||
|
if (node) {
|
||||||
|
data.nodes[i] = node;
|
||||||
|
node->refs++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If asked for replace, put the old table. data->nodes[] holds both
|
* If asked for replace, put the old table. data->nodes[] holds both
|
||||||
* old and new nodes at this point.
|
* old and new nodes at this point.
|
||||||
|
|
@ -1265,10 +1264,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
|
||||||
WARN_ON_ONCE(ctx->buf_table.nr);
|
WARN_ON_ONCE(ctx->buf_table.nr);
|
||||||
ctx->buf_table = data;
|
ctx->buf_table = data;
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
out_free:
|
|
||||||
io_rsrc_data_free(ctx, &data);
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue