From c03ce4173c7bffe1e7477f905a09b015d4000d3c Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Mon, 13 Apr 2026 09:08:14 +0800 Subject: [PATCH 01/17] fs: aio: set VMA_DONTCOPY_BIT in mmap to fix NULL-pointer-dereference error [BUG] Recently, our internal syzkaller testing uncovered a null pointer dereference issue: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... [ 51.111664] filemap_read_folio+0x25/0xe0 [ 51.112410] filemap_fault+0xad7/0x1250 [ 51.113112] __do_fault+0x4b/0x460 [ 51.113699] do_pte_missing+0x5bc/0x1db0 [ 51.114250] ? __pte_offset_map+0x23/0x170 [ 51.114822] __handle_mm_fault+0x9f8/0x1680 [ 51.115408] handle_mm_fault+0x24c/0x570 [ 51.115958] do_user_addr_fault+0x226/0xa50 ... Crash analysis showed the file involved was an AIO ring file. [CAUSE] PARENT process CHILD process t=0 io_setup(1, &ctx) [access ctx addr] fork() io_destroy vm_munmap // not affect child vma percpu_ref_put ... put_aio_ring_file t=1 [access ctx addr] // pagefault ... __do_fault filemap_fault max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) t=2 truncate_setsize truncate_pagecache t=3 filemap_get_folio // no folio, create folio __filemap_get_folio(..., FGP_CREAT, ...) // page_not_uptodate filemap_read_folio(file, mapping->a_ops->read_folio, folio) // oops! At t=0, the parent process calls io_setup and then fork. The child process gets its own VMA but without any PTEs. The parent then calls io_destroy. Before i_size is truncated to 0, at t=1 the child process accesses this AIO ctx address and triggers a pagefault. After the max_idx check passes, at t=2 the parent calls truncate_setsize and truncate_pagecache. At t=3 the child fails to obtain the folio, falls into the "page_not_uptodate" path, and hits this problem because AIO does not implement "read_folio". [Fix] Fix this by marking the AIO ring buffer VMA with VM_DONTCOPY so that fork()'s dup_mmap() skips it entirely. This is the correct semantic because: 1) The child's ioctx_table is already reset to NULL by mm_init_aio() during fork(), so the child has no AIO context and no way to perform any AIO operations on this mapping. 2) The AIO ring VMA is only meaningful in conjunction with its associated kioctx, which is never inherited across fork(). So child process with no AIO context has no legitimate reason to access the ring buffer. Delivering SIGSEGV on such an erroneous access is preferable to a kernel crash. Signed-off-by: Zizhi Wo Link: https://patch.msgid.link/20260413010814.548568-1-wozizhi@huawei.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index ba9b9fa2446b..d7910c7c93a6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -447,7 +447,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = { static int aio_ring_mmap_prepare(struct vm_area_desc *desc) { - vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT); + vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTCOPY_BIT); desc->vm_ops = &aio_ring_vm_ops; return 0; } From 6689f01d6740cf358932b3e97ee968c6099800d9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Apr 2026 11:36:19 +0200 Subject: [PATCH 02/17] writeback: Fix use after free in inode_switch_wbs_work_fn() inode_switch_wbs_work_fn() has a loop like: wb_get(new_wb); while (1) { list = llist_del_all(&new_wb->switch_wbs_ctxs); /* Nothing to do? */ if (!list) break; ... process the items ... } Now adding of items to the list looks like: wb_queue_isw() if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) queue_work(isw_wq, &wb->switch_work); Because inode_switch_wbs_work_fn() loops when processing isw items, it can happen that wb->switch_work is pending while wb->switch_wbs_ctxs is empty. This is a problem because in that case wb can get freed (no isw items -> no wb reference) while the work is still pending causing use-after-free issues. We cannot just fix this by cancelling work when freeing wb because that could still trigger problematic 0 -> 1 transitions on wb refcount due to wb_get() in inode_switch_wbs_work_fn(). It could be all handled with more careful code but that seems unnecessarily complex so let's avoid that until it is proven that the looping actually brings practical benefit. Just remove the loop from inode_switch_wbs_work_fn() instead. That way when wb_queue_isw() queues work, we are guaranteed we have added the first item to wb->switch_wbs_ctxs and nobody is going to remove it (and drop the wb reference it holds) until the queued work runs. Fixes: e1b849cfa6b6 ("writeback: Avoid contention on wb->list_lock when switching inodes") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260413093618.17244-2-jack@suse.cz Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3c75ee025bda..d63baa1b6fec 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -570,28 +570,30 @@ void inode_switch_wbs_work_fn(struct work_struct *work) struct inode_switch_wbs_context *isw, *next_isw; struct llist_node *list; + list = llist_del_all(&new_wb->switch_wbs_ctxs); /* - * Grab out reference to wb so that it cannot get freed under us + * Nothing to do? That would be a problem as references held by isw + * items protect wb from freeing... + */ + if (WARN_ON_ONCE(!list)) + return; + + /* + * Grab our reference to wb so that it cannot get freed under us * after we process all the isw items. */ wb_get(new_wb); - while (1) { - list = llist_del_all(&new_wb->switch_wbs_ctxs); - /* Nothing to do? */ - if (!list) - break; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH - * tells the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be - * visible. - */ - synchronize_rcu(); + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); - llist_for_each_entry_safe(isw, next_isw, list, list) - process_inode_switch_wbs(new_wb, isw); - } + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); wb_put(new_wb); } From 51a8de6c50bf947c8f534cd73da4c8f0a13e7bed Mon Sep 17 00:00:00 2001 From: Samuel Page Date: Mon, 20 Apr 2026 11:01:37 +0200 Subject: [PATCH 03/17] fuse: reject oversized dirents in page cache fuse_add_dirent_to_cache() computes a serialized dirent size from the server-controlled namelen field and copies the dirent into a single page-cache page. The existing logic only checks whether the dirent fits in the remaining space of the current page and advances to a fresh page if not. It never checks whether the dirent itself exceeds PAGE_SIZE. As a result, a malicious FUSE server can return a dirent with namelen=4095, producing a serialized record size of 4120 bytes. On 4 KiB page systems this causes memcpy() to overflow the cache page by 24 bytes into the following kernel page. Reject dirents that cannot fit in a single page before copying them into the readdir cache. Fixes: 69e34551152a ("fuse: allow caching readdir") Cc: stable@vger.kernel.org # v6.16+ Assisted-by: Bynario AI Signed-off-by: Samuel Page Reported-by: Qi Tang Reported-by: Zijun Hu Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260420090139.662772-1-mszeredi@redhat.com Signed-off-by: Christian Brauner --- fs/fuse/readdir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index c2aae2eef086..aae657fd56c0 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -41,6 +41,10 @@ static void fuse_add_dirent_to_cache(struct file *file, unsigned int offset; void *addr; + /* Dirent doesn't fit in readdir cache page? Skip caching. */ + if (reclen > PAGE_SIZE) + return; + spin_lock(&fi->rdc.lock); /* * Is cache already completed? Or this entry does not go at the end of From 3adf7ae18bf42601246031002287c103a27df307 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Sat, 18 Apr 2026 14:06:34 +0800 Subject: [PATCH 04/17] fs: aio: reject partial mremap to avoid Null-pointer-dereference error [BUG] Recently, our internal syzkaller testing uncovered a null pointer dereference issue: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... [ 51.111664] filemap_read_folio+0x25/0xe0 [ 51.112410] filemap_fault+0xad7/0x1250 [ 51.113112] __do_fault+0x4b/0x460 [ 51.113699] do_pte_missing+0x5bc/0x1db0 [ 51.114250] ? __pte_offset_map+0x23/0x170 [ 51.114822] __handle_mm_fault+0x9f8/0x1680 ... Crash analysis showed the file involved was an AIO ring file. The phenomenon triggered is the same as the issue described in [1]. [CAUSE] Consider the following scenario: userspace sets up an AIO context via io_setup(), which creates a VMA covering the entire ring buffer. Then userspace calls mremap() with the AIO ring address as the source, a smaller old_len (less than the full ring size), MREMAP_MAYMOVE set, and without MREMAP_DONTUNMAP. The kernel will relocate the requested portion to a new destination address. During this move, __split_vma() splits the original AIO ring VMA. The requested portion is unmapped from the source and re-established at the destination, while the remainder stays at the original source address as an orphan VMA. The aio_ring_mremap() callback fires on the new destination VMA, updating ctx->mmap_base to the destination address. But the callback is unaware that only a partial region was moved and that an orphan VMA still exists at the source: source(AIO): +-------------------+---------------------+ | moved to dest | orphan VMA (AIO) | +-------------------+---------------------+ A A+partial_len A+ctx->mmap_size dest: +-------------------+ | moved VMA (AIO) | +-------------------+ B B+partial_len Later, io_destroy() calls vm_munmap(ctx->mmap_base, ctx->mmap_size), which unmaps the destination. This not only fails to unmap the orphan VMA at the source, but also overshoots the destination VMA and may unmap unrelated mappings adjacent to it! After put_aio_ring_file() calls truncate_setsize() to remove all pages from the pagecache, any subsequent access to the orphan VMA triggers filemap_fault(), which calls a_ops->read_folio(). Since aio does not implement read_folio, this results in a NULL pointer dereference. [FIX] Note that expanding mremap (new_len > old_len) is already rejected because AIO ring VMAs are created with VM_DONTEXPAND. The only problematic case is a partial move where "old_len == new_len" but both are smaller than the full ring size. Fix this by checking in aio_ring_mremap() that the new VMA covers the entire ring. This ensures the AIO ring is always moved as a whole, preventing orphan VMAs and the subsequent crash. [1]: https://lore.kernel.org/all/20260413010814.548568-1-wozizhi@huawei.com/ Signed-off-by: Zizhi Wo Link: https://patch.msgid.link/20260418060634.3713620-1-wozizhi@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/aio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index d7910c7c93a6..722476560848 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -422,7 +422,8 @@ static int aio_ring_mremap(struct vm_area_struct *vma) ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { - if (!atomic_read(&ctx->dead)) { + if (!atomic_read(&ctx->dead) && + (ctx->mmap_size == (vma->vm_end - vma->vm_start))) { ctx->user_id = ctx->mmap_base = vma->vm_start; res = 0; } From 43eb354ecb471426e97b0ce6a0c922ec20f82027 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 16 Apr 2026 14:54:29 -0700 Subject: [PATCH 05/17] nstree: fix func. parameter kernel-doc warnings Use the correct parameter name ("__ns") for function parameter kernel-doc to avoid 3 warnings: Warning: include/linux/nstree.h:68 function parameter '__ns' not described in 'ns_tree_add_raw' Warning: include/linux/nstree.h:77 function parameter '__ns' not described in 'ns_tree_add' Warning: include/linux/nstree.h:88 function parameter '__ns' not described in 'ns_tree_remove' Fixes: 885fc8ac0a4d ("nstree: make iterator generic") Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260416215429.948898-1-rdunlap@infradead.org Signed-off-by: Christian Brauner --- include/linux/nstree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 175e4625bfa6..5b64d4572881 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -61,7 +61,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t /** * ns_tree_add_raw - Add a namespace to a namespace - * @ns: Namespace to add + * @__ns: Namespace to add * * This function adds a namespace to the appropriate namespace tree * without assigning a id. @@ -70,7 +70,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t /** * ns_tree_add - Add a namespace to a namespace tree - * @ns: Namespace to add + * @__ns: Namespace to add * * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. @@ -81,7 +81,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t /** * ns_tree_remove - Remove a namespace from a namespace tree - * @ns: Namespace to remove + * @__ns: Namespace to remove * * This function removes a namespace from the appropriate namespace * tree and list. From 9a466382c5e1ab706e155914e5532c80c2f3f76c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Apr 2026 11:03:12 +0200 Subject: [PATCH 06/17] fs: Handle multiply claimed blocks more gracefully with mmb When a metadata block is referenced by multiple inodes and tracked by metadata bh infrastructure (which is forbidden and generally indicates filesystem corruption), it can happen that mmb_mark_buffer_dirty() is called for two different mmb structures in parallel. This can lead to a corruption of mmb linked list. Handle that situation gracefully (at least from mmb POV) by serializing on setting bh->b_mmb. Reported-by: Ruikai Peng Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260423090311.10955-2-jack@suse.cz Signed-off-by: Christian Brauner --- fs/buffer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index e6980dab1a7f..770a5d89277c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -719,8 +719,15 @@ void mmb_mark_buffer_dirty(struct buffer_head *bh, mark_buffer_dirty(bh); if (!bh->b_mmb) { spin_lock(&mmb->lock); + /* + * For a corrupted filesystem with multiply claimed blocks this + * can fail. Avoid corrupting the linked list in that case. + */ + if (cmpxchg(&bh->b_mmb, NULL, mmb) != NULL) { + spin_unlock(&mmb->lock); + return; + } list_move_tail(&bh->b_assoc_buffers, &mmb->list); - bh->b_mmb = mmb; spin_unlock(&mmb->lock); } } From 3d9fd0abc94d8cd430cc7cd7d37ce5e5aae2cd2b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:04 +0200 Subject: [PATCH 07/17] eventpoll: use hlist_is_singular_node() in __ep_remove() Replace the open-coded "epi is the only entry in file->f_ep" check with hlist_is_singular_node(). Same semantics, and the helper avoids the head-cacheline access in the common false case. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-1-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 23f3c6ac0bad..4e8440994277 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -856,7 +856,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; - if (head->first == &epi->fllink && !epi->fllink.next) { + if (hlist_is_singular_node(&epi->fllink, head)) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { From 0f7bdfd413000985de09fc39eb9efa1e091a3ce0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:05 +0200 Subject: [PATCH 08/17] eventpoll: split __ep_remove() Split __ep_remove() to delineate file removal from epoll item removal. Suggested-by: Linus Torvalds Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-2-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4e8440994277..27839a4446be 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -826,6 +826,9 @@ static void ep_free(struct eventpoll *ep) kfree_rcu(ep, rcu); } +static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file); +static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi); + /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. @@ -837,8 +840,6 @@ static void ep_free(struct eventpoll *ep) static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; - struct epitems_head *to_free; - struct hlist_head *head; lockdep_assert_irqs_enabled(); @@ -854,8 +855,21 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) return false; } - to_free = NULL; - head = file->f_ep; + __ep_remove_file(ep, epi, file); + return __ep_remove_epi(ep, epi); +} + +/* + * Called with &file->f_lock held, + * returns with it released + */ +static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file) +{ + struct epitems_head *to_free = NULL; + struct hlist_head *head = file->f_ep; + + lockdep_assert_held(&ep->mtx); + if (hlist_is_singular_node(&epi->fllink, head)) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); @@ -869,6 +883,11 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); +} + +static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi) +{ + lockdep_assert_held(&ep->mtx); rb_erase_cached(&epi->rbn, &ep->rbr); From e9e5cd40d7c403e19f21d0f7b8b8ba3a76b58330 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:06 +0200 Subject: [PATCH 09/17] eventpoll: kill __ep_remove() Remove the boolean conditional in __ep_remove() and restructure the code so the check for racing with eventpoll_release_file() are only done in the ep_remove_safe() path where they belong. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-3-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 67 ++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 27839a4446be..aae1ef7a3f16 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -826,49 +826,18 @@ static void ep_free(struct eventpoll *ep) kfree_rcu(ep, rcu); } -static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file); -static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi); - -/* - * Removes a "struct epitem" from the eventpoll RB tree and deallocates - * all the associated resources. Must be called with "mtx" held. - * If the dying flag is set, do the removal only if force is true. - * This prevents ep_clear_and_put() from dropping all the ep references - * while running concurrently with eventpoll_release_file(). - * Returns true if the eventpoll can be disposed. - */ -static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) -{ - struct file *file = epi->ffd.file; - - lockdep_assert_irqs_enabled(); - - /* - * Removes poll wait queue hooks. - */ - ep_unregister_pollwait(ep, epi); - - /* Remove the current item from the list of epoll hooks */ - spin_lock(&file->f_lock); - if (epi->dying && !force) { - spin_unlock(&file->f_lock); - return false; - } - - __ep_remove_file(ep, epi, file); - return __ep_remove_epi(ep, epi); -} - /* * Called with &file->f_lock held, * returns with it released */ -static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file) +static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, + struct file *file) { struct epitems_head *to_free = NULL; struct hlist_head *head = file->f_ep; lockdep_assert_held(&ep->mtx); + lockdep_assert_held(&file->f_lock); if (hlist_is_singular_node(&epi->fllink, head)) { /* See eventpoll_release() for details. */ @@ -915,7 +884,25 @@ static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi) */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { - if (__ep_remove(ep, epi, false)) + struct file *file = epi->ffd.file; + + lockdep_assert_irqs_enabled(); + lockdep_assert_held(&ep->mtx); + + ep_unregister_pollwait(ep, epi); + + /* sync with eventpoll_release_file() */ + if (unlikely(READ_ONCE(epi->dying))) + return; + + spin_lock(&file->f_lock); + if (epi->dying) { + spin_unlock(&file->f_lock); + return; + } + __ep_remove_file(ep, epi, file); + + if (__ep_remove_epi(ep, epi)) WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } @@ -1147,7 +1134,7 @@ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); - epi->dying = true; + WRITE_ONCE(epi->dying, true); spin_unlock(&file->f_lock); /* @@ -1156,7 +1143,13 @@ again: */ ep = epi->ep; mutex_lock(&ep->mtx); - dispose = __ep_remove(ep, epi, true); + + ep_unregister_pollwait(ep, epi); + + spin_lock(&file->f_lock); + __ep_remove_file(ep, epi, file); + dispose = __ep_remove_epi(ep, epi); + mutex_unlock(&ep->mtx); if (dispose && ep_refcount_dec_and_test(ep)) From 0feaf644f7180c4a91b6b405a881afbfd958f1cf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 24 Apr 2026 00:23:18 +0200 Subject: [PATCH 10/17] eventpoll: drop vestigial __ prefix from ep_remove_{file,epi}() With __ep_remove() gone, the double-underscore on __ep_remove_file() and __ep_remove_epi() no longer contrasts with a __-less parent and just reads as noise. Rename both to ep_remove_file() and ep_remove_epi(). No functional change. Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aae1ef7a3f16..c9940d50c3fe 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -830,7 +830,7 @@ static void ep_free(struct eventpoll *ep) * Called with &file->f_lock held, * returns with it released */ -static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, +static void ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file) { struct epitems_head *to_free = NULL; @@ -854,7 +854,7 @@ static void __ep_remove_file(struct eventpoll *ep, struct epitem *epi, free_ephead(to_free); } -static bool __ep_remove_epi(struct eventpoll *ep, struct epitem *epi) +static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) { lockdep_assert_held(&ep->mtx); @@ -900,9 +900,9 @@ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) spin_unlock(&file->f_lock); return; } - __ep_remove_file(ep, epi, file); + ep_remove_file(ep, epi, file); - if (__ep_remove_epi(ep, epi)) + if (ep_remove_epi(ep, epi)) WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } @@ -1147,8 +1147,8 @@ again: ep_unregister_pollwait(ep, epi); spin_lock(&file->f_lock); - __ep_remove_file(ep, epi, file); - dispose = __ep_remove_epi(ep, epi); + ep_remove_file(ep, epi, file); + dispose = ep_remove_epi(ep, epi); mutex_unlock(&ep->mtx); From 0bade234723e40e4937be912e105785d6a51464e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:07 +0200 Subject: [PATCH 11/17] eventpoll: rename ep_remove_safe() back to ep_remove() The current name is just confusing and doesn't clarify anything. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-4-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c9940d50c3fe..f9b601f5c0ad 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -882,7 +882,7 @@ static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) /* * ep_remove variant for callers owing an additional reference to the ep */ -static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) +static void ep_remove(struct eventpoll *ep, struct epitem *epi) { struct file *file = epi->ffd.file; @@ -929,7 +929,7 @@ static void ep_clear_and_put(struct eventpoll *ep) /* * Walks through the whole tree and try to free each "struct epitem". - * Note that ep_remove_safe() will not remove the epitem in case of a + * Note that ep_remove() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't @@ -938,7 +938,7 @@ static void ep_clear_and_put(struct eventpoll *ep) for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); - ep_remove_safe(ep, epi); + ep_remove(ep, epi); cond_resched(); } @@ -1631,21 +1631,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, mutex_unlock(&tep->mtx); /* - * ep_remove_safe() calls in the later error paths can't lead to + * ep_remove() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference. */ ep_get(ep); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { - ep_remove_safe(ep, epi); + ep_remove(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { - ep_remove_safe(ep, epi); + ep_remove(ep, epi); return error; } } @@ -1669,7 +1669,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, * high memory pressure. */ if (unlikely(!epq.epi)) { - ep_remove_safe(ep, epi); + ep_remove(ep, epi); return -ENOMEM; } @@ -2364,7 +2364,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, * The eventpoll itself is still alive: the refcount * can't go to zero here. */ - ep_remove_safe(ep, epi); + ep_remove(ep, epi); error = 0; } else { error = -ENOENT; From 86e87059e6d1fd5115a31949726450ed03c1073b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:08 +0200 Subject: [PATCH 12/17] eventpoll: move epi_fget() up We'll need it when removing files so move it up. No functional change. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-5-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 56 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index f9b601f5c0ad..5ee4398a6cb8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -826,6 +826,34 @@ static void ep_free(struct eventpoll *ep) kfree_rcu(ep, rcu); } +/* + * The ffd.file pointer may be in the process of being torn down due to + * being closed, but we may not have finished eventpoll_release() yet. + * + * Normally, even with the atomic_long_inc_not_zero, the file may have + * been free'd and then gotten re-allocated to something else (since + * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). + * + * But for epoll, users hold the ep->mtx mutex, and as such any file in + * the process of being free'd will block in eventpoll_release_file() + * and thus the underlying file allocation will not be free'd, and the + * file re-use cannot happen. + * + * For the same reason we can avoid a rcu_read_lock() around the + * operation - 'ffd.file' cannot go away even if the refcount has + * reached zero (but we must still not call out to ->poll() functions + * etc). + */ +static struct file *epi_fget(const struct epitem *epi) +{ + struct file *file; + + file = epi->ffd.file; + if (!file_ref_get(&file->f_ref)) + file = NULL; + return file; +} + /* * Called with &file->f_lock held, * returns with it released @@ -1018,34 +1046,6 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep return res; } -/* - * The ffd.file pointer may be in the process of being torn down due to - * being closed, but we may not have finished eventpoll_release() yet. - * - * Normally, even with the atomic_long_inc_not_zero, the file may have - * been free'd and then gotten re-allocated to something else (since - * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). - * - * But for epoll, users hold the ep->mtx mutex, and as such any file in - * the process of being free'd will block in eventpoll_release_file() - * and thus the underlying file allocation will not be free'd, and the - * file re-use cannot happen. - * - * For the same reason we can avoid a rcu_read_lock() around the - * operation - 'ffd.file' cannot go away even if the refcount has - * reached zero (but we must still not call out to ->poll() functions - * etc). - */ -static struct file *epi_fget(const struct epitem *epi) -{ - struct file *file; - - file = epi->ffd.file; - if (!file_ref_get(&file->f_ref)) - file = NULL; - return file; -} - /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() From a6dc643c69311677c574a0f17a3f4d66a5f3744b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:09 +0200 Subject: [PATCH 13/17] eventpoll: fix ep_remove struct eventpoll / struct file UAF ep_remove() (via ep_remove_file()) cleared file->f_ep under file->f_lock but then kept using @file inside the critical section (is_file_epoll(), hlist_del_rcu() through the head, spin_unlock). A concurrent __fput() taking the eventpoll_release() fastpath in that window observed the transient NULL, skipped eventpoll_release_file() and ran to f_op->release / file_free(). For the epoll-watches-epoll case, f_op->release is ep_eventpoll_release() -> ep_clear_and_put() -> ep_free(), which kfree()s the watched struct eventpoll. Its embedded ->refs hlist_head is exactly where epi->fllink.pprev points, so the subsequent hlist_del_rcu()'s "*pprev = next" scribbles into freed kmalloc-192 memory. In addition, struct file is SLAB_TYPESAFE_BY_RCU, so the slot backing @file could be recycled by alloc_empty_file() -- reinitializing f_lock and f_ep -- while ep_remove() is still nominally inside that lock. The upshot is an attacker-controllable kmem_cache_free() against the wrong slab cache. Pin @file via epi_fget() at the top of ep_remove() and gate the critical section on the pin succeeding. With the pin held @file cannot reach refcount zero, which holds __fput() off and transitively keeps the watched struct eventpoll alive across the hlist_del_rcu() and the f_lock use, closing both UAFs. If the pin fails @file has already reached refcount zero and its __fput() is in flight. Because we bailed before clearing f_ep, that path takes the eventpoll_release() slow path into eventpoll_release_file() and blocks on ep->mtx until the waiter side's ep_clear_and_put() drops it. The bailed epi's share of ep->refcount stays intact, so the trailing ep_refcount_dec_and_test() in ep_clear_and_put() cannot free the eventpoll out from under eventpoll_release_file(); the orphaned epi is then cleaned up there. A successful pin also proves we are not racing eventpoll_release_file() on this epi, so drop the now-redundant re-check of epi->dying under f_lock. The cheap lockless READ_ONCE(epi->dying) fast-path bailout stays. Fixes: 58c9b016e128 ("epoll: use refcount to reduce ep_mutex contention") Reported-by: Jaeyoung Chung Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-6-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 5ee4398a6cb8..0f785c0a1544 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -912,22 +912,26 @@ static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) */ static void ep_remove(struct eventpoll *ep, struct epitem *epi) { - struct file *file = epi->ffd.file; + struct file *file __free(fput) = NULL; lockdep_assert_irqs_enabled(); lockdep_assert_held(&ep->mtx); ep_unregister_pollwait(ep, epi); - /* sync with eventpoll_release_file() */ + /* cheap sync with eventpoll_release_file() */ if (unlikely(READ_ONCE(epi->dying))) return; - spin_lock(&file->f_lock); - if (epi->dying) { - spin_unlock(&file->f_lock); + /* + * If we manage to grab a reference it means we're not in + * eventpoll_release_file() and aren't going to be. + */ + file = epi_fget(epi); + if (!file) return; - } + + spin_lock(&file->f_lock); ep_remove_file(ep, epi, file); if (ep_remove_epi(ep, epi)) From d30deeb8b0cf6259785c1fb79b87905d281b0a5a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:10 +0200 Subject: [PATCH 14/17] eventpoll: move f_lock acquisition into ep_remove_file() Let the helper own its critical section end-to-end: take &file->f_lock at the top, read file->f_ep inside the lock, release on exit. Callers (ep_remove() and eventpoll_release_file()) no longer need to wrap the call, and the function-comment lock-handoff contract is gone. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-7-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 0f785c0a1544..3f99ff54626f 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -855,18 +855,18 @@ static struct file *epi_fget(const struct epitem *epi) } /* - * Called with &file->f_lock held, - * returns with it released + * Takes &file->f_lock; returns with it released. */ static void ep_remove_file(struct eventpoll *ep, struct epitem *epi, struct file *file) { struct epitems_head *to_free = NULL; - struct hlist_head *head = file->f_ep; + struct hlist_head *head; lockdep_assert_held(&ep->mtx); - lockdep_assert_held(&file->f_lock); + spin_lock(&file->f_lock); + head = file->f_ep; if (hlist_is_singular_node(&epi->fllink, head)) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); @@ -931,7 +931,6 @@ static void ep_remove(struct eventpoll *ep, struct epitem *epi) if (!file) return; - spin_lock(&file->f_lock); ep_remove_file(ep, epi, file); if (ep_remove_epi(ep, epi)) @@ -1150,7 +1149,6 @@ again: ep_unregister_pollwait(ep, epi); - spin_lock(&file->f_lock); ep_remove_file(ep, epi, file); dispose = ep_remove_epi(ep, epi); From 33e92e9ecf48c08cb4807e9a36f9eb01619c1a1e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:11 +0200 Subject: [PATCH 15/17] eventpoll: refresh eventpoll_release() fast-path comment The old comment justified the lockless READ_ONCE(file->f_ep) check with "False positives simply cannot happen because the file is on the way to be removed and nobody ( but eventpoll ) has still a reference to this file." That reasoning was the root of the UAF fixed in "eventpoll: fix ep_remove struct eventpoll / struct file UAF": __ep_remove() could clear f_ep while another close raced past the fast path and freed the watched eventpoll / recycled the struct file slot. With ep_remove() now pinning @file via epi_fget() across the f_ep clear and hlist_del_rcu(), the invariant is re-established for the right reason: anyone who might clear f_ep holds @file alive for the duration, so a NULL observation really does mean no concurrent eventpoll path has work left on this file. Refresh the comment accordingly so the next reader doesn't inherit the broken model. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-8-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- include/linux/eventpoll.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index ea9ca0e4172a..728fb5dee5ed 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -39,12 +39,16 @@ static inline void eventpoll_release(struct file *file) { /* - * Fast check to avoid the get/release of the semaphore. Since - * we're doing this outside the semaphore lock, it might return - * false negatives, but we don't care. It'll help in 99.99% of cases - * to avoid the semaphore lock. False positives simply cannot happen - * because the file in on the way to be removed and nobody ( but - * eventpoll ) has still a reference to this file. + * Fast check to skip the slow path in the common case where the + * file was never attached to an epoll. Safe without file->f_lock + * because every f_ep writer excludes a concurrent __fput() on + * @file: + * - ep_insert() requires the file alive (refcount > 0); + * - ep_remove() holds @file pinned via epi_fget() across the + * write; + * - eventpoll_release_file() runs from __fput() itself. + * We are in __fput() here, so none of those can race us: a NULL + * observation truly means no epoll path has work left on @file. */ if (likely(!READ_ONCE(file->f_ep))) return; From 3a4551ea9c042502019b1d8a986e962cb9015366 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:12 +0200 Subject: [PATCH 16/17] eventpoll: drop dead bool return from ep_remove_epi() ep_remove_epi() always returns true -- the "can be disposed" answer was meaningful back when the dying-check lived inside the pre-split __ep_remove(), but after that check moved to ep_remove() the return value is just noise. Both callers gate on it unconditionally: if (ep_remove_epi(ep, epi)) WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); dispose = ep_remove_epi(ep, epi); ... if (dispose && ep_refcount_dec_and_test(ep)) ep_free(ep); Make ep_remove_epi() return void, drop the dispose local in eventpoll_release_file(), and the useless conditionals at both callers. No functional change. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-9-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3f99ff54626f..eeaadb000eee 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -882,7 +882,7 @@ static void ep_remove_file(struct eventpoll *ep, struct epitem *epi, free_ephead(to_free); } -static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) +static void ep_remove_epi(struct eventpoll *ep, struct epitem *epi) { lockdep_assert_held(&ep->mtx); @@ -904,7 +904,6 @@ static bool ep_remove_epi(struct eventpoll *ep, struct epitem *epi) kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); - return true; } /* @@ -932,9 +931,8 @@ static void ep_remove(struct eventpoll *ep, struct epitem *epi) return; ep_remove_file(ep, epi, file); - - if (ep_remove_epi(ep, epi)) - WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); + ep_remove_epi(ep, epi); + WARN_ON_ONCE(ep_refcount_dec_and_test(ep)); } static void ep_clear_and_put(struct eventpoll *ep) @@ -1126,7 +1124,6 @@ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; - bool dispose; /* * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from @@ -1150,11 +1147,11 @@ again: ep_unregister_pollwait(ep, epi); ep_remove_file(ep, epi, file); - dispose = ep_remove_epi(ep, epi); + ep_remove_epi(ep, epi); mutex_unlock(&ep->mtx); - if (dispose && ep_refcount_dec_and_test(ep)) + if (ep_refcount_dec_and_test(ep)) ep_free(ep); goto again; } From 07422c948f4bdf15567a129a0983f7c12e57ba8e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 23 Apr 2026 11:56:13 +0200 Subject: [PATCH 17/17] eventpoll: drop vestigial epi->dying flag With ep_remove() now pinning @file via epi_fget() across the f_ep clear and hlist_del_rcu(), the dying flag no longer orchestrates anything: it was set in eventpoll_release_file() (which only runs from __fput(), i.e. after @file's refcount has reached zero) and read in __ep_remove() / ep_remove() as a cheap bail before attempting the same synchronization epi_fget() now provides unconditionally. The implication is simple: epi->dying == true always coincides with file_ref_get(&file->f_ref) == false, because __fput() is reachable only once the refcount hits zero and the refcount is monotone in that state. The READ_ONCE(epi->dying) in ep_remove() therefore selects exactly the same callers that epi_fget() would reject, just one atomic cheaper. That's not worth a struct field, a second coordination mechanism, and the comments on both. Refresh the eventpoll_release_file() comment to describe what actually makes the path race-free now (the pin in ep_remove()). No functional change: the correctness argument is unchanged, only the mechanism is now a single one instead of two. Link: https://patch.msgid.link/20260423-work-epoll-uaf-v1-10-2470f9eec0f5@kernel.org Signed-off-by: Christian Brauner (Amutable) --- fs/eventpoll.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eeaadb000eee..a3090b446af1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -148,13 +148,6 @@ struct epitem { /* The file descriptor information this item refers to */ struct epoll_filefd ffd; - /* - * Protected by file->f_lock, true for to-be-released epitem already - * removed from the "struct file" items list; together with - * eventpoll->refcount orchestrates "struct eventpoll" disposal - */ - bool dying; - /* List containing poll wait queues */ struct eppoll_entry *pwqlist; @@ -220,10 +213,7 @@ struct eventpoll { struct hlist_head refs; u8 loop_check_depth; - /* - * usage count, used together with epitem->dying to - * orchestrate the disposal of this struct - */ + /* usage count, orchestrates "struct eventpoll" disposal */ refcount_t refcount; /* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */ @@ -918,13 +908,10 @@ static void ep_remove(struct eventpoll *ep, struct epitem *epi) ep_unregister_pollwait(ep, epi); - /* cheap sync with eventpoll_release_file() */ - if (unlikely(READ_ONCE(epi->dying))) - return; - /* * If we manage to grab a reference it means we're not in - * eventpoll_release_file() and aren't going to be. + * eventpoll_release_file() and aren't going to be: once @file's + * refcount has reached zero, file_ref_get() cannot bring it back. */ file = epi_fget(epi); if (!file) @@ -1126,15 +1113,15 @@ void eventpoll_release_file(struct file *file) struct epitem *epi; /* - * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from - * touching the epitems list before eventpoll_release_file() can access - * the ep->mtx. + * A concurrent ep_remove() cannot outrace us: it pins @file via + * epi_fget(), which fails once __fput() has dropped the refcount + * to zero -- the path we're on. So any racing ep_remove() bails + * and leaves the epi for us to clean up here. */ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); - WRITE_ONCE(epi->dying, true); spin_unlock(&file->f_lock); /*