vfs-7.1-rc1.fixes

Please consider pulling these changes from the signed vfs-7.1-rc1.fixes tag. Thanks! Christian -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaeqfYAAKCRCRxhvAZXjc oltyAP4y1SFYvmoy2mPM3jrSbYuT2rX0q4OZ/GDbuWOvir/bcgEAoPI9JHraS1+2 xFj/7JJFWzuDXlFoaX6g+nv42pfatgU= =BnjA -----END PGP SIGNATURE----- Merge tag 'vfs-7.1-rc1.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs Pull vfs fixes from Christian Brauner: - eventpoll: fix ep_remove() UAF and follow-up cleanup - fs: aio: set VMA_DONTCOPY_BIT in mmap to fix NULL-pointer-dereference error - writeback: Fix use after free in inode_switch_wbs_work_fn() - fuse: reject oversized dirents in page cache - fs: aio: reject partial mremap to avoid Null-pointer-dereference error - nstree: fix func. parameter kernel-doc warnings - fs: Handle multiply claimed blocks more gracefully with mmb * tag 'vfs-7.1-rc1.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: eventpoll: drop vestigial epi->dying flag eventpoll: drop dead bool return from ep_remove_epi() eventpoll: refresh eventpoll_release() fast-path comment eventpoll: move f_lock acquisition into ep_remove_file() eventpoll: fix ep_remove struct eventpoll / struct file UAF eventpoll: move epi_fget() up eventpoll: rename ep_remove_safe() back to ep_remove() eventpoll: drop vestigial __ prefix from ep_remove_{file,epi}() eventpoll: kill __ep_remove() eventpoll: split __ep_remove() eventpoll: use hlist_is_singular_node() in __ep_remove() fs: Handle multiply claimed blocks more gracefully with mmb nstree: fix func. parameter kernel-doc warnings fs: aio: reject partial mremap to avoid Null-pointer-dereference error fuse: reject oversized dirents in page cache writeback: Fix use after free in inode_switch_wbs_work_fn() fs: aio: set VMA_DONTCOPY_BIT in mmap to fix NULL-pointer-dereference error
2026-04-23 17:08:04 -07:00 · 2026-04-23 17:08:04 -07:00 · dd6c438c3e
parent bd1886d6e4 ac8777cc36
commit dd6c438c3e
7 changed files with 125 additions and 109 deletions
--- a/fs/aio.c
+++ b/fs/aio.c
@ -422,7 +422,8 @@ static int aio_ring_mremap(struct vm_area_struct *vma)

 		ctx = rcu_dereference(table->table[i]);
 		if (ctx && ctx->aio_ring_file == file) {
-			if (!atomic_read(&ctx->dead)) {
+			if (!atomic_read(&ctx->dead) &&
+			    (ctx->mmap_size == (vma->vm_end - vma->vm_start))) {
 				ctx->user_id = ctx->mmap_base = vma->vm_start;
 				res = 0;
 			}
@ -447,7 +448,7 @@ static const struct vm_operations_struct aio_ring_vm_ops = {

 static int aio_ring_mmap_prepare(struct vm_area_desc *desc)
 {
-	vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT);
+	vma_desc_set_flags(desc, VMA_DONTEXPAND_BIT, VMA_DONTCOPY_BIT);
 	desc->vm_ops = &aio_ring_vm_ops;
 	return 0;
 }
--- a/fs/buffer.c
+++ b/fs/buffer.c
@ -719,8 +719,15 @@ void mmb_mark_buffer_dirty(struct buffer_head *bh,
 	mark_buffer_dirty(bh);
 	if (!bh->b_mmb) {
 		spin_lock(&mmb->lock);
+		/*
+		 * For a corrupted filesystem with multiply claimed blocks this
+		 * can fail. Avoid corrupting the linked list in that case.
+		 */
+		if (cmpxchg(&bh->b_mmb, NULL, mmb) != NULL) {
+			spin_unlock(&mmb->lock);
+			return;
+		}
 		list_move_tail(&bh->b_assoc_buffers, &mmb->list);
-		bh->b_mmb = mmb;
 		spin_unlock(&mmb->lock);
 	}
 }
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@ -148,13 +148,6 @@ struct epitem {
 	/* The file descriptor information this item refers to */
 	struct epoll_filefd ffd;

-	/*
-	 * Protected by file->f_lock, true for to-be-released epitem already
-	 * removed from the "struct file" items list; together with
-	 * eventpoll->refcount orchestrates "struct eventpoll" disposal
-	 */
-	bool dying;
-
 	/* List containing poll wait queues */
 	struct eppoll_entry *pwqlist;

@ -220,10 +213,7 @@ struct eventpoll {
 	struct hlist_head refs;
 	u8 loop_check_depth;

-	/*
-	 * usage count, used together with epitem->dying to
-	 * orchestrate the disposal of this struct
-	 */
+	/* usage count, orchestrates "struct eventpoll" disposal */
 	refcount_t refcount;

 	/* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */
@ -827,36 +817,47 @@ static void ep_free(struct eventpoll *ep)
 }

 /*
- * Removes a "struct epitem" from the eventpoll RB tree and deallocates
- * all the associated resources. Must be called with "mtx" held.
- * If the dying flag is set, do the removal only if force is true.
- * This prevents ep_clear_and_put() from dropping all the ep references
- * while running concurrently with eventpoll_release_file().
- * Returns true if the eventpoll can be disposed.
+ * The ffd.file pointer may be in the process of being torn down due to
+ * being closed, but we may not have finished eventpoll_release() yet.
+ *
+ * Normally, even with the atomic_long_inc_not_zero, the file may have
+ * been free'd and then gotten re-allocated to something else (since
+ * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
+ *
+ * But for epoll, users hold the ep->mtx mutex, and as such any file in
+ * the process of being free'd will block in eventpoll_release_file()
+ * and thus the underlying file allocation will not be free'd, and the
+ * file re-use cannot happen.
+ *
+ * For the same reason we can avoid a rcu_read_lock() around the
+ * operation - 'ffd.file' cannot go away even if the refcount has
+ * reached zero (but we must still not call out to ->poll() functions
+ * etc).
 */
-static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
+static struct file *epi_fget(const struct epitem *epi)
 {
-	struct file *file = epi->ffd.file;
-	struct epitems_head *to_free;
+	struct file *file;
+
+	file = epi->ffd.file;
+	if (!file_ref_get(&file->f_ref))
+		file = NULL;
+	return file;
+}
+
+/*
+ * Takes &file->f_lock; returns with it released.
+ */
+static void ep_remove_file(struct eventpoll *ep, struct epitem *epi,
+			     struct file *file)
+{
+	struct epitems_head *to_free = NULL;
 	struct hlist_head *head;

-	lockdep_assert_irqs_enabled();
+	lockdep_assert_held(&ep->mtx);

-	/*
-	 * Removes poll wait queue hooks.
-	 */
-	ep_unregister_pollwait(ep, epi);
-
-	/* Remove the current item from the list of epoll hooks */
 	spin_lock(&file->f_lock);
-	if (epi->dying && !force) {
-		spin_unlock(&file->f_lock);
-		return false;
-	}
-
-	to_free = NULL;
 	head = file->f_ep;
-	if (head->first == &epi->fllink && !epi->fllink.next) {
+	if (hlist_is_singular_node(&epi->fllink, head)) {
 		/* See eventpoll_release() for details. */
 		WRITE_ONCE(file->f_ep, NULL);
 		if (!is_file_epoll(file)) {
@ -869,6 +870,11 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 	hlist_del_rcu(&epi->fllink);
 	spin_unlock(&file->f_lock);
 	free_ephead(to_free);
+}
+
+static void ep_remove_epi(struct eventpoll *ep, struct epitem *epi)
+{
+	lockdep_assert_held(&ep->mtx);

 	rb_erase_cached(&epi->rbn, &ep->rbr);

@ -888,16 +894,32 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 	kfree_rcu(epi, rcu);

 	percpu_counter_dec(&ep->user->epoll_watches);
-	return true;
 }

 /*
 * ep_remove variant for callers owing an additional reference to the ep
 */
-static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
+static void ep_remove(struct eventpoll *ep, struct epitem *epi)
 {
-	if (__ep_remove(ep, epi, false))
-		WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
+	struct file *file __free(fput) = NULL;
+
+	lockdep_assert_irqs_enabled();
+	lockdep_assert_held(&ep->mtx);
+
+	ep_unregister_pollwait(ep, epi);
+
+	/*
+	 * If we manage to grab a reference it means we're not in
+	 * eventpoll_release_file() and aren't going to be: once @file's
+	 * refcount has reached zero, file_ref_get() cannot bring it back.
+	 */
+	file = epi_fget(epi);
+	if (!file)
+		return;
+
+	ep_remove_file(ep, epi, file);
+	ep_remove_epi(ep, epi);
+	WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
 }

 static void ep_clear_and_put(struct eventpoll *ep)
@ -923,7 +945,7 @@ static void ep_clear_and_put(struct eventpoll *ep)

 	/*
 	 * Walks through the whole tree and try to free each "struct epitem".
-	 * Note that ep_remove_safe() will not remove the epitem in case of a
+	 * Note that ep_remove() will not remove the epitem in case of a
 	 * racing eventpoll_release_file(); the latter will do the removal.
 	 * At this point we are sure no poll callbacks will be lingering around.
 	 * Since we still own a reference to the eventpoll struct, the loop can't
@ -932,7 +954,7 @@ static void ep_clear_and_put(struct eventpoll *ep)
 	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
 		next = rb_next(rbp);
 		epi = rb_entry(rbp, struct epitem, rbn);
-		ep_remove_safe(ep, epi);
+		ep_remove(ep, epi);
 		cond_resched();
 	}

@ -1012,34 +1034,6 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
 	return res;
 }

-/*
- * The ffd.file pointer may be in the process of being torn down due to
- * being closed, but we may not have finished eventpoll_release() yet.
- *
- * Normally, even with the atomic_long_inc_not_zero, the file may have
- * been free'd and then gotten re-allocated to something else (since
- * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
- *
- * But for epoll, users hold the ep->mtx mutex, and as such any file in
- * the process of being free'd will block in eventpoll_release_file()
- * and thus the underlying file allocation will not be free'd, and the
- * file re-use cannot happen.
- *
- * For the same reason we can avoid a rcu_read_lock() around the
- * operation - 'ffd.file' cannot go away even if the refcount has
- * reached zero (but we must still not call out to ->poll() functions
- * etc).
- */
-static struct file *epi_fget(const struct epitem *epi)
-{
-	struct file *file;
-
-	file = epi->ffd.file;
-	if (!file_ref_get(&file->f_ref))
-		file = NULL;
-	return file;
-}
-
 /*
 * Differs from ep_eventpoll_poll() in that internal callers already have
 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
@ -1117,18 +1111,17 @@ void eventpoll_release_file(struct file *file)
 {
 	struct eventpoll *ep;
 	struct epitem *epi;
-	bool dispose;

 	/*
-	 * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
-	 * touching the epitems list before eventpoll_release_file() can access
-	 * the ep->mtx.
+	 * A concurrent ep_remove() cannot outrace us: it pins @file via
+	 * epi_fget(), which fails once __fput() has dropped the refcount
+	 * to zero -- the path we're on. So any racing ep_remove() bails
+	 * and leaves the epi for us to clean up here.
 	 */
 again:
 	spin_lock(&file->f_lock);
 	if (file->f_ep && file->f_ep->first) {
 		epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
-		epi->dying = true;
 		spin_unlock(&file->f_lock);

 		/*
@ -1137,10 +1130,15 @@ again:
 		 */
 		ep = epi->ep;
 		mutex_lock(&ep->mtx);
-		dispose = __ep_remove(ep, epi, true);
+
+		ep_unregister_pollwait(ep, epi);
+
+		ep_remove_file(ep, epi, file);
+		ep_remove_epi(ep, epi);
+
 		mutex_unlock(&ep->mtx);

-		if (dispose && ep_refcount_dec_and_test(ep))
+		if (ep_refcount_dec_and_test(ep))
 			ep_free(ep);
 		goto again;
 	}
@ -1619,21 +1617,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 		mutex_unlock(&tep->mtx);

 	/*
-	 * ep_remove_safe() calls in the later error paths can't lead to
+	 * ep_remove() calls in the later error paths can't lead to
 	 * ep_free() as the ep file itself still holds an ep reference.
 	 */
 	ep_get(ep);

 	/* now check if we've created too many backpaths */
 	if (unlikely(full_check && reverse_path_check())) {
-		ep_remove_safe(ep, epi);
+		ep_remove(ep, epi);
 		return -EINVAL;
 	}

 	if (epi->event.events & EPOLLWAKEUP) {
 		error = ep_create_wakeup_source(epi);
 		if (error) {
-			ep_remove_safe(ep, epi);
+			ep_remove(ep, epi);
 			return error;
 		}
 	}
@ -1657,7 +1655,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 	 * high memory pressure.
 	 */
 	if (unlikely(!epq.epi)) {
-		ep_remove_safe(ep, epi);
+		ep_remove(ep, epi);
 		return -ENOMEM;
 	}

@ -2352,7 +2350,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
 			 * The eventpoll itself is still alive: the refcount
 			 * can't go to zero here.
 			 */
-			ep_remove_safe(ep, epi);
+			ep_remove(ep, epi);
 			error = 0;
 		} else {
 			error = -ENOENT;
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@ -568,28 +568,30 @@ void inode_switch_wbs_work_fn(struct work_struct *work)
 	struct inode_switch_wbs_context *isw, *next_isw;
 	struct llist_node *list;

+	list = llist_del_all(&new_wb->switch_wbs_ctxs);
 	/*
-	 * Grab out reference to wb so that it cannot get freed under us
+	 * Nothing to do? That would be a problem as references held by isw
+	 * items protect wb from freeing...
+	 */
+	if (WARN_ON_ONCE(!list))
+		return;
+
+	/*
+	 * Grab our reference to wb so that it cannot get freed under us
 	 * after we process all the isw items.
 	 */
 	wb_get(new_wb);
-	while (1) {
-		list = llist_del_all(&new_wb->switch_wbs_ctxs);
-		/* Nothing to do? */
-		if (!list)
-			break;
-		/*
-		 * In addition to synchronizing among switchers, I_WB_SWITCH
-		 * tells the RCU protected stat update paths to grab the i_page
-		 * lock so that stat transfer can synchronize against them.
-		 * Let's continue after I_WB_SWITCH is guaranteed to be
-		 * visible.
-		 */
-		synchronize_rcu();
+	/*
+	 * In addition to synchronizing among switchers, I_WB_SWITCH
+	 * tells the RCU protected stat update paths to grab the i_page
+	 * lock so that stat transfer can synchronize against them.
+	 * Let's continue after I_WB_SWITCH is guaranteed to be
+	 * visible.
+	 */
+	synchronize_rcu();

-		llist_for_each_entry_safe(isw, next_isw, list, list)
-			process_inode_switch_wbs(new_wb, isw);
-	}
+	llist_for_each_entry_safe(isw, next_isw, list, list)
+		process_inode_switch_wbs(new_wb, isw);
 	wb_put(new_wb);
 }

--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@ -41,6 +41,10 @@ static void fuse_add_dirent_to_cache(struct file *file,
 	unsigned int offset;
 	void *addr;

+	/* Dirent doesn't fit in readdir cache page?  Skip caching. */
+	if (reclen > PAGE_SIZE)
+		return;
+
 	spin_lock(&fi->rdc.lock);
 	/*
 	 * Is cache already completed?  Or this entry does not go at the end of
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@ -39,12 +39,16 @@ static inline void eventpoll_release(struct file *file)
 {

 	/*
-	 * Fast check to avoid the get/release of the semaphore. Since
-	 * we're doing this outside the semaphore lock, it might return
-	 * false negatives, but we don't care. It'll help in 99.99% of cases
-	 * to avoid the semaphore lock. False positives simply cannot happen
-	 * because the file in on the way to be removed and nobody ( but
-	 * eventpoll ) has still a reference to this file.
+	 * Fast check to skip the slow path in the common case where the
+	 * file was never attached to an epoll. Safe without file->f_lock
+	 * because every f_ep writer excludes a concurrent __fput() on
+	 * @file:
+	 *   - ep_insert() requires the file alive (refcount > 0);
+	 *   - ep_remove() holds @file pinned via epi_fget() across the
+	 *     write;
+	 *   - eventpoll_release_file() runs from __fput() itself.
+	 * We are in __fput() here, so none of those can race us: a NULL
+	 * observation truly means no epoll path has work left on @file.
 	 */
 	if (likely(!READ_ONCE(file->f_ep)))
 		return;
--- a/include/linux/nstree.h
+++ b/include/linux/nstree.h
@ -61,7 +61,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t

 /**
 * ns_tree_add_raw - Add a namespace to a namespace
- * @ns: Namespace to add
+ * @__ns: Namespace to add
 *
 * This function adds a namespace to the appropriate namespace tree
 * without assigning a id.
@ -70,7 +70,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t

 /**
 * ns_tree_add - Add a namespace to a namespace tree
- * @ns: Namespace to add
+ * @__ns: Namespace to add
 *
 * This function assigns a new id to the namespace and adds it to the
 * appropriate namespace tree and list.
@ -81,7 +81,7 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_t

 /**
 * ns_tree_remove - Remove a namespace from a namespace tree
- * @ns: Namespace to remove
+ * @__ns: Namespace to remove
 *
 * This function removes a namespace from the appropriate namespace
 * tree and list.