From 87213b0d847cd300285b5545598e0548baeb5208 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Mon, 1 Dec 2025 14:41:44 -0700
Subject: [PATCH 1/8] ublk: allow non-blocking ctrl cmds in IO_URING_F_NONBLOCK
 issue

Handling most of the ublksrv_ctrl_cmd opcodes require locking a mutex,
so ublk_ctrl_uring_cmd() bails out with EAGAIN when called with the
IO_URING_F_NONBLOCK issue flag. However, several opcodes can be handled
without blocking:
- UBLK_CMD_GET_QUEUE_AFFINITY
- UBLK_CMD_GET_DEV_INFO
- UBLK_CMD_GET_DEV_INFO2
- UBLK_U_CMD_GET_FEATURES

Handle these opcodes synchronously instead of returning EAGAIN so
io_uring doesn't need to issue the command via the worker thread pool.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 2c715df63f23..3ecaafacfd20 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -3673,6 +3673,19 @@ exit:
 	return ret;
 }
 
+static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op)
+{
+	switch (_IOC_NR(cmd_op)) {
+	case UBLK_CMD_GET_QUEUE_AFFINITY:
+	case UBLK_CMD_GET_DEV_INFO:
+	case UBLK_CMD_GET_DEV_INFO2:
+	case _IOC_NR(UBLK_U_CMD_GET_FEATURES):
+		return false;
+	default:
+		return true;
+	}
+}
+
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
@@ -3681,7 +3694,8 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	u32 cmd_op = cmd->cmd_op;
 	int ret = -EINVAL;
 
-	if (issue_flags & IO_URING_F_NONBLOCK)
+	if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) &&
+	    issue_flags & IO_URING_F_NONBLOCK)
 		return -EAGAIN;
 
 	ublk_ctrl_cmd_dump(cmd);

From 53280e398471f0bddbb17b798a63d41264651325 Mon Sep 17 00:00:00 2001
From: Shida Zhang <zhangshida@kylinos.cn>
Date: Tue, 9 Dec 2025 17:01:56 +0800
Subject: [PATCH 2/8] bcache: fix improper use of bi_end_io

Don't call bio->bi_end_io() directly. Use the bio_endio() helper
function instead, which handles completion more safely and uniformly.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Shida Zhang <zhangshida@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index af345dc6fde1..82fdea7dea7a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1104,7 +1104,7 @@ static void detached_dev_end_io(struct bio *bio)
 	}
 
 	kfree(ddip);
-	bio->bi_end_io(bio);
+	bio_endio(bio);
 }
 
 static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
@@ -1121,7 +1121,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
 	ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
 	if (!ddip) {
 		bio->bi_status = BLK_STS_RESOURCE;
-		bio->bi_end_io(bio);
+		bio_endio(bio);
 		return;
 	}
 
@@ -1136,7 +1136,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio,
 
 	if ((bio_op(bio) == REQ_OP_DISCARD) &&
 	    !bdev_max_discard_sectors(dc->bdev))
-		bio->bi_end_io(bio);
+		detached_dev_end_io(bio);
 	else
 		submit_bio_noacct(bio);
 }

From cfdeb588ae1dff5d52da37d2797d0203e8605480 Mon Sep 17 00:00:00 2001
From: Shida Zhang <zhangshida@kylinos.cn>
Date: Tue, 9 Dec 2025 17:01:57 +0800
Subject: [PATCH 3/8] block: prohibit calls to bio_chain_endio

Now that all potential callers of bio_chain_endio have been
eliminated, completely prohibit any future calls to this function.

Suggested-by: Ming Lei <ming.lei@redhat.com>
Suggested-by: Andreas Gruenbacher <agruenba@redhat.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Shida Zhang <zhangshida@kylinos.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index fa5ff36b443f..e726c0e280a8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -321,9 +321,13 @@ static struct bio *__bio_chain_endio(struct bio *bio)
 	return parent;
 }
 
+/*
+ * This function should only be used as a flag and must never be called.
+ * If execution reaches here, it indicates a serious programming error.
+ */
 static void bio_chain_endio(struct bio *bio)
 {
-	bio_endio(__bio_chain_endio(bio));
+	BUG();
 }
 
 /**

From db339b4067eccb7fa3d9787d5d3ab5d466fd9efa Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Mon, 8 Dec 2025 20:14:23 -0700
Subject: [PATCH 4/8] ublk: don't mutate struct bio_vec in iteration

__bio_for_each_segment() uses the returned struct bio_vec's bv_len field
to advance the struct bvec_iter at the end of each loop iteration. So
it's incorrect to modify it during the loop. Don't assign to bv_len (or
bv_offset, for that matter) in ublk_copy_user_pages().

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Fixes: e87d66ab27ac ("ublk: use rq_for_each_segment() for user copy")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 3ecaafacfd20..df9831783a13 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -926,6 +926,7 @@ static size_t ublk_copy_user_pages(const struct request *req,
 	size_t done = 0;
 
 	rq_for_each_segment(bv, req, iter) {
+		unsigned len;
 		void *bv_buf;
 		size_t copied;
 
@@ -934,18 +935,17 @@ static size_t ublk_copy_user_pages(const struct request *req,
 			continue;
 		}
 
-		bv.bv_offset += offset;
-		bv.bv_len -= offset;
-		bv_buf = bvec_kmap_local(&bv);
+		len = bv.bv_len - offset;
+		bv_buf = kmap_local_page(bv.bv_page) + bv.bv_offset + offset;
 		if (dir == ITER_DEST)
-			copied = copy_to_iter(bv_buf, bv.bv_len, uiter);
+			copied = copy_to_iter(bv_buf, len, uiter);
 		else
-			copied = copy_from_iter(bv_buf, bv.bv_len, uiter);
+			copied = copy_from_iter(bv_buf, len, uiter);
 
 		kunmap_local(bv_buf);
 
 		done += copied;
-		if (copied < bv.bv_len)
+		if (copied < len)
 			break;
 
 		offset = 0;

From 59e25ef2b413c72da6686d431e7759302cfccafa Mon Sep 17 00:00:00 2001
From: Mohamed Khalfella <mkhalfella@purestorage.com>
Date: Fri, 5 Dec 2025 13:17:02 -0800
Subject: [PATCH 5/8] block: Use RCU in blk_mq_[un]quiesce_tagset() instead of
 set->tag_list_lock

blk_mq_{add,del}_queue_tag_set() functions add and remove queues from
tagset, the functions make sure that tagset and queues are marked as
shared when two or more queues are attached to the same tagset.
Initially a tagset starts as unshared and when the number of added
queues reaches two, blk_mq_add_queue_tag_set() marks it as shared along
with all the queues attached to it. When the number of attached queues
drops to 1 blk_mq_del_queue_tag_set() need to mark both the tagset and
the remaining queues as unshared.

Both functions need to freeze current queues in tagset before setting on
unsetting BLK_MQ_F_TAG_QUEUE_SHARED flag. While doing so, both functions
hold set->tag_list_lock mutex, which makes sense as we do not want
queues to be added or deleted in the process. This used to work fine
until commit 98d81f0df70c ("nvme: use blk_mq_[un]quiesce_tagset")
made the nvme driver quiesce tagset instead of quiscing individual
queues. blk_mq_quiesce_tagset() does the job and quiesce the queues in
set->tag_list while holding set->tag_list_lock also.

This results in deadlock between two threads with these stacktraces:

  __schedule+0x47c/0xbb0
  ? timerqueue_add+0x66/0xb0
  schedule+0x1c/0xa0
  schedule_preempt_disabled+0xa/0x10
  __mutex_lock.constprop.0+0x271/0x600
  blk_mq_quiesce_tagset+0x25/0xc0
  nvme_dev_disable+0x9c/0x250
  nvme_timeout+0x1fc/0x520
  blk_mq_handle_expired+0x5c/0x90
  bt_iter+0x7e/0x90
  blk_mq_queue_tag_busy_iter+0x27e/0x550
  ? __blk_mq_complete_request_remote+0x10/0x10
  ? __blk_mq_complete_request_remote+0x10/0x10
  ? __call_rcu_common.constprop.0+0x1c0/0x210
  blk_mq_timeout_work+0x12d/0x170
  process_one_work+0x12e/0x2d0
  worker_thread+0x288/0x3a0
  ? rescuer_thread+0x480/0x480
  kthread+0xb8/0xe0
  ? kthread_park+0x80/0x80
  ret_from_fork+0x2d/0x50
  ? kthread_park+0x80/0x80
  ret_from_fork_asm+0x11/0x20

  __schedule+0x47c/0xbb0
  ? xas_find+0x161/0x1a0
  schedule+0x1c/0xa0
  blk_mq_freeze_queue_wait+0x3d/0x70
  ? destroy_sched_domains_rcu+0x30/0x30
  blk_mq_update_tag_set_shared+0x44/0x80
  blk_mq_exit_queue+0x141/0x150
  del_gendisk+0x25a/0x2d0
  nvme_ns_remove+0xc9/0x170
  nvme_remove_namespaces+0xc7/0x100
  nvme_remove+0x62/0x150
  pci_device_remove+0x23/0x60
  device_release_driver_internal+0x159/0x200
  unbind_store+0x99/0xa0
  kernfs_fop_write_iter+0x112/0x1e0
  vfs_write+0x2b1/0x3d0
  ksys_write+0x4e/0xb0
  do_syscall_64+0x5b/0x160
  entry_SYSCALL_64_after_hwframe+0x4b/0x53

The top stacktrace is showing nvme_timeout() called to handle nvme
command timeout. timeout handler is trying to disable the controller and
as a first step, it needs to blk_mq_quiesce_tagset() to tell blk-mq not
to call queue callback handlers. The thread is stuck waiting for
set->tag_list_lock as it tries to walk the queues in set->tag_list.

The lock is held by the second thread in the bottom stack which is
waiting for one of queues to be frozen. The queue usage counter will
drop to zero after nvme_timeout() finishes, and this will not happen
because the thread will wait for this mutex forever.

Given that [un]quiescing queue is an operation that does not need to
sleep, update blk_mq_[un]quiesce_tagset() to use RCU instead of taking
set->tag_list_lock, update blk_mq_{add,del}_queue_tag_set() to use RCU
safe list operations. Also, delete INIT_LIST_HEAD(&q->tag_set_list)
in blk_mq_del_queue_tag_set() because we can not re-initialize it while
the list is being traversed under RCU. The deleted queue will not be
added/deleted to/from a tagset and it will be freed in blk_free_queue()
after the end of RCU grace period.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
Fixes: 98d81f0df70c ("nvme: use blk_mq_[un]quiesce_tagset")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index bd8b11c472a2..049e9dce1149 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -336,12 +336,12 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
 {
 	struct request_queue *q;
 
-	mutex_lock(&set->tag_list_lock);
-	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(q, &set->tag_list, tag_set_list) {
 		if (!blk_queue_skip_tagset_quiesce(q))
 			blk_mq_quiesce_queue_nowait(q);
 	}
-	mutex_unlock(&set->tag_list_lock);
+	rcu_read_unlock();
 
 	blk_mq_wait_quiesce_done(set);
 }
@@ -351,12 +351,12 @@ void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
 {
 	struct request_queue *q;
 
-	mutex_lock(&set->tag_list_lock);
-	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(q, &set->tag_list, tag_set_list) {
 		if (!blk_queue_skip_tagset_quiesce(q))
 			blk_mq_unquiesce_queue(q);
 	}
-	mutex_unlock(&set->tag_list_lock);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
 
@@ -4311,7 +4311,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 	struct blk_mq_tag_set *set = q->tag_set;
 
 	mutex_lock(&set->tag_list_lock);
-	list_del(&q->tag_set_list);
+	list_del_rcu(&q->tag_set_list);
 	if (list_is_singular(&set->tag_list)) {
 		/* just transitioned to unshared */
 		set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
@@ -4319,7 +4319,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 		blk_mq_update_tag_set_shared(set, false);
 	}
 	mutex_unlock(&set->tag_list_lock);
-	INIT_LIST_HEAD(&q->tag_set_list);
 }
 
 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
@@ -4338,7 +4337,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 	}
 	if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
 		queue_set_hctx_shared(q, true);
-	list_add_tail(&q->tag_set_list, &set->tag_list);
+	list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
 
 	mutex_unlock(&set->tag_list_lock);
 }

From 2c38ec934ddfe2d35c813edea2674356bea0fabe Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Date: Wed, 10 Dec 2025 03:10:37 +0100
Subject: [PATCH 6/8] block: fix cached zone reports on devices with native
 zone append

When mounting a btrfs file system on virtio-blk which supports native
Zone Append there has been a WARN triggering in btrfs' space management
code.

Further looking into btrfs' zoned statistics uncovered the filesystem
expecting the zones to be used, but the write pointers being 0:
 # cat /sys/fs/btrfs/8eabd2e7-3294-4f9e-9b58-7e64135c8bf4/zoned_stats
 active block-groups: 4
         reclaimable: 0
         unused: 0
         need reclaim: false
 data relocation block-group: 1342177280
 active zones:
         start: 1073741824, wp: 0 used: 0, reserved: 0, unusable: 0
         start: 1342177280, wp: 0 used: 0, reserved: 0, unusable: 0
         start: 1610612736, wp: 0 used: 16384, reserved: 0, unusable: 18446744073709535232
         start: 1879048192, wp: 0 used: 131072, reserved: 0, unusable: 18446744073709420544

Looking at the blkzone report output for the zone in question
(1610612736) the write pointer on the device moved, but the filesystem
did not see a change on the write pointer:
 # blkzone report -c 1 -o 0x300000 /dev/vda
   start: 0x000300000, len 0x080000, cap 0x080000, wptr 0x000040 reset:0 non-seq:0, zcond: 2(oi) [type: 2(SEQ_WRITE_REQUIRED)]

The zone write pointer is 0, because btrfs is using the cached version
of blkdev_report_zones() and as virtio-blk is supporting native zone
append, but blkdev_revalidate_zones() does not initialize the zone write
plugs in this case.

Not skipping the revalidate of sequential zones in
blkdev_revalidate_zones() callchain fixes this issue.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Fixes: a6aa36e957a1 ("block: Remove zone write plugs when handling native zone append writes")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 394d8d74bba9..1c54678fae6b 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -2100,7 +2100,7 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
 	 * we have a zone write plug for such zone if the device has a zone
 	 * write plug hash table.
 	 */
-	if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash)
+	if (!disk->zone_wplugs_hash)
 		return 0;
 
 	wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);

From f22ecf9c14c12918e30f2179ef516e99eb8b2e49 Mon Sep 17 00:00:00 2001
From: Fengnan Chang <fengnanchang@gmail.com>
Date: Wed, 10 Dec 2025 16:55:00 +0800
Subject: [PATCH 7/8] blk-mq: delete task running check in blk_hctx_poll()

blk_hctx_poll() always checks if the task is running or not, and returns
1 if the task is running. This is a leftover from when polled IO was
purely for synchronous IO, and doesn't make sense anymore when polled IO
is purely asynchronous. Similarly, marking the task as TASK_RUNNING is
also superflous, as the very much has to be running to enter the
function in the first place.

It looks like there has been this judgment for historical reasons, and
in very early versions of this function the user would set the process
state to TASK_UNINTERRUPTIBLE.

Signed-off-by: Diangang Li <lidiangang@bytedance.com>
Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
[axboe: kill all remnants of task running, pointless now. massage message]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 049e9dce1149..1978eef95dca 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -5192,27 +5192,19 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 			 struct io_comp_batch *iob, unsigned int flags)
 {
-	long state = get_current_state();
 	int ret;
 
 	do {
 		ret = q->mq_ops->poll(hctx, iob);
-		if (ret > 0) {
-			__set_current_state(TASK_RUNNING);
+		if (ret > 0)
 			return ret;
-		}
-
-		if (signal_pending_state(state, current))
-			__set_current_state(TASK_RUNNING);
-		if (task_is_running(current))
+		if (task_sigpending(current))
 			return 1;
-
 		if (ret < 0 || (flags & BLK_POLL_ONESHOT))
 			break;
 		cpu_relax();
 	} while (!need_resched());
 
-	__set_current_state(TASK_RUNNING);
 	return 0;
 }
 

From a0750fae73c55112ea11a4867bee40f11e679405 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Wed, 10 Dec 2025 02:43:46 -0800
Subject: [PATCH 8/8] blk-mq-dma: always initialize dma state

Ensure the dma state is initialized when we're not using the contiguous
iova, otherwise the caller may be using a stale state from a previous
request that could use the coalesed iova allocation.

Fixes: 2f6b2565d43cdb5 ("block: accumulate memory segment gaps per bio")
Reported-by: Sebastian Ott <sebott@redhat.com>
Tested-by: Sebastian Ott <sebott@redhat.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-dma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index e9108ccaf4b0..6dc7a3c23ac8 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -199,6 +199,7 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev,
 	if (blk_can_dma_map_iova(req, dma_dev) &&
 	    dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len))
 		return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec);
+	memset(state, 0, sizeof(*state));
 	return blk_dma_map_direct(req, dma_dev, iter, &vec);
 }