From ba59e59b8b4dc0cb4882aa6f91fe16361dd95b99 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 11 Oct 2025 11:10:42 -0700
Subject: [PATCH 01/29] dm-crypt: Use MD5 library instead of crypto_shash

The lmk IV mode, which dm-crypt supports for Loop-AES compatibility,
involves an MD5 computation.  Update its implementation to use the MD5
library API instead of crypto_shash.  This has many benefits, such as:

- Simpler code.  Notably, much of the error-handling code is no longer
  needed, since the library functions can't fail.

- Reduced stack usage.  crypt_iv_lmk_one() now allocates only 112 bytes
  on the stack instead of 520 bytes.

- The library functions are strongly typed, preventing bugs like
  https://lore.kernel.org/r/f1625ddc-e82e-4b77-80c2-dc8e45b54848@gmail.com

- Slightly improved performance, as the library provides direct access
  to the MD5 code without unnecessary overhead such as indirect calls.

To preserve the existing behavior of lmk support being disabled when the
kernel is booted with "fips=1", make crypt_iv_lmk_ctr() check
fips_enabled itself.  Previously it relied on crypto_alloc_shash("md5")
failing.  (I don't know for sure that lmk *actually* needs to be
disallowed in FIPS mode; this just preserves the existing behavior.)

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/md/Kconfig    |  1 +
 drivers/md/dm-crypt.c | 76 ++++++++++++-------------------------------
 2 files changed, 22 insertions(+), 55 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 104aa5355090..dcd232a2ca24 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -299,6 +299,7 @@ config DM_CRYPT
 	select CRYPTO
 	select CRYPTO_CBC
 	select CRYPTO_ESSIV
+	select CRYPTO_LIB_MD5 # needed by lmk IV mode
 	help
 	  This device-mapper target allows you to create a device that
 	  transparently encrypts the data on it. You'll need to activate
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5ef43231fe77..04a553529dc2 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -21,6 +21,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/crypto.h>
+#include <linux/fips.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/backing-dev.h>
@@ -120,7 +121,6 @@ struct iv_benbi_private {
 
 #define LMK_SEED_SIZE 64 /* hash + 0 */
 struct iv_lmk_private {
-	struct crypto_shash *hash_tfm;
 	u8 *seed;
 };
 
@@ -465,10 +465,6 @@ static void crypt_iv_lmk_dtr(struct crypt_config *cc)
 {
 	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
 
-	if (lmk->hash_tfm && !IS_ERR(lmk->hash_tfm))
-		crypto_free_shash(lmk->hash_tfm);
-	lmk->hash_tfm = NULL;
-
 	kfree_sensitive(lmk->seed);
 	lmk->seed = NULL;
 }
@@ -483,11 +479,10 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
 		return -EINVAL;
 	}
 
-	lmk->hash_tfm = crypto_alloc_shash("md5", 0,
-					   CRYPTO_ALG_ALLOCATES_MEMORY);
-	if (IS_ERR(lmk->hash_tfm)) {
-		ti->error = "Error initializing LMK hash";
-		return PTR_ERR(lmk->hash_tfm);
+	if (fips_enabled) {
+		ti->error = "LMK support is disabled due to FIPS";
+		/* ... because it uses MD5. */
+		return -EINVAL;
 	}
 
 	/* No seed in LMK version 2 */
@@ -498,7 +493,6 @@ static int crypt_iv_lmk_ctr(struct crypt_config *cc, struct dm_target *ti,
 
 	lmk->seed = kzalloc(LMK_SEED_SIZE, GFP_KERNEL);
 	if (!lmk->seed) {
-		crypt_iv_lmk_dtr(cc);
 		ti->error = "Error kmallocing seed storage in LMK";
 		return -ENOMEM;
 	}
@@ -514,7 +508,7 @@ static int crypt_iv_lmk_init(struct crypt_config *cc)
 	/* LMK seed is on the position of LMK_KEYS + 1 key */
 	if (lmk->seed)
 		memcpy(lmk->seed, cc->key + (cc->tfms_count * subkey_size),
-		       crypto_shash_digestsize(lmk->hash_tfm));
+		       MD5_DIGEST_SIZE);
 
 	return 0;
 }
@@ -529,55 +523,31 @@ static int crypt_iv_lmk_wipe(struct crypt_config *cc)
 	return 0;
 }
 
-static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
-			    struct dm_crypt_request *dmreq,
-			    u8 *data)
+static void crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
+			     struct dm_crypt_request *dmreq, u8 *data)
 {
 	struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk;
-	SHASH_DESC_ON_STACK(desc, lmk->hash_tfm);
-	union {
-		struct md5_state md5state;
-		u8 state[CRYPTO_MD5_STATESIZE];
-	} u;
+	struct md5_ctx ctx;
 	__le32 buf[4];
-	int i, r;
 
-	desc->tfm = lmk->hash_tfm;
+	md5_init(&ctx);
 
-	r = crypto_shash_init(desc);
-	if (r)
-		return r;
-
-	if (lmk->seed) {
-		r = crypto_shash_update(desc, lmk->seed, LMK_SEED_SIZE);
-		if (r)
-			return r;
-	}
+	if (lmk->seed)
+		md5_update(&ctx, lmk->seed, LMK_SEED_SIZE);
 
 	/* Sector is always 512B, block size 16, add data of blocks 1-31 */
-	r = crypto_shash_update(desc, data + 16, 16 * 31);
-	if (r)
-		return r;
+	md5_update(&ctx, data + 16, 16 * 31);
 
 	/* Sector is cropped to 56 bits here */
 	buf[0] = cpu_to_le32(dmreq->iv_sector & 0xFFFFFFFF);
 	buf[1] = cpu_to_le32((((u64)dmreq->iv_sector >> 32) & 0x00FFFFFF) | 0x80000000);
 	buf[2] = cpu_to_le32(4024);
 	buf[3] = 0;
-	r = crypto_shash_update(desc, (u8 *)buf, sizeof(buf));
-	if (r)
-		return r;
+	md5_update(&ctx, (u8 *)buf, sizeof(buf));
 
 	/* No MD5 padding here */
-	r = crypto_shash_export(desc, &u.md5state);
-	if (r)
-		return r;
-
-	for (i = 0; i < MD5_HASH_WORDS; i++)
-		__cpu_to_le32s(&u.md5state.hash[i]);
-	memcpy(iv, &u.md5state.hash, cc->iv_size);
-
-	return 0;
+	cpu_to_le32_array(ctx.state.h, ARRAY_SIZE(ctx.state.h));
+	memcpy(iv, ctx.state.h, cc->iv_size);
 }
 
 static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
@@ -585,17 +555,15 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
 {
 	struct scatterlist *sg;
 	u8 *src;
-	int r = 0;
 
 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
 		sg = crypt_get_sg_data(cc, dmreq->sg_in);
 		src = kmap_local_page(sg_page(sg));
-		r = crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
+		crypt_iv_lmk_one(cc, iv, dmreq, src + sg->offset);
 		kunmap_local(src);
 	} else
 		memset(iv, 0, cc->iv_size);
-
-	return r;
+	return 0;
 }
 
 static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
@@ -603,21 +571,19 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
 {
 	struct scatterlist *sg;
 	u8 *dst;
-	int r;
 
 	if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
 		return 0;
 
 	sg = crypt_get_sg_data(cc, dmreq->sg_out);
 	dst = kmap_local_page(sg_page(sg));
-	r = crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
+	crypt_iv_lmk_one(cc, iv, dmreq, dst + sg->offset);
 
 	/* Tweak the first block of plaintext sector */
-	if (!r)
-		crypto_xor(dst + sg->offset, iv, cc->iv_size);
+	crypto_xor(dst + sg->offset, iv, cc->iv_size);
 
 	kunmap_local(dst);
-	return r;
+	return 0;
 }
 
 static void crypt_iv_tcw_dtr(struct crypt_config *cc)

From 9a746ee0fb399b2021e801c5f724e75c7468fcf6 Mon Sep 17 00:00:00 2001
From: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Date: Sat, 4 Oct 2025 00:19:12 -0300
Subject: [PATCH 02/29] dm-crypt: use folio_nr_pages() instead of shift
 operation

folio_nr_pages() is a faster helper function to get the number of pages when
NR_PAGES_IN_LARGE_FOLIO is enabled.

Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-crypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 04a553529dc2..5eace7d4a67a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1747,7 +1747,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
 		bio_for_each_folio_all(fi, clone) {
 			if (folio_test_large(fi.folio)) {
 				percpu_counter_sub(&cc->n_allocated_pages,
-						1 << folio_order(fi.folio));
+						folio_nr_pages(fi.folio));
 				folio_put(fi.folio);
 			} else {
 				mempool_free(&fi.folio->page, &cc->page_pool);

From 27cecacbe88f22aa2e0454cc516d64a55e9002d4 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Wed, 8 Oct 2025 23:24:11 -0400
Subject: [PATCH 03/29] dm: remove useless md->nr_zones variable

md->nr_zones is no longer used for anything. Remove it.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-core.h | 1 -
 drivers/md/dm-zone.c | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index a3c9f74fe2dc..1cda8618d74d 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -139,7 +139,6 @@ struct mapped_device {
 	struct srcu_struct io_barrier;
 
 #ifdef CONFIG_BLK_DEV_ZONED
-	unsigned int nr_zones;
 	void *zone_revalidate_map;
 	struct task_struct *revalidate_map_task;
 #endif
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 78e17dd4d01b..cf4553e863f0 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -193,8 +193,6 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q)
 		return ret;
 	}
 
-	md->nr_zones = disk->nr_zones;
-
 	return 0;
 }
 
@@ -442,7 +440,6 @@ void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim)
 			set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 	} else {
 		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
-		md->nr_zones = 0;
 		md->disk->nr_zones = 0;
 	}
 }

From 4929ba5c5bd75dc28971f0909902e4624e92ad59 Mon Sep 17 00:00:00 2001
From: Vivek BalachandharTN <vivek.balachandhar@gmail.com>
Date: Wed, 15 Oct 2025 07:45:02 +0000
Subject: [PATCH 04/29] dm: sysfs: use sysfs_emit() in dm-sysfs.c

Replace sprintf()+strlen() with sysfs_emit(), the preferred helper for
sysfs show() routines. sysfs_emit() returns the number of bytes written,
guarantees NUL-termination, and clamps to PAGE_SIZE-1.

Reference: Documentation/filesystems/sysfs.rst.

No functional change intended.

Signed-off-by: Vivek BalachandharTN <vivek.balachandhar@gmail.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-sysfs.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index bfaef27ca79f..22bc70923a83 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -86,17 +86,13 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
 
 static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 {
-	sprintf(buf, "%d\n", dm_suspended_md(md));
-
-	return strlen(buf);
+	return sysfs_emit(buf, "%d\n", dm_suspended_md(md));
 }
 
 static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
 {
 	/* Purely for userspace compatibility */
-	sprintf(buf, "%d\n", true);
-
-	return strlen(buf);
+	return sysfs_emit(buf, "%d\n", true);
 }
 
 static DM_ATTR_RO(name);

From be4addb1914f00c60599495acacf4e24e9cb8237 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Wed, 8 Oct 2025 23:04:31 -0400
Subject: [PATCH 05/29] dm: Fix deadlock when reloading a multipath table

Request-based devices (dm-multipath) queue I/O in blk-mq on noflush
suspends. Any queued IO will make it impossible to freeze the queue. If
a process attempts to update the queue limits while there is queued IO,
it can be get stuck holding the limits lock, while unable to freeze the
queue. If device-mapper then attempts to update the limits during a
table swap, it will deadlock trying to grab the limits lock while making
it impossible to flush the IO.

Disallow updating the queue limits during a table swap, when updating an
immutable request-based dm device (dm-multipath) during a noflush
suspend. It is userspace's responsibility to make sure that the new
table uses the same limits as the existing table if it asks for a
noflush suspend.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-table.c |  4 ++++
 drivers/md/dm-thin.c  |  7 ++-----
 drivers/md/dm.c       | 35 +++++++++++++++++++++++------------
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ad0a60a07b93..0522cd700e0e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2043,6 +2043,10 @@ bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size,
 	return true;
 }
 
+/*
+ * This function will be skipped by noflush reloads of immutable request
+ * based devices (dm-mpath).
+ */
 int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			      struct queue_limits *limits)
 {
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c84149ba4e38..6f98936f0e05 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -4383,11 +4383,8 @@ static void thin_postsuspend(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
 
-	/*
-	 * The dm_noflush_suspending flag has been cleared by now, so
-	 * unfortunately we must always run this.
-	 */
-	noflush_work(tc, do_noflush_stop);
+	if (dm_noflush_suspending(ti))
+		noflush_work(tc, do_noflush_stop);
 }
 
 static int thin_preresume(struct dm_target *ti)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f5e5e59b232b..bff3ab4a3bd8 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2439,7 +2439,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 {
 	struct dm_table *old_map;
 	sector_t size, old_size;
-	int ret;
 
 	lockdep_assert_held(&md->suspend_lock);
 
@@ -2454,11 +2453,13 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	set_capacity(md->disk, size);
 
-	ret = dm_table_set_restrictions(t, md->queue, limits);
-	if (ret) {
-		set_capacity(md->disk, old_size);
-		old_map = ERR_PTR(ret);
-		goto out;
+	if (limits) {
+		int ret = dm_table_set_restrictions(t, md->queue, limits);
+		if (ret) {
+			set_capacity(md->disk, old_size);
+			old_map = ERR_PTR(ret);
+			goto out;
+		}
 	}
 
 	/*
@@ -2836,6 +2837,7 @@ static void dm_wq_work(struct work_struct *work)
 
 static void dm_queue_flush(struct mapped_device *md)
 {
+	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
 	smp_mb__after_atomic();
 	queue_work(md->wq, &md->work);
@@ -2848,6 +2850,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
 	struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
 	struct queue_limits limits;
+	bool update_limits = true;
 	int r;
 
 	mutex_lock(&md->suspend_lock);
@@ -2856,20 +2859,31 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (!dm_suspended_md(md))
 		goto out;
 
+	/*
+	 * To avoid a potential deadlock locking the queue limits, disallow
+	 * updating the queue limits during a table swap, when updating an
+	 * immutable request-based dm device (dm-multipath) during a noflush
+	 * suspend. It is userspace's responsibility to make sure that the new
+	 * table uses the same limits as the existing table, if it asks for a
+	 * noflush suspend.
+	 */
+	if (dm_request_based(md) && md->immutable_target &&
+	    __noflush_suspending(md))
+		update_limits = false;
 	/*
 	 * If the new table has no data devices, retain the existing limits.
 	 * This helps multipath with queue_if_no_path if all paths disappear,
 	 * then new I/O is queued based on these limits, and then some paths
 	 * reappear.
 	 */
-	if (dm_table_has_no_data_devices(table)) {
+	else if (dm_table_has_no_data_devices(table)) {
 		live_map = dm_get_live_table_fast(md);
 		if (live_map)
 			limits = md->queue->limits;
 		dm_put_live_table_fast(md);
 	}
 
-	if (!live_map) {
+	if (update_limits && !live_map) {
 		r = dm_calculate_queue_limits(table, &limits);
 		if (r) {
 			map = ERR_PTR(r);
@@ -2877,7 +2891,7 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		}
 	}
 
-	map = __bind(md, table, &limits);
+	map = __bind(md, table, update_limits ? &limits : NULL);
 	dm_issue_global_event();
 
 out:
@@ -2930,7 +2944,6 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
-	 * This flag is cleared before dm_suspend returns.
 	 */
 	if (noflush)
 		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
@@ -2993,8 +3006,6 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	if (!r)
 		set_bit(dmf_suspended_flag, &md->flags);
 
-	if (noflush)
-		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 	if (map)
 		synchronize_srcu(&md->io_barrier);
 

From 3ee6c4bc5307d9fcc681dc7ee15822a54b94b39c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 14 Oct 2025 14:16:54 -0700
Subject: [PATCH 06/29] dm-verity: remove log message with shash driver name

I added this log message in commit bbf6a566920e ("dm verity: log the
hash algorithm implementation"), to help people debug issues where they
forgot to enable the architecture-optimized SHA-256 code in their
kconfig or accidentally enabled a slow hardware offload driver (such as
QCE) that overrode the faster CPU-accelerated code.  However:

- The crypto layer now always enables the architecture-optimized SHA-1,
  SHA-256, and SHA-512 code.  Moreover, for simplicity the driver name
  is now fixed at "sha1-lib", "sha256-lib", etc.

- dm-verity now uses crypto_shash instead of crypto_ahash, preventing
  the mistake of accidentally using a slow driver such as QCE.

Therefore, this log message generally no longer provides useful
information.  Remove it.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-verity-target.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 66a00a8ccb39..20ddf560d22e 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1252,7 +1252,6 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
 	}
 	v->shash_tfm = shash;
 	v->digest_size = crypto_shash_digestsize(shash);
-	DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash));
 	if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
 		ti->error = "Digest size too big";
 		return -EINVAL;

From ba0f428c9b40364f1af92523860c787068987b8c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 14 Oct 2025 14:16:55 -0700
Subject: [PATCH 07/29] dm-verity: use SHA-256 library for SHA-256

When the hash algorithm is SHA-256 and the verity version is not 0, use
the SHA-256 library instead of crypto_shash.

This is a prerequisite for making dm-verity interleave the computation
of SHA-256 hashes for increased performance.  That optimization is
available in the SHA-256 library but not in crypto_shash.

Even without interleaved hashing, switching to the library also slightly
improves performance by itself because it avoids the overhead of
crypto_shash, including indirect calls and other API overhead.
(Benchmark on x86_64, AMD Zen 5: hashing 4K blocks gets 2.1% faster.)

SHA-256 is by far the most common hash algorithm used with dm-verity.
It makes sense to optimize for the common case and fall back to the
generic crypto layer for uncommon cases, as suggested by Linus:
https://lore.kernel.org/r/CAHk-=wgp-fOSsZsYrbyzqCAfEvrt5jQs1jL-97Wc4seMNTUyng@mail.gmail.com

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/Kconfig            |  1 +
 drivers/md/dm-verity-target.c | 61 +++++++++++++++++++++++++++--------
 drivers/md/dm-verity.h        | 20 +++++++++---
 3 files changed, 64 insertions(+), 18 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index dcd232a2ca24..239c1744a926 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -547,6 +547,7 @@ config DM_VERITY
 	depends on BLK_DEV_DM
 	select CRYPTO
 	select CRYPTO_HASH
+	select CRYPTO_LIB_SHA256
 	select DM_BUFIO
 	help
 	  This device-mapper target creates a read-only device that
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 20ddf560d22e..bba981080563 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -117,11 +117,25 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
 int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
 		const u8 *data, size_t len, u8 *digest)
 {
-	struct shash_desc *desc = &io->hash_desc;
+	struct shash_desc *desc;
 	int r;
 
+	if (likely(v->use_sha256_lib)) {
+		struct sha256_ctx *ctx = &io->hash_ctx.sha256;
+
+		/*
+		 * Fast path using SHA-256 library.  This is enabled only for
+		 * verity version 1, where the salt is at the beginning.
+		 */
+		*ctx = *v->initial_hashstate.sha256;
+		sha256_update(ctx, data, len);
+		sha256_final(ctx, digest);
+		return 0;
+	}
+
+	desc = &io->hash_ctx.shash;
 	desc->tfm = v->shash_tfm;
-	if (unlikely(v->initial_hashstate == NULL)) {
+	if (unlikely(v->initial_hashstate.shash == NULL)) {
 		/* Version 0: salt at end */
 		r = crypto_shash_init(desc) ?:
 		    crypto_shash_update(desc, data, len) ?:
@@ -129,7 +143,7 @@ int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
 		    crypto_shash_final(desc, digest);
 	} else {
 		/* Version 1: salt at beginning */
-		r = crypto_shash_import(desc, v->initial_hashstate) ?:
+		r = crypto_shash_import(desc, v->initial_hashstate.shash) ?:
 		    crypto_shash_finup(desc, data, len, digest);
 	}
 	if (unlikely(r))
@@ -1004,7 +1018,7 @@ static void verity_dtr(struct dm_target *ti)
 
 	kvfree(v->validated_blocks);
 	kfree(v->salt);
-	kfree(v->initial_hashstate);
+	kfree(v->initial_hashstate.shash);
 	kfree(v->root_digest);
 	kfree(v->zero_digest);
 	verity_free_sig(v);
@@ -1069,8 +1083,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
 	if (!v->zero_digest)
 		return r;
 
-	io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm),
-		     GFP_KERNEL);
+	io = kmalloc(v->ti->per_io_data_size, GFP_KERNEL);
 
 	if (!io)
 		return r; /* verity_dtr will free zero_digest */
@@ -1256,6 +1269,20 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
 		ti->error = "Digest size too big";
 		return -EINVAL;
 	}
+	if (likely(v->version && strcmp(alg_name, "sha256") == 0)) {
+		/*
+		 * Fast path: use the library API for reduced overhead and
+		 * interleaved hashing support.
+		 */
+		v->use_sha256_lib = true;
+		ti->per_io_data_size =
+			offsetofend(struct dm_verity_io, hash_ctx.sha256);
+	} else {
+		/* Fallback case: use the generic crypto API. */
+		ti->per_io_data_size =
+			offsetofend(struct dm_verity_io, hash_ctx.shash) +
+			crypto_shash_descsize(shash);
+	}
 	return 0;
 }
 
@@ -1276,7 +1303,18 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg)
 			return -EINVAL;
 		}
 	}
-	if (v->version) { /* Version 1: salt at beginning */
+	if (likely(v->use_sha256_lib)) {
+		/* Implies version 1: salt at beginning */
+		v->initial_hashstate.sha256 =
+			kmalloc(sizeof(struct sha256_ctx), GFP_KERNEL);
+		if (!v->initial_hashstate.sha256) {
+			ti->error = "Cannot allocate initial hash state";
+			return -ENOMEM;
+		}
+		sha256_init(v->initial_hashstate.sha256);
+		sha256_update(v->initial_hashstate.sha256,
+			      v->salt, v->salt_size);
+	} else if (v->version) { /* Version 1: salt at beginning */
 		SHASH_DESC_ON_STACK(desc, v->shash_tfm);
 		int r;
 
@@ -1284,16 +1322,16 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg)
 		 * Compute the pre-salted hash state that can be passed to
 		 * crypto_shash_import() for each block later.
 		 */
-		v->initial_hashstate = kmalloc(
+		v->initial_hashstate.shash = kmalloc(
 			crypto_shash_statesize(v->shash_tfm), GFP_KERNEL);
-		if (!v->initial_hashstate) {
+		if (!v->initial_hashstate.shash) {
 			ti->error = "Cannot allocate initial hash state";
 			return -ENOMEM;
 		}
 		desc->tfm = v->shash_tfm;
 		r = crypto_shash_init(desc) ?:
 		    crypto_shash_update(desc, v->salt, v->salt_size) ?:
-		    crypto_shash_export(desc, v->initial_hashstate);
+		    crypto_shash_export(desc, v->initial_hashstate.shash);
 		if (r) {
 			ti->error = "Cannot set up initial hash state";
 			return r;
@@ -1555,9 +1593,6 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
-	ti->per_io_data_size = sizeof(struct dm_verity_io) +
-			       crypto_shash_descsize(v->shash_tfm);
-
 	r = verity_fec_ctr(v);
 	if (r)
 		goto bad;
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 6d141abd965c..cdcee68a4bc0 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -16,6 +16,7 @@
 #include <linux/device-mapper.h>
 #include <linux/interrupt.h>
 #include <crypto/hash.h>
+#include <crypto/sha2.h>
 
 #define DM_VERITY_MAX_LEVELS		63
 
@@ -42,7 +43,10 @@ struct dm_verity {
 	struct crypto_shash *shash_tfm;
 	u8 *root_digest;	/* digest of the root block */
 	u8 *salt;		/* salt: its size is salt_size */
-	u8 *initial_hashstate;	/* salted initial state, if version >= 1 */
+	union {
+		struct sha256_ctx *sha256;	/* for use_sha256_lib=1 */
+		u8 *shash;			/* for use_sha256_lib=0 */
+	} initial_hashstate; /* salted initial state, if version >= 1 */
 	u8 *zero_digest;	/* digest for a zero block */
 #ifdef CONFIG_SECURITY
 	u8 *root_digest_sig;	/* signature of the root digest */
@@ -59,6 +63,7 @@ struct dm_verity {
 	unsigned char version;
 	bool hash_failed:1;	/* set if hash of any block failed */
 	bool use_bh_wq:1;	/* try to verify in BH wq before normal work-queue */
+	bool use_sha256_lib:1;	/* use SHA-256 library instead of generic crypto API */
 	unsigned int digest_size;	/* digest size for the current hash algorithm */
 	enum verity_mode mode;	/* mode for handling verification errors */
 	enum verity_mode error_mode;/* mode for handling I/O errors */
@@ -98,11 +103,16 @@ struct dm_verity_io {
 	u8 want_digest[HASH_MAX_DIGESTSIZE];
 
 	/*
-	 * Temporary space for hashing.  This is variable-length and must be at
-	 * the end of the struct.  struct shash_desc is just the fixed part;
-	 * it's followed by a context of size crypto_shash_descsize(shash_tfm).
+	 * Temporary space for hashing.  Either sha256 or shash is used,
+	 * depending on the value of use_sha256_lib.  If shash is used,
+	 * then this field is variable-length, with total size
+	 * sizeof(struct shash_desc) + crypto_shash_descsize(shash_tfm).
+	 * For this reason, this field must be the end of the struct.
 	 */
-	struct shash_desc hash_desc;
+	union {
+		struct sha256_ctx sha256;
+		struct shash_desc shash;
+	} hash_ctx;
 };
 
 static inline u8 *verity_io_real_digest(struct dm_verity *v,

From 379475dc88fc44f57760e6057b038073e352aaea Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 14 Oct 2025 14:16:56 -0700
Subject: [PATCH 08/29] dm-verity: reduce scope of real and wanted digests

In preparation for supporting interleaved hashing where dm-verity will
need to keep track of the real and wanted digests for multiple data
blocks simultaneously, stop using the want_digest and real_digest fields
of struct dm_verity_io from so many different places.  Specifically:

- Make various functions take want_digest as a parameter rather than
  having it be implicitly passed via the struct dm_verity_io.

- Add a new tmp_digest field, and use this instead of real_digest when
  computing a digest solely for the purpose of immediately checking it.

The result is that real_digest and want_digest are used only by
verity_verify_io().

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-verity-fec.c    | 21 +++++++++----------
 drivers/md/dm-verity-fec.h    |  5 +++--
 drivers/md/dm-verity-target.c | 38 ++++++++++++++++++-----------------
 drivers/md/dm-verity.h        |  1 +
 4 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index d382a390d39a..301a9c01bf86 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -188,14 +188,13 @@ error:
  * Locate data block erasures using verity hashes.
  */
 static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
-			  u8 *want_digest, u8 *data)
+			  const u8 *want_digest, const u8 *data)
 {
 	if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits,
-				 verity_io_real_digest(v, io))))
+				 io->tmp_digest)))
 		return 0;
 
-	return memcmp(verity_io_real_digest(v, io), want_digest,
-		      v->digest_size) != 0;
+	return memcmp(io->tmp_digest, want_digest, v->digest_size) != 0;
 }
 
 /*
@@ -366,7 +365,7 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
  */
 static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 			  struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
-			  bool use_erasures)
+			  const u8 *want_digest, bool use_erasures)
 {
 	int r, neras = 0;
 	unsigned int pos;
@@ -392,12 +391,11 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 
 	/* Always re-validate the corrected block against the expected hash */
 	r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits,
-			verity_io_real_digest(v, io));
+			io->tmp_digest);
 	if (unlikely(r < 0))
 		return r;
 
-	if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io),
-		   v->digest_size)) {
+	if (memcmp(io->tmp_digest, want_digest, v->digest_size)) {
 		DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)",
 			    v->data_dev->name, (unsigned long long)rsb, neras);
 		return -EILSEQ;
@@ -408,7 +406,8 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
 
 /* Correct errors in a block. Copies corrected block to dest. */
 int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
-		      enum verity_block_type type, sector_t block, u8 *dest)
+		      enum verity_block_type type, const u8 *want_digest,
+		      sector_t block, u8 *dest)
 {
 	int r;
 	struct dm_verity_fec_io *fio = fec_io(io);
@@ -451,9 +450,9 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	 * them first. Do a second attempt with erasures if the corruption is
 	 * bad enough.
 	 */
-	r = fec_decode_rsb(v, io, fio, rsb, offset, false);
+	r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false);
 	if (r < 0) {
-		r = fec_decode_rsb(v, io, fio, rsb, offset, true);
+		r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true);
 		if (r < 0)
 			goto done;
 	}
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index 09123a612953..a6689cdc489d 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -68,8 +68,8 @@ struct dm_verity_fec_io {
 extern bool verity_fec_is_enabled(struct dm_verity *v);
 
 extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
-			     enum verity_block_type type, sector_t block,
-			     u8 *dest);
+			     enum verity_block_type type, const u8 *want_digest,
+			     sector_t block, u8 *dest);
 
 extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz,
 					char *result, unsigned int maxlen);
@@ -99,6 +99,7 @@ static inline bool verity_fec_is_enabled(struct dm_verity *v)
 static inline int verity_fec_decode(struct dm_verity *v,
 				    struct dm_verity_io *io,
 				    enum verity_block_type type,
+				    const u8 *want_digest,
 				    sector_t block, u8 *dest)
 {
 	return -EOPNOTSUPP;
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index bba981080563..af9f1544af3e 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -229,12 +229,12 @@ out:
  * Verify hash of a metadata block pertaining to the specified data block
  * ("block" argument) at a specified level ("level" argument).
  *
- * On successful return, verity_io_want_digest(v, io) contains the hash value
- * for a lower tree level or for the data block (if we're at the lowest level).
+ * On successful return, want_digest contains the hash value for a lower tree
+ * level or for the data block (if we're at the lowest level).
  *
  * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
  * If "skip_unverified" is false, unverified buffer is hashed and verified
- * against current value of verity_io_want_digest(v, io).
+ * against current value of want_digest.
  */
 static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			       sector_t block, int level, bool skip_unverified,
@@ -273,7 +273,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 		if (IS_ERR(data))
 			return r;
 		if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
-				      hash_block, data) == 0) {
+				      want_digest, hash_block, data) == 0) {
 			aux = dm_bufio_get_aux_data(buf);
 			aux->hash_verified = 1;
 			goto release_ok;
@@ -293,11 +293,11 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 		}
 
 		r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits,
-				verity_io_real_digest(v, io));
+				io->tmp_digest);
 		if (unlikely(r < 0))
 			goto release_ret_r;
 
-		if (likely(memcmp(verity_io_real_digest(v, io), want_digest,
+		if (likely(memcmp(io->tmp_digest, want_digest,
 				  v->digest_size) == 0))
 			aux->hash_verified = 1;
 		else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
@@ -308,7 +308,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io,
 			r = -EAGAIN;
 			goto release_ret_r;
 		} else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA,
-					     hash_block, data) == 0)
+					     want_digest, hash_block, data) == 0)
 			aux->hash_verified = 1;
 		else if (verity_handle_err(v,
 					   DM_VERITY_BLOCK_TYPE_METADATA,
@@ -372,7 +372,8 @@ out:
 }
 
 static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
-				   sector_t cur_block, u8 *dest)
+				   const u8 *want_digest, sector_t cur_block,
+				   u8 *dest)
 {
 	struct page *page;
 	void *buffer;
@@ -396,12 +397,11 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io,
 		goto free_ret;
 
 	r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits,
-			verity_io_real_digest(v, io));
+			io->tmp_digest);
 	if (unlikely(r))
 		goto free_ret;
 
-	if (memcmp(verity_io_real_digest(v, io),
-		   verity_io_want_digest(v, io), v->digest_size)) {
+	if (memcmp(io->tmp_digest, want_digest, v->digest_size)) {
 		r = -EIO;
 		goto free_ret;
 	}
@@ -416,8 +416,9 @@ free_ret:
 
 static int verity_handle_data_hash_mismatch(struct dm_verity *v,
 					    struct dm_verity_io *io,
-					    struct bio *bio, sector_t blkno,
-					    u8 *data)
+					    struct bio *bio,
+					    const u8 *want_digest,
+					    sector_t blkno, u8 *data)
 {
 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
 		/*
@@ -426,14 +427,14 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v,
 		 */
 		return -EAGAIN;
 	}
-	if (verity_recheck(v, io, blkno, data) == 0) {
+	if (verity_recheck(v, io, want_digest, blkno, data) == 0) {
 		if (v->validated_blocks)
 			set_bit(blkno, v->validated_blocks);
 		return 0;
 	}
 #if defined(CONFIG_DM_VERITY_FEC)
-	if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno,
-			      data) == 0)
+	if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, want_digest,
+			      blkno, data) == 0)
 		return 0;
 #endif
 	if (bio->bi_status)
@@ -525,8 +526,9 @@ static int verity_verify_io(struct dm_verity_io *io)
 			kunmap_local(data);
 			continue;
 		}
-		r = verity_handle_data_hash_mismatch(v, io, bio, cur_block,
-						     data);
+		r = verity_handle_data_hash_mismatch(v, io, bio,
+						     verity_io_want_digest(v, io),
+						     cur_block, data);
 		kunmap_local(data);
 		if (unlikely(r))
 			return r;
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index cdcee68a4bc0..cf7973ed3059 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -99,6 +99,7 @@ struct dm_verity_io {
 	struct work_struct work;
 	struct work_struct bh_work;
 
+	u8 tmp_digest[HASH_MAX_DIGESTSIZE];
 	u8 real_digest[HASH_MAX_DIGESTSIZE];
 	u8 want_digest[HASH_MAX_DIGESTSIZE];
 

From 23f57ed9d26e309010996a6809e410ed59c7ec7c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Tue, 14 Oct 2025 14:16:57 -0700
Subject: [PATCH 09/29] dm-verity: use 2-way interleaved SHA-256 hashing when
 supported

When the crypto library provides an optimized implementation of
sha256_finup_2x(), use it to interleave the hashing of pairs of data
blocks.  On some CPUs this nearly doubles hashing performance.  The
increase in overall throughput of cold-cache dm-verity reads that I'm
seeing on arm64 and x86_64 is roughly 35% (though this metric is hard to
measure as it jumps around a lot).

For now this is done only on data blocks, not Merkle tree blocks.  We
could use sha256_finup_2x() on Merkle tree blocks too, but that is less
important as there aren't as many Merkle tree blocks as data blocks, and
that would require some additional code restructuring.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-verity-target.c | 113 ++++++++++++++++++++++++++--------
 drivers/md/dm-verity.h        |  31 +++++-----
 2 files changed, 103 insertions(+), 41 deletions(-)

diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index af9f1544af3e..bf0aee73b074 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -417,9 +417,12 @@ free_ret:
 static int verity_handle_data_hash_mismatch(struct dm_verity *v,
 					    struct dm_verity_io *io,
 					    struct bio *bio,
-					    const u8 *want_digest,
-					    sector_t blkno, u8 *data)
+					    struct pending_block *block)
 {
+	const u8 *want_digest = block->want_digest;
+	sector_t blkno = block->blkno;
+	u8 *data = block->data;
+
 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
 		/*
 		 * Error handling code (FEC included) cannot be run in the
@@ -448,6 +451,58 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v,
 	return 0;
 }
 
+static void verity_clear_pending_blocks(struct dm_verity_io *io)
+{
+	int i;
+
+	for (i = io->num_pending - 1; i >= 0; i--) {
+		kunmap_local(io->pending_blocks[i].data);
+		io->pending_blocks[i].data = NULL;
+	}
+	io->num_pending = 0;
+}
+
+static int verity_verify_pending_blocks(struct dm_verity *v,
+					struct dm_verity_io *io,
+					struct bio *bio)
+{
+	const unsigned int block_size = 1 << v->data_dev_block_bits;
+	int i, r;
+
+	if (io->num_pending == 2) {
+		/* num_pending == 2 implies that the algorithm is SHA-256 */
+		sha256_finup_2x(v->initial_hashstate.sha256,
+				io->pending_blocks[0].data,
+				io->pending_blocks[1].data, block_size,
+				io->pending_blocks[0].real_digest,
+				io->pending_blocks[1].real_digest);
+	} else {
+		for (i = 0; i < io->num_pending; i++) {
+			r = verity_hash(v, io, io->pending_blocks[i].data,
+					block_size,
+					io->pending_blocks[i].real_digest);
+			if (unlikely(r))
+				return r;
+		}
+	}
+
+	for (i = 0; i < io->num_pending; i++) {
+		struct pending_block *block = &io->pending_blocks[i];
+
+		if (likely(memcmp(block->real_digest, block->want_digest,
+				  v->digest_size) == 0)) {
+			if (v->validated_blocks)
+				set_bit(block->blkno, v->validated_blocks);
+		} else {
+			r = verity_handle_data_hash_mismatch(v, io, bio, block);
+			if (unlikely(r))
+				return r;
+		}
+	}
+	verity_clear_pending_blocks(io);
+	return 0;
+}
+
 /*
  * Verify one "dm_verity_io" structure.
  */
@@ -455,10 +510,14 @@ static int verity_verify_io(struct dm_verity_io *io)
 {
 	struct dm_verity *v = io->v;
 	const unsigned int block_size = 1 << v->data_dev_block_bits;
+	const int max_pending = v->use_sha256_finup_2x ? 2 : 1;
 	struct bvec_iter iter_copy;
 	struct bvec_iter *iter;
 	struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
 	unsigned int b;
+	int r;
+
+	io->num_pending = 0;
 
 	if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) {
 		/*
@@ -472,21 +531,22 @@ static int verity_verify_io(struct dm_verity_io *io)
 
 	for (b = 0; b < io->n_blocks;
 	     b++, bio_advance_iter(bio, iter, block_size)) {
-		int r;
-		sector_t cur_block = io->block + b;
+		sector_t blkno = io->block + b;
+		struct pending_block *block;
 		bool is_zero;
 		struct bio_vec bv;
 		void *data;
 
 		if (v->validated_blocks && bio->bi_status == BLK_STS_OK &&
-		    likely(test_bit(cur_block, v->validated_blocks)))
+		    likely(test_bit(blkno, v->validated_blocks)))
 			continue;
 
-		r = verity_hash_for_block(v, io, cur_block,
-					  verity_io_want_digest(v, io),
+		block = &io->pending_blocks[io->num_pending];
+
+		r = verity_hash_for_block(v, io, blkno, block->want_digest,
 					  &is_zero);
 		if (unlikely(r < 0))
-			return r;
+			goto error;
 
 		bv = bio_iter_iovec(bio, *iter);
 		if (unlikely(bv.bv_len < block_size)) {
@@ -497,7 +557,8 @@ static int verity_verify_io(struct dm_verity_io *io)
 			 * data block size to be greater than PAGE_SIZE.
 			 */
 			DMERR_LIMIT("unaligned io (data block spans pages)");
-			return -EIO;
+			r = -EIO;
+			goto error;
 		}
 
 		data = bvec_kmap_local(&bv);
@@ -511,30 +572,26 @@ static int verity_verify_io(struct dm_verity_io *io)
 			kunmap_local(data);
 			continue;
 		}
-
-		r = verity_hash(v, io, data, block_size,
-				verity_io_real_digest(v, io));
-		if (unlikely(r < 0)) {
-			kunmap_local(data);
-			return r;
+		block->data = data;
+		block->blkno = blkno;
+		if (++io->num_pending == max_pending) {
+			r = verity_verify_pending_blocks(v, io, bio);
+			if (unlikely(r))
+				goto error;
 		}
+	}
 
-		if (likely(memcmp(verity_io_real_digest(v, io),
-				  verity_io_want_digest(v, io), v->digest_size) == 0)) {
-			if (v->validated_blocks)
-				set_bit(cur_block, v->validated_blocks);
-			kunmap_local(data);
-			continue;
-		}
-		r = verity_handle_data_hash_mismatch(v, io, bio,
-						     verity_io_want_digest(v, io),
-						     cur_block, data);
-		kunmap_local(data);
+	if (io->num_pending) {
+		r = verity_verify_pending_blocks(v, io, bio);
 		if (unlikely(r))
-			return r;
+			goto error;
 	}
 
 	return 0;
+
+error:
+	verity_clear_pending_blocks(io);
+	return r;
 }
 
 /*
@@ -1277,6 +1334,8 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name)
 		 * interleaved hashing support.
 		 */
 		v->use_sha256_lib = true;
+		if (sha256_finup_2x_is_optimized())
+			v->use_sha256_finup_2x = true;
 		ti->per_io_data_size =
 			offsetofend(struct dm_verity_io, hash_ctx.sha256);
 	} else {
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index cf7973ed3059..f975a9e5c5d6 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -64,6 +64,7 @@ struct dm_verity {
 	bool hash_failed:1;	/* set if hash of any block failed */
 	bool use_bh_wq:1;	/* try to verify in BH wq before normal work-queue */
 	bool use_sha256_lib:1;	/* use SHA-256 library instead of generic crypto API */
+	bool use_sha256_finup_2x:1; /* use interleaved hashing optimization */
 	unsigned int digest_size;	/* digest size for the current hash algorithm */
 	enum verity_mode mode;	/* mode for handling verification errors */
 	enum verity_mode error_mode;/* mode for handling I/O errors */
@@ -83,6 +84,13 @@ struct dm_verity {
 	mempool_t recheck_pool;
 };
 
+struct pending_block {
+	void *data;
+	sector_t blkno;
+	u8 want_digest[HASH_MAX_DIGESTSIZE];
+	u8 real_digest[HASH_MAX_DIGESTSIZE];
+};
+
 struct dm_verity_io {
 	struct dm_verity *v;
 
@@ -100,8 +108,15 @@ struct dm_verity_io {
 	struct work_struct bh_work;
 
 	u8 tmp_digest[HASH_MAX_DIGESTSIZE];
-	u8 real_digest[HASH_MAX_DIGESTSIZE];
-	u8 want_digest[HASH_MAX_DIGESTSIZE];
+
+	/*
+	 * This is the queue of data blocks that are pending verification.  When
+	 * the crypto layer supports interleaved hashing, we allow multiple
+	 * blocks to be queued up in order to utilize it.  This can improve
+	 * performance significantly vs. sequential hashing of each block.
+	 */
+	int num_pending;
+	struct pending_block pending_blocks[2];
 
 	/*
 	 * Temporary space for hashing.  Either sha256 or shash is used,
@@ -116,18 +131,6 @@ struct dm_verity_io {
 	} hash_ctx;
 };
 
-static inline u8 *verity_io_real_digest(struct dm_verity *v,
-					struct dm_verity_io *io)
-{
-	return io->real_digest;
-}
-
-static inline u8 *verity_io_want_digest(struct dm_verity *v,
-					struct dm_verity_io *io)
-{
-	return io->want_digest;
-}
-
 extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io,
 		       const u8 *data, size_t len, u8 *digest);
 

From c82faa893418f584da8f38f9cbdda4533f49fd55 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Fri, 31 Oct 2025 15:34:55 +0000
Subject: [PATCH 10/29] dm: Don't warn if IMA_DISABLE_HTABLE is not enabled

Commit f1cd6cb24b6b ("dm ima: add a warning in dm_init if duplicate ima
events are not measured") added a warning message if CONFIG_IMA is
enabled but CONFIG_IMA_DISABLE_HTABLE is not to inform users. When
enabling CONFIG_IMA, CONFIG_IMA_DISABLE_HTABLE is disabled by default
and so warning is seen. Therefore, it seems more appropriate to make
this an INFO level message than warning. If this truly is a warning,
then maybe CONFIG_IMA_DISABLE_HTABLE should default to y if CONFIG_IMA
is enabled.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index bff3ab4a3bd8..557f3f52edf4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -272,7 +272,7 @@ static int __init dm_init(void)
 	int r, i;
 
 #if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
-	DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
+	DMINFO("CONFIG_IMA_DISABLE_HTABLE is disabled."
 	       " Duplicate IMA measurements will not be recorded in the IMA log.");
 #endif
 

From ae97648e14f7907f4b0e0b295eb2fdcf43806f9d Mon Sep 17 00:00:00 2001
From: Shubhankar Mishra <shubhankarm@google.com>
Date: Wed, 5 Nov 2025 14:06:44 +0000
Subject: [PATCH 11/29] dm verity fec: Expose corrected block count via status

Enhance visibility into dm-verity Forward Error Correction (FEC)
activity. While FEC can correct on-disk corruptions, the number of
successful correction events is not readily exposed through a standard
interface.

This change integrates FEC statistics into the verity target's
.status handler for STATUSTYPE_INFO. The info output now
includes count of corrected block by FEC.

The counter is a per-device instance atomic64_t, maintained within
the struct dm_verity_fec, tracking blocks successfully repaired by FEC
on this specific device instance since it was created.

This approach aligns with the standard Device Mapper mechanism for
targets to report runtime information, as used by other targets like
dm-integrity.

This patch also updates Documentation/admin-guide/device-mapper/verity.rst
to reflect the new status information.

Tested:
  Induced single-bit errors on a block device protected by dm-verity
  with FEC on android phone. Confirmed 'dmctl status <device>' on Android
  reports an incrementing 'fec_corrected_blocks' count after the
  corrupted blocks were accessed.

Signed-off-by: Shubhankar Mishra <shubhankarm@google.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 Documentation/admin-guide/device-mapper/verity.rst | 6 ++++--
 drivers/md/dm-verity-fec.c                         | 4 +++-
 drivers/md/dm-verity-fec.h                         | 1 +
 drivers/md/dm-verity-target.c                      | 4 ++++
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst
index 8c3f1f967a3c..3ecab1cff9c6 100644
--- a/Documentation/admin-guide/device-mapper/verity.rst
+++ b/Documentation/admin-guide/device-mapper/verity.rst
@@ -236,8 +236,10 @@ is available at the cryptsetup project's wiki page
 
 Status
 ======
-V (for Valid) is returned if every check performed so far was valid.
-If any check failed, C (for Corruption) is returned.
+1. V (for Valid) is returned if every check performed so far was valid.
+   If any check failed, C (for Corruption) is returned.
+2. Number of corrected blocks by Forward Error Correction.
+   '-' if Forward Error Correction is not enabled.
 
 Example
 =======
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 301a9c01bf86..d792eaed0792 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -177,9 +177,11 @@ error:
 	if (r < 0 && neras)
 		DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
 			    v->data_dev->name, (unsigned long long)rsb, r);
-	else if (r > 0)
+	else if (r > 0) {
 		DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
 			     v->data_dev->name, (unsigned long long)rsb, r);
+		atomic64_inc(&v->fec->corrected);
+	}
 
 	return r;
 }
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index a6689cdc489d..dd55037377b6 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -48,6 +48,7 @@ struct dm_verity_fec {
 	mempool_t extra_pool;	/* mempool for extra buffers */
 	mempool_t output_pool;	/* mempool for output */
 	struct kmem_cache *cache;	/* cache for buffers */
+	atomic64_t corrected; /* corrected errors */
 };
 
 /* per-bio data */
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index bf0aee73b074..52a0e052a5e8 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -848,6 +848,10 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 	switch (type) {
 	case STATUSTYPE_INFO:
 		DMEMIT("%c", v->hash_failed ? 'C' : 'V');
+		if (verity_fec_is_enabled(v))
+			DMEMIT(" %lld", atomic64_read(&v->fec->corrected));
+		else
+			DMEMIT(" -");
 		break;
 	case STATUSTYPE_TABLE:
 		DMEMIT("%u %s %s %u %u %llu %llu %s ",

From 61c73e8de99370ad0ee96ef6d65d8e35d302c5c1 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Tue, 28 Oct 2025 15:08:05 +0000
Subject: [PATCH 12/29] dm mpath: enable DM_TARGET_ATOMIC_WRITES

Both the bio- and rq-based paths have no problem supporting REQ_ATOMIC,
so enable DM_TARGET_ATOMIC_WRITES.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-mpath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index aaf4a0a4b0eb..5dd90b2cdb9b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -2307,7 +2307,7 @@ static struct target_type multipath_target = {
 	.name = "multipath",
 	.version = {1, 15, 0},
 	.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
-		    DM_TARGET_PASSES_INTEGRITY,
+		    DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ATOMIC_WRITES,
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,

From 7fa3e7d114abc9cc71cc35d768e116641074ddb4 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 17 Nov 2025 11:59:45 +0100
Subject: [PATCH 13/29] dm-ebs: Mark full buffer dirty even on partial write

When performing a read-modify-write(RMW) operation, any modification
to a buffered block must cause the entire buffer to be marked dirty.

Marking only a subrange as dirty is incorrect because the underlying
device block size(ubs) defines the minimum read/write granularity. A
lower device can perform I/O only on regions which are fully aligned
and sized to ubs.

This change ensures that write-back operations always occur in full
ubs-sized chunks, matching the intended emulation semantics of the
EBS target.

As for user space visible impact, submitting sub-ubs and misaligned
I/O for devices which are tuned to ubs sizes only, will reject such
requests, therefore it can lead to losing data. Example:

1) Create a 8K nvme device in qemu by adding

-device nvme,drive=drv0,serial=foo,logical_block_size=8192,physical_block_size=8192

2) Setup dm-ebs to emulate 512B to 8K mapping

urezki@pc638:~/bin$ cat dmsetup.sh

lower=/dev/nvme0n1
len=$(blockdev --getsz "$lower")

echo "0 $len ebs $lower 0 1 16" | dmsetup create nvme-8k
urezki@pc638:~/bin$

offset 0, ebs=1 and ubs=16(in sectors).

3) Create an ext4 filesystem(default 4K block size)

urezki@pc638:~/bin$ sudo mkfs.ext4 -F /dev/dm-0
mke2fs 1.47.0 (5-Feb-2023)
Discarding device blocks: done
Creating filesystem with 2072576 4k blocks and 518144 inodes
Filesystem UUID: bd0b6ca6-0506-4e31-86da-8d22c9d50b63
Superblock backups stored on blocks:
        32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632

Allocating group tables: done
Writing inode tables: done
Creating journal (16384 blocks): done
Writing superblocks and filesystem accounting information: mkfs.ext4: Input/output error while writing out and closing file system
urezki@pc638:~/bin$ dmesg

<snip>
[ 1618.875449] buffer_io_error: 1028 callbacks suppressed
[ 1618.875456] Buffer I/O error on dev dm-0, logical block 0, lost async page write
[ 1618.875527] Buffer I/O error on dev dm-0, logical block 1, lost async page write
[ 1618.875602] Buffer I/O error on dev dm-0, logical block 2, lost async page write
[ 1618.875620] Buffer I/O error on dev dm-0, logical block 3, lost async page write
[ 1618.875639] Buffer I/O error on dev dm-0, logical block 4, lost async page write
[ 1618.894316] Buffer I/O error on dev dm-0, logical block 5, lost async page write
[ 1618.894358] Buffer I/O error on dev dm-0, logical block 6, lost async page write
[ 1618.894380] Buffer I/O error on dev dm-0, logical block 7, lost async page write
[ 1618.894405] Buffer I/O error on dev dm-0, logical block 8, lost async page write
[ 1618.894427] Buffer I/O error on dev dm-0, logical block 9, lost async page write
<snip>

Many I/O errors because the lower 8K device rejects sub-ubs/misaligned
requests.

with a patch:

urezki@pc638:~/bin$ sudo mkfs.ext4 -F /dev/dm-0
mke2fs 1.47.0 (5-Feb-2023)
Discarding device blocks: done
Creating filesystem with 2072576 4k blocks and 518144 inodes
Filesystem UUID: 9b54f44f-ef55-4bd4-9e40-c8b775a616ac
Superblock backups stored on blocks:
        32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632

Allocating group tables: done
Writing inode tables: done
Creating journal (16384 blocks): done
Writing superblocks and filesystem accounting information: done

urezki@pc638:~/bin$ sudo mount /dev/dm-0 /mnt/
urezki@pc638:~/bin$ ls -al /mnt/
total 24
drwxr-xr-x  3 root root  4096 Oct 17 15:13 .
drwxr-xr-x 19 root root  4096 Jul 10 19:42 ..
drwx------  2 root root 16384 Oct 17 15:13 lost+found
urezki@pc638:~/bin$

After this change: mkfs completes; mount succeeds.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-ebs-target.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index 6abb31ca9662..b354e74a670e 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -103,7 +103,7 @@ static int __ebs_rw_bvec(struct ebs_c *ec, enum req_op op, struct bio_vec *bv,
 			} else {
 				flush_dcache_page(bv->bv_page);
 				memcpy(ba, pa, cur_len);
-				dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len);
+				dm_bufio_mark_buffer_dirty(b);
 			}
 
 			dm_bufio_release(b);

From d9f3e47d3fae0c101d9094bc956ed24e7a0ee801 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 14 Nov 2025 16:54:01 +0100
Subject: [PATCH 14/29] dm-verity: disable recursive forward error correction

There are two problems with the recursive correction:

1. It may cause denial-of-service. In fec_read_bufs, there is a loop that
has 253 iterations. For each iteration, we may call verity_hash_for_block
recursively. There is a limit of 4 nested recursions - that means that
there may be at most 253^4 (4 billion) iterations. Red Hat QE team
actually created an image that pushes dm-verity to this limit - and this
image just makes the udev-worker process get stuck in the 'D' state.

2. It doesn't work. In fec_read_bufs we store data into the variable
"fio->bufs", but fio bufs is shared between recursive invocations, if
"verity_hash_for_block" invoked correction recursively, it would
overwrite partially filled fio->bufs.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reported-by: Guangwu Zhang <guazhang@redhat.com>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
---
 drivers/md/dm-verity-fec.c    | 4 +---
 drivers/md/dm-verity-fec.h    | 3 ---
 drivers/md/dm-verity-target.c | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index d792eaed0792..9dbb68c9afe4 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -418,10 +418,8 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io,
 	if (!verity_fec_is_enabled(v))
 		return -EOPNOTSUPP;
 
-	if (fio->level >= DM_VERITY_FEC_MAX_RECURSION) {
-		DMWARN_LIMIT("%s: FEC: recursion too deep", v->data_dev->name);
+	if (fio->level)
 		return -EIO;
-	}
 
 	fio->level++;
 
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index dd55037377b6..f0e991a613f0 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -23,9 +23,6 @@
 #define DM_VERITY_FEC_BUF_MAX \
 	(1 << (PAGE_SHIFT - DM_VERITY_FEC_BUF_RS_BITS))
 
-/* maximum recursion level for verity_fec_decode */
-#define DM_VERITY_FEC_MAX_RECURSION	4
-
 #define DM_VERITY_OPT_FEC_DEV		"use_fec_from_device"
 #define DM_VERITY_OPT_FEC_BLOCKS	"fec_blocks"
 #define DM_VERITY_OPT_FEC_START		"fec_start"
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 52a0e052a5e8..5c17472d7896 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1789,7 +1789,7 @@ static struct target_type verity_target = {
 	.name		= "verity",
 /* Note: the LSMs depend on the singleton and immutable features */
 	.features	= DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE,
-	.version	= {1, 12, 0},
+	.version	= {1, 13, 0},
 	.module		= THIS_MODULE,
 	.ctr		= verity_ctr,
 	.dtr		= verity_dtr,

From b9dd1f71e6fca46c9efed7e1328d1b2f4dacd19b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 17 Nov 2025 21:43:54 +0100
Subject: [PATCH 15/29] dm-verity: remove useless mempool

v->fec->extra_pool has zero reserved entries, so we can remove it and use
the kernel cache directly.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
---
 drivers/md/dm-verity-fec.c | 12 +++---------
 drivers/md/dm-verity-fec.h |  1 -
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 9dbb68c9afe4..5f3cb4f05d72 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -333,7 +333,7 @@ static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
 		if (fio->bufs[n])
 			continue;
 
-		fio->bufs[n] = mempool_alloc(&v->fec->extra_pool, GFP_NOWAIT);
+		fio->bufs[n] = kmem_cache_alloc(v->fec->cache, GFP_NOWAIT);
 		/* we can manage with even one buffer if necessary */
 		if (unlikely(!fio->bufs[n]))
 			break;
@@ -482,7 +482,8 @@ void verity_fec_finish_io(struct dm_verity_io *io)
 		mempool_free(fio->bufs[n], &f->prealloc_pool);
 
 	fec_for_each_extra_buffer(fio, n)
-		mempool_free(fio->bufs[n], &f->extra_pool);
+		if (fio->bufs[n])
+			kmem_cache_free(f->cache, fio->bufs[n]);
 
 	mempool_free(fio->output, &f->output_pool);
 }
@@ -534,7 +535,6 @@ void verity_fec_dtr(struct dm_verity *v)
 
 	mempool_exit(&f->rs_pool);
 	mempool_exit(&f->prealloc_pool);
-	mempool_exit(&f->extra_pool);
 	mempool_exit(&f->output_pool);
 	kmem_cache_destroy(f->cache);
 
@@ -787,12 +787,6 @@ int verity_fec_ctr(struct dm_verity *v)
 		return ret;
 	}
 
-	ret = mempool_init_slab_pool(&f->extra_pool, 0, f->cache);
-	if (ret) {
-		ti->error = "Cannot allocate FEC buffer extra pool";
-		return ret;
-	}
-
 	/* Preallocate an output buffer for each thread */
 	ret = mempool_init_kmalloc_pool(&f->output_pool, num_online_cpus(),
 					1 << v->data_dev_block_bits);
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h
index f0e991a613f0..5fd267873812 100644
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -42,7 +42,6 @@ struct dm_verity_fec {
 	unsigned char rsn;	/* N of RS(M, N) */
 	mempool_t rs_pool;	/* mempool for fio->rs */
 	mempool_t prealloc_pool;	/* mempool for preallocated buffers */
-	mempool_t extra_pool;	/* mempool for extra buffers */
 	mempool_t output_pool;	/* mempool for output */
 	struct kmem_cache *cache;	/* cache for buffers */
 	atomic64_t corrected; /* corrected errors */

From de67c139b3846ece6b8bbb62abf1f010ae85c083 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 5 Nov 2025 16:01:33 +0100
Subject: [PATCH 16/29] dm: test for REQ_ATOMIC in dm_accept_partial_bio()

Any bio with REQ_ATOMIC flag set should never be split or partially
completed, so BUG_ON() on this scenario in dm_accept_partial_bio() (whose
intent is to allow partial completions).

Also, we must reject atomic bio to targets that don't support them,
otherwise this BUG could be triggered by stray bios that have the
REQ_ATOMIC set.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Tested-by: John Garry <john.g.garry@oracle.com>
---
 drivers/md/dm.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 557f3f52edf4..44be646574b7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1321,6 +1321,7 @@ void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors)
 	BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
 	BUG_ON(bio_sectors > *tio->len_ptr);
 	BUG_ON(n_sectors > bio_sectors);
+	BUG_ON(bio->bi_opf & REQ_ATOMIC);
 
 	if (static_branch_unlikely(&zoned_enabled) &&
 	    unlikely(bdev_is_zoned(bio->bi_bdev))) {
@@ -1735,8 +1736,12 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci)
 	ci->submit_as_polled = !!(ci->bio->bi_opf & REQ_POLLED);
 
 	len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
-	if (ci->bio->bi_opf & REQ_ATOMIC && len != ci->sector_count)
-		return BLK_STS_IOERR;
+	if (ci->bio->bi_opf & REQ_ATOMIC) {
+		if (unlikely(!dm_target_supports_atomic_writes(ti->type)))
+			return BLK_STS_IOERR;
+		if (unlikely(len != ci->sector_count))
+			return BLK_STS_IOERR;
+	}
 
 	setup_split_accounting(ci, len);
 

From ce51c6963a91cc6d5c9cf6c3735991882f72587d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 5 Nov 2025 16:02:36 +0100
Subject: [PATCH 17/29] dm-crypt: enable DM_TARGET_ATOMIC_WRITES

Allow handling of bios with REQ_ATOMIC flag set.

Don't split these bios and fail them if they overrun the hard limit
"BIO_MAX_VECS << PAGE_SHIFT".

In order to simplify the code, this commit joins the logic that avoids
splitting emulated zone append bios with the logic that avoids
splitting atomic write bios.

Signed-off-by: John Garry <john.g.garry@oracle.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Tested-by: John Garry <john.g.garry@oracle.com>
---
 drivers/md/dm-crypt.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5eace7d4a67a..79704fbc523b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -254,22 +254,15 @@ static unsigned int max_write_size = 0;
 module_param(max_write_size, uint, 0644);
 MODULE_PARM_DESC(max_write_size, "Maximum size of a write request");
 
-static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio)
+static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio, bool no_split)
 {
 	struct crypt_config *cc = ti->private;
 	unsigned val, sector_align;
 	bool wrt = op_is_write(bio_op(bio));
 
-	if (wrt) {
-		/*
-		 * For zoned devices, splitting write operations creates the
-		 * risk of deadlocking queue freeze operations with zone write
-		 * plugging BIO work when the reminder of a split BIO is
-		 * issued. So always allow the entire BIO to proceed.
-		 */
-		if (ti->emulate_zone_append)
-			return bio_sectors(bio);
-
+	if (no_split) {
+		val = -1;
+	} else if (wrt) {
 		val = min_not_zero(READ_ONCE(max_write_size),
 				   DM_CRYPT_DEFAULT_MAX_WRITE_SIZE);
 	} else {
@@ -3462,6 +3455,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 	struct dm_crypt_io *io;
 	struct crypt_config *cc = ti->private;
 	unsigned max_sectors;
+	bool no_split;
 
 	/*
 	 * If bio is REQ_PREFLUSH or REQ_OP_DISCARD, just bypass crypt queues.
@@ -3479,10 +3473,20 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
 
 	/*
 	 * Check if bio is too large, split as needed.
+	 *
+	 * For zoned devices, splitting write operations creates the
+	 * risk of deadlocking queue freeze operations with zone write
+	 * plugging BIO work when the reminder of a split BIO is
+	 * issued. So always allow the entire BIO to proceed.
 	 */
-	max_sectors = get_max_request_sectors(ti, bio);
-	if (unlikely(bio_sectors(bio) > max_sectors))
+	no_split = (ti->emulate_zone_append && op_is_write(bio_op(bio))) ||
+		   (bio->bi_opf & REQ_ATOMIC);
+	max_sectors = get_max_request_sectors(ti, bio, no_split);
+	if (unlikely(bio_sectors(bio) > max_sectors)) {
+		if (unlikely(no_split))
+			return DM_MAPIO_KILL;
 		dm_accept_partial_bio(bio, max_sectors);
+	}
 
 	/*
 	 * Ensure that bio is a multiple of internal sector encryption size
@@ -3728,15 +3732,20 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	if (ti->emulate_zone_append)
 		limits->max_hw_sectors = min(limits->max_hw_sectors,
 					     BIO_MAX_VECS << PAGE_SECTORS_SHIFT);
+
+	limits->atomic_write_hw_unit_max = min(limits->atomic_write_hw_unit_max,
+					       BIO_MAX_VECS << PAGE_SHIFT);
+	limits->atomic_write_hw_max = min(limits->atomic_write_hw_max,
+					  BIO_MAX_VECS << PAGE_SHIFT);
 }
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 28, 0},
+	.version = {1, 29, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
-	.features = DM_TARGET_ZONED_HM,
+	.features = DM_TARGET_ZONED_HM | DM_TARGET_ATOMIC_WRITES,
 	.report_zones = crypt_report_zones,
 	.map    = crypt_map,
 	.status = crypt_status,

From d0ac06ae53be0cdb61f5fe6b62d25d3317c51657 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 20 Oct 2025 14:48:13 +0200
Subject: [PATCH 18/29] dm-bufio: align write boundary on physical block size

There may be devices with physical block size larger than 4k.

If dm-bufio sends I/O that is not aligned on physical block size,
performance is degraded.

The 4k minimum alignment limit is there because some SSDs report logical
and physical block size 512 despite having 4k internally - so dm-bufio
shouldn't send I/Os not aligned on 4k boundary, because they perform
badly (the SSD does read-modify-write for them).

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reported-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-bufio.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index e6d28be11c5c..5235f3e4924b 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1374,7 +1374,7 @@ static void submit_io(struct dm_buffer *b, enum req_op op, unsigned short ioprio
 {
 	unsigned int n_sectors;
 	sector_t sector;
-	unsigned int offset, end;
+	unsigned int offset, end, align;
 
 	b->end_io = end_io;
 
@@ -1388,9 +1388,11 @@ static void submit_io(struct dm_buffer *b, enum req_op op, unsigned short ioprio
 			b->c->write_callback(b);
 		offset = b->write_start;
 		end = b->write_end;
-		offset &= -DM_BUFIO_WRITE_ALIGN;
-		end += DM_BUFIO_WRITE_ALIGN - 1;
-		end &= -DM_BUFIO_WRITE_ALIGN;
+		align = max(DM_BUFIO_WRITE_ALIGN,
+			bdev_physical_block_size(b->c->bdev));
+		offset &= -align;
+		end += align - 1;
+		end &= -align;
 		if (unlikely(end > b->c->block_size))
 			end = b->c->block_size;
 

From 4efe85b0c442a47d8063fdc8ce5f31e9b33f046d Mon Sep 17 00:00:00 2001
From: Matthew Sakai <msakai@redhat.com>
Date: Fri, 21 Nov 2025 17:51:04 -0500
Subject: [PATCH 19/29] dm vdo: fix kerneldoc warnings

Fix kerneldoc warnings across the dm-vdo target. Also
remove some unhelpful or inaccurate doc comments, and fix
some format inconsistencies that did not produce warnings.

No functional changes.

Suggested-by: Sunday Adelodun <adelodunolaoluwa@yahoo.com>
Signed-off-by: Matthew Sakai <msakai@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-vdo/action-manager.c   |  2 +-
 drivers/md/dm-vdo/admin-state.c      | 75 ++++++++++++++--------
 drivers/md/dm-vdo/block-map.c        | 51 ++++++++++++---
 drivers/md/dm-vdo/completion.c       |  5 ++
 drivers/md/dm-vdo/data-vio.c         | 34 +++++++++-
 drivers/md/dm-vdo/dedupe.c           | 42 ++++++------
 drivers/md/dm-vdo/dm-vdo-target.c    |  5 +-
 drivers/md/dm-vdo/encodings.c        | 26 +++++++-
 drivers/md/dm-vdo/flush.c            |  6 +-
 drivers/md/dm-vdo/funnel-workqueue.c |  7 ++
 drivers/md/dm-vdo/io-submitter.c     | 26 ++++----
 drivers/md/dm-vdo/logical-zone.c     | 20 ++----
 drivers/md/dm-vdo/packer.c           | 15 ++---
 drivers/md/dm-vdo/physical-zone.c    |  5 +-
 drivers/md/dm-vdo/recovery-journal.c | 30 +++++----
 drivers/md/dm-vdo/slab-depot.c       | 96 ++++++++++++++++------------
 drivers/md/dm-vdo/vdo.c              |  9 ++-
 drivers/md/dm-vdo/vdo.h              |  4 +-
 drivers/md/dm-vdo/vio.c              |  3 +-
 drivers/md/dm-vdo/vio.h              |  6 +-
 20 files changed, 298 insertions(+), 169 deletions(-)

diff --git a/drivers/md/dm-vdo/action-manager.c b/drivers/md/dm-vdo/action-manager.c
index a0e5e7077d13..e3bba0b28aad 100644
--- a/drivers/md/dm-vdo/action-manager.c
+++ b/drivers/md/dm-vdo/action-manager.c
@@ -43,7 +43,7 @@ struct action {
  * @actions: The two action slots.
  * @current_action: The current action slot.
  * @zones: The number of zones in which an action is to be applied.
- * @Scheduler: A function to schedule a default next action.
+ * @scheduler: A function to schedule a default next action.
  * @get_zone_thread_id: A function to get the id of the thread on which to apply an action to a
  *                      zone.
  * @initiator_thread_id: The ID of the thread on which actions may be initiated.
diff --git a/drivers/md/dm-vdo/admin-state.c b/drivers/md/dm-vdo/admin-state.c
index 3f9dba525154..da153fef085e 100644
--- a/drivers/md/dm-vdo/admin-state.c
+++ b/drivers/md/dm-vdo/admin-state.c
@@ -149,7 +149,8 @@ const struct admin_state_code *VDO_ADMIN_STATE_RESUMING = &VDO_CODE_RESUMING;
 /**
  * get_next_state() - Determine the state which should be set after a given operation completes
  *                    based on the operation and the current state.
- * @operation The operation to be started.
+ * @state: The current admin state.
+ * @operation: The operation to be started.
  *
  * Return: The state to set when the operation completes or NULL if the operation can not be
  *         started in the current state.
@@ -187,6 +188,8 @@ static const struct admin_state_code *get_next_state(const struct admin_state *s
 
 /**
  * vdo_finish_operation() - Finish the current operation.
+ * @state: The current admin state.
+ * @result: The result of the operation.
  *
  * Will notify the operation waiter if there is one. This method should be used for operations
  * started with vdo_start_operation(). For operations which were started with vdo_start_draining(),
@@ -214,8 +217,10 @@ bool vdo_finish_operation(struct admin_state *state, int result)
 
 /**
  * begin_operation() - Begin an operation if it may be started given the current state.
- * @waiter A completion to notify when the operation is complete; may be NULL.
- * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ * @state: The current admin state.
+ * @operation: The operation to be started.
+ * @waiter: A completion to notify when the operation is complete; may be NULL.
+ * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
  *
  * Return: VDO_SUCCESS or an error.
  */
@@ -259,8 +264,10 @@ static int __must_check begin_operation(struct admin_state *state,
 
 /**
  * start_operation() - Start an operation if it may be started given the current state.
- * @waiter     A completion to notify when the operation is complete.
- * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ * @state: The current admin state.
+ * @operation: The operation to be started.
+ * @waiter: A completion to notify when the operation is complete; may be NULL.
+ * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
  *
  * Return: true if the operation was started.
  */
@@ -274,10 +281,10 @@ static inline bool __must_check start_operation(struct admin_state *state,
 
 /**
  * check_code() - Check the result of a state validation.
- * @valid true if the code is of an appropriate type.
- * @code The code which failed to be of the correct type.
- * @what What the code failed to be, for logging.
- * @waiter The completion to notify of the error; may be NULL.
+ * @valid: True if the code is of an appropriate type.
+ * @code: The code which failed to be of the correct type.
+ * @what: What the code failed to be, for logging.
+ * @waiter: The completion to notify of the error; may be NULL.
  *
  * If the result failed, log an invalid state error and, if there is a waiter, notify it.
  *
@@ -301,7 +308,8 @@ static bool check_code(bool valid, const struct admin_state_code *code, const ch
 
 /**
  * assert_vdo_drain_operation() - Check that an operation is a drain.
- * @waiter The completion to finish with an error if the operation is not a drain.
+ * @operation: The operation to check.
+ * @waiter: The completion to finish with an error if the operation is not a drain.
  *
  * Return: true if the specified operation is a drain.
  */
@@ -313,9 +321,10 @@ static bool __must_check assert_vdo_drain_operation(const struct admin_state_cod
 
 /**
  * vdo_start_draining() - Initiate a drain operation if the current state permits it.
- * @operation The type of drain to initiate.
- * @waiter The completion to notify when the drain is complete.
- * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ * @state: The current admin state.
+ * @operation: The type of drain to initiate.
+ * @waiter: The completion to notify when the drain is complete.
+ * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
  *
  * Return: true if the drain was initiated, if not the waiter will be notified.
  */
@@ -345,6 +354,7 @@ bool vdo_start_draining(struct admin_state *state,
 
 /**
  * vdo_finish_draining() - Finish a drain operation if one was in progress.
+ * @state: The current admin state.
  *
  * Return: true if the state was draining; will notify the waiter if so.
  */
@@ -355,6 +365,8 @@ bool vdo_finish_draining(struct admin_state *state)
 
 /**
  * vdo_finish_draining_with_result() - Finish a drain operation with a status code.
+ * @state: The current admin state.
+ * @result: The result of the drain operation.
  *
  * Return: true if the state was draining; will notify the waiter if so.
  */
@@ -365,7 +377,8 @@ bool vdo_finish_draining_with_result(struct admin_state *state, int result)
 
 /**
  * vdo_assert_load_operation() - Check that an operation is a load.
- * @waiter The completion to finish with an error if the operation is not a load.
+ * @operation: The operation to check.
+ * @waiter: The completion to finish with an error if the operation is not a load.
  *
  * Return: true if the specified operation is a load.
  */
@@ -377,9 +390,10 @@ bool vdo_assert_load_operation(const struct admin_state_code *operation,
 
 /**
  * vdo_start_loading() - Initiate a load operation if the current state permits it.
- * @operation The type of load to initiate.
- * @waiter The completion to notify when the load is complete (may be NULL).
- * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ * @state: The current admin state.
+ * @operation: The type of load to initiate.
+ * @waiter: The completion to notify when the load is complete; may be NULL.
+ * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
  *
  * Return: true if the load was initiated, if not the waiter will be notified.
  */
@@ -393,6 +407,7 @@ bool vdo_start_loading(struct admin_state *state,
 
 /**
  * vdo_finish_loading() - Finish a load operation if one was in progress.
+ * @state: The current admin state.
  *
  * Return: true if the state was loading; will notify the waiter if so.
  */
@@ -403,7 +418,8 @@ bool vdo_finish_loading(struct admin_state *state)
 
 /**
  * vdo_finish_loading_with_result() - Finish a load operation with a status code.
- * @result The result of the load operation.
+ * @state: The current admin state.
+ * @result: The result of the load operation.
  *
  * Return: true if the state was loading; will notify the waiter if so.
  */
@@ -414,7 +430,8 @@ bool vdo_finish_loading_with_result(struct admin_state *state, int result)
 
 /**
  * assert_vdo_resume_operation() - Check whether an admin_state_code is a resume operation.
- * @waiter The completion to notify if the operation is not a resume operation; may be NULL.
+ * @operation: The operation to check.
+ * @waiter: The completion to notify if the operation is not a resume operation; may be NULL.
  *
  * Return: true if the code is a resume operation.
  */
@@ -427,9 +444,10 @@ static bool __must_check assert_vdo_resume_operation(const struct admin_state_co
 
 /**
  * vdo_start_resuming() - Initiate a resume operation if the current state permits it.
- * @operation The type of resume to start.
- * @waiter The completion to notify when the resume is complete (may be NULL).
- * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ * @state: The current admin state.
+ * @operation: The type of resume to start.
+ * @waiter: The completion to notify when the resume is complete; may be NULL.
+ * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
  *
  * Return: true if the resume was initiated, if not the waiter will be notified.
  */
@@ -443,6 +461,7 @@ bool vdo_start_resuming(struct admin_state *state,
 
 /**
  * vdo_finish_resuming() - Finish a resume operation if one was in progress.
+ * @state: The current admin state.
  *
  * Return: true if the state was resuming; will notify the waiter if so.
  */
@@ -453,7 +472,8 @@ bool vdo_finish_resuming(struct admin_state *state)
 
 /**
  * vdo_finish_resuming_with_result() - Finish a resume operation with a status code.
- * @result The result of the resume operation.
+ * @state: The current admin state.
+ * @result: The result of the resume operation.
  *
  * Return: true if the state was resuming; will notify the waiter if so.
  */
@@ -465,6 +485,7 @@ bool vdo_finish_resuming_with_result(struct admin_state *state, int result)
 /**
  * vdo_resume_if_quiescent() - Change the state to normal operation if the current state is
  *                             quiescent.
+ * @state: The current admin state.
  *
  * Return: VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise.
  */
@@ -479,6 +500,8 @@ int vdo_resume_if_quiescent(struct admin_state *state)
 
 /**
  * vdo_start_operation() - Attempt to start an operation.
+ * @state: The current admin state.
+ * @operation: The operation to attempt to start.
  *
  * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not
  */
@@ -490,8 +513,10 @@ int vdo_start_operation(struct admin_state *state,
 
 /**
  * vdo_start_operation_with_waiter() - Attempt to start an operation.
- * @waiter the completion to notify when the operation completes or fails to start; may be NULL.
- * @initiator The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
+ * @state: The current admin state.
+ * @operation: The operation to attempt to start.
+ * @waiter: The completion to notify when the operation completes or fails to start; may be NULL.
+ * @initiator: The vdo_admin_initiator_fn to call if the operation may begin; may be NULL.
  *
  * Return: VDO_SUCCESS if the operation was started, VDO_INVALID_ADMIN_STATE if not
  */
diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c
index baf683cabb1b..a7db5b41155e 100644
--- a/drivers/md/dm-vdo/block-map.c
+++ b/drivers/md/dm-vdo/block-map.c
@@ -174,6 +174,7 @@ static inline struct vdo_page_completion *page_completion_from_waiter(struct vdo
 
 /**
  * initialize_info() - Initialize all page info structures and put them on the free list.
+ * @cache: The page cache.
  *
  * Return: VDO_SUCCESS or an error.
  */
@@ -209,6 +210,7 @@ static int initialize_info(struct vdo_page_cache *cache)
 /**
  * allocate_cache_components() - Allocate components of the cache which require their own
  *                               allocation.
+ * @cache: The page cache.
  *
  * The caller is responsible for all clean up on errors.
  *
@@ -238,6 +240,8 @@ static int __must_check allocate_cache_components(struct vdo_page_cache *cache)
 /**
  * assert_on_cache_thread() - Assert that a function has been called on the VDO page cache's
  *                            thread.
+ * @cache: The page cache.
+ * @function_name: The funtion name to report if the assertion fails.
  */
 static inline void assert_on_cache_thread(struct vdo_page_cache *cache,
 					  const char *function_name)
@@ -271,6 +275,7 @@ static void report_cache_pressure(struct vdo_page_cache *cache)
 
 /**
  * get_page_state_name() - Return the name of a page state.
+ * @state: The page state to describe.
  *
  * If the page state is invalid a static string is returned and the invalid state is logged.
  *
@@ -342,6 +347,8 @@ static void update_lru(struct page_info *info)
 /**
  * set_info_state() - Set the state of a page_info and put it on the right list, adjusting
  *                    counters.
+ * @info: The page info to update.
+ * @new_state: The new state to set.
  */
 static void set_info_state(struct page_info *info, enum vdo_page_buffer_state new_state)
 {
@@ -416,6 +423,7 @@ static int reset_page_info(struct page_info *info)
 
 /**
  * find_free_page() - Find a free page.
+ * @cache: The page cache.
  *
  * Return: A pointer to the page info structure (if found), NULL otherwise.
  */
@@ -433,6 +441,7 @@ static struct page_info * __must_check find_free_page(struct vdo_page_cache *cac
 
 /**
  * find_page() - Find the page info (if any) associated with a given pbn.
+ * @cache: The page cache.
  * @pbn: The absolute physical block number of the page.
  *
  * Return: The page info for the page if available, or NULL if not.
@@ -449,6 +458,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache,
 
 /**
  * select_lru_page() - Determine which page is least recently used.
+ * @cache: The page cache.
  *
  * Picks the least recently used from among the non-busy entries at the front of each of the lru
  * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely
@@ -523,6 +533,8 @@ static void complete_waiter_with_page(struct vdo_waiter *waiter, void *page_info
 
 /**
  * distribute_page_over_waitq() - Complete a waitq of VDO page completions with a page result.
+ * @info: The loaded page info.
+ * @waitq: The list of waiting data_vios.
  *
  * Upon completion the waitq will be empty.
  *
@@ -548,7 +560,9 @@ static unsigned int distribute_page_over_waitq(struct page_info *info,
 
 /**
  * set_persistent_error() - Set a persistent error which all requests will receive in the future.
+ * @cache: The page cache.
  * @context: A string describing what triggered the error.
+ * @result: The error result to set on the cache.
  *
  * Once triggered, all enqueued completions will get this error. Any future requests will result in
  * this error as well.
@@ -581,6 +595,7 @@ static void set_persistent_error(struct vdo_page_cache *cache, const char *conte
 /**
  * validate_completed_page() - Check that a page completion which is being freed to the cache
  *                             referred to a valid page and is in a valid state.
+ * @completion: The page completion to check.
  * @writable: Whether a writable page is required.
  *
  * Return: VDO_SUCCESS if the page was valid, otherwise as error
@@ -758,6 +773,8 @@ static void load_cache_page_endio(struct bio *bio)
 
 /**
  * launch_page_load() - Begin the process of loading a page.
+ * @info: The page info to launch.
+ * @pbn: The absolute physical block number of the page to load.
  *
  * Return: VDO_SUCCESS or an error code.
  */
@@ -836,6 +853,7 @@ static void save_pages(struct vdo_page_cache *cache)
 
 /**
  * schedule_page_save() - Add a page to the outgoing list of pages waiting to be saved.
+ * @info: The page info to save.
  *
  * Once in the list, a page may not be used until it has been written out.
  */
@@ -854,6 +872,7 @@ static void schedule_page_save(struct page_info *info)
 /**
  * launch_page_save() - Add a page to outgoing pages waiting to be saved, and then start saving
  * pages if another save is not in progress.
+ * @info: The page info to save.
  */
 static void launch_page_save(struct page_info *info)
 {
@@ -864,6 +883,7 @@ static void launch_page_save(struct page_info *info)
 /**
  * completion_needs_page() - Determine whether a given vdo_page_completion (as a waiter) is
  *                           requesting a given page number.
+ * @waiter: The page completion waiter to check.
  * @context: A pointer to the pbn of the desired page.
  *
  * Implements waiter_match_fn.
@@ -880,6 +900,7 @@ static bool completion_needs_page(struct vdo_waiter *waiter, void *context)
 /**
  * allocate_free_page() - Allocate a free page to the first completion in the waiting queue, and
  *                        any other completions that match it in page number.
+ * @info: The page info to allocate a page for.
  */
 static void allocate_free_page(struct page_info *info)
 {
@@ -925,6 +946,7 @@ static void allocate_free_page(struct page_info *info)
 
 /**
  * discard_a_page() - Begin the process of discarding a page.
+ * @cache: The page cache.
  *
  * If no page is discardable, increments a count of deferred frees so that the next release of a
  * page which is no longer busy will kick off another discard cycle. This is an indication that the
@@ -955,10 +977,6 @@ static void discard_a_page(struct vdo_page_cache *cache)
 	launch_page_save(info);
 }
 
-/**
- * discard_page_for_completion() - Helper used to trigger a discard so that the completion can get
- *                                 a different page.
- */
 static void discard_page_for_completion(struct vdo_page_completion *vdo_page_comp)
 {
 	struct vdo_page_cache *cache = vdo_page_comp->cache;
@@ -1132,6 +1150,7 @@ static void write_pages(struct vdo_completion *flush_completion)
 
 /**
  * vdo_release_page_completion() - Release a VDO Page Completion.
+ * @completion: The page completion to release.
  *
  * The page referenced by this completion (if any) will no longer be held busy by this completion.
  * If a page becomes discardable and there are completions awaiting free pages then a new round of
@@ -1172,10 +1191,6 @@ void vdo_release_page_completion(struct vdo_completion *completion)
 	}
 }
 
-/**
- * load_page_for_completion() - Helper function to load a page as described by a VDO Page
- *                              Completion.
- */
 static void load_page_for_completion(struct page_info *info,
 				     struct vdo_page_completion *vdo_page_comp)
 {
@@ -1319,6 +1334,7 @@ int vdo_get_cached_page(struct vdo_completion *completion,
 
 /**
  * vdo_invalidate_page_cache() - Invalidate all entries in the VDO page cache.
+ * @cache: The page cache.
  *
  * There must not be any dirty pages in the cache.
  *
@@ -1345,6 +1361,10 @@ int vdo_invalidate_page_cache(struct vdo_page_cache *cache)
 
 /**
  * get_tree_page_by_index() - Get the tree page for a given height and page index.
+ * @forest: The block map forest.
+ * @root_index: The root index of the tree to search.
+ * @height: The height in the tree.
+ * @page_index: The page index.
  *
  * Return: The requested page.
  */
@@ -2211,6 +2231,7 @@ static void allocate_block_map_page(struct block_map_zone *zone,
 /**
  * vdo_find_block_map_slot() - Find the block map slot in which the block map entry for a data_vio
  *                             resides and cache that result in the data_vio.
+ * @data_vio: The data vio.
  *
  * All ancestors in the tree will be allocated or loaded, as needed.
  */
@@ -2435,6 +2456,7 @@ static void deforest(struct forest *forest, size_t first_page_segment)
 /**
  * make_forest() - Make a collection of trees for a block_map, expanding the existing forest if
  *                 there is one.
+ * @map: The block map.
  * @entries: The number of entries the block map will hold.
  *
  * Return: VDO_SUCCESS or an error.
@@ -2476,6 +2498,7 @@ static int make_forest(struct block_map *map, block_count_t entries)
 
 /**
  * replace_forest() - Replace a block_map's forest with the already-prepared larger forest.
+ * @map: The block map.
  */
 static void replace_forest(struct block_map *map)
 {
@@ -2492,6 +2515,7 @@ static void replace_forest(struct block_map *map)
 /**
  * finish_cursor() - Finish the traversal of a single tree. If it was the last cursor, finish the
  *                   traversal.
+ * @cursor: The cursor to complete.
  */
 static void finish_cursor(struct cursor *cursor)
 {
@@ -2549,6 +2573,7 @@ static void traversal_endio(struct bio *bio)
 
 /**
  * traverse() - Traverse a single block map tree.
+ * @cursor: A cursor tracking traversal progress.
  *
  * This is the recursive heart of the traversal process.
  */
@@ -2619,6 +2644,7 @@ static void traverse(struct cursor *cursor)
 /**
  * launch_cursor() - Start traversing a single block map tree now that the cursor has a VIO with
  *                   which to load pages.
+ * @waiter: The parent of the cursor to launch.
  * @context: The pooled_vio just acquired.
  *
  * Implements waiter_callback_fn.
@@ -2636,6 +2662,8 @@ static void launch_cursor(struct vdo_waiter *waiter, void *context)
 
 /**
  * compute_boundary() - Compute the number of pages used at each level of the given root's tree.
+ * @map: The block map.
+ * @root_index: The tree root index.
  *
  * Return: The list of page counts as a boundary structure.
  */
@@ -2668,6 +2696,7 @@ static struct boundary compute_boundary(struct block_map *map, root_count_t root
 
 /**
  * vdo_traverse_forest() - Walk the entire forest of a block map.
+ * @map: The block map.
  * @callback: A function to call with the pbn of each allocated node in the forest.
  * @completion: The completion to notify on each traversed PBN, and when traversal completes.
  */
@@ -2707,6 +2736,9 @@ void vdo_traverse_forest(struct block_map *map, vdo_entry_callback_fn callback,
 
 /**
  * initialize_block_map_zone() - Initialize the per-zone portions of the block map.
+ * @map: The block map.
+ * @zone_number: The zone to initialize.
+ * @cache_size: The total block map cache size.
  * @maximum_age: The number of journal blocks before a dirtied page is considered old and must be
  *               written out.
  */
@@ -3091,6 +3123,7 @@ static void fetch_mapping_page(struct data_vio *data_vio, bool modifiable,
 
 /**
  * clear_mapped_location() - Clear a data_vio's mapped block location, setting it to be unmapped.
+ * @data_vio: The data vio.
  *
  * This indicates the block map entry for the logical block is either unmapped or corrupted.
  */
@@ -3104,6 +3137,8 @@ static void clear_mapped_location(struct data_vio *data_vio)
 /**
  * set_mapped_location() - Decode and validate a block map entry, and set the mapped location of a
  *                         data_vio.
+ * @data_vio: The data vio.
+ * @entry: The new mapped entry to set.
  *
  * Return: VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid or an error code for any
  *         other failure
diff --git a/drivers/md/dm-vdo/completion.c b/drivers/md/dm-vdo/completion.c
index 5ad85334632d..2f00acbb3b2b 100644
--- a/drivers/md/dm-vdo/completion.c
+++ b/drivers/md/dm-vdo/completion.c
@@ -65,6 +65,8 @@ static inline void assert_incomplete(struct vdo_completion *completion)
 
 /**
  * vdo_set_completion_result() - Set the result of a completion.
+ * @completion: The completion to update.
+ * @result: The result to set.
  *
  * Older errors will not be masked.
  */
@@ -77,6 +79,7 @@ void vdo_set_completion_result(struct vdo_completion *completion, int result)
 
 /**
  * vdo_launch_completion_with_priority() - Run or enqueue a completion.
+ * @completion: The completion to launch.
  * @priority: The priority at which to enqueue the completion.
  *
  * If called on the correct thread (i.e. the one specified in the completion's callback_thread_id
@@ -125,6 +128,8 @@ void vdo_enqueue_completion(struct vdo_completion *completion,
 
 /**
  * vdo_requeue_completion_if_needed() - Requeue a completion if not called on the specified thread.
+ * @completion: The completion to requeue.
+ * @callback_thread_id: The thread on which to requeue the completion.
  *
  * Return: True if the completion was requeued; callers may not access the completion in this case.
  */
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 262e11581f2d..3333e1e5b02e 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -227,6 +227,7 @@ static inline u64 get_arrival_time(struct bio *bio)
 /**
  * check_for_drain_complete_locked() - Check whether a data_vio_pool has no outstanding data_vios
  *				       or waiters while holding the pool's lock.
+ * @pool: The data_vio pool.
  */
 static bool check_for_drain_complete_locked(struct data_vio_pool *pool)
 {
@@ -387,6 +388,7 @@ struct data_vio_compression_status advance_data_vio_compression_stage(struct dat
 
 /**
  * cancel_data_vio_compression() - Prevent this data_vio from being compressed or packed.
+ * @data_vio: The data_vio.
  *
  * Return: true if the data_vio is in the packer and the caller was the first caller to cancel it.
  */
@@ -483,6 +485,8 @@ static void attempt_logical_block_lock(struct vdo_completion *completion)
 /**
  * launch_data_vio() - (Re)initialize a data_vio to have a new logical block number, keeping the
  *		       same parent and other state and send it on its way.
+ * @data_vio: The data_vio to launch.
+ * @lbn: The logical block number.
  */
 static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lbn)
 {
@@ -641,6 +645,7 @@ static void update_limiter(struct limiter *limiter)
 
 /**
  * schedule_releases() - Ensure that release processing is scheduled.
+ * @pool: The data_vio pool.
  *
  * If this call switches the state to processing, enqueue. Otherwise, some other thread has already
  * done so.
@@ -768,6 +773,8 @@ static void initialize_limiter(struct limiter *limiter, struct data_vio_pool *po
 
 /**
  * initialize_data_vio() - Allocate the components of a data_vio.
+ * @data_vio: The data_vio to initialize.
+ * @vdo: The vdo containing the data_vio.
  *
  * The caller is responsible for cleaning up the data_vio on error.
  *
@@ -880,6 +887,7 @@ int make_data_vio_pool(struct vdo *vdo, data_vio_count_t pool_size,
 
 /**
  * free_data_vio_pool() - Free a data_vio_pool and the data_vios in it.
+ * @pool: The data_vio pool to free.
  *
  * All data_vios must be returned to the pool before calling this function.
  */
@@ -944,6 +952,8 @@ static void wait_permit(struct limiter *limiter, struct bio *bio)
 
 /**
  * vdo_launch_bio() - Acquire a data_vio from the pool, assign the bio to it, and launch it.
+ * @pool: The data_vio pool.
+ * @bio: The bio to launch.
  *
  * This will block if data_vios or discard permits are not available.
  */
@@ -994,6 +1004,7 @@ static void assert_on_vdo_cpu_thread(const struct vdo *vdo, const char *name)
 
 /**
  * drain_data_vio_pool() - Wait asynchronously for all data_vios to be returned to the pool.
+ * @pool: The data_vio pool.
  * @completion: The completion to notify when the pool has drained.
  */
 void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion)
@@ -1005,6 +1016,7 @@ void drain_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *comp
 
 /**
  * resume_data_vio_pool() - Resume a data_vio pool.
+ * @pool: The data_vio pool.
  * @completion: The completion to notify when the pool has resumed.
  */
 void resume_data_vio_pool(struct data_vio_pool *pool, struct vdo_completion *completion)
@@ -1024,6 +1036,7 @@ static void dump_limiter(const char *name, struct limiter *limiter)
 
 /**
  * dump_data_vio_pool() - Dump a data_vio pool to the log.
+ * @pool: The data_vio pool.
  * @dump_vios: Whether to dump the details of each busy data_vio as well.
  */
 void dump_data_vio_pool(struct data_vio_pool *pool, bool dump_vios)
@@ -1114,6 +1127,7 @@ static void perform_cleanup_stage(struct data_vio *data_vio,
 /**
  * release_allocated_lock() - Release the PBN lock and/or the reference on the allocated block at
  *			      the end of processing a data_vio.
+ * @completion: The data_vio holding the lock.
  */
 static void release_allocated_lock(struct vdo_completion *completion)
 {
@@ -1194,6 +1208,7 @@ static void transfer_lock(struct data_vio *data_vio, struct lbn_lock *lock)
 /**
  * release_logical_lock() - Release the logical block lock and flush generation lock at the end of
  *			    processing a data_vio.
+ * @completion: The data_vio holding the lock.
  */
 static void release_logical_lock(struct vdo_completion *completion)
 {
@@ -1228,6 +1243,7 @@ static void clean_hash_lock(struct vdo_completion *completion)
 
 /**
  * finish_cleanup() - Make some assertions about a data_vio which has finished cleaning up.
+ * @data_vio: The data_vio.
  *
  * If it is part of a multi-block discard, starts on the next block, otherwise, returns it to the
  * pool.
@@ -1342,6 +1358,7 @@ void handle_data_vio_error(struct vdo_completion *completion)
 /**
  * get_data_vio_operation_name() - Get the name of the last asynchronous operation performed on a
  *				   data_vio.
+ * @data_vio: The data_vio.
  */
 const char *get_data_vio_operation_name(struct data_vio *data_vio)
 {
@@ -1355,7 +1372,7 @@ const char *get_data_vio_operation_name(struct data_vio *data_vio)
 
 /**
  * data_vio_allocate_data_block() - Allocate a data block.
- *
+ * @data_vio: The data_vio.
  * @write_lock_type: The type of write lock to obtain on the block.
  * @callback: The callback which will attempt an allocation in the current zone and continue if it
  *	      succeeds.
@@ -1379,6 +1396,7 @@ void data_vio_allocate_data_block(struct data_vio *data_vio,
 
 /**
  * release_data_vio_allocation_lock() - Release the PBN lock on a data_vio's allocated block.
+ * @data_vio: The data_vio.
  * @reset: If true, the allocation will be reset (i.e. any allocated pbn will be forgotten).
  *
  * If the reference to the locked block is still provisional, it will be released as well.
@@ -1399,6 +1417,7 @@ void release_data_vio_allocation_lock(struct data_vio *data_vio, bool reset)
 
 /**
  * uncompress_data_vio() - Uncompress the data a data_vio has just read.
+ * @data_vio: The data_vio.
  * @mapping_state: The mapping state indicating which fragment to decompress.
  * @buffer: The buffer to receive the uncompressed data.
  */
@@ -1519,6 +1538,7 @@ static void complete_zero_read(struct vdo_completion *completion)
 
 /**
  * read_block() - Read a block asynchronously.
+ * @completion: The data_vio doing the read.
  *
  * This is the callback registered in read_block_mapping().
  */
@@ -1675,6 +1695,7 @@ static void journal_remapping(struct vdo_completion *completion)
 
 /**
  * read_old_block_mapping() - Get the previous PBN/LBN mapping of an in-progress write.
+ * @completion: The data_vio doing the read.
  *
  * Gets the previous PBN mapped to this LBN from the block map, so as to make an appropriate
  * journal entry referencing the removal of this LBN->PBN mapping.
@@ -1704,6 +1725,7 @@ void update_metadata_for_data_vio_write(struct data_vio *data_vio, struct pbn_lo
 
 /**
  * pack_compressed_data() - Attempt to pack the compressed data_vio into a block.
+ * @completion: The data_vio.
  *
  * This is the callback registered in launch_compress_data_vio().
  */
@@ -1725,6 +1747,7 @@ static void pack_compressed_data(struct vdo_completion *completion)
 
 /**
  * compress_data_vio() - Do the actual work of compressing the data on a CPU queue.
+ * @completion: The data_vio.
  *
  * This callback is registered in launch_compress_data_vio().
  */
@@ -1754,6 +1777,7 @@ static void compress_data_vio(struct vdo_completion *completion)
 
 /**
  * launch_compress_data_vio() - Continue a write by attempting to compress the data.
+ * @data_vio: The data_vio.
  *
  * This is a re-entry point to vio_write used by hash locks.
  */
@@ -1796,7 +1820,8 @@ void launch_compress_data_vio(struct data_vio *data_vio)
 /**
  * hash_data_vio() - Hash the data in a data_vio and set the hash zone (which also flags the record
  *		     name as set).
-
+ * @completion: The data_vio.
+ *
  * This callback is registered in prepare_for_dedupe().
  */
 static void hash_data_vio(struct vdo_completion *completion)
@@ -1832,6 +1857,7 @@ static void prepare_for_dedupe(struct data_vio *data_vio)
 /**
  * write_bio_finished() - This is the bio_end_io function registered in write_block() to be called
  *			  when a data_vio's write to the underlying storage has completed.
+ * @bio: The bio to update.
  */
 static void write_bio_finished(struct bio *bio)
 {
@@ -1884,6 +1910,7 @@ void write_data_vio(struct data_vio *data_vio)
 
 /**
  * acknowledge_write_callback() - Acknowledge a write to the requestor.
+ * @completion: The data_vio.
  *
  * This callback is registered in allocate_block() and continue_write_with_block_map_slot().
  */
@@ -1909,6 +1936,7 @@ static void acknowledge_write_callback(struct vdo_completion *completion)
 
 /**
  * allocate_block() - Attempt to allocate a block in the current allocation zone.
+ * @completion: The data_vio.
  *
  * This callback is registered in continue_write_with_block_map_slot().
  */
@@ -1941,6 +1969,7 @@ static void allocate_block(struct vdo_completion *completion)
 
 /**
  * handle_allocation_error() - Handle an error attempting to allocate a block.
+ * @completion: The data_vio.
  *
  * This error handler is registered in continue_write_with_block_map_slot().
  */
@@ -1970,6 +1999,7 @@ static int assert_is_discard(struct data_vio *data_vio)
 
 /**
  * continue_data_vio_with_block_map_slot() - Read the data_vio's mapping from the block map.
+ * @completion: The data_vio to continue.
  *
  * This callback is registered in launch_read_data_vio().
  */
diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c
index 4d983092a152..75a26f3f4461 100644
--- a/drivers/md/dm-vdo/dedupe.c
+++ b/drivers/md/dm-vdo/dedupe.c
@@ -917,6 +917,8 @@ static int __must_check acquire_lock(struct hash_zone *zone,
 
 /**
  * enter_forked_lock() - Bind the data_vio to a new hash lock.
+ * @waiter: The data_vio's waiter link.
+ * @context: The new hash lock.
  *
  * Implements waiter_callback_fn. Binds the data_vio that was waiting to a new hash lock and waits
  * on that lock.
@@ -971,7 +973,7 @@ static void fork_hash_lock(struct hash_lock *old_lock, struct data_vio *new_agen
  *                   path.
  * @lock: The hash lock.
  * @data_vio: The data_vio to deduplicate using the hash lock.
- * @has_claim: true if the data_vio already has claimed an increment from the duplicate lock.
+ * @has_claim: True if the data_vio already has claimed an increment from the duplicate lock.
  *
  * If no increments are available, this will roll over to a new hash lock and launch the data_vio
  * as the writing agent for that lock.
@@ -996,7 +998,7 @@ static void launch_dedupe(struct hash_lock *lock, struct data_vio *data_vio,
  *                    true copy of their data on disk.
  * @lock: The hash lock.
  * @agent: The data_vio acting as the agent for the lock.
- * @agent_is_done: true only if the agent has already written or deduplicated against its data.
+ * @agent_is_done: True only if the agent has already written or deduplicated against its data.
  *
  * If the agent itself needs to deduplicate, an increment for it must already have been claimed
  * from the duplicate lock, ensuring the hash lock will still have a data_vio holding it.
@@ -2146,8 +2148,8 @@ static void start_expiration_timer(struct dedupe_context *context)
 /**
  * report_dedupe_timeouts() - Record and eventually report that some dedupe requests reached their
  *                            expiration time without getting answers, so we timed them out.
- * @zones: the hash zones.
- * @timeouts: the number of newly timed out requests.
+ * @zones: The hash zones.
+ * @timeouts: The number of newly timed out requests.
  */
 static void report_dedupe_timeouts(struct hash_zones *zones, unsigned int timeouts)
 {
@@ -2509,6 +2511,8 @@ static void initiate_suspend_index(struct admin_state *state)
 
 /**
  * suspend_index() - Suspend the UDS index prior to draining hash zones.
+ * @context: Not used.
+ * @completion: The completion for the suspend operation.
  *
  * Implements vdo_action_preamble_fn
  */
@@ -2521,21 +2525,13 @@ static void suspend_index(void *context, struct vdo_completion *completion)
 			   initiate_suspend_index);
 }
 
-/**
- * initiate_drain() - Initiate a drain.
- *
- * Implements vdo_admin_initiator_fn.
- */
+/** Implements vdo_admin_initiator_fn. */
 static void initiate_drain(struct admin_state *state)
 {
 	check_for_drain_complete(container_of(state, struct hash_zone, state));
 }
 
-/**
- * drain_hash_zone() - Drain a hash zone.
- *
- * Implements vdo_zone_action_fn.
- */
+/** Implements vdo_zone_action_fn. */
 static void drain_hash_zone(void *context, zone_count_t zone_number,
 			    struct vdo_completion *parent)
 {
@@ -2572,6 +2568,8 @@ static void launch_dedupe_state_change(struct hash_zones *zones)
 
 /**
  * resume_index() - Resume the UDS index prior to resuming hash zones.
+ * @context: Not used.
+ * @parent: The completion for the resume operation.
  *
  * Implements vdo_action_preamble_fn
  */
@@ -2602,11 +2600,7 @@ static void resume_index(void *context, struct vdo_completion *parent)
 	vdo_finish_completion(parent);
 }
 
-/**
- * resume_hash_zone() - Resume a hash zone.
- *
- * Implements vdo_zone_action_fn.
- */
+/** Implements vdo_zone_action_fn. */
 static void resume_hash_zone(void *context, zone_count_t zone_number,
 			     struct vdo_completion *parent)
 {
@@ -2634,7 +2628,7 @@ void vdo_resume_hash_zones(struct hash_zones *zones, struct vdo_completion *pare
 /**
  * get_hash_zone_statistics() - Add the statistics for this hash zone to the tally for all zones.
  * @zone: The hash zone to query.
- * @tally: The tally
+ * @tally: The tally.
  */
 static void get_hash_zone_statistics(const struct hash_zone *zone,
 				     struct hash_lock_statistics *tally)
@@ -2680,8 +2674,8 @@ static void get_index_statistics(struct hash_zones *zones,
 
 /**
  * vdo_get_dedupe_statistics() - Tally the statistics from all the hash zones and the UDS index.
- * @zones: The hash zones to query
- * @stats: A structure to store the statistics
+ * @zones: The hash zones to query.
+ * @stats: A structure to store the statistics.
  *
  * Return: The sum of the hash lock statistics from all hash zones plus the statistics from the UDS
  *         index
@@ -2856,9 +2850,9 @@ void vdo_set_dedupe_index_min_timer_interval(unsigned int value)
 
 /**
  * acquire_context() - Acquire a dedupe context from a hash_zone if any are available.
- * @zone: the hash zone
+ * @zone: The hash zone.
  *
- * Return: A dedupe_context or NULL if none are available
+ * Return: A dedupe_context or NULL if none are available.
  */
 static struct dedupe_context * __must_check acquire_context(struct hash_zone *zone)
 {
diff --git a/drivers/md/dm-vdo/dm-vdo-target.c b/drivers/md/dm-vdo/dm-vdo-target.c
index 0e04c2021682..6af40d40f255 100644
--- a/drivers/md/dm-vdo/dm-vdo-target.c
+++ b/drivers/md/dm-vdo/dm-vdo-target.c
@@ -1144,6 +1144,7 @@ static bool vdo_uses_device(struct vdo *vdo, const void *context)
 /**
  * get_thread_id_for_phase() - Get the thread id for the current phase of the admin operation in
  *                             progress.
+ * @vdo: The vdo.
  */
 static thread_id_t __must_check get_thread_id_for_phase(struct vdo *vdo)
 {
@@ -1188,9 +1189,9 @@ static struct vdo_completion *prepare_admin_completion(struct vdo *vdo,
 /**
  * advance_phase() - Increment the phase of the current admin operation and prepare the admin
  *                   completion to run on the thread for the next phase.
- * @vdo: The on which an admin operation is being performed
+ * @vdo: The vdo on which an admin operation is being performed.
  *
- * Return: The current phase
+ * Return: The current phase.
  */
 static u32 advance_phase(struct vdo *vdo)
 {
diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c
index b7cc0f41caca..dd59691be840 100644
--- a/drivers/md/dm-vdo/encodings.c
+++ b/drivers/md/dm-vdo/encodings.c
@@ -432,7 +432,10 @@ static void encode_block_map_state_2_0(u8 *buffer, size_t *offset,
 /**
  * vdo_compute_new_forest_pages() - Compute the number of pages which must be allocated at each
  *                                  level in order to grow the forest to a new number of entries.
+ * @root_count: The number of block map roots.
+ * @old_sizes: The sizes of the old tree segments.
  * @entries: The new number of entries the block map must address.
+ * @new_sizes: The sizes of the new tree segments.
  *
  * Return: The total number of non-leaf pages required.
  */
@@ -462,6 +465,9 @@ block_count_t vdo_compute_new_forest_pages(root_count_t root_count,
 
 /**
  * encode_recovery_journal_state_7_0() - Encode the state of a recovery journal.
+ * @buffer: A buffer to store the encoding.
+ * @offset: The offset in the buffer at which to encode.
+ * @state: The recovery journal state to encode.
  *
  * Return: VDO_SUCCESS or an error code.
  */
@@ -484,6 +490,7 @@ static void encode_recovery_journal_state_7_0(u8 *buffer, size_t *offset,
 /**
  * decode_recovery_journal_state_7_0() - Decode the state of a recovery journal saved in a buffer.
  * @buffer: The buffer containing the saved state.
+ * @offset: The offset to start decoding from.
  * @state: A pointer to a recovery journal state to hold the result of a successful decode.
  *
  * Return: VDO_SUCCESS or an error code.
@@ -544,6 +551,9 @@ const char *vdo_get_journal_operation_name(enum journal_operation operation)
 
 /**
  * encode_slab_depot_state_2_0() - Encode the state of a slab depot into a buffer.
+ * @buffer: A buffer to store the encoding.
+ * @offset: The offset in the buffer at which to encode.
+ * @state: The slab depot state to encode.
  */
 static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
 					struct slab_depot_state_2_0 state)
@@ -570,6 +580,9 @@ static void encode_slab_depot_state_2_0(u8 *buffer, size_t *offset,
 
 /**
  * decode_slab_depot_state_2_0() - Decode slab depot component state version 2.0 from a buffer.
+ * @buffer: The buffer being decoded.
+ * @offset: The offset to start decoding from.
+ * @state: A pointer to a slab depot state to hold the decoded result.
  *
  * Return: VDO_SUCCESS or an error code.
  */
@@ -1156,6 +1169,9 @@ static struct vdo_component unpack_vdo_component_41_0(struct packed_vdo_componen
 
 /**
  * decode_vdo_component() - Decode the component data for the vdo itself out of the super block.
+ * @buffer: The buffer being decoded.
+ * @offset: The offset to start decoding from.
+ * @component: The vdo component structure to decode into.
  *
  * Return: VDO_SUCCESS or an error.
  */
@@ -1290,7 +1306,7 @@ void vdo_destroy_component_states(struct vdo_component_states *states)
  *                       understand.
  * @buffer: The buffer being decoded.
  * @offset: The offset to start decoding from.
- * @geometry: The vdo geometry
+ * @geometry: The vdo geometry.
  * @states: An object to hold the successfully decoded state.
  *
  * Return: VDO_SUCCESS or an error.
@@ -1329,7 +1345,7 @@ static int __must_check decode_components(u8 *buffer, size_t *offset,
 /**
  * vdo_decode_component_states() - Decode the payload of a super block.
  * @buffer: The buffer containing the encoded super block contents.
- * @geometry: The vdo geometry
+ * @geometry: The vdo geometry.
  * @states: A pointer to hold the decoded states.
  *
  * Return: VDO_SUCCESS or an error.
@@ -1383,6 +1399,9 @@ int vdo_validate_component_states(struct vdo_component_states *states,
 
 /**
  * vdo_encode_component_states() - Encode the state of all vdo components in the super block.
+ * @buffer: A buffer to store the encoding.
+ * @offset: The offset into the buffer to start the encoding.
+ * @states: The component states to encode.
  */
 static void vdo_encode_component_states(u8 *buffer, size_t *offset,
 					const struct vdo_component_states *states)
@@ -1402,6 +1421,8 @@ static void vdo_encode_component_states(u8 *buffer, size_t *offset,
 
 /**
  * vdo_encode_super_block() - Encode a super block into its on-disk representation.
+ * @buffer: A buffer to store the encoding.
+ * @states: The component states to encode.
  */
 void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states)
 {
@@ -1426,6 +1447,7 @@ void vdo_encode_super_block(u8 *buffer, struct vdo_component_states *states)
 
 /**
  * vdo_decode_super_block() - Decode a super block from its on-disk representation.
+ * @buffer: The buffer to decode from.
  */
 int vdo_decode_super_block(u8 *buffer)
 {
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index dd4fdee2ca0c..82a259ef1601 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -522,11 +522,7 @@ static void vdo_complete_flush(struct vdo_flush *flush)
 	vdo_enqueue_completion(completion, BIO_Q_FLUSH_PRIORITY);
 }
 
-/**
- * initiate_drain() - Initiate a drain.
- *
- * Implements vdo_admin_initiator_fn.
- */
+/** Implements vdo_admin_initiator_fn. */
 static void initiate_drain(struct admin_state *state)
 {
 	check_for_drain_complete(container_of(state, struct flusher, state));
diff --git a/drivers/md/dm-vdo/funnel-workqueue.c b/drivers/md/dm-vdo/funnel-workqueue.c
index 0613c82bbe8e..8a79b33b8b09 100644
--- a/drivers/md/dm-vdo/funnel-workqueue.c
+++ b/drivers/md/dm-vdo/funnel-workqueue.c
@@ -372,6 +372,13 @@ static int make_simple_work_queue(const char *thread_name_prefix, const char *na
 /**
  * vdo_make_work_queue() - Create a work queue; if multiple threads are requested, completions will
  *                         be distributed to them in round-robin fashion.
+ * @thread_name_prefix: A prefix for the thread names to identify them as a vdo thread.
+ * @name: A base name to identify this queue.
+ * @owner: The vdo_thread structure to manage this queue.
+ * @type: The type of queue to create.
+ * @thread_count: The number of actual threads handling this queue.
+ * @thread_privates: An array of private contexts, one for each thread; may be NULL.
+ * @queue_ptr: A pointer to return the new work queue.
  *
  * Each queue is associated with a struct vdo_thread which has a single vdo thread id. Regardless
  * of the actual number of queues and threads allocated here, code outside of the queue
diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c
index 11d47770b54d..e26d75f8366d 100644
--- a/drivers/md/dm-vdo/io-submitter.c
+++ b/drivers/md/dm-vdo/io-submitter.c
@@ -118,6 +118,7 @@ static void send_bio_to_device(struct vio *vio, struct bio *bio)
 /**
  * vdo_submit_vio() - Submits a vio's bio to the underlying block device. May block if the device
  *		      is busy. This callback should be used by vios which did not attempt to merge.
+ * @completion: The vio to submit.
  */
 void vdo_submit_vio(struct vdo_completion *completion)
 {
@@ -133,7 +134,7 @@ void vdo_submit_vio(struct vdo_completion *completion)
  * The list will always contain at least one entry (the bio for the vio on which it is called), but
  * other bios may have been merged with it as well.
  *
- * Return: bio  The head of the bio list to submit.
+ * Return: The head of the bio list to submit.
  */
 static struct bio *get_bio_list(struct vio *vio)
 {
@@ -158,6 +159,7 @@ static struct bio *get_bio_list(struct vio *vio)
 /**
  * submit_data_vio() - Submit a data_vio's bio to the storage below along with
  *		       any bios that have been merged with it.
+ * @completion: The vio to submit.
  *
  * Context: This call may block and so should only be called from a bio thread.
  */
@@ -184,7 +186,7 @@ static void submit_data_vio(struct vdo_completion *completion)
  * There are two types of merging possible, forward and backward, which are distinguished by a flag
  * that uses kernel elevator terminology.
  *
- * Return: the vio to merge to, NULL if no merging is possible.
+ * Return: The vio to merge to, NULL if no merging is possible.
  */
 static struct vio *get_mergeable_locked(struct int_map *map, struct vio *vio,
 					bool back_merge)
@@ -262,7 +264,7 @@ static int merge_to_next_head(struct int_map *bio_map, struct vio *vio,
  *
  * Currently this is only used for data_vios, but is broken out for future use with metadata vios.
  *
- * Return: whether or not the vio was merged.
+ * Return: Whether or not the vio was merged.
  */
 static bool try_bio_map_merge(struct vio *vio)
 {
@@ -306,7 +308,7 @@ static bool try_bio_map_merge(struct vio *vio)
 
 /**
  * vdo_submit_data_vio() - Submit I/O for a data_vio.
- * @data_vio: the data_vio for which to issue I/O.
+ * @data_vio: The data_vio for which to issue I/O.
  *
  * If possible, this I/O will be merged other pending I/Os. Otherwise, the data_vio will be sent to
  * the appropriate bio zone directly.
@@ -321,13 +323,13 @@ void vdo_submit_data_vio(struct data_vio *data_vio)
 
 /**
  * __submit_metadata_vio() - Submit I/O for a metadata vio.
- * @vio: the vio for which to issue I/O
- * @physical: the physical block number to read or write
- * @callback: the bio endio function which will be called after the I/O completes
- * @error_handler: the handler for submission or I/O errors (may be NULL)
- * @operation: the type of I/O to perform
- * @data: the buffer to read or write (may be NULL)
- * @size: the I/O amount in bytes
+ * @vio: The vio for which to issue I/O.
+ * @physical: The physical block number to read or write.
+ * @callback: The bio endio function which will be called after the I/O completes.
+ * @error_handler: The handler for submission or I/O errors; may be NULL.
+ * @operation: The type of I/O to perform.
+ * @data: The buffer to read or write; may be NULL.
+ * @size: The I/O amount in bytes.
  *
  * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block
  * other vdo threads.
@@ -441,7 +443,7 @@ int vdo_make_io_submitter(unsigned int thread_count, unsigned int rotation_inter
 
 /**
  * vdo_cleanup_io_submitter() - Tear down the io_submitter fields as needed for a physical layer.
- * @io_submitter: The I/O submitter data to tear down (may be NULL).
+ * @io_submitter: The I/O submitter data to tear down; may be NULL.
  */
 void vdo_cleanup_io_submitter(struct io_submitter *io_submitter)
 {
diff --git a/drivers/md/dm-vdo/logical-zone.c b/drivers/md/dm-vdo/logical-zone.c
index 026f031ffc9e..0a27e60a9dfd 100644
--- a/drivers/md/dm-vdo/logical-zone.c
+++ b/drivers/md/dm-vdo/logical-zone.c
@@ -159,21 +159,13 @@ static void check_for_drain_complete(struct logical_zone *zone)
 	vdo_finish_draining(&zone->state);
 }
 
-/**
- * initiate_drain() - Initiate a drain.
- *
- * Implements vdo_admin_initiator_fn.
- */
+/** Implements vdo_admin_initiator_fn. */
 static void initiate_drain(struct admin_state *state)
 {
 	check_for_drain_complete(container_of(state, struct logical_zone, state));
 }
 
-/**
- * drain_logical_zone() - Drain a logical zone.
- *
- * Implements vdo_zone_action_fn.
- */
+/** Implements vdo_zone_action_fn. */
 static void drain_logical_zone(void *context, zone_count_t zone_number,
 			       struct vdo_completion *parent)
 {
@@ -192,11 +184,7 @@ void vdo_drain_logical_zones(struct logical_zones *zones,
 			       parent);
 }
 
-/**
- * resume_logical_zone() - Resume a logical zone.
- *
- * Implements vdo_zone_action_fn.
- */
+/** Implements vdo_zone_action_fn. */
 static void resume_logical_zone(void *context, zone_count_t zone_number,
 				struct vdo_completion *parent)
 {
@@ -356,7 +344,7 @@ struct physical_zone *vdo_get_next_allocation_zone(struct logical_zone *zone)
 
 /**
  * vdo_dump_logical_zone() - Dump information about a logical zone to the log for debugging.
- * @zone: The zone to dump
+ * @zone: The zone to dump.
  *
  * Context: the information is dumped in a thread-unsafe fashion.
  *
diff --git a/drivers/md/dm-vdo/packer.c b/drivers/md/dm-vdo/packer.c
index f70f5edabc10..666be6d557e1 100644
--- a/drivers/md/dm-vdo/packer.c
+++ b/drivers/md/dm-vdo/packer.c
@@ -35,10 +35,10 @@ static const struct version_number COMPRESSED_BLOCK_1_0 = {
 /**
  * vdo_get_compressed_block_fragment() - Get a reference to a compressed fragment from a compressed
  *                                       block.
- * @mapping_state [in] The mapping state for the look up.
- * @compressed_block [in] The compressed block that was read from disk.
- * @fragment_offset [out] The offset of the fragment within a compressed block.
- * @fragment_size [out] The size of the fragment.
+ * @mapping_state: The mapping state describing the fragment.
+ * @block: The compressed block that was read from disk.
+ * @fragment_offset: The offset of the fragment within the compressed block.
+ * @fragment_size: The size of the fragment.
  *
  * Return: If a valid compressed fragment is found, VDO_SUCCESS; otherwise, VDO_INVALID_FRAGMENT if
  *         the fragment is invalid.
@@ -382,6 +382,7 @@ static void initialize_compressed_block(struct compressed_block *block, u16 size
  * @compression: The agent's compression_state to pack in to.
  * @data_vio: The data_vio to pack.
  * @offset: The offset into the compressed block at which to pack the fragment.
+ * @slot: The slot number in the compressed block.
  * @block: The compressed block which will be written out when batch is fully packed.
  *
  * Return: The new amount of space used.
@@ -705,11 +706,7 @@ void vdo_increment_packer_flush_generation(struct packer *packer)
 	vdo_flush_packer(packer);
 }
 
-/**
- * initiate_drain() - Initiate a drain.
- *
- * Implements vdo_admin_initiator_fn.
- */
+/** Implements vdo_admin_initiator_fn. */
 static void initiate_drain(struct admin_state *state)
 {
 	struct packer *packer = container_of(state, struct packer, state);
diff --git a/drivers/md/dm-vdo/physical-zone.c b/drivers/md/dm-vdo/physical-zone.c
index a43b5c45fab7..686eb7d714e6 100644
--- a/drivers/md/dm-vdo/physical-zone.c
+++ b/drivers/md/dm-vdo/physical-zone.c
@@ -60,7 +60,7 @@ static inline bool has_lock_type(const struct pbn_lock *lock, enum pbn_lock_type
  * vdo_is_pbn_read_lock() - Check whether a pbn_lock is a read lock.
  * @lock: The lock to check.
  *
- * Return: true if the lock is a read lock.
+ * Return: True if the lock is a read lock.
  */
 bool vdo_is_pbn_read_lock(const struct pbn_lock *lock)
 {
@@ -75,6 +75,7 @@ static inline void set_pbn_lock_type(struct pbn_lock *lock, enum pbn_lock_type t
 /**
  * vdo_downgrade_pbn_write_lock() - Downgrade a PBN write lock to a PBN read lock.
  * @lock: The PBN write lock to downgrade.
+ * @compressed_write: True if the written block was a compressed block.
  *
  * The lock holder count is cleared and the caller is responsible for setting the new count.
  */
@@ -582,7 +583,7 @@ static bool continue_allocating(struct data_vio *data_vio)
  *				  that fails try the next if possible.
  * @data_vio: The data_vio needing an allocation.
  *
- * Return: true if a block was allocated, if not the data_vio will have been dispatched so the
+ * Return: True if a block was allocated, if not the data_vio will have been dispatched so the
  *         caller must not touch it.
  */
 bool vdo_allocate_block_in_zone(struct data_vio *data_vio)
diff --git a/drivers/md/dm-vdo/recovery-journal.c b/drivers/md/dm-vdo/recovery-journal.c
index de58184f538f..9cc0f0ff1664 100644
--- a/drivers/md/dm-vdo/recovery-journal.c
+++ b/drivers/md/dm-vdo/recovery-journal.c
@@ -109,7 +109,7 @@ static atomic_t *get_decrement_counter(struct recovery_journal *journal,
  * @journal: The recovery journal.
  * @lock_number: The lock to check.
  *
- * Return: true if the journal zone is locked.
+ * Return: True if the journal zone is locked.
  */
 static bool is_journal_zone_locked(struct recovery_journal *journal,
 				   block_count_t lock_number)
@@ -217,7 +217,7 @@ static struct recovery_journal_block * __must_check pop_free_list(struct recover
  * Indicates it has any uncommitted entries, which includes both entries not written and entries
  * written but not yet acknowledged.
  *
- * Return: true if the block has any uncommitted entries.
+ * Return: True if the block has any uncommitted entries.
  */
 static inline bool __must_check is_block_dirty(const struct recovery_journal_block *block)
 {
@@ -228,7 +228,7 @@ static inline bool __must_check is_block_dirty(const struct recovery_journal_blo
  * is_block_empty() - Check whether a journal block is empty.
  * @block: The block to check.
  *
- * Return: true if the block has no entries.
+ * Return: True if the block has no entries.
  */
 static inline bool __must_check is_block_empty(const struct recovery_journal_block *block)
 {
@@ -239,7 +239,7 @@ static inline bool __must_check is_block_empty(const struct recovery_journal_blo
  * is_block_full() - Check whether a journal block is full.
  * @block: The block to check.
  *
- * Return: true if the block is full.
+ * Return: True if the block is full.
  */
 static inline bool __must_check is_block_full(const struct recovery_journal_block *block)
 {
@@ -260,6 +260,8 @@ static void assert_on_journal_thread(struct recovery_journal *journal,
 
 /**
  * continue_waiter() - Release a data_vio from the journal.
+ * @waiter: The data_vio waiting on journal activity.
+ * @context: The result of the journal operation.
  *
  * Invoked whenever a data_vio is to be released from the journal, either because its entry was
  * committed to disk, or because there was an error. Implements waiter_callback_fn.
@@ -273,7 +275,7 @@ static void continue_waiter(struct vdo_waiter *waiter, void *context)
  * has_block_waiters() - Check whether the journal has any waiters on any blocks.
  * @journal: The journal in question.
  *
- * Return: true if any block has a waiter.
+ * Return: True if any block has a waiter.
  */
 static inline bool has_block_waiters(struct recovery_journal *journal)
 {
@@ -296,7 +298,7 @@ static void notify_commit_waiters(struct recovery_journal *journal);
  * suspend_lock_counter() - Prevent the lock counter from notifying.
  * @counter: The counter.
  *
- * Return: true if the lock counter was not notifying and hence the suspend was efficacious.
+ * Return: True if the lock counter was not notifying and hence the suspend was efficacious.
  */
 static bool suspend_lock_counter(struct lock_counter *counter)
 {
@@ -416,7 +418,7 @@ sequence_number_t vdo_get_recovery_journal_current_sequence_number(struct recove
  *
  * The head is the lowest sequence number of the block map head and the slab journal head.
  *
- * Return: the head of the journal.
+ * Return: The head of the journal.
  */
 static inline sequence_number_t get_recovery_journal_head(const struct recovery_journal *journal)
 {
@@ -535,7 +537,7 @@ static void initialize_journal_state(struct recovery_journal *journal)
  * vdo_get_recovery_journal_length() - Get the number of usable recovery journal blocks.
  * @journal_size: The size of the recovery journal in blocks.
  *
- * Return: the number of recovery journal blocks usable for entries.
+ * Return: The number of recovery journal blocks usable for entries.
  */
 block_count_t vdo_get_recovery_journal_length(block_count_t journal_size)
 {
@@ -1078,6 +1080,8 @@ static void update_usages(struct recovery_journal *journal, struct data_vio *dat
 
 /**
  * assign_entry() - Assign an entry waiter to the active block.
+ * @waiter: The data_vio.
+ * @context: The recovery journal block.
  *
  * Implements waiter_callback_fn.
  */
@@ -1165,6 +1169,8 @@ static void recycle_journal_block(struct recovery_journal_block *block)
 /**
  * continue_committed_waiter() - invoked whenever a VIO is to be released from the journal because
  *                               its entry was committed to disk.
+ * @waiter: The data_vio waiting on a journal write.
+ * @context: A pointer to the recovery journal.
  *
  * Implements waiter_callback_fn.
  */
@@ -1362,6 +1368,8 @@ static void add_queued_recovery_entries(struct recovery_journal_block *block)
 
 /**
  * write_block() - Issue a block for writing.
+ * @waiter: The recovery journal block to write.
+ * @context: Not used.
  *
  * Implements waiter_callback_fn.
  */
@@ -1611,11 +1619,7 @@ void vdo_release_journal_entry_lock(struct recovery_journal *journal,
 	smp_mb__after_atomic();
 }
 
-/**
- * initiate_drain() - Initiate a drain.
- *
- * Implements vdo_admin_initiator_fn.
- */
+/** Implements vdo_admin_initiator_fn. */
 static void initiate_drain(struct admin_state *state)
 {
 	check_for_drain_complete(container_of(state, struct recovery_journal, state));
diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c
index f3d80ff7bef5..034ecaa51f48 100644
--- a/drivers/md/dm-vdo/slab-depot.c
+++ b/drivers/md/dm-vdo/slab-depot.c
@@ -40,7 +40,7 @@ static const bool NORMAL_OPERATION = true;
 
 /**
  * get_lock() - Get the lock object for a slab journal block by sequence number.
- * @journal: vdo_slab journal to retrieve from.
+ * @journal: The vdo_slab journal to retrieve from.
  * @sequence_number: Sequence number of the block.
  *
  * Return: The lock object for the given sequence number.
@@ -110,7 +110,7 @@ static void initialize_journal_state(struct slab_journal *journal)
  * block_is_full() - Check whether a journal block is full.
  * @journal: The slab journal for the block.
  *
- * Return: true if the tail block is full.
+ * Return: True if the tail block is full.
  */
 static bool __must_check block_is_full(struct slab_journal *journal)
 {
@@ -127,10 +127,11 @@ static void release_journal_locks(struct vdo_waiter *waiter, void *context);
 
 /**
  * is_slab_journal_blank() - Check whether a slab's journal is blank.
+ * @slab: The slab to check.
  *
  * A slab journal is blank if it has never had any entries recorded in it.
  *
- * Return: true if the slab's journal has never been modified.
+ * Return: True if the slab's journal has never been modified.
  */
 static bool is_slab_journal_blank(const struct vdo_slab *slab)
 {
@@ -227,6 +228,7 @@ static u8 __must_check compute_fullness_hint(struct slab_depot *depot,
 
 /**
  * check_summary_drain_complete() - Check whether an allocators summary has finished draining.
+ * @allocator: The allocator to check.
  */
 static void check_summary_drain_complete(struct block_allocator *allocator)
 {
@@ -349,7 +351,7 @@ static void launch_write(struct slab_summary_block *block)
 
 /**
  * update_slab_summary_entry() - Update the entry for a slab.
- * @slab: The slab whose entry is to be updated
+ * @slab: The slab whose entry is to be updated.
  * @waiter: The waiter that is updating the summary.
  * @tail_block_offset: The offset of the slab journal's tail block.
  * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load.
@@ -654,6 +656,7 @@ static void update_tail_block_location(struct slab_journal *journal)
 
 /**
  * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
+ * @slab: The slab to reopen.
  */
 static void reopen_slab_journal(struct vdo_slab *slab)
 {
@@ -839,8 +842,6 @@ static void commit_tail(struct slab_journal *journal)
  * @sbn: The slab block number of the entry to encode.
  * @operation: The type of the entry.
  * @increment: True if this is an increment.
- *
- * Exposed for unit tests.
  */
 static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
 				      slab_journal_payload *payload,
@@ -951,7 +952,7 @@ static inline block_count_t journal_length(const struct slab_journal *journal)
  * @parent: The completion to notify when there is space to add the entry if the entry could not be
  *          added immediately.
  *
- * Return: true if the entry was added immediately.
+ * Return: True if the entry was added immediately.
  */
 bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn,
 				  enum journal_operation operation, bool increment,
@@ -1003,7 +1004,7 @@ bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t
  * requires_reaping() - Check whether the journal must be reaped before adding new entries.
  * @journal: The journal to check.
  *
- * Return: true if the journal must be reaped.
+ * Return: True if the journal must be reaped.
  */
 static bool requires_reaping(const struct slab_journal *journal)
 {
@@ -1275,6 +1276,8 @@ static void dirty_block(struct reference_block *block)
 
 /**
  * get_reference_block() - Get the reference block that covers the given block index.
+ * @slab: The slab containing the references.
+ * @index: The index of the physical block.
  */
 static struct reference_block * __must_check get_reference_block(struct vdo_slab *slab,
 								 slab_block_number index)
@@ -1379,7 +1382,8 @@ static void prioritize_slab(struct vdo_slab *slab)
 
 /**
  * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab.
- * @incremented: true if the free block count went up.
+ * @slab: The slab.
+ * @incremented: True if the free block count went up.
  */
 static void adjust_free_block_count(struct vdo_slab *slab, bool incremented)
 {
@@ -1885,6 +1889,7 @@ static void add_entries(struct slab_journal *journal)
 /**
  * reset_search_cursor() - Reset the free block search back to the first reference counter in the
  *                         first reference block of a slab.
+ * @slab: The slab.
  */
 static void reset_search_cursor(struct vdo_slab *slab)
 {
@@ -1892,17 +1897,17 @@ static void reset_search_cursor(struct vdo_slab *slab)
 
 	cursor->block = cursor->first_block;
 	cursor->index = 0;
-	/* Unit tests have slabs with only one reference block (and it's a runt). */
 	cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count);
 }
 
 /**
  * advance_search_cursor() - Advance the search cursor to the start of the next reference block in
- *                           a slab,
+ *                           a slab.
+ * @slab: The slab.
  *
  * Wraps around to the first reference block if the current block is the last reference block.
  *
- * Return: true unless the cursor was at the last reference block.
+ * Return: True unless the cursor was at the last reference block.
  */
 static bool advance_search_cursor(struct vdo_slab *slab)
 {
@@ -1933,6 +1938,9 @@ static bool advance_search_cursor(struct vdo_slab *slab)
 
 /**
  * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild.
+ * @depot: The slab depot.
+ * @pbn: The physical block number to adjust.
+ * @operation: The type opf operation.
  *
  * Return: VDO_SUCCESS or an error.
  */
@@ -2038,9 +2046,7 @@ static inline slab_block_number find_zero_byte_in_word(const u8 *word_ptr,
  * @slab: The slab counters to scan.
  * @index_ptr: A pointer to hold the array index of the free block.
  *
- * Exposed for unit testing.
- *
- * Return: true if a free block was found in the specified range.
+ * Return: True if a free block was found in the specified range.
  */
 static bool find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr)
 {
@@ -2097,7 +2103,7 @@ static bool find_free_block(const struct vdo_slab *slab, slab_block_number *inde
  * @slab: The slab to search.
  * @free_index_ptr: A pointer to receive the array index of the zero reference count.
  *
- * Return: true if an unreferenced counter was found.
+ * Return: True if an unreferenced counter was found.
  */
 static bool search_current_reference_block(const struct vdo_slab *slab,
 					   slab_block_number *free_index_ptr)
@@ -2116,7 +2122,7 @@ static bool search_current_reference_block(const struct vdo_slab *slab,
  * counter index saved in the search cursor and searching up to the end of the last reference
  * block. The search does not wrap.
  *
- * Return: true if an unreferenced counter was found.
+ * Return: True if an unreferenced counter was found.
  */
 static bool search_reference_blocks(struct vdo_slab *slab,
 				    slab_block_number *free_index_ptr)
@@ -2136,6 +2142,8 @@ static bool search_reference_blocks(struct vdo_slab *slab,
 
 /**
  * make_provisional_reference() - Do the bookkeeping for making a provisional reference.
+ * @slab: The slab.
+ * @block_number: The index for the physical block to reference.
  */
 static void make_provisional_reference(struct vdo_slab *slab,
 				       slab_block_number block_number)
@@ -2155,6 +2163,7 @@ static void make_provisional_reference(struct vdo_slab *slab,
 
 /**
  * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty.
+ * @slab: The slab.
  */
 static void dirty_all_reference_blocks(struct vdo_slab *slab)
 {
@@ -2173,10 +2182,10 @@ static inline bool journal_points_equal(struct journal_point first,
 
 /**
  * match_bytes() - Check an 8-byte word for bytes matching the value specified
- * @input: A word to examine the bytes of
- * @match: The byte value sought
+ * @input: A word to examine the bytes of.
+ * @match: The byte value sought.
  *
- * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise
+ * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise.
  */
 static inline u64 match_bytes(u64 input, u8 match)
 {
@@ -2191,12 +2200,12 @@ static inline u64 match_bytes(u64 input, u8 match)
 
 /**
  * count_valid_references() - Process a newly loaded refcount array
- * @counters: the array of counters from a metadata block
+ * @counters: The array of counters from a metadata block.
  *
- * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't
- * cleaned up at shutdown, changing them internally to "empty".
+ * Scan an 8-byte-aligned array of counters, fixing up any provisional values that
+ * weren't cleaned up at shutdown, changing them internally to zero.
  *
- * Return: the number of blocks that are referenced (counters not "empty")
+ * Return: The number of blocks with a non-zero reference count.
  */
 static unsigned int count_valid_references(vdo_refcount_t *counters)
 {
@@ -2351,6 +2360,7 @@ static void load_reference_block_group(struct vdo_waiter *waiter, void *context)
 /**
  * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a
  *                           pre-allocated reference counter.
+ * @slab: The slab.
  */
 static void load_reference_blocks(struct vdo_slab *slab)
 {
@@ -2375,6 +2385,7 @@ static void load_reference_blocks(struct vdo_slab *slab)
 
 /**
  * drain_slab() - Drain all reference count I/O.
+ * @slab: The slab.
  *
  * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the
  * reference blocks may be loaded from disk or dirty reference blocks may be written out.
@@ -2564,6 +2575,7 @@ static void read_slab_journal_tail(struct vdo_waiter *waiter, void *context)
 
 /**
  * load_slab_journal() - Load a slab's journal by reading the journal's tail.
+ * @slab: The slab.
  */
 static void load_slab_journal(struct vdo_slab *slab)
 {
@@ -2663,11 +2675,7 @@ static void queue_slab(struct vdo_slab *slab)
 	prioritize_slab(slab);
 }
 
-/**
- * initiate_slab_action() - Initiate a slab action.
- *
- * Implements vdo_admin_initiator_fn.
- */
+/** Implements vdo_admin_initiator_fn. */
 static void initiate_slab_action(struct admin_state *state)
 {
 	struct vdo_slab *slab = container_of(state, struct vdo_slab, state);
@@ -2720,7 +2728,7 @@ static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber)
  * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub.
  * @scrubber: The scrubber to check.
  *
- * Return: true if the scrubber has slabs to scrub.
+ * Return: True if the scrubber has slabs to scrub.
  */
 static inline bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber)
 {
@@ -2741,6 +2749,7 @@ static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber)
  * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because
  *                      there's been an error.
  * @scrubber: The scrubber.
+ * @result: The result of the scrubbing operation.
  */
 static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
 {
@@ -3132,11 +3141,13 @@ static struct vdo_slab *next_slab(struct slab_iterator *iterator)
 
 /**
  * abort_waiter() - Abort vios waiting to make journal entries when read-only.
+ * @waiter: A waiting data_vio.
+ * @context: Not used.
  *
  * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone
  * into read-only mode. Implements waiter_callback_fn.
  */
-static void abort_waiter(struct vdo_waiter *waiter, void *context __always_unused)
+static void abort_waiter(struct vdo_waiter *waiter, void __always_unused *context)
 {
 	struct reference_updater *updater =
 		container_of(waiter, struct reference_updater, waiter);
@@ -3536,7 +3547,7 @@ static void initiate_load(struct admin_state *state)
 /**
  * vdo_notify_slab_journals_are_recovered() - Inform a block allocator that its slab journals have
  *                                            been recovered from the recovery journal.
- * @completion The allocator completion
+ * @completion: The allocator completion.
  */
 void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion)
 {
@@ -3775,7 +3786,7 @@ static int initialize_slab_journal(struct vdo_slab *slab)
  *               in the slab.
  * @allocator: The block allocator to which the slab belongs.
  * @slab_number: The slab number of the slab.
- * @is_new: true if this slab is being allocated as part of a resize.
+ * @is_new: True if this slab is being allocated as part of a resize.
  * @slab_ptr: A pointer to receive the new slab.
  *
  * Return: VDO_SUCCESS or an error code.
@@ -3894,11 +3905,7 @@ void vdo_abandon_new_slabs(struct slab_depot *depot)
 	vdo_free(vdo_forget(depot->new_slabs));
 }
 
-/**
- * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates.
- *
- * Implements vdo_zone_thread_getter_fn.
- */
+/** Implements vdo_zone_thread_getter_fn. */
 static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number)
 {
 	return ((struct slab_depot *) context)->allocators[zone_number].thread_id;
@@ -3911,7 +3918,7 @@ static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_numb
  * @recovery_lock: The sequence number of the recovery journal block whose locks should be
  *                 released.
  *
- * Return: true if the journal does hold a lock on the specified block (which it will release).
+ * Return: True if the journal released a lock on the specified block.
  */
 static bool __must_check release_recovery_journal_lock(struct slab_journal *journal,
 						       sequence_number_t recovery_lock)
@@ -3955,6 +3962,8 @@ static void release_tail_block_locks(void *context, zone_count_t zone_number,
 
 /**
  * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks.
+ * @context: The slab depot.
+ * @parent: The parent operation.
  *
  * Implements vdo_action_preamble_fn.
  */
@@ -3968,6 +3977,7 @@ static void prepare_for_tail_block_commit(void *context, struct vdo_completion *
 
 /**
  * schedule_tail_block_commit() - Schedule a tail block commit if necessary.
+ * @context: The slab depot.
  *
  * This method should not be called directly. Rather, call vdo_schedule_default_action() on the
  * depot's action manager.
@@ -4361,6 +4371,7 @@ struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot
 
 /**
  * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot.
+ * @depot: The slab depot.
  *
  * Context: This method may be called only before entering normal operation from the load thread.
  *
@@ -4615,7 +4626,9 @@ static void load_summary_endio(struct bio *bio)
 }
 
 /**
- * load_slab_summary() - The preamble of a load operation.
+ * load_slab_summary() - Load the slab summary before the slab data.
+ * @context: The slab depot.
+ * @parent: The load operation.
  *
  * Implements vdo_action_preamble_fn.
  */
@@ -4731,7 +4744,7 @@ void vdo_update_slab_depot_size(struct slab_depot *depot)
  * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to
  *                                    the given size.
  * @depot: The depot to prepare to resize.
- * @partition: The new depot partition
+ * @partition: The new depot partition.
  *
  * Return: VDO_SUCCESS or an error.
  */
@@ -4781,6 +4794,7 @@ int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
 /**
  * finish_registration() - Finish registering new slabs now that all of the allocators have
  *                         received their new slabs.
+ * @context: The slab depot.
  *
  * Implements vdo_action_conclusion_fn.
  */
diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c
index 80b608674022..09fd0628d18c 100644
--- a/drivers/md/dm-vdo/vdo.c
+++ b/drivers/md/dm-vdo/vdo.c
@@ -181,6 +181,8 @@ static void assign_thread_ids(struct thread_config *config,
 
 /**
  * initialize_thread_config() - Initialize the thread mapping
+ * @counts: The number and types of threads to create.
+ * @config: The thread_config to initialize.
  *
  * If the logical, physical, and hash zone counts are all 0, a single thread will be shared by all
  * three plus the packer and recovery journal. Otherwise, there must be at least one of each type,
@@ -884,6 +886,7 @@ const struct admin_state_code *vdo_get_admin_state(const struct vdo *vdo)
 
 /**
  * record_vdo() - Record the state of the VDO for encoding in the super block.
+ * @vdo: The vdo.
  */
 static void record_vdo(struct vdo *vdo)
 {
@@ -1277,7 +1280,7 @@ void vdo_enter_read_only_mode(struct vdo *vdo, int error_code)
  * vdo_is_read_only() - Check whether the VDO is read-only.
  * @vdo: The vdo.
  *
- * Return: true if the vdo is read-only.
+ * Return: True if the vdo is read-only.
  *
  * This method may be called from any thread, as opposed to examining the VDO's state field which
  * is only safe to check from the admin thread.
@@ -1291,7 +1294,7 @@ bool vdo_is_read_only(struct vdo *vdo)
  * vdo_in_read_only_mode() - Check whether a vdo is in read-only mode.
  * @vdo: The vdo to query.
  *
- * Return: true if the vdo is in read-only mode.
+ * Return: True if the vdo is in read-only mode.
  */
 bool vdo_in_read_only_mode(const struct vdo *vdo)
 {
@@ -1302,7 +1305,7 @@ bool vdo_in_read_only_mode(const struct vdo *vdo)
  * vdo_in_recovery_mode() - Check whether the vdo is in recovery mode.
  * @vdo: The vdo to query.
  *
- * Return: true if the vdo is in recovery mode.
+ * Return: True if the vdo is in recovery mode.
  */
 bool vdo_in_recovery_mode(const struct vdo *vdo)
 {
diff --git a/drivers/md/dm-vdo/vdo.h b/drivers/md/dm-vdo/vdo.h
index 483ae873e002..1aaba73997b7 100644
--- a/drivers/md/dm-vdo/vdo.h
+++ b/drivers/md/dm-vdo/vdo.h
@@ -279,8 +279,10 @@ static inline bool vdo_uses_bio_ack_queue(struct vdo *vdo)
 
 /**
  * typedef vdo_filter_fn - Method type for vdo matching methods.
+ * @vdo: The vdo to match.
+ * @context: A parameter for the filter to use.
  *
- * A filter function returns false if the vdo doesn't match.
+ * Return: True if the vdo matches the filter criteria, false if it doesn't.
  */
 typedef bool (*vdo_filter_fn)(struct vdo *vdo, const void *context);
 
diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c
index 8fc22fb14196..5ffc867d9c5e 100644
--- a/drivers/md/dm-vdo/vio.c
+++ b/drivers/md/dm-vdo/vio.c
@@ -398,8 +398,9 @@ void free_vio_pool(struct vio_pool *pool)
 
 /**
  * is_vio_pool_busy() - Check whether an vio pool has outstanding entries.
+ * @pool: The vio pool.
  *
- * Return: true if the pool is busy.
+ * Return: True if the pool is busy.
  */
 bool is_vio_pool_busy(struct vio_pool *pool)
 {
diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h
index 4bfcb21901f1..7a8a6819aec4 100644
--- a/drivers/md/dm-vdo/vio.h
+++ b/drivers/md/dm-vdo/vio.h
@@ -156,8 +156,7 @@ static inline enum vdo_completion_priority get_metadata_priority(struct vio *vio
 /**
  * continue_vio() - Enqueue a vio to run its next callback.
  * @vio: The vio to continue.
- *
- * Return: The result of the current operation.
+ * @result: The result of the current operation.
  */
 static inline void continue_vio(struct vio *vio, int result)
 {
@@ -172,6 +171,9 @@ void vdo_count_completed_bios(struct bio *bio);
 
 /**
  * continue_vio_after_io() - Continue a vio now that its I/O has returned.
+ * @vio: The vio to continue.
+ * @callback: The next operation for this vio.
+ * @thread: Which thread to run the next operation on.
  */
 static inline void continue_vio_after_io(struct vio *vio, vdo_action_fn callback,
 					 thread_id_t thread)

From 27f204c215a0f5d91a963a16adb10432fa4ca0e9 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 21 Nov 2025 18:49:52 -0500
Subject: [PATCH 20/29] dm-mpath: Simplify the setup_scsi_dh code

There's no point to the MPATHF_RETAIN_ATTACHED_HW_HANDLER flag any more.
The way setup_scsi_dh() worked, if that flag wasn't set, it would
attempt to attach any passed in hardware handler. This would always fail
if a different hardware handler was attached, which caused
setup_scsi_dh() to rerun as if the flag was set. So the code would
already retain any attached handler, because attaching a different one
would always fail.

Also, the code had a bug. If attached_handler_name was NULL but there
was a scsi device handler attached (because either
scsi_dh_attached_handler_name failed() to allocate a name, a handler got
attached after it was called) the code would loop endlessly.

Instead, ignore MPATHF_RETAIN_ATTACHED_HW_HANDLER, and always free the
passed in handler if *attached_handler_name is set. This simplifies the
code, and avoids the endless loop bug, while keeping the functionality
the same.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-mpath.c | 61 ++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 38 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 5dd90b2cdb9b..c18358271618 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -131,7 +131,7 @@ static void queue_if_no_path_timeout_work(struct timer_list *t);
 #define MPATHF_QUEUE_IO 0			/* Must we queue all I/O? */
 #define MPATHF_QUEUE_IF_NO_PATH 1		/* Queue I/O if last path fails? */
 #define MPATHF_SAVED_QUEUE_IF_NO_PATH 2		/* Saved state during suspension */
-#define MPATHF_RETAIN_ATTACHED_HW_HANDLER 3	/* If there's already a hw_handler present, don't change it. */
+/* MPATHF_RETAIN_ATTACHED_HW_HANDLER no longer has any effect */
 #define MPATHF_PG_INIT_DISABLED 4		/* pg_init is not currently allowed */
 #define MPATHF_PG_INIT_REQUIRED 5		/* pg_init needs calling? */
 #define MPATHF_PG_INIT_DELAY_RETRY 6		/* Delay pg_init retry? */
@@ -237,16 +237,10 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 
 static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
 {
-	if (m->queue_mode == DM_TYPE_NONE) {
+	if (m->queue_mode == DM_TYPE_NONE)
 		m->queue_mode = DM_TYPE_REQUEST_BASED;
-	} else if (m->queue_mode == DM_TYPE_BIO_BASED) {
+	else if (m->queue_mode == DM_TYPE_BIO_BASED)
 		INIT_WORK(&m->process_queued_bios, process_queued_bios);
-		/*
-		 * bio-based doesn't support any direct scsi_dh management;
-		 * it just discovers if a scsi_dh is attached.
-		 */
-		set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
-	}
 
 	dm_table_set_type(ti->table, m->queue_mode);
 
@@ -887,36 +881,30 @@ static int setup_scsi_dh(struct block_device *bdev, struct multipath *m,
 	struct request_queue *q = bdev_get_queue(bdev);
 	int r;
 
-	if (mpath_double_check_test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, m)) {
-retain:
-		if (*attached_handler_name) {
-			/*
-			 * Clear any hw_handler_params associated with a
-			 * handler that isn't already attached.
-			 */
-			if (m->hw_handler_name && strcmp(*attached_handler_name, m->hw_handler_name)) {
-				kfree(m->hw_handler_params);
-				m->hw_handler_params = NULL;
-			}
-
-			/*
-			 * Reset hw_handler_name to match the attached handler
-			 *
-			 * NB. This modifies the table line to show the actual
-			 * handler instead of the original table passed in.
-			 */
-			kfree(m->hw_handler_name);
-			m->hw_handler_name = *attached_handler_name;
-			*attached_handler_name = NULL;
+	if (*attached_handler_name) {
+		/*
+		 * Clear any hw_handler_params associated with a
+		 * handler that isn't already attached.
+		 */
+		if (m->hw_handler_name && strcmp(*attached_handler_name,
+						 m->hw_handler_name)) {
+			kfree(m->hw_handler_params);
+			m->hw_handler_params = NULL;
 		}
+
+		/*
+		 * Reset hw_handler_name to match the attached handler
+		 *
+		 * NB. This modifies the table line to show the actual
+		 * handler instead of the original table passed in.
+		 */
+		kfree(m->hw_handler_name);
+		m->hw_handler_name = *attached_handler_name;
+		*attached_handler_name = NULL;
 	}
 
 	if (m->hw_handler_name) {
 		r = scsi_dh_attach(q, m->hw_handler_name);
-		if (r == -EBUSY) {
-			DMINFO("retaining handler on device %pg", bdev);
-			goto retain;
-		}
 		if (r < 0) {
 			*error = "error attaching hardware handler";
 			return r;
@@ -1138,7 +1126,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m)
 		}
 
 		if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
-			set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags);
+			/* no longer has any effect */
 			continue;
 		}
 
@@ -1823,7 +1811,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 		DMEMIT("%u ", test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) +
 			      (m->pg_init_retries > 0) * 2 +
 			      (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
-			      test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags) +
 			      (m->queue_mode != DM_TYPE_REQUEST_BASED) * 2);
 
 		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
@@ -1832,8 +1819,6 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("pg_init_retries %u ", m->pg_init_retries);
 		if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
 			DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
-		if (test_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags))
-			DMEMIT("retain_attached_hw_handler ");
 		if (m->queue_mode != DM_TYPE_REQUEST_BASED) {
 			switch (m->queue_mode) {
 			case DM_TYPE_BIO_BASED:

From 20f85a1b1a8f3f5f2a7a215c9cf501ed9093a03a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 26 Nov 2025 16:27:13 +0100
Subject: [PATCH 21/29] MAINTAINERS: add Benjamin Marzinski as a device mapper
 maintainer

Ben will be working with me as a maintainer, so add him to the
MAINTAINERS file.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: Mike Snitzer <snitzer@kernel.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 46126ce2f968..a91ddba595a4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7123,6 +7123,7 @@ DEVICE-MAPPER  (LVM)
 M:	Alasdair Kergon <agk@redhat.com>
 M:	Mike Snitzer <snitzer@kernel.org>
 M:	Mikulas Patocka <mpatocka@redhat.com>
+M:	Benjamin Marzinski <bmarzins@redhat.com>
 L:	dm-devel@lists.linux.dev
 S:	Maintained
 Q:	http://patchwork.kernel.org/project/dm-devel/list/

From f4412c7d5a5ab34a6338e15d07fba25185fbb94c Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Date: Mon, 24 Nov 2025 15:48:03 -0800
Subject: [PATCH 22/29] dm: ignore discard return value

__blkdev_issue_discard() always returns 0, making all error checking
at call sites dead code.

For dm-thin change issue_discard() return type to void, in
passdown_double_checking_shared_status() remove the r assignment from
return value of the issue_discard(), for end_discard() hardcode value of
r to 0 that matches only value returned from __blkdev_issue_discard().

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-thin.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 6f98936f0e05..52ffb495f5a8 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -395,13 +395,13 @@ static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *
 	op->bio = NULL;
 }
 
-static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
+static void issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
 {
 	struct thin_c *tc = op->tc;
 	sector_t s = block_to_sectors(tc->pool, data_b);
 	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
 
-	return __blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);
+	__blkdev_issue_discard(tc->pool_dev->bdev, s, len, GFP_NOIO, &op->bio);
 }
 
 static void end_discard(struct discard_op *op, int r)
@@ -1113,9 +1113,7 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m
 				break;
 		}
 
-		r = issue_discard(&op, b, e);
-		if (r)
-			goto out;
+		issue_discard(&op, b, e);
 
 		b = e;
 	}
@@ -1188,8 +1186,8 @@ static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
 		struct discard_op op;
 
 		begin_discard(&op, tc, discard_parent);
-		r = issue_discard(&op, m->data_block, data_end);
-		end_discard(&op, r);
+		issue_discard(&op, m->data_block, data_end);
+		end_discard(&op, 0);
 	}
 }
 

From 8581b19eb2c5ccf06c195d3b5468c3c9d17a5020 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 1 Dec 2025 22:13:10 +0100
Subject: [PATCH 23/29] dm-snapshot: fix 'scheduling while atomic' on real-time
 kernels

There is reported 'scheduling while atomic' bug when using dm-snapshot on
real-time kernels. The reason for the bug is that the hlist_bl code does
preempt_disable() when taking the lock and the kernel attempts to take
other spinlocks while holding the hlist_bl lock.

Fix this by converting a hlist_bl spinlock into a regular spinlock.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reported-by: Jiping Ma <jiping.ma2@windriver.com>
---
 drivers/md/dm-exception-store.h |  2 +-
 drivers/md/dm-snap.c            | 73 +++++++++++++++------------------
 2 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index b67976637538..061b4d310813 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -29,7 +29,7 @@ typedef sector_t chunk_t;
  * chunk within the device.
  */
 struct dm_exception {
-	struct hlist_bl_node hash_list;
+	struct hlist_node hash_list;
 
 	chunk_t old_chunk;
 	chunk_t new_chunk;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f40c18da4000..dbd148967de4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -40,10 +40,15 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
 #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
 
+struct dm_hlist_head {
+	struct hlist_head head;
+	spinlock_t lock;
+};
+
 struct dm_exception_table {
 	uint32_t hash_mask;
 	unsigned int hash_shift;
-	struct hlist_bl_head *table;
+	struct dm_hlist_head *table;
 };
 
 struct dm_snapshot {
@@ -628,8 +633,8 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
 
 /* Lock to protect access to the completed and pending exception hash tables. */
 struct dm_exception_table_lock {
-	struct hlist_bl_head *complete_slot;
-	struct hlist_bl_head *pending_slot;
+	spinlock_t *complete_slot;
+	spinlock_t *pending_slot;
 };
 
 static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
@@ -638,20 +643,20 @@ static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
 	struct dm_exception_table *complete = &s->complete;
 	struct dm_exception_table *pending = &s->pending;
 
-	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
-	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
+	lock->complete_slot = &complete->table[exception_hash(complete, chunk)].lock;
+	lock->pending_slot = &pending->table[exception_hash(pending, chunk)].lock;
 }
 
 static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
 {
-	hlist_bl_lock(lock->complete_slot);
-	hlist_bl_lock(lock->pending_slot);
+	spin_lock_nested(lock->complete_slot, 1);
+	spin_lock_nested(lock->pending_slot, 2);
 }
 
 static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
 {
-	hlist_bl_unlock(lock->pending_slot);
-	hlist_bl_unlock(lock->complete_slot);
+	spin_unlock(lock->pending_slot);
+	spin_unlock(lock->complete_slot);
 }
 
 static int dm_exception_table_init(struct dm_exception_table *et,
@@ -661,13 +666,15 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 
 	et->hash_shift = hash_shift;
 	et->hash_mask = size - 1;
-	et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head),
+	et->table = kvmalloc_array(size, sizeof(struct dm_hlist_head),
 				   GFP_KERNEL);
 	if (!et->table)
 		return -ENOMEM;
 
-	for (i = 0; i < size; i++)
-		INIT_HLIST_BL_HEAD(et->table + i);
+	for (i = 0; i < size; i++) {
+		INIT_HLIST_HEAD(&et->table[i].head);
+		spin_lock_init(&et->table[i].lock);
+	}
 
 	return 0;
 }
@@ -675,16 +682,17 @@ static int dm_exception_table_init(struct dm_exception_table *et,
 static void dm_exception_table_exit(struct dm_exception_table *et,
 				    struct kmem_cache *mem)
 {
-	struct hlist_bl_head *slot;
+	struct dm_hlist_head *slot;
 	struct dm_exception *ex;
-	struct hlist_bl_node *pos, *n;
+	struct hlist_node *pos;
 	int i, size;
 
 	size = et->hash_mask + 1;
 	for (i = 0; i < size; i++) {
 		slot = et->table + i;
 
-		hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) {
+		hlist_for_each_entry_safe(ex, pos, &slot->head, hash_list) {
+			hlist_del(&ex->hash_list);
 			kmem_cache_free(mem, ex);
 			cond_resched();
 		}
@@ -700,7 +708,7 @@ static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
 
 static void dm_remove_exception(struct dm_exception *e)
 {
-	hlist_bl_del(&e->hash_list);
+	hlist_del(&e->hash_list);
 }
 
 /*
@@ -710,12 +718,11 @@ static void dm_remove_exception(struct dm_exception *e)
 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
 						chunk_t chunk)
 {
-	struct hlist_bl_head *slot;
-	struct hlist_bl_node *pos;
+	struct hlist_head *slot;
 	struct dm_exception *e;
 
-	slot = &et->table[exception_hash(et, chunk)];
-	hlist_bl_for_each_entry(e, pos, slot, hash_list)
+	slot = &et->table[exception_hash(et, chunk)].head;
+	hlist_for_each_entry(e, slot, hash_list)
 		if (chunk >= e->old_chunk &&
 		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
 			return e;
@@ -762,18 +769,17 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
 static void dm_insert_exception(struct dm_exception_table *eh,
 				struct dm_exception *new_e)
 {
-	struct hlist_bl_head *l;
-	struct hlist_bl_node *pos;
+	struct hlist_head *l;
 	struct dm_exception *e = NULL;
 
-	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
+	l = &eh->table[exception_hash(eh, new_e->old_chunk)].head;
 
 	/* Add immediately if this table doesn't support consecutive chunks */
 	if (!eh->hash_shift)
 		goto out;
 
 	/* List is ordered by old_chunk */
-	hlist_bl_for_each_entry(e, pos, l, hash_list) {
+	hlist_for_each_entry(e, l, hash_list) {
 		/* Insert after an existing chunk? */
 		if (new_e->old_chunk == (e->old_chunk +
 					 dm_consecutive_chunk_count(e) + 1) &&
@@ -804,13 +810,13 @@ out:
 		 * Either the table doesn't support consecutive chunks or slot
 		 * l is empty.
 		 */
-		hlist_bl_add_head(&new_e->hash_list, l);
+		hlist_add_head(&new_e->hash_list, l);
 	} else if (new_e->old_chunk < e->old_chunk) {
 		/* Add before an existing exception */
-		hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
+		hlist_add_before(&new_e->hash_list, &e->hash_list);
 	} else {
 		/* Add to l's tail: e is the last exception in this slot */
-		hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
+		hlist_add_behind(&new_e->hash_list, &e->hash_list);
 	}
 }
 
@@ -820,7 +826,6 @@ out:
  */
 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 {
-	struct dm_exception_table_lock lock;
 	struct dm_snapshot *s = context;
 	struct dm_exception *e;
 
@@ -833,17 +838,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 	/* Consecutive_count is implicitly initialised to zero */
 	e->new_chunk = new;
 
-	/*
-	 * Although there is no need to lock access to the exception tables
-	 * here, if we don't then hlist_bl_add_head(), called by
-	 * dm_insert_exception(), will complain about accessing the
-	 * corresponding list without locking it first.
-	 */
-	dm_exception_table_lock_init(s, old, &lock);
-
-	dm_exception_table_lock(&lock);
 	dm_insert_exception(&s->complete, e);
-	dm_exception_table_unlock(&lock);
 
 	return 0;
 }
@@ -873,7 +868,7 @@ static int calc_max_buckets(void)
 	/* use a fixed size of 2MB */
 	unsigned long mem = 2 * 1024 * 1024;
 
-	mem /= sizeof(struct hlist_bl_head);
+	mem /= sizeof(struct dm_hlist_head);
 
 	return mem;
 }

From 2f6cfd6d7cb165a7af8877b838a9f6aab4159324 Mon Sep 17 00:00:00 2001
From: Alexey Simakov <bigalex934@gmail.com>
Date: Tue, 2 Dec 2025 20:18:38 +0300
Subject: [PATCH 24/29] dm-raid: fix possible NULL dereference with undefined
 raid type

rs->raid_type is assigned from get_raid_type_by_ll(), which may return
NULL. This NULL value could be dereferenced later in the condition
'if (!(rs_is_raid10(rs) && rt_is_raid0(rs->raid_type)))'.

Add a fail-fast check to return early with an error if raid_type is NULL,
similar to other uses of this function.

Found by Linux Verification Center (linuxtesting.org) with Svace.

Fixes: 33e53f06850f ("dm raid: introduce extended superblock and new raid types to support takeover/reshaping")
Signed-off-by: Alexey Simakov <bigalex934@gmail.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-raid.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c6f7129e43d3..4bacdc499984 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2287,6 +2287,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev)
 
 			mddev->reshape_position = le64_to_cpu(sb->reshape_position);
 			rs->raid_type = get_raid_type_by_ll(mddev->level, mddev->layout);
+			if (!rs->raid_type)
+				return -EINVAL;
 		}
 
 	} else {

From ab08f9c8b363297cafaf45475b08f78bf19b88ef Mon Sep 17 00:00:00 2001
From: Haotian Zhang <vulab@iscas.ac.cn>
Date: Mon, 1 Dec 2025 15:41:03 +0800
Subject: [PATCH 25/29] dm log-writes: Add missing set_freezable() for
 freezable kthread

The log_writes_kthread() calls try_to_freeze() but lacks set_freezable(),
rendering the freeze attempt ineffective since kernel threads are
non-freezable by default. This prevents proper thread suspension during
system suspend/hibernate.

Add set_freezable() to explicitly mark the thread as freezable.

Fixes: 0e9cebe72459 ("dm: add log writes target")
Signed-off-by: Haotian Zhang <vulab@iscas.ac.cn>
Reviewed-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 drivers/md/dm-log-writes.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 7bb7174f8f4f..f0c84e7a5daa 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -432,6 +432,7 @@ static int log_writes_kthread(void *arg)
 	struct log_writes_c *lc = arg;
 	sector_t sector = 0;
 
+	set_freezable();
 	while (!kthread_should_stop()) {
 		bool super = false;
 		bool logging_enabled;

From 7799eaecfeb756664be37c079520af67d5d64f70 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 3 Dec 2025 17:19:42 +0100
Subject: [PATCH 26/29] dm raid: add documentation for takeover/reshape raid1
 -> raid5 table line examples

Also enhance possible takeover/reshape information and do some reformatting.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
 .../admin-guide/device-mapper/dm-raid.rst     | 80 +++++++++++++++++--
 1 file changed, 73 insertions(+), 7 deletions(-)

diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst
index bb17e26e3c1b..e11f10764770 100644
--- a/Documentation/admin-guide/device-mapper/dm-raid.rst
+++ b/Documentation/admin-guide/device-mapper/dm-raid.rst
@@ -20,10 +20,10 @@ The target is named "raid" and it accepts the following parameters::
   raid0		RAID0 striping (no resilience)
   raid1		RAID1 mirroring
   raid4		RAID4 with dedicated last parity disk
-  raid5_n 	RAID5 with dedicated last parity disk supporting takeover
+  raid5_n 	RAID5 with dedicated last parity disk supporting takeover from/to raid1
 		Same as raid4
 
-		- Transitory layout
+		- Transitory layout for takeover from/to raid1
   raid5_la	RAID5 left asymmetric
 
 		- rotating parity 0 with data continuation
@@ -48,8 +48,8 @@ The target is named "raid" and it accepts the following parameters::
   raid6_n_6	RAID6 with dedicate parity disks
 
 		- parity and Q-syndrome on the last 2 disks;
-		  layout for takeover from/to raid4/raid5_n
-  raid6_la_6	Same as "raid_la" plus dedicated last Q-syndrome disk
+		  layout for takeover from/to raid0/raid4/raid5_n
+  raid6_la_6	Same as "raid_la" plus dedicated last Q-syndrome disk supporting takeover from/to raid5
 
 		- layout for takeover from raid5_la from/to raid6
   raid6_ra_6	Same as "raid5_ra" dedicated last Q-syndrome disk
@@ -173,9 +173,9 @@ The target is named "raid" and it accepts the following parameters::
 		The delta_disks option value (-251 < N < +251) triggers
 		device removal (negative value) or device addition (positive
 		value) to any reshape supporting raid levels 4/5/6 and 10.
-		RAID levels 4/5/6 allow for addition of devices (metadata
-		and data device tuple), raid10_near and raid10_offset only
-		allow for device addition. raid10_far does not support any
+		RAID levels 4/5/6 allow for addition and removal of devices
+                (metadata and data device tuple), raid10_near and raid10_offset
+                only allow for device addition. raid10_far does not support any
 		reshaping at all.
 		A minimum of devices have to be kept to enforce resilience,
 		which is 3 devices for raid4/5 and 4 devices for raid6.
@@ -372,6 +372,72 @@ to safely enable discard support for RAID 4/5/6:
     'devices_handle_discards_safely'
 
 
+Takeover/Reshape Support
+------------------------
+The target natively supports these two types of MDRAID conversions:
+
+o Takeover: Converts an array from one RAID level to another
+
+o Reshape: Changes the internal layout while maintaining the current RAID level
+
+Each operation is only valid under specific constraints imposed by the existing array's layout and configuration.
+
+
+Takeover:
+linear -> raid1 with N >= 2 mirrors
+raid0 -> raid4 (add dedicated parity device)
+raid0 -> raid5 (add dedicated parity device)
+raid0 -> raid10 with near layout and N >= 2 mirror groups (raid0 stripes have to become first member within mirror groups)
+raid1 -> linear
+raid1 -> raid5 with 2 mirrors
+raid4 -> raid5 w/ rotating parity
+raid5 with dedicated parity device -> raid4
+raid5 -> raid6 (with dedicated Q-syndrome)
+raid6 (with dedicated Q-syndrome) -> raid5
+raid10 with near layout and even number of disks -> raid0 (select any in-sync device from each mirror group)
+
+Reshape:
+linear: not possible
+raid0:  not possible
+raid1:  change number of mirrors
+raid4:  add and remove stripes (minimum 3), change stripesize
+raid5:  add and remove stripes (minimum 3, special case 2 for raid1 takeover), change rotating parity algorithms, change stripesize
+raid6:  add and remove stripes (minimum 4), change rotating syndrome algorithms, change stripesize
+raid10 near:   add stripes (minimum 4), change stripesize, no stripe removal possible, change to offset layout
+raid10 offset: add stripes, change stripesize, no stripe removal possible, change to near layout
+raid10 far:    not possible
+
+Table line examples:
+
+### raid1 -> raid5
+#
+# 2 devices limitation in raid1.
+# raid5 personality is able to just map 2 like raid1.
+# Reshape after takeover to change to full raid5 layout
+
+  0 1960886272 raid raid1 3 0 region_size 2048 2 /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3
+
+# dm-0 and dm-2 are e.g. 4MiB large metadata devices, dm-1 and dm-3 have to be at least 1960886272 big.
+#
+# Table line to takeover to raid5
+
+  0 1960886272 raid raid5 3 0 region_size 2048 2 /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3
+
+# Add required out-of-place reshape space to the beginniong of the given 2 data devices,
+# allocate another metadata/data device tuple with the same sizes for the parity space
+# and zero the first 4K of the metadata device.
+#
+# Example table of the out-of-place reshape space addition for one data device, e.g. dm-1
+
+  0 8192 linear 8:0 0 1960903888 #  <- must be free space segment
+  8192 1960886272 linear 8:0 0 2048 # previous data segment
+
+# Mapping table for e.g. raid5_rs reshape causing the size of the raid device to double-fold once the reshape finishes.
+# Check the status output (e.g. "dmsetup status $RaidDev") for progess.
+
+  0 $((2 * 1960886272)) raid raid5 7 0 region_size 2048 data_offset 8192 delta_disk 1 2 /dev/dm-0 /dev/dm-1 /dev/dm-2 /dev/dm-3
+
+
 Version History
 ---------------
 

From ebbb90344a7da2421e4b54668b94e81828b8b308 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <dongsheng.yang@linux.dev>
Date: Fri, 5 Dec 2025 05:46:18 +0000
Subject: [PATCH 27/29] dm-pcache: advance slot index before writing slot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In dm-pcache, in order to ensure crash-consistency, a dual-copy scheme
is used to alternately update metadata, and there is a slot index that
records the current slot. However, in the write path the current
implementation writes directly to the current slot indexed by slot
index, and then advances the slot — which ends up overwriting the
existing slot, violating the crash-consistency guarantee.

This patch fixes that behavior, preventing metadata from being
overwritten incorrectly.

In addition, this patch add a missing pmem_wmb() after memcpy_flushcache().

Signed-off-by: Dongsheng Yang <dongsheng.yang@linux.dev>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Zheng Gu <cengku@gmail.com>
Cc: stable@vger.kernel.org	# 6.18
---
 drivers/md/dm-pcache/cache.c         | 8 ++++----
 drivers/md/dm-pcache/cache_segment.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c
index d8e92367d947..d93aeb67c492 100644
--- a/drivers/md/dm-pcache/cache.c
+++ b/drivers/md/dm-pcache/cache.c
@@ -21,10 +21,10 @@ static void cache_info_write(struct pcache_cache *cache)
 	cache_info->header.crc = pcache_meta_crc(&cache_info->header,
 						sizeof(struct pcache_cache_info));
 
+	cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX;
 	memcpy_flushcache(get_cache_info_addr(cache), cache_info,
 			sizeof(struct pcache_cache_info));
-
-	cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX;
+	pmem_wmb();
 }
 
 static void cache_info_init_default(struct pcache_cache *cache);
@@ -93,10 +93,10 @@ void cache_pos_encode(struct pcache_cache *cache,
 	pos_onmedia.header.seq = seq;
 	pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia);
 
+	*index = (*index + 1) % PCACHE_META_INDEX_MAX;
+
 	memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia));
 	pmem_wmb();
-
-	*index = (*index + 1) % PCACHE_META_INDEX_MAX;
 }
 
 int cache_pos_decode(struct pcache_cache *cache,
diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c
index f0b58980806e..ae57cc261422 100644
--- a/drivers/md/dm-pcache/cache_segment.c
+++ b/drivers/md/dm-pcache/cache_segment.c
@@ -26,11 +26,11 @@ static void cache_seg_info_write(struct pcache_cache_segment *cache_seg)
 	seg_info->header.seq++;
 	seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info));
 
+	cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX;
+
 	seg_info_addr = get_seg_info_addr(cache_seg);
 	memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info));
 	pmem_wmb();
-
-	cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX;
 	mutex_unlock(&cache_seg->info_lock);
 }
 
@@ -129,10 +129,10 @@ static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg)
 	cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header,
 						 sizeof(struct pcache_cache_seg_gen));
 
+	cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX;
+
 	memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen));
 	pmem_wmb();
-
-	cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX;
 }
 
 static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg)

From ee7633178321f5d983db3adfdea9322456cfdaaa Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Fri, 5 Dec 2025 05:46:19 +0000
Subject: [PATCH 28/29] dm pcache: fix cache info indexing

The on-media cache_info index used sizeof(struct) instead of the
4K metadata stride, so gc_percent updates from dmsetup message
were written between slots and lost after reboot. Use
PCACHE_CACHE_INFO_SIZE in get_cache_info_addr() and align
info_index with the slot returned by pcache_meta_find_latest().

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Dongsheng Yang <dongsheng.yang@linux.dev>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Zheng Gu <cengku@gmail.com>
Cc: stable@vger.kernel.org	# 6.18
---
 drivers/md/dm-pcache/cache.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-pcache/cache.c b/drivers/md/dm-pcache/cache.c
index d93aeb67c492..18b00f6f4fd3 100644
--- a/drivers/md/dm-pcache/cache.c
+++ b/drivers/md/dm-pcache/cache.c
@@ -10,7 +10,8 @@ struct kmem_cache *key_cache;
 
 static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache)
 {
-	return cache->cache_info_addr + cache->info_index;
+	return (struct pcache_cache_info *)((char *)cache->cache_info_addr +
+						(size_t)cache->info_index * PCACHE_CACHE_INFO_SIZE);
 }
 
 static void cache_info_write(struct pcache_cache *cache)
@@ -49,6 +50,8 @@ static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_optio
 			return -EINVAL;
 		}
 
+		cache->info_index = ((char *)cache_info_addr - (char *)cache->cache_info_addr) / PCACHE_CACHE_INFO_SIZE;
+
 		return 0;
 	}
 

From 13ea55ea20176736516b20b9ea2d8cf97dbe74f5 Mon Sep 17 00:00:00 2001
From: Li Chen <chenl311@chinatelecom.cn>
Date: Fri, 5 Dec 2025 05:46:20 +0000
Subject: [PATCH 29/29] dm pcache: fix segment info indexing

Segment info indexing also used sizeof(struct) instead of the
4K metadata stride, so info_index could point between slots and
subsequent writes would advance incorrectly. Derive info_index
from the pointer returned by the segment meta search using
PCACHE_SEG_INFO_SIZE and advance to the next slot for future
updates.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Signed-off-by: Dongsheng Yang <dongsheng.yang@linux.dev>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Zheng Gu <cengku@gmail.com>
Cc: stable@vger.kernel.org	# 6.18
---
 drivers/md/dm-pcache/cache_segment.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-pcache/cache_segment.c b/drivers/md/dm-pcache/cache_segment.c
index ae57cc261422..9d92e2b067ed 100644
--- a/drivers/md/dm-pcache/cache_segment.c
+++ b/drivers/md/dm-pcache/cache_segment.c
@@ -56,7 +56,10 @@ static int cache_seg_info_load(struct pcache_cache_segment *cache_seg)
 		ret = -EIO;
 		goto out;
 	}
-	cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base;
+
+	cache_seg->info_index =
+		((char *)cache_seg_info_addr - (char *)cache_seg_info_addr_base) /
+		PCACHE_SEG_INFO_SIZE;
 out:
 	mutex_unlock(&cache_seg->info_lock);