From 93a358af59c6e8ab00b57cfdb1c437516a4948ca Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 13 Oct 2025 12:28:02 -0700 Subject: [PATCH 001/162] block/mq-deadline: Introduce dd_start_request() Prepare for adding a second caller of this function. No functionality has been changed. Cc: Damien Le Moal Cc: Yu Kuai Cc: chengkaitao Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/mq-deadline.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 3e741d33142d..647a45f6d935 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -306,6 +306,19 @@ static bool started_after(struct deadline_data *dd, struct request *rq, return time_after(start_time, latest_start); } +static struct request *dd_start_request(struct deadline_data *dd, + enum dd_data_dir data_dir, + struct request *rq) +{ + u8 ioprio_class = dd_rq_ioclass(rq); + enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + + dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); + dd->per_prio[prio].stats.dispatched++; + rq->rq_flags |= RQF_STARTED; + return rq; +} + /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. @@ -316,8 +329,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, { struct request *rq, *next_rq; enum dd_data_dir data_dir; - enum dd_prio prio; - u8 ioprio_class; lockdep_assert_held(&dd->lock); @@ -411,12 +422,7 @@ dispatch_request: dd->batching++; deadline_move_request(dd, per_prio, rq); done: - ioprio_class = dd_rq_ioclass(rq); - prio = ioprio_class_to_prio[ioprio_class]; - dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); - dd->per_prio[prio].stats.dispatched++; - rq->rq_flags |= RQF_STARTED; - return rq; + return dd_start_request(dd, data_dir, rq); } /* From d60055cf52703a705b86fb25b9b7931ec7ee399c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 13 Oct 2025 12:28:03 -0700 Subject: [PATCH 002/162] block/mq-deadline: Switch back to a single dispatch list Commit c807ab520fc3 ("block/mq-deadline: Add I/O priority support") modified the behavior of request flag BLK_MQ_INSERT_AT_HEAD from dispatching a request before other requests into dispatching a request before other requests with the same I/O priority. This is not correct since BLK_MQ_INSERT_AT_HEAD is used when requeuing requests and also when a flush request is inserted. Both types of requests should be dispatched as soon as possible. Hence, make the mq-deadline I/O scheduler again ignore the I/O priority for BLK_MQ_INSERT_AT_HEAD requests. Cc: Damien Le Moal Cc: Yu Kuai Reported-by: chengkaitao Closes: https://lore.kernel.org/linux-block/20251009155253.14611-1-pilgrimtao@gmail.com/ Fixes: c807ab520fc3 ("block/mq-deadline: Add I/O priority support") Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moalv Signed-off-by: Jens Axboe --- block/mq-deadline.c | 107 +++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 60 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 647a45f6d935..3e3719093aec 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -71,7 +71,6 @@ struct io_stats_per_prio { * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { - struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ @@ -84,6 +83,7 @@ struct deadline_data { * run time data */ + struct list_head dispatch; struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -332,16 +332,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, lockdep_assert_held(&dd->lock); - if (!list_empty(&per_prio->dispatch)) { - rq = list_first_entry(&per_prio->dispatch, struct request, - queuelist); - if (started_after(dd, rq, latest_start)) - return NULL; - list_del_init(&rq->queuelist); - data_dir = rq_data_dir(rq); - goto done; - } - /* * batches are currently reads XOR writes */ @@ -421,7 +411,6 @@ dispatch_request: */ dd->batching++; deadline_move_request(dd, per_prio, rq); -done: return dd_start_request(dd, data_dir, rq); } @@ -469,6 +458,14 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) enum dd_prio prio; spin_lock(&dd->lock); + + if (!list_empty(&dd->dispatch)) { + rq = list_first_entry(&dd->dispatch, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_start_request(dd, rq_data_dir(rq), rq); + goto unlock; + } + rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; @@ -557,10 +554,10 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) eq->elevator_data = dd; + INIT_LIST_HEAD(&dd->dispatch); for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; - INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; @@ -664,7 +661,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { - list_add(&rq->queuelist, &per_prio->dispatch); + list_add(&rq->queuelist, &dd->dispatch); rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); @@ -731,8 +728,7 @@ static void dd_finish_request(struct request *rq) static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { - return !list_empty_careful(&per_prio->dispatch) || - !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + return !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } @@ -741,6 +737,9 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; + if (!list_empty_careful(&dd->dispatch)) + return true; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; @@ -949,49 +948,39 @@ static int dd_owned_by_driver_show(void *data, struct seq_file *m) return 0; } -#define DEADLINE_DISPATCH_ATTR(prio) \ -static void *deadline_dispatch##prio##_start(struct seq_file *m, \ - loff_t *pos) \ - __acquires(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - spin_lock(&dd->lock); \ - return seq_list_start(&per_prio->dispatch, *pos); \ -} \ - \ -static void *deadline_dispatch##prio##_next(struct seq_file *m, \ - void *v, loff_t *pos) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - return seq_list_next(v, &per_prio->dispatch, pos); \ -} \ - \ -static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ - __releases(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - \ - spin_unlock(&dd->lock); \ -} \ - \ -static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ - .start = deadline_dispatch##prio##_start, \ - .next = deadline_dispatch##prio##_next, \ - .stop = deadline_dispatch##prio##_stop, \ - .show = blk_mq_debugfs_rq_show, \ +static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) + __acquires(&dd->lock) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + return seq_list_start(&dd->dispatch, *pos); } -DEADLINE_DISPATCH_ATTR(0); -DEADLINE_DISPATCH_ATTR(1); -DEADLINE_DISPATCH_ATTR(2); -#undef DEADLINE_DISPATCH_ATTR +static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + return seq_list_next(v, &dd->dispatch, pos); +} + +static void deadline_dispatch_stop(struct seq_file *m, void *v) + __releases(&dd->lock) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_unlock(&dd->lock); +} + +static const struct seq_operations deadline_dispatch_seq_ops = { + .start = deadline_dispatch_start, + .next = deadline_dispatch_next, + .stop = deadline_dispatch_stop, + .show = blk_mq_debugfs_rq_show, +}; #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ @@ -1014,9 +1003,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, - {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, - {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, - {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, From e5a82249d88c7063c4ac998704b0ae5784013976 Mon Sep 17 00:00:00 2001 From: Mehdi Ben Hadj Khelifa Date: Sat, 18 Oct 2025 22:38:13 +0100 Subject: [PATCH 003/162] blk-mq: use struct_size() in kmalloc() Change struct size calculation to use struct_size() to align with new recommended practices[1] which quotes: "Another common case to avoid is calculating the size of a structure with a trailing array of others structures, as in: header = kzalloc(sizeof(*header) + count * sizeof(*header->item), GFP_KERNEL); Instead, use the helper: header = kzalloc(struct_size(header, item, count), GFP_KERNEL);" Signed-off-by: Mehdi Ben Hadj Khelifa Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e0bed16485c3..97b69fbe26f6 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -466,8 +466,7 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, else nr_tags = nr_hw_queues; - et = kmalloc(sizeof(struct elevator_tags) + - nr_tags * sizeof(struct blk_mq_tags *), gfp); + et = kmalloc(struct_size(et, tags, nr_tags), gfp); if (!et) return NULL; From 5c5028ee594ce5f907ca6ad1c32cca6a15098464 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 20 Oct 2025 13:47:15 -0700 Subject: [PATCH 004/162] block: rename min_segment_size Despite its name, the block layer is fine with segments smaller that the "min_segment_size" limit. The value is an optimization limit indicating the largest segment that can be used without considering boundary limits. Smaller segments can take a fast path, so give it a name that reflects that: max_fast_segment_size. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-settings.c | 4 ++-- block/blk.h | 2 +- include/linux/blkdev.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 37864c5d287e..c47d18587a0b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -336,7 +336,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= lim->min_segment_size) { + bv.bv_offset + bv.bv_len <= lim->max_fast_segment_size) { nsegs++; bytes += bv.bv_len; } else { diff --git a/block/blk-settings.c b/block/blk-settings.c index 54cffaae4df4..345b6a271cc3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -457,12 +457,12 @@ int blk_validate_limits(struct queue_limits *lim) return -EINVAL; } - /* setup min segment size for building new segment in fast path */ + /* setup max segment size for building new segment in fast path */ if (lim->seg_boundary_mask > lim->max_segment_size - 1) seg_size = lim->max_segment_size; else seg_size = lim->seg_boundary_mask + 1; - lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + lim->max_fast_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); /* * We require drivers to at least do logical block aligned I/O, but diff --git a/block/blk.h b/block/blk.h index 170794632135..32a10024efba 100644 --- a/block/blk.h +++ b/block/blk.h @@ -377,7 +377,7 @@ static inline bool bio_may_need_split(struct bio *bio, if (bio->bi_vcnt != 1) return true; return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > - lim->min_segment_size; + lim->max_fast_segment_size; } /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 70b671a9a7f7..99be263b31ab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -378,7 +378,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; - unsigned int min_segment_size; + unsigned int max_fast_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; From a65988a0ad047dbdb8a1eb6f07540b980858b522 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:00 +0200 Subject: [PATCH 005/162] blktrace: only calculate trace length once De-duplicate the calculation of the trace length instead of doing the calculation twice, once for calling trace_buffer_lock_reserve() and once for calling relay_reserve(). Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6941145b5058..bc4b885f2cec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -76,13 +76,14 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + size_t trace_len; + trace_len = sizeof(*t) + cgid_len + len; if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -92,7 +93,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); + t = relay_reserve(bt->rchan, trace_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); @@ -228,6 +229,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; + size_t trace_len; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -250,14 +252,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; cpu = raw_smp_processor_id(); + trace_len = sizeof(*t) + pdu_len + cgid_len; if (blk_tracer) { tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -273,7 +275,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); + t = relay_reserve(bt->rchan, trace_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); From 472eca538358fc8a56884a8adb0cc6047bf05cf3 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:01 +0200 Subject: [PATCH 006/162] blktrace: factor out recording a blktrace event Factor out the recording of a blktrace event into its own function, deduplicating the code. This also enables recording different versions of the blktrace protocol later on. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 85 +++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc4b885f2cec..25a0a1b09747 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -63,6 +63,34 @@ static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); +static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, + sector_t sector, int bytes, u32 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) + +{ + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->device = dev; + t->error = error; + t->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); +} + /* * Send out a notify message. */ @@ -87,7 +115,12 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + record_blktrace_event(t, pid, cpu, 0, 0, + action | (cgid ? __BLK_TN_CGROUP : 0), + bt->dev, 0, cgid, cgid_len, (void *)data, + len); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (!bt->rchan) @@ -97,18 +130,11 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); -record_it: - t->device = bt->dev; - t->action = action | (cgid ? __BLK_TN_CGROUP : 0); - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len + cgid_len; - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - memcpy((void *) t + sizeof(*t) + cgid_len, data, len); - if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + record_blktrace_event(t, pid, cpu, 0, 0, + action | (cgid ? __BLK_TN_CGROUP : 0), + bt->dev, 0, cgid, cgid_len, (void *)data, + len); } } @@ -263,7 +289,12 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, + error, cgid, cgid_len, pdu_data, pdu_len); + + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (unlikely(tsk->btrace_seq != blktrace_seq)) @@ -282,32 +313,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len + cgid_len; - - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - if (pdu_len) - memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); - - if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - return; - } + record_blktrace_event(t, pid, cpu, sector, bytes, what, + bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); } local_irq_restore(flags); From 04678e72e95f4165d58442b3ed108e06605984df Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:02 +0200 Subject: [PATCH 007/162] blktrace: split out relaying a blktrace event Split out the code relaying a blktrace event to user-space using relayfs. This enables adding a second version supporting a new version of the protocol. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 60 ++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 25a0a1b09747..51745832c713 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -91,6 +91,26 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); } +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u32 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace *t; + size_t trace_len = sizeof(*t) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + /* * Send out a notify message. */ @@ -126,16 +146,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, trace_len); - if (t) { - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); - - record_blktrace_event(t, pid, cpu, 0, 0, - action | (cgid ? __BLK_TN_CGROUP : 0), - bt->dev, 0, cgid, cgid_len, (void *)data, - len); - } + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, + action | (cgid ? __BLK_TN_CGROUP : 0), 0, cgid, + cgid_len, (void *)data, len); } /* @@ -246,7 +259,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; @@ -278,20 +290,21 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; cpu = raw_smp_processor_id(); - trace_len = sizeof(*t) + pdu_len + cgid_len; if (blk_tracer) { tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); + trace_len = sizeof(struct blk_io_trace) + pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, trace_len, trace_ctx); if (!event) return; - t = ring_buffer_event_data(event); - record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, - error, cgid, cgid_len, pdu_data, pdu_len); + record_blktrace_event(ring_buffer_event_data(event), + pid, cpu, sector, bytes, what, bt->dev, + error, cgid, cgid_len, pdu_data, + pdu_len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; @@ -306,19 +319,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, trace_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); - - record_blktrace_event(t, pid, cpu, sector, bytes, what, - bt->dev, error, cgid, cgid_len, - pdu_data, pdu_len); - } - + sequence = per_cpu_ptr(bt->sequence, cpu); + (*sequence)++; + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, what, + error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } From 70e3c62b891281b94b9d449a381e033ce592acc8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:03 +0200 Subject: [PATCH 008/162] blktrace: untangle if/else sequence in __blk_add_trace Untangle the if/else sequence setting the trace action in __blk_add_trace() and turn it into a switch statement for better extensibility. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 51745832c713..11e264f67851 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -278,10 +278,19 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); - if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: what |= BLK_TC_ACT(BLK_TC_DISCARD); - if (op == REQ_OP_FLUSH) + break; + case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); + break; + default: + break; + } + if (cgid) what |= __BLK_TA_CGROUP; From 370cd70a402f972f6d7a7e54ba5a82d1a72c762f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:04 +0200 Subject: [PATCH 009/162] blktrace: change the internal action to 64bit Change the internal use of the action in blktrace to 64bit. Although for now only the lower 32bits will be used. With the upcoming version 2 of the blktrace user-space protocol the upper 32bit will also be utilized. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 11e264f67851..15d6788700ca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,7 +64,7 @@ static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, - sector_t sector, int bytes, u32 what, + sector_t sector, int bytes, u64 what, dev_t dev, int error, u64 cgid, ssize_t cgid_len, void *pdu_data, int pdu_len) @@ -80,7 +80,7 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, t->sector = sector; t->bytes = bytes; - t->action = what; + t->action = lower_32_bits(what); t->device = dev; t->error = error; t->pdu_len = pdu_len + cgid_len; @@ -93,7 +93,7 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, pid_t pid, int cpu, sector_t sector, int bytes, - u32 what, int error, u64 cgid, + u64 what, int error, u64 cgid, ssize_t cgid_len, void *pdu_data, int pdu_len) { struct blk_io_trace *t; @@ -114,7 +114,7 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, /* * Send out a notify message. */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, +static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; @@ -127,6 +127,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, size_t trace_len; trace_len = sizeof(*t) + cgid_len + len; + action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); @@ -136,9 +137,8 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, return; t = ring_buffer_event_data(event); record_blktrace_event(t, pid, cpu, 0, 0, - action | (cgid ? __BLK_TN_CGROUP : 0), - bt->dev, 0, cgid, cgid_len, (void *)data, - len); + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } @@ -146,8 +146,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - relay_blktrace_event(bt, 0, pid, cpu, 0, 0, - action | (cgid ? __BLK_TN_CGROUP : 0), 0, cgid, + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, cgid_len, (void *)data, len); } @@ -222,7 +221,7 @@ void __blk_trace_note_message(struct blk_trace *bt, } EXPORT_SYMBOL_GPL(__blk_trace_note_message); -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, +static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) @@ -253,7 +252,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - const blk_opf_t opf, u32 what, int error, + const blk_opf_t opf, u64 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; @@ -311,9 +310,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; record_blktrace_event(ring_buffer_event_data(event), - pid, cpu, sector, bytes, what, bt->dev, - error, cgid, cgid_len, pdu_data, - pdu_len); + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; @@ -330,8 +329,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, local_irq_save(flags); sequence = per_cpu_ptr(bt->sequence, cpu); (*sequence)++; - relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, what, - error, cgid, cgid_len, pdu_data, pdu_len); + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, + lower_32_bits(what), error, cgid, cgid_len, + pdu_data, pdu_len); local_irq_restore(flags); } @@ -818,7 +818,7 @@ blk_trace_request_get_cgid(struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, - unsigned int nr_bytes, u32 what, u64 cgid) + unsigned int nr_bytes, u64 what, u64 cgid) { struct blk_trace *bt; @@ -882,7 +882,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u64 what, int error) { struct blk_trace *bt; @@ -948,7 +948,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); - u32 what; + u64 what; if (explicit) what = BLK_TA_UNPLUG_IO; From 42da88a724d8a3b92ade35ae2ef4d5e5a491df2d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:05 +0200 Subject: [PATCH 010/162] blktrace: split do_blk_trace_setup into two functions Split do_blk_trace_setup into two functions, this is done to prepare for an incoming new BLKTRACESETUP2 ioctl(2) which can receive extended parameters from user-space. Also move the size verification logic to the callers in preparation for using a new internal structure later. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 94 ++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 15d6788700ca..df90422ae613 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -518,9 +518,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q, + char *name, dev_t dev, + u32 buf_size, u32 buf_nr, + struct block_device *bdev) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -528,31 +529,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, lockdep_assert_held(&q->debugfs_mutex); - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - strreplace(buts->name, '/', '_'); - /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { - pr_warn("Concurrent blktraces are not allowed on %s\n", - buts->name); - return -EBUSY; + pr_warn("Concurrent blktraces are not allowed on %s\n", name); + return ERR_PTR(-EBUSY); } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); @@ -572,7 +561,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else - bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory @@ -580,8 +569,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * files or directories. */ if (IS_ERR_OR_NULL(dir)) { - pr_warn("debugfs_dir not present for %s so skipping\n", - buts->name); + pr_warn("debugfs_dir not present for %s so skipping\n", name); ret = -ENOENT; goto err; } @@ -593,17 +581,38 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); + bt->rchan = relay_open("trace", dir, buf_size, buf_nr, + &blk_relay_callbacks, bt); if (!bt->rchan) goto err; + blk_trace_setup_lba(bt, bdev); + + return bt; + +err: + blk_trace_free(q, bt); + + return ERR_PTR(ret); +} + +static void blk_trace_setup_finalize(struct request_queue *q, + char *name, struct blk_trace *bt, + struct blk_user_trace_setup *buts) + +{ + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + strreplace(buts->name, '/', '_'); + bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; - blk_trace_setup_lba(bt, bdev); - /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; @@ -615,12 +624,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); - - ret = 0; -err: - if (ret) - blk_trace_free(q, bt); - return ret; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, @@ -628,17 +631,25 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg) { struct blk_user_trace_setup buts; + struct blk_trace *bt; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; + if (!buts.buf_size || !buts.buf_nr) + return -EINVAL; + mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, bt, &buts); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { blk_trace_remove(q); @@ -655,11 +666,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, { struct blk_user_trace_setup buts; struct compat_blk_user_trace_setup cbuts; - int ret; + struct blk_trace *bt; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; + if (!cbuts.buf_size || !cbuts.buf_nr) + return -EINVAL; + buts = (struct blk_user_trace_setup) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, @@ -670,10 +684,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, bt, &buts); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); From 0d8627cc936de8ea04f3cc1e6921c63fb72cc199 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:06 +0200 Subject: [PATCH 011/162] blktrace: add definitions for blk_user_trace_setup2 Add definitions for a version 2 of the blk_user_trace_setup ioctl. This new ioctl will enable a different struct layout of the binary data passed to user-space when using a new version of the blktrace utility requesting the new struct layout. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ include/uapi/linux/fs.h | 1 + kernel/trace/blktrace.c | 3 +++ 3 files changed, 20 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 1bfb635e309b..a6958708d477 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -129,6 +129,7 @@ enum { }; #define BLKTRACE_BDEV_SIZE 32 +#define BLKTRACE_BDEV_SIZE2 64 /* * User setup structure passed with BLKTRACESETUP @@ -143,4 +144,19 @@ struct blk_user_trace_setup { __u32 pid; }; +/* + * User setup structure passed with BLKTRACESETUP2 + */ +struct blk_user_trace_setup2 { + char name[BLKTRACE_BDEV_SIZE2]; /* output */ + __u64 act_mask; /* input */ + __u32 buf_size; /* input */ + __u32 buf_nr; /* input */ + __u64 start_lba; + __u64 end_lba; + __u32 pid; + __u32 flags; /* currently unused */ + __u64 reserved[11]; +}; + #endif /* _UAPIBLKTRACE_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index beb4c2d1e41c..957ce3343a4f 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -300,6 +300,7 @@ struct file_attr { #define BLKGETDISKSEQ _IOR(0x12,128,__u64) /* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ +#define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index df90422ae613..c31b8f433116 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1601,6 +1601,9 @@ static int __init init_blk_tracer(void) return 1; } + BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % + __alignof__(long)); + return 0; } From 113cbd62824afdf62d2f3f092809cf37cc7f1dd8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:07 +0200 Subject: [PATCH 012/162] blktrace: pass blk_user_trace2 to setup functions Pass struct blk_user_trace_setup2 to blktrace_setup_finalize(). This prepares for the incoming extension of the blktrace protocol with a 64bit act_mask. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 3 ++- kernel/trace/blktrace.c | 31 ++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 122c62e561fc..05c8754456aa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -14,11 +14,12 @@ #include struct blk_trace { + int version; int trace_state; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; - u16 act_mask; + u64 act_mask; u64 start_lba; u64 end_lba; u32 pid; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c31b8f433116..d1532df84cc8 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -597,11 +597,12 @@ err: } static void blk_trace_setup_finalize(struct request_queue *q, - char *name, struct blk_trace *bt, - struct blk_user_trace_setup *buts) + char *name, int version, + struct blk_trace *bt, + struct blk_user_trace_setup2 *buts) { - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2); /* * some device names have larger paths - convert the slashes @@ -609,6 +610,7 @@ static void blk_trace_setup_finalize(struct request_queue *q, */ strreplace(buts->name, '/', '_'); + bt->version = version; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; @@ -630,6 +632,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { + struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; struct blk_trace *bt; int ret; @@ -641,6 +644,15 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts.buf_size || !buts.buf_nr) return -EINVAL; + buts2 = (struct blk_user_trace_setup2) { + .act_mask = buts.act_mask, + .buf_size = buts.buf_size, + .buf_nr = buts.buf_nr, + .start_lba = buts.start_lba, + .end_lba = buts.end_lba, + .pid = buts.pid, + }; + mutex_lock(&q->debugfs_mutex); bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, bdev); @@ -648,7 +660,8 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, mutex_unlock(&q->debugfs_mutex); return PTR_ERR(bt); } - blk_trace_setup_finalize(q, name, bt, &buts); + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + strcpy(buts.name, buts2.name); mutex_unlock(&q->debugfs_mutex); if (copy_to_user(arg, &buts, sizeof(buts))) { @@ -664,7 +677,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { - struct blk_user_trace_setup buts; + struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; struct blk_trace *bt; @@ -674,7 +687,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, if (!cbuts.buf_size || !cbuts.buf_nr) return -EINVAL; - buts = (struct blk_user_trace_setup) { + buts2 = (struct blk_user_trace_setup2) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, @@ -684,16 +697,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, bdev); if (IS_ERR(bt)) { mutex_unlock(&q->debugfs_mutex); return PTR_ERR(bt); } - blk_trace_setup_finalize(q, name, bt, &buts); + blk_trace_setup_finalize(q, name, 1, bt, &buts2); mutex_unlock(&q->debugfs_mutex); - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { blk_trace_remove(q); return -EFAULT; } From c44347d606260f36a81f6d8415a5af33cb3015fa Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:08 +0200 Subject: [PATCH 013/162] blktrace: add definitions for struct blk_io_trace2 Add definitions for the extended version of the blktrace protocol using a wider action type to be able to record new actions in the kernel. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ kernel/trace/blktrace.c | 1 + 2 files changed, 17 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index a6958708d477..9f9834d76e00 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -94,6 +94,7 @@ enum blktrace_notify { #define BLK_IO_TRACE_MAGIC 0x65617400 #define BLK_IO_TRACE_VERSION 0x07 +#define BLK_IO_TRACE2_VERSION 0x08 /* * The trace itself @@ -113,6 +114,21 @@ struct blk_io_trace { /* cgroup id will be stored here if exists */ }; +struct blk_io_trace2 { + __u32 magic; /* MAGIC << 8 | BLK_IO_TRACE2_VERSION */ + __u32 sequence; /* event number */ + __u64 time; /* in nanoseconds */ + __u64 sector; /* disk offset */ + __u32 bytes; /* transfer length */ + __u32 pid; /* who did it */ + __u64 action; /* what happened */ + __u32 device; /* device number */ + __u32 cpu; /* on what cpu did it happen */ + __u16 error; /* completion error */ + __u16 pdu_len; /* length of data after this trace */ + __u8 pad[12]; + /* cgroup id will be stored here if it exists */ +}; /* * The remap event */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d1532df84cc8..185f19c9f772 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1616,6 +1616,7 @@ static int __init init_blk_tracer(void) BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % __alignof__(long)); + BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long)); return 0; } From 915bb53860c3a6cc3dd2c9a5e0d1988ada0e377d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:09 +0200 Subject: [PATCH 014/162] blktrace: differentiate between blk_io_trace versions Differentiate between blk_io_trace and blk_io_trace2 when relaying to user-space depending on which version has been requested by the blktrace utility. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 58 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 185f19c9f772..5f8ecf88a196 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -91,7 +91,29 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); } -static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, +static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, + int pdu_len) +{ + t2->pid = pid; + t2->cpu = cpu; + + t2->sector = sector; + t2->bytes = bytes; + t2->action = what; + t2->device = dev; + t2->error = error; + t2->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence, pid_t pid, int cpu, sector_t sector, int bytes, u64 what, int error, u64 cgid, ssize_t cgid_len, void *pdu_data, int pdu_len) @@ -111,6 +133,40 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, cgid, cgid_len, pdu_data, pdu_len); } +static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, + int bytes, u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace2 *t; + size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + if (bt->version == 2) + return relay_blktrace_event2(bt, sequence, pid, cpu, sector, + bytes, what, error, cgid, cgid_len, + pdu_data, pdu_len); + return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, + lower_32_bits(what), error, cgid, cgid_len, + pdu_data, pdu_len); +} + /* * Send out a notify message. */ From 67bfa74d81bae9271f6ec72d2058d081732949cb Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:10 +0200 Subject: [PATCH 015/162] blktrace: move trace_note to blk_io_trace2 Move trace_note() to the new blk_io_trace2 infrastructure. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5f8ecf88a196..08c7be671c47 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -173,18 +173,18 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { - struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; - size_t trace_len; - trace_len = sizeof(*t) + cgid_len + len; action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { + struct blk_io_trace2 *t; + size_t trace_len = sizeof(*t) + cgid_len + len; + buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, @@ -192,9 +192,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, if (!event) return; t = ring_buffer_event_data(event); - record_blktrace_event(t, pid, cpu, 0, 0, - action, bt->dev, 0, cgid, cgid_len, - (void *)data, len); + record_blktrace_event2(t, pid, cpu, 0, 0, + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } @@ -359,7 +359,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); - trace_len = sizeof(struct blk_io_trace) + pdu_len + cgid_len; + trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, trace_len, trace_ctx); if (!event) From 4d8bc7bd4f73c6b34ba29d3e8277864c6e0a44a7 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:11 +0200 Subject: [PATCH 016/162] blktrace: move ftrace blk_io_tracer to blk_io_trace2 Move ftrace's blk_io_tracer to the new blk_io_trace2 infrastructure. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 08c7be671c47..49f73cb3cb33 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1224,7 +1224,7 @@ static void blk_unregister_tracepoints(void) * struct blk_io_tracer formatting routines */ -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; @@ -1259,9 +1259,9 @@ out: } static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent) { - return (const struct blk_io_trace *)ent; + return (const struct blk_io_trace2 *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) @@ -1320,7 +1320,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act, unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1334,7 +1334,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { @@ -1555,7 +1555,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t; + const struct blk_io_trace2 *t; u16 what; bool long_act; blk_log_action_t *log_action; @@ -1592,8 +1592,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent; + const int offset = offsetof(struct blk_io_trace2, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, From f9ee38bbf70fb20584625849a253c8652176fa66 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:12 +0200 Subject: [PATCH 017/162] blktrace: add block trace commands for zone operations Add block trace commands for zone operations. These commands can only be handled with version 2 of the blktrace protocol. For version 1, warn if a command that does not fit into the 16 bits reserved for the command in this version is passed in. Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 13 +++++++++++-- kernel/trace/blktrace.c | 29 +++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 9f9834d76e00..190a3c5ab0a0 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -26,11 +26,20 @@ enum blktrace_cat { BLK_TC_DRV_DATA = 1 << 14, /* binary per-driver data */ BLK_TC_FUA = 1 << 15, /* fua requests */ - BLK_TC_END = 1 << 15, /* we've run out of bits! */ + BLK_TC_END_V1 = 1 << 15, /* we've run out of bits! */ + + BLK_TC_ZONE_APPEND = 1ull << 16, /* zone append */ + BLK_TC_ZONE_RESET = 1ull << 17, /* zone reset */ + BLK_TC_ZONE_RESET_ALL = 1ull << 18, /* zone reset all */ + BLK_TC_ZONE_FINISH = 1ull << 19, /* zone finish */ + BLK_TC_ZONE_OPEN = 1ull << 20, /* zone open */ + BLK_TC_ZONE_CLOSE = 1ull << 21, /* zone close */ + + BLK_TC_END_V2 = 1ull << 21, }; #define BLK_TC_SHIFT (16) -#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) +#define BLK_TC_ACT(act) ((u64)(act) << BLK_TC_SHIFT) /* * Basic trace actions diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 49f73cb3cb33..fb5935885abc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -163,8 +163,8 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, bytes, what, error, cgid, cgid_len, pdu_data, pdu_len); return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, - lower_32_bits(what), error, cgid, cgid_len, - pdu_data, pdu_len); + what, error, cgid, cgid_len, pdu_data, + pdu_len); } /* @@ -342,10 +342,32 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); break; + case REQ_OP_ZONE_APPEND: + what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND); + break; + case REQ_OP_ZONE_RESET: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET); + break; + case REQ_OP_ZONE_RESET_ALL: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL); + break; + case REQ_OP_ZONE_FINISH: + what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH); + break; + case REQ_OP_ZONE_OPEN: + what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE); + break; default: break; } + if (WARN_ON_ONCE(bt->version == 1 && + (what >> BLK_TC_SHIFT) > BLK_TC_END_V1)) + return; + if (cgid) what |= __BLK_TA_CGROUP; @@ -386,8 +408,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, sequence = per_cpu_ptr(bt->sequence, cpu); (*sequence)++; relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, - lower_32_bits(what), error, cgid, cgid_len, - pdu_data, pdu_len); + what, error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } From 1c164fcc1b08e75f1cad1532718f09cddc0ddebe Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:13 +0200 Subject: [PATCH 018/162] blktrace: expose ZONE APPEND completions to blktrace Expose ZONE APPEND completions as a block trace completion action to blktrace. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 3 +++ kernel/trace/blktrace.c | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 190a3c5ab0a0..289872e51fc5 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -97,6 +97,9 @@ enum blktrace_notify { #define BLK_TA_ABORT (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA)) +#define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ + BLK_TC_ACT(BLK_TC_ZONE_APPEND)) + #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb5935885abc..c83577096607 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -978,6 +978,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_trace_request_get_cgid(rq)); } +static void blk_add_trace_zone_update_request(void *ignore, struct request *rq) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt) || bt->version < 2) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND, + blk_trace_request_get_cgid(rq)); +} + /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for @@ -1208,6 +1224,9 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); + ret = register_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1227,6 +1246,8 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); From 3f6722816a73e2017599d965683dbe71833afd7a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:14 +0200 Subject: [PATCH 019/162] blktrace: trace zone write plugging operations Trace zone write plugging operations on block devices. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 5 ++++ kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 289872e51fc5..30f3d2589365 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -62,6 +62,8 @@ enum blktrace_act { __BLK_TA_REMAP, /* bio was remapped */ __BLK_TA_ABORT, /* request aborted */ __BLK_TA_DRV_DATA, /* driver-specific binary data */ + __BLK_TA_ZONE_PLUG, /* zone write plug was plugged */ + __BLK_TA_ZONE_UNPLUG, /* zone write plug was unplugged */ __BLK_TA_CGROUP = 1 << 8, /* from a cgroup*/ }; @@ -99,6 +101,9 @@ enum blktrace_notify { #define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ BLK_TC_ACT(BLK_TC_ZONE_APPEND)) +#define BLK_TA_ZONE_PLUG (__BLK_TA_ZONE_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_ZONE_UNPLUG (__BLK_TA_ZONE_UNPLUG |\ + BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c83577096607..6bfe1b36a1d3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1084,6 +1084,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } +static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + + return; +} + +static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + return; +} + static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; @@ -1227,6 +1258,12 @@ static void blk_register_tracepoints(void) ret = register_trace_blk_zone_append_update_request_bio( blk_add_trace_zone_update_request, NULL); WARN_ON(ret); + ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, + NULL); + WARN_ON(ret); + ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, + NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1246,6 +1283,8 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL); + unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL); unregister_trace_blk_zone_append_update_request_bio( blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); From 4ae8efb4f907383a16abf3c59b353763e31ae106 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:15 +0200 Subject: [PATCH 020/162] blktrace: handle BLKTRACESETUP2 ioctl Handle the BLKTRACESETUP2 ioctl, requesting an extended version of the blktrace protocol from user-space. Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/ioctl.c | 1 + kernel/trace/blktrace.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/block/ioctl.c b/block/ioctl.c index d7489a56b33c..3927ca4707d0 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -691,6 +691,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) /* Incompatible alignment on i386 */ case BLKTRACESETUP: + case BLKTRACESETUP2: return blk_trace_ioctl(bdev, cmd, argp); default: break; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6bfe1b36a1d3..6ad3807a5b73 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -749,6 +749,38 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } EXPORT_SYMBOL_GPL(blk_trace_setup); +static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) +{ + struct blk_user_trace_setup2 buts2; + struct blk_trace *bt; + + if (copy_from_user(&buts2, arg, sizeof(buts2))) + return -EFAULT; + + if (!buts2.buf_size || !buts2.buf_nr) + return -EINVAL; + + if (buts2.flags != 0) + return -EINVAL; + + mutex_lock(&q->debugfs_mutex); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 2, bt, &buts2); + mutex_unlock(&q->debugfs_mutex); + + if (copy_to_user(arg, &buts2, sizeof(buts2))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; +} + #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, @@ -839,6 +871,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) char b[BDEVNAME_SIZE]; switch (cmd) { + case BLKTRACESETUP2: + snprintf(b, sizeof(b), "%pg", bdev); + ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg); + break; case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); From 4a0940bdcac260be1e3460e99464fa63d317c6a2 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 27 Oct 2025 19:46:19 -0700 Subject: [PATCH 021/162] blktrace: use debug print to report dropped events The WARN_ON_ONCE introduced in commit f9ee38bbf70f ("blktrace: add block trace commands for zone operations") triggers kernel warnings when zone operations are traced with blktrace version 1. This can spam the kernel log during normal operation with zoned block devices when userspace is using the legacy blktrace protocol. Currently blktrace implementation drops newly added REQ_OP_ZONE_XXX when blktrace userspce version is set to 1. Remove the WARN_ON_ONCE and quietly filter these events. Add a rate-limited debug message to help diagnose potential issues without flooding the kernel log. The debug message can be enabled via dynamic debug when needed for troubleshooting. This approach is more appropriate as encountering zone operations with blktrace v1 is an expected condition that should be handled gracefully rather than warned about, since users may be running older blktrace userspace tools that only support version 1 of the protocol. With this patch :- linux-block (for-next) # git log -1 commit c8966006a0971d2b4bf94c0426eb7e4407c6853f (HEAD -> for-next) Author: Chaitanya Kulkarni Date: Mon Oct 27 19:26:53 2025 -0700 blktrace: use debug print to report dropped events linux-block (for-next) # cdblktests blktests (master) # ./check blktrace blktrace/001 (blktrace zone management command tracing) [passed] runtime 3.805s ... 3.889s blktests (master) # dmesg -c blktests (master) # echo "file kernel/trace/blktrace.c +p" > /sys/kernel/debug/dynamic_debug/control blktests (master) # ./check blktrace blktrace/001 (blktrace zone management command tracing) [passed] runtime 3.889s ... 3.881s blktests (master) # dmesg -c [ 77.826237] blktrace: blktrace v1 cannot trace zone operation 0x1000190001 [ 77.826260] blktrace: blktrace v1 cannot trace zone operation 0x1000190004 [ 77.826282] blktrace: blktrace v1 cannot trace zone operation 0x1001490007 [ 77.826288] blktrace: blktrace v1 cannot trace zone operation 0x1001890008 [ 77.826343] blktrace: blktrace v1 cannot trace zone operation 0x1000190001 [ 77.826347] blktrace: blktrace v1 cannot trace zone operation 0x1000190004 [ 77.826350] blktrace: blktrace v1 cannot trace zone operation 0x1001490007 [ 77.826354] blktrace: blktrace v1 cannot trace zone operation 0x1001890008 [ 77.826373] blktrace: blktrace v1 cannot trace zone operation 0x1000190001 [ 77.826377] blktrace: blktrace v1 cannot trace zone operation 0x1000190004 blktests (master) # echo "file kernel/trace/blktrace.c -p" > /sys/kernel/debug/dynamic_debug/control blktests (master) # ./check blktrace blktrace/001 (blktrace zone management command tracing) [passed] runtime 3.881s ... 3.824s blktests (master) # dmesg -c blktests (master) # Reported-by: syzbot+153e64c0aa875d7e4c37@syzkaller.appspotmail.com Fixes: f9ee38bbf70f ("blktrace: add block trace commands for zone operations") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6ad3807a5b73..776ae4190f36 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -364,9 +364,12 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, break; } - if (WARN_ON_ONCE(bt->version == 1 && - (what >> BLK_TC_SHIFT) > BLK_TC_END_V1)) + /* Drop trace events for zone operations with blktrace v1 */ + if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) { + pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n", + (unsigned long long)what); return; + } if (cgid) what |= __BLK_TA_CGROUP; From e48886b9d668d80be24e37345bd0904e9138473c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 27 Oct 2025 22:50:42 -0700 Subject: [PATCH 022/162] blktrace: for ftrace use correct trace format ver The ftrace blktrace path allocates buffers and writes trace events but was using the wrong recording function. After commit 4d8bc7bd4f73 ("blktrace: move ftrace blk_io_tracer to blk_io_trace2"), the ftrace interface was moved to use blk_io_trace2 format, but __blk_add_trace() still called record_blktrace_event() which writes in blk_io_trace (v1) format. This causes critical data corruption: - blk_io_trace (v1) has 32-bit 'action' field at offset 28 - blk_io_trace2 (v2) has 32-bit 'pid' at offset 28 and 64-bit 'action' at offset 32 - When record_blktrace_event() writes to a v2 buffer: * Writing pid (offset 32 in v1) corrupts the v2 action field * Writing action (offset 28 in v1) corrupts the v2 pid field * The 64-bit action is truncated to 32-bit via lower_32_bits() Fix by: 1. Adding version switch to select correct format (v1 vs v2) 2. Calling appropriate recording function based on version 3. Defaulting to v2 for ftrace (as intended by commit 4d8bc7bd4f73) 4. Adding WARN_ONCE for unexpected version values Without this patch :- linux-block (for-next) # sh reproduce_blktrace_bug.sh dd-14242 [033] d..1. 3903.022308: Unknown action 36a2 dd-14242 [033] d..1. 3903.022333: Unknown action 36a2 dd-14242 [033] d..1. 3903.022365: Unknown action 36a2 dd-14242 [033] d..1. 3903.022366: Unknown action 36a2 dd-14242 [033] d..1. 3903.022369: Unknown action 36a2 The action field is corrupted because: - ftrace allocated blk_io_trace2 buffer (64 bytes) - But called record_blktrace_event() (writes v1, 48 bytes) - Field offsets don't match, causing corruption The hex value shown 0x30e3 is actually a PID, not an action code! linux-block (for-next) # linux-block (for-next) # linux-block (for-next) # sh reproduce_blktrace_bug.sh Trace output looks correct: dd-2420 [019] d..1. 59.641742: 251,0 Q RS 0 + 8 [dd] dd-2420 [019] d..1. 59.641775: 251,0 G RS 0 + 8 [dd] dd-2420 [019] d..1. 59.641784: 251,0 P N [dd] dd-2420 [019] d..1. 59.641785: 251,0 U N [dd] 1 dd-2420 [019] d..1. 59.641788: 251,0 D RS 0 + 8 [dd] Fixes: 4d8bc7bd4f73 ("blktrace: move ftrace blk_io_tracer to blk_io_trace2") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 59 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 776ae4190f36..e4f26ddb7ee2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -384,16 +384,65 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); - trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; + switch (bt->version) { + case 1: + trace_len = sizeof(struct blk_io_trace); + break; + case 2: + default: + /* + * ftrace always uses v2 (blk_io_trace2) format. + * + * For sysfs-enabled tracing path (enabled via + * /sys/block/DEV/trace/enable), blk_trace_setup_queue() + * never initializes bt->version, leaving it 0 from + * kzalloc(). We must handle version==0 safely here. + * + * Fall through to default to ensure we never hit the + * old bug where default set trace_len=0, causing + * buffer underflow and memory corruption. + * + * Always use v2 format for ftrace and normalize + * bt->version to 2 when uninitialized. + */ + trace_len = sizeof(struct blk_io_trace2); + if (bt->version == 0) + bt->version = 2; + break; + } + trace_len += pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, trace_len, trace_ctx); if (!event) return; - record_blktrace_event(ring_buffer_event_data(event), - pid, cpu, sector, bytes, - what, bt->dev, error, cgid, cgid_len, - pdu_data, pdu_len); + switch (bt->version) { + case 1: + record_blktrace_event(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + case 2: + default: + /* + * Use v2 recording function (record_blktrace_event2) + * which writes blk_io_trace2 structure with correct + * field layout: + * - 32-bit pid at offset 28 + * - 64-bit action at offset 32 + * + * Fall through to default handles version==0 case + * (from sysfs path), ensuring we always use correct + * v2 recording function to match the v2 buffer + * allocated above. + */ + record_blktrace_event2(ring_buffer_event_data(event), + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); + break; + } trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; From 77220f6d18a22b0b5d73b5d2156609b0aa21a7c5 Mon Sep 17 00:00:00 2001 From: Shi Hao Date: Sat, 1 Nov 2025 11:14:22 +0530 Subject: [PATCH 023/162] drbd: replace kmap() with kmap_local_page() in receiver path Use kmap_local_page() instead of kmap() to avoid CPU contention. kmap() uses a global set of mapping slots that can cause contention between multiple CPUs, while kmap_local_page() uses per-CPU slots eliminating this contention. It also ensures non-sleeping operation and provides better cache locality. Convert kmap() to kmap_local_page() as it aligns with ongoing kernel efforts to modernize kmap() usage for better multi-core scalability. Signed-off-by: Shi Hao Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index caaf2781136d..14821420ea50 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1736,13 +1736,13 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, page = peer_req->pages; page_chain_for_each(page) { unsigned len = min_t(int, ds, PAGE_SIZE); - data = kmap(page); + data = kmap_local_page(page); err = drbd_recv_all_warn(peer_device->connection, data, len); if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) { drbd_err(device, "Fault injection: Corrupting data on receive\n"); data[0] = data[0] ^ (unsigned long)-1; } - kunmap(page); + kunmap_local(data); if (err) { drbd_free_peer_req(device, peer_req); return NULL; @@ -1777,7 +1777,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) page = drbd_alloc_pages(peer_device, 1, 1); - data = kmap(page); + data = kmap_local_page(page); while (data_size) { unsigned int len = min_t(int, data_size, PAGE_SIZE); @@ -1786,7 +1786,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) break; data_size -= len; } - kunmap(page); + kunmap_local(data); drbd_free_pages(peer_device->device, page); return err; } From bc49af56eea866c34d21bf582f65b02fc8c06ec3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 28 Oct 2025 20:34:23 -0700 Subject: [PATCH 024/162] blktrace: add support for REQ_OP_WRITE_ZEROES tracing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, REQ_OP_WRITE_ZEROES operations are not handled in the blktrace infrastructure, resulting in incorrect or missing operation labels in ftrace blktrace output. This manifests as write-zeroes operations appearing with incorrect labels like "N" instead of a proper "WZ" designation. This patch adds complete support for REQ_OP_WRITE_ZEROES across the blktrace infrastructure: Add BLK_TC_WRITE_ZEROES trace category in blktrace_api.h and update BLK_TC_END_V2 marker accordingly Map REQ_OP_WRITE_ZEROES to BLK_TC_WRITE_ZEROES in __blk_add_trace() to ensure proper trace event categorization Update fill_rwbs() to generate "WZ" label for write-zeroes operations in ftrace output, making them easily identifiable Add "write-zeroes" string mapping in act_to_str array for debugfs filter interface Update blk_fill_rwbs() to handle REQ_OP_WRITE_ZEROES for block layer event tracing With this fix, write-zeroes operations are now correctly traced and displayed. =========================================================== BEFORE THIS PATCH =========================================================== blkdiscard -z -o 0 -l 40960 /dev/nvme0n1 blkdiscard-3809 [030] ..... 1212.253701: block_bio_queue: 259,0 NS 0 + 80 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253703: block_getrq: 259,0 NS 0 + 80 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253704: block_io_start: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253704: block_plug: [blkdiscard] blkdiscard-3809 [030] ..... 1212.253706: block_unplug: [blkdiscard] 1 blkdiscard-3809 [030] ..... 1212.253706: block_rq_insert: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard] kworker/30:1H-566 [030] ..... 1212.253726: block_rq_issue: 259,0 NS 40960 () 0 + 80 be,0,4 [kworker/30:1H] -0 [030] d.h1. 1212.253957: block_rq_complete: 259,0 NS () 0 + 80 be,0,4 [0] -0 [030] dNh1. 1212.253960: block_io_done: 259,0 NS 0 () 0 + 0 none,0,0 [swapper/30] Trace Event Breakdown: Event | Device | Op | Sector | Sectors | Byte Size | Calculation block_bio_queue | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_getrq | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_io_start | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_insert | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_issue | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_complete | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_io_done | 259,0 | NS | 0 | 0 | 0 | Completion (no data) Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes =========================================================== AFTER THIS PATCH =========================================================== blkdiscard -z -o 0 -l 40960 /dev/nvme0n1 blkdiscard-2477 [020] ..... 960.989131: block_bio_queue: 259,0 WZS 0 + 80 [blkdiscard] blkdiscard-2477 [020] ..... 960.989134: block_getrq: 259,0 WZS 0 + 80 [blkdiscard] blkdiscard-2477 [020] ..... 960.989135: block_io_start: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard] blkdiscard-2477 [020] ..... 960.989138: block_plug: [blkdiscard] blkdiscard-2477 [020] ..... 960.989140: block_unplug: [blkdiscard] 1 blkdiscard-2477 [020] ..... 960.989141: block_rq_insert: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard] kworker/20:1H-736 [020] ..... 960.989166: block_rq_issue: 259,0 WZS 40960 () 0 + 80 be,0,4 [kworker/20:1H] -0 [020] d.h1. 960.989476: block_rq_complete: 259,0 WZS () 0 + 80 be,0,4 [0] -0 [020] dNh1. 960.989482: block_io_done: 259,0 WZS 0 () 0 + 0 none,0,0 [swapper/20] Trace Event Breakdown: Event | Device | Op | Sector | Sectors | Byte Size | Calculation block_bio_queue | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_getrq | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_io_start | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_insert | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_issue | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_complete | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_io_done | 259,0 | WZS | 0 | 0 | 0 | Completion (no data) Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes Tested with ftrace blktrace on NVMe devices using blkdiscard with the -z (write-zeroes) flag. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 4 +++- kernel/trace/blktrace.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 30f3d2589365..7c092d9f3aa4 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -35,7 +35,9 @@ enum blktrace_cat { BLK_TC_ZONE_OPEN = 1ull << 20, /* zone open */ BLK_TC_ZONE_CLOSE = 1ull << 21, /* zone close */ - BLK_TC_END_V2 = 1ull << 21, + BLK_TC_WRITE_ZEROES = 1ull << 22, /* write-zeroes */ + + BLK_TC_END_V2 = 1ull << 22, }; #define BLK_TC_SHIFT (16) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e4f26ddb7ee2..af8cbc8e1a7c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -360,6 +360,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, case REQ_OP_ZONE_CLOSE: what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE); break; + case REQ_OP_WRITE_ZEROES: + what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES); + break; default: break; } @@ -1408,7 +1411,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t) if (tc & BLK_TC_DISCARD) rwbs[i++] = 'D'; - else if (tc & BLK_TC_WRITE) + else if (tc & BLK_TC_WRITE_ZEROES) { + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + } else if (tc & BLK_TC_WRITE) rwbs[i++] = 'W'; else if (t->bytes) rwbs[i++] = 'R'; @@ -1951,6 +1957,7 @@ static const struct { { BLK_TC_DISCARD, "discard" }, { BLK_TC_DRV_DATA, "drv_data" }, { BLK_TC_FUA, "fua" }, + { BLK_TC_WRITE_ZEROES, "write-zeroes" }, }; static int blk_trace_str2mask(const char *str) @@ -2164,6 +2171,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) rwbs[i++] = 'Z'; rwbs[i++] = 'C'; break; + case REQ_OP_WRITE_ZEROES: + rwbs[i++] = 'W'; + rwbs[i++] = 'Z'; + break; default: rwbs[i++] = 'N'; } From 011af85ccd871526df36988c7ff20ca375fb804d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Nov 2025 21:31:16 +0800 Subject: [PATCH 025/162] ublk: reorder tag_set initialization before queue allocation Move ublk_add_tag_set() before ublk_init_queues() in the device initialization path. This allows us to use the blk-mq CPU-to-queue mapping established by the tag_set to determine the appropriate NUMA node for each queue allocation. The error handling paths are also reordered accordingly. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0c74a41a6753..2569566bf5e6 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3178,17 +3178,17 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.nr_hw_queues, nr_cpu_ids); ublk_align_max_io_size(ub); - ret = ublk_init_queues(ub); + ret = ublk_add_tag_set(ub); if (ret) goto out_free_dev_number; - ret = ublk_add_tag_set(ub); + ret = ublk_init_queues(ub); if (ret) - goto out_deinit_queues; + goto out_free_tag_set; ret = -EFAULT; if (copy_to_user(argp, &ub->dev_info, sizeof(info))) - goto out_free_tag_set; + goto out_deinit_queues; /* * Add the char dev so that ublksrv daemon can be setup. @@ -3197,10 +3197,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ret = ublk_add_chdev(ub); goto out_unlock; -out_free_tag_set: - blk_mq_free_tag_set(&ub->tag_set); out_deinit_queues: ublk_deinit_queues(ub); +out_free_tag_set: + blk_mq_free_tag_set(&ub->tag_set); out_free_dev_number: ublk_free_dev_number(ub); out_free_ub: From 529d4d6327880e5c60f4e0def39b3faaa7954e54 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Nov 2025 21:31:17 +0800 Subject: [PATCH 026/162] ublk: implement NUMA-aware memory allocation Implement NUMA-friendly memory allocation for ublk driver to improve performance on multi-socket systems. This commit includes the following changes: 1. Rename __queues to queues, dropping the __ prefix since the field is now accessed directly throughout the codebase rather than only through the ublk_get_queue() helper. 2. Remove the queue_size field from struct ublk_device as it is no longer needed. 3. Move queue allocation and deallocation into ublk_init_queue() and ublk_deinit_queue() respectively, improving encapsulation. This simplifies ublk_init_queues() and ublk_deinit_queues() to just iterate and call the per-queue functions. 4. Add ublk_get_queue_numa_node() helper function to determine the appropriate NUMA node for a queue by finding the first CPU mapped to that queue via tag_set.map[HCTX_TYPE_DEFAULT].mq_map[] and converting it to a NUMA node using cpu_to_node(). This function is called internally by ublk_init_queue() to determine the allocation node. 5. Allocate each queue structure on its local NUMA node using kvzalloc_node() in ublk_init_queue(). 6. Allocate the I/O command buffer on the same NUMA node using alloc_pages_node(). This reduces memory access latency on multi-socket NUMA systems by ensuring each queue's data structures are local to the CPUs that access them. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 84 +++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 2569566bf5e6..d920f7cf4dad 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -209,9 +209,6 @@ struct ublk_queue { struct ublk_device { struct gendisk *ub_disk; - char *__queues; - - unsigned int queue_size; struct ublksrv_ctrl_dev_info dev_info; struct blk_mq_tag_set tag_set; @@ -239,6 +236,8 @@ struct ublk_device { bool canceling; pid_t ublksrv_tgid; struct delayed_work exit_work; + + struct ublk_queue *queues[]; }; /* header of ublk_params */ @@ -781,7 +780,7 @@ static noinline void ublk_put_device(struct ublk_device *ub) static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, int qid) { - return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); + return dev->queues[qid]; } static inline bool ublk_rq_has_data(const struct request *rq) @@ -2662,9 +2661,13 @@ static const struct file_operations ublk_ch_fops = { static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { - int size = ublk_queue_cmd_buf_size(ub); - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - int i; + struct ublk_queue *ubq = ub->queues[q_id]; + int size, i; + + if (!ubq) + return; + + size = ublk_queue_cmd_buf_size(ub); for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -2676,57 +2679,76 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); + + kvfree(ubq); + ub->queues[q_id] = NULL; +} + +static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id) +{ + unsigned int cpu; + + /* Find first CPU mapped to this queue */ + for_each_possible_cpu(cpu) { + if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[cpu] == q_id) + return cpu_to_node(cpu); + } + + return NUMA_NO_NODE; } static int ublk_init_queue(struct ublk_device *ub, int q_id) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + int depth = ub->dev_info.queue_depth; + int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; - void *ptr; + struct ublk_queue *ubq; + struct page *page; + int numa_node; int size; + /* Determine NUMA node based on queue's CPU affinity */ + numa_node = ublk_get_queue_numa_node(ub, q_id); + + /* Allocate queue structure on local NUMA node */ + ubq = kvzalloc_node(ubq_size, GFP_KERNEL, numa_node); + if (!ubq) + return -ENOMEM; + spin_lock_init(&ubq->cancel_lock); ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; - ubq->q_depth = ub->dev_info.queue_depth; + ubq->q_depth = depth; size = ublk_queue_cmd_buf_size(ub); - ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); - if (!ptr) + /* Allocate I/O command buffer on local NUMA node */ + page = alloc_pages_node(numa_node, gfp_flags, get_order(size)); + if (!page) { + kvfree(ubq); return -ENOMEM; + } + ubq->io_cmd_buf = page_address(page); - ubq->io_cmd_buf = ptr; + ub->queues[q_id] = ubq; ubq->dev = ub; return 0; } static void ublk_deinit_queues(struct ublk_device *ub) { - int nr_queues = ub->dev_info.nr_hw_queues; int i; - if (!ub->__queues) - return; - - for (i = 0; i < nr_queues; i++) + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) ublk_deinit_queue(ub, i); - kvfree(ub->__queues); } static int ublk_init_queues(struct ublk_device *ub) { - int nr_queues = ub->dev_info.nr_hw_queues; - int depth = ub->dev_info.queue_depth; - int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); - int i, ret = -ENOMEM; + int i, ret; - ub->queue_size = ubq_size; - ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL); - if (!ub->__queues) - return ret; - - for (i = 0; i < nr_queues; i++) { - if (ublk_init_queue(ub, i)) + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + ret = ublk_init_queue(ub, i); + if (ret) goto fail; } @@ -3128,7 +3150,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) goto out_unlock; ret = -ENOMEM; - ub = kzalloc(sizeof(*ub), GFP_KERNEL); + ub = kzalloc(struct_size(ub, queues, info.nr_hw_queues), GFP_KERNEL); if (!ub) goto out_unlock; mutex_init(&ub->mutex); From c28ba6b6c51d090103800eb1c7679c32f6501dbc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Nov 2025 21:31:18 +0800 Subject: [PATCH 027/162] ublk: use struct_size() for allocation Convert ublk_queue to use struct_size() for allocation. Changes in this commit: 1. Update ublk_init_queue() to use struct_size(ubq, ios, depth) instead of manual size calculation (sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io)). This provides better type safety and makes the code more maintainable by using standard kernel macro for flexible array handling. Meantime annotate ublk_queue.ios by __counted_by(). Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d920f7cf4dad..96e07763cd28 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -203,7 +203,7 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; - struct ublk_io ios[]; + struct ublk_io ios[] __counted_by(q_depth); }; struct ublk_device { @@ -2700,7 +2700,6 @@ static int ublk_get_queue_numa_node(struct ublk_device *ub, int q_id) static int ublk_init_queue(struct ublk_device *ub, int q_id) { int depth = ub->dev_info.queue_depth; - int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; struct ublk_queue *ubq; struct page *page; @@ -2711,7 +2710,8 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) numa_node = ublk_get_queue_numa_node(ub, q_id); /* Allocate queue structure on local NUMA node */ - ubq = kvzalloc_node(ubq_size, GFP_KERNEL, numa_node); + ubq = kvzalloc_node(struct_size(ubq, ios, depth), GFP_KERNEL, + numa_node); if (!ubq) return -ENOMEM; From 0123bb91f464487cc1bbdcc515757dc723496a39 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Nov 2025 21:31:19 +0800 Subject: [PATCH 028/162] selftests: ublk: set CPU affinity before thread initialization Move ublk_thread_set_sched_affinity() call before ublk_thread_init() to ensure memory allocations during thread initialization occur on the correct NUMA node. This leverages Linux's first-touch memory policy for better NUMA locality. Also convert ublk_thread_set_sched_affinity() to use pthread_setaffinity_np() instead of sched_setaffinity(), as the pthread API is the proper interface for setting thread affinity in multithreaded programs. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 6b8123c12a7a..062537ab8976 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -839,7 +839,7 @@ static int ublk_process_io(struct ublk_thread *t) static void ublk_thread_set_sched_affinity(const struct ublk_thread *t, cpu_set_t *cpuset) { - if (sched_setaffinity(0, sizeof(*cpuset), cpuset) < 0) + if (pthread_setaffinity_np(pthread_self(), sizeof(*cpuset), cpuset) < 0) ublk_err("ublk dev %u thread %u set affinity failed", t->dev->dev_info.dev_id, t->idx); } @@ -862,15 +862,21 @@ static void *ublk_io_handler_fn(void *data) t->dev = info->dev; t->idx = info->idx; + /* + * IO perf is sensitive with queue pthread affinity on NUMA machine + * + * Set sched_affinity at beginning, so following allocated memory/pages + * could be CPU/NUMA aware. + */ + if (info->affinity) + ublk_thread_set_sched_affinity(t, info->affinity); + ret = ublk_thread_init(t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", dev_id, t->idx); return NULL; } - /* IO perf is sensitive with queue pthread affinity on NUMA machine*/ - if (info->affinity) - ublk_thread_set_sched_affinity(t, info->affinity); sem_post(info->ready); ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", From 3f5b1169d2ab20ed14271654799121232b9eb9d7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 1 Nov 2025 21:31:20 +0800 Subject: [PATCH 029/162] selftests: ublk: make ublk_thread thread-local variable Refactor ublk_thread to be a thread-local variable instead of storing it in ublk_dev: - Remove pthread_t thread field from struct ublk_thread and move it to struct ublk_thread_info - Remove struct ublk_thread array from struct ublk_dev, reducing memory footprint - Define struct ublk_thread as local variable in __ublk_io_handler_fn() instead of accessing it from dev->threads[] - Extract main IO handling logic into __ublk_io_handler_fn() which is marked as noinline - Move CPU affinity setup to ublk_io_handler_fn() before calling __ublk_io_handler_fn() - Update ublk_thread_set_sched_affinity() to take struct ublk_thread_info * instead of struct ublk_thread *, and use pthread_setaffinity_np() instead of sched_setaffinity() - Reorder struct ublk_thread fields to group related state together This change makes each thread's ublk_thread structure truly local to the thread, improving cache locality and reducing memory usage. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 82 +++++++++++++++------------- tools/testing/selftests/ublk/kublk.h | 9 +-- 2 files changed, 48 insertions(+), 43 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 062537ab8976..f8fa102a627f 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -836,31 +836,58 @@ static int ublk_process_io(struct ublk_thread *t) return reapped; } -static void ublk_thread_set_sched_affinity(const struct ublk_thread *t, - cpu_set_t *cpuset) -{ - if (pthread_setaffinity_np(pthread_self(), sizeof(*cpuset), cpuset) < 0) - ublk_err("ublk dev %u thread %u set affinity failed", - t->dev->dev_info.dev_id, t->idx); -} - struct ublk_thread_info { struct ublk_dev *dev; + pthread_t thread; unsigned idx; sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; }; -static void *ublk_io_handler_fn(void *data) +static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) { - struct ublk_thread_info *info = data; - struct ublk_thread *t = &info->dev->threads[info->idx]; + if (pthread_setaffinity_np(pthread_self(), sizeof(*info->affinity), info->affinity) < 0) + ublk_err("ublk dev %u thread %u set affinity failed", + info->dev->dev_info.dev_id, info->idx); +} + +static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) +{ + struct ublk_thread t = { + .dev = info->dev, + .idx = info->idx, + }; int dev_id = info->dev->dev_info.dev_id; int ret; - t->dev = info->dev; - t->idx = info->idx; + ret = ublk_thread_init(&t, info->extra_flags); + if (ret) { + ublk_err("ublk dev %d thread %u init failed\n", + dev_id, t.idx); + return ret; + } + sem_post(info->ready); + + ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", + gettid(), dev_id, t.idx); + + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(&t); + do { + if (ublk_process_io(&t) < 0) + break; + } while (1); + + ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n", + gettid(), dev_id, t.idx); + ublk_thread_deinit(&t); + return 0; +} + +static void *ublk_io_handler_fn(void *data) +{ + struct ublk_thread_info *info = data; /* * IO perf is sensitive with queue pthread affinity on NUMA machine @@ -869,29 +896,10 @@ static void *ublk_io_handler_fn(void *data) * could be CPU/NUMA aware. */ if (info->affinity) - ublk_thread_set_sched_affinity(t, info->affinity); + ublk_thread_set_sched_affinity(info); - ret = ublk_thread_init(t, info->extra_flags); - if (ret) { - ublk_err("ublk dev %d thread %u init failed\n", - dev_id, t->idx); - return NULL; - } - sem_post(info->ready); + __ublk_io_handler_fn(info); - ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", - gettid(), dev_id, t->idx); - - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(t); - do { - if (ublk_process_io(t) < 0) - break; - } while (1); - - ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %d exiting\n", - gettid(), dev_id, t->idx); - ublk_thread_deinit(t); return NULL; } @@ -989,14 +997,13 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) */ if (dev->nthreads == dinfo->nr_hw_queues) tinfo[i].affinity = &affinity_buf[i]; - pthread_create(&dev->threads[i].thread, NULL, + pthread_create(&tinfo[i].thread, NULL, ublk_io_handler_fn, &tinfo[i]); } for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); - free(tinfo); free(affinity_buf); /* everything is fine now, start us */ @@ -1019,7 +1026,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) /* wait until we are terminated */ for (i = 0; i < dev->nthreads; i++) - pthread_join(dev->threads[i].thread, &thread_ret); + pthread_join(tinfo[i].thread, &thread_ret); + free(tinfo); fail: for (i = 0; i < dinfo->nr_hw_queues; i++) ublk_queue_deinit(&dev->q[i]); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 5e55484fb0aa..fe42705c6d42 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -175,23 +175,20 @@ struct ublk_queue { struct ublk_thread { struct ublk_dev *dev; - struct io_uring ring; - unsigned int cmd_inflight; - unsigned int io_inflight; - - pthread_t thread; unsigned idx; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) unsigned state; + unsigned int cmd_inflight; + unsigned int io_inflight; + struct io_uring ring; }; struct ublk_dev { struct ublk_tgt tgt; struct ublksrv_ctrl_dev_info dev_info; struct ublk_queue q[UBLK_MAX_QUEUES]; - struct ublk_thread threads[UBLK_MAX_THREADS]; unsigned nthreads; unsigned per_io_tasks; From eef09f742be2a89126742f9f6f6a0d5d7c83cba8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Nov 2025 05:16:44 -0500 Subject: [PATCH 030/162] block: blocking mempool_alloc doesn't fail So remove the error check for it in bio_integrity_prep. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 687952f63bbb..2f4a244749ac 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -158,8 +158,6 @@ bool bio_integrity_prep(struct bio *bio) if (!buf) goto err_end_io; bid = mempool_alloc(&bid_pool, GFP_NOIO); - if (!bid) - goto err_free_buf; bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); bid->bio = bio; @@ -187,8 +185,6 @@ bool bio_integrity_prep(struct bio *bio) bid->saved_bio_iter = bio->bi_iter; return true; -err_free_buf: - kfree(buf); err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); From ec7f31b2a2d3bf6b9e4d4b8cd156587f1d0607d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Nov 2025 05:16:45 -0500 Subject: [PATCH 031/162] block: make bio auto-integrity deadlock safe The current block layer automatic integrity protection allocates the actual integrity buffer, which has three problems: - because it happens at the bottom of the I/O stack and doesn't use a mempool it can deadlock under load - because the data size in a bio is almost unbounded when using lage folios it can relatively easily exceed the maximum kmalloc size - even when it does not exceed the maximum kmalloc size, it could exceed the maximum segment size of the device Fix this by limiting the I/O size so that we can allocate at least a 2MiB integrity buffer, i.e. 128MiB for 8 byte PI and 512 byte integrity intervals, and create a mempool as a last resort for this maximum size, mirroring the scheme used for bvecs. As a nice upside none of this can fail now, so we remove the error handling and open code the trivial addition of the bip vec. The new allocation helpers sit outside of bio-integrity-auto.c because I plan to reuse them for file system based PI in the near future. Fixes: 7ba1ba12eeef ("block: Block layer data integrity support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 22 +++------------- block/bio-integrity.c | 48 +++++++++++++++++++++++++++++++++++ block/blk-settings.c | 21 +++++++++++++++ include/linux/bio-integrity.h | 6 +++++ include/linux/blk-integrity.h | 5 ++++ 5 files changed, 83 insertions(+), 19 deletions(-) diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 2f4a244749ac..9850c338548d 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -29,7 +29,7 @@ static void bio_integrity_finish(struct bio_integrity_data *bid) { bid->bio->bi_integrity = NULL; bid->bio->bi_opf &= ~REQ_INTEGRITY; - kfree(bvec_virt(bid->bip.bip_vec)); + bio_integrity_free_buf(&bid->bip); mempool_free(bid, &bid_pool); } @@ -110,8 +110,6 @@ bool bio_integrity_prep(struct bio *bio) struct bio_integrity_data *bid; bool set_flags = true; gfp_t gfp = GFP_NOIO; - unsigned int len; - void *buf; if (!bi) return true; @@ -152,17 +150,12 @@ bool bio_integrity_prep(struct bio *bio) if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return true; - /* Allocate kernel buffer for protection data */ - len = bio_integrity_bytes(bi, bio_sectors(bio)); - buf = kmalloc(len, gfp); - if (!buf) - goto err_end_io; bid = mempool_alloc(&bid_pool, GFP_NOIO); bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); - bid->bio = bio; - bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; + bio_integrity_alloc_buf(bio, gfp & __GFP_ZERO); + bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); if (set_flags) { @@ -174,21 +167,12 @@ bool bio_integrity_prep(struct bio *bio) bid->bip.bip_flags |= BIP_CHECK_REFTAG; } - if (bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)) < len) - goto err_end_io; - /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; return true; - -err_end_io: - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return false; } EXPORT_SYMBOL(bio_integrity_prep); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bed26f1ec869..09eeaf6e74b8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -14,6 +14,45 @@ struct bio_integrity_alloc { struct bio_vec bvecs[]; }; +static mempool_t integrity_buf_pool; + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + unsigned int len = bio_integrity_bytes(bi, bio_sectors(bio)); + gfp_t gfp = GFP_NOIO | (zero_buffer ? __GFP_ZERO : 0); + void *buf; + + buf = kmalloc(len, (gfp & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN); + if (unlikely(!buf)) { + struct page *page; + + page = mempool_alloc(&integrity_buf_pool, GFP_NOFS); + if (zero_buffer) + memset(page_address(page), 0, len); + bvec_set_page(&bip->bip_vec[0], page, len, 0); + bip->bip_flags |= BIP_MEMPOOL; + } else { + bvec_set_page(&bip->bip_vec[0], virt_to_page(buf), len, + offset_in_page(buf)); + } + + bip->bip_vcnt = 1; + bip->bip_iter.bi_size = len; +} + +void bio_integrity_free_buf(struct bio_integrity_payload *bip) +{ + struct bio_vec *bv = &bip->bip_vec[0]; + + if (bip->bip_flags & BIP_MEMPOOL) + mempool_free(bv->bv_page, &integrity_buf_pool); + else + kfree(bvec_virt(bv)); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -438,3 +477,12 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, return 0; } + +static int __init bio_integrity_initfn(void) +{ + if (mempool_init_page_pool(&integrity_buf_pool, BIO_POOL_SIZE, + get_order(BLK_INTEGRITY_MAX_SIZE))) + panic("bio: can't create integrity buf pool\n"); + return 0; +} +subsys_initcall(bio_integrity_initfn); diff --git a/block/blk-settings.c b/block/blk-settings.c index 345b6a271cc3..e0d0b035f39d 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -123,6 +123,19 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } +/* + * Maximum size of I/O that needs a block layer integrity buffer. Limited + * by the number of intervals for which we can fit the integrity buffer into + * the buffer size. Because the buffer is a single segment it is also limited + * by the maximum segment size. + */ +static inline unsigned int max_integrity_io_size(struct queue_limits *lim) +{ + return min_t(unsigned int, lim->max_segment_size, + (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << + lim->integrity.interval_exp); +} + static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; @@ -184,6 +197,14 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) if (!bi->interval_exp) bi->interval_exp = ilog2(lim->logical_block_size); + /* + * The block layer automatically adds integrity data for bios that don't + * already have it. Limit the I/O size so that a single maximum size + * metadata segment can cover the integrity data for the entire I/O. + */ + lim->max_sectors = min(lim->max_sectors, + max_integrity_io_size(lim) >> SECTOR_SHIFT); + return 0; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 851254f36eb3..3d05296a5afe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -14,6 +14,8 @@ enum bip_flags { BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ BIP_P2P_DMA = 1 << 8, /* using P2P address */ + + BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; struct bio_integrity_payload { @@ -140,4 +142,8 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); +void bio_integrity_free_buf(struct bio_integrity_payload *bip); + #endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index b659373788f6..c2030fd8ba0a 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,6 +8,11 @@ struct request; +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, From efae226c2ef19528ffd81d29ba0eecf1b0896ca2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:35 +0900 Subject: [PATCH 032/162] block: handle zone management operations completions The functions blk_zone_wplug_handle_reset_or_finish() and blk_zone_wplug_handle_reset_all() both modify the zone write pointer offset of zone write plugs that are the target of a reset, reset all or finish zone management operation. However, these functions do this modification before the BIO is executed. So if the zone operation fails, the modified zone write pointer offsets become invalid. Avoid this by modifying the zone write pointer offset of a zone write plug that is the target of a zone management operation when the operation completes. To do so, modify blk_zone_bio_endio() to call the new function blk_zone_mgmt_bio_endio() which in turn calls the functions blk_zone_reset_all_bio_endio(), blk_zone_reset_bio_endio() or blk_zone_finish_bio_endio() depending on the operation of the completed BIO, to modify a zone write plug write pointer offset accordingly. These functions are called only if the BIO execution was successful. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 139 ++++++++++++++++++++++++++++++---------------- block/blk.h | 14 +++++ 2 files changed, 104 insertions(+), 49 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 5e2a5788dc3b..1621e8f78338 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -71,6 +71,11 @@ struct blk_zone_wplug { struct gendisk *disk; }; +static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) +{ + return 1U << disk->zone_wplugs_hash_bits; +} + /* * Zone write plug flags bits: * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, @@ -698,71 +703,91 @@ static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) disk_report_zones_cb, &args); } -static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, - unsigned int wp_offset) +static void blk_zone_reset_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - sector_t sector = bio->bi_iter.bi_sector; struct blk_zone_wplug *zwplug; - unsigned long flags; - - /* Conventional zones cannot be reset nor finished. */ - if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { - bio_io_error(bio); - return true; - } /* - * No-wait reset or finish BIOs do not make much sense as the callers - * issue these as blocking operations in most cases. To avoid issues - * the BIO execution potentially failing with BLK_STS_AGAIN, warn about - * REQ_NOWAIT being set and ignore that flag. - */ - if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) - bio->bi_opf &= ~REQ_NOWAIT; - - /* - * If we have a zone write plug, set its write pointer offset to 0 - * (reset case) or to the zone size (finish case). This will abort all - * BIOs plugged for the target zone. It is fine as resetting or - * finishing zones while writes are still in-flight will result in the + * If we have a zone write plug, set its write pointer offset to 0. + * This will abort all BIOs plugged for the target zone. It is fine as + * resetting zones while writes are still in-flight will result in the * writes failing anyway. */ - zwplug = disk_get_zone_wplug(disk, sector); + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); if (zwplug) { + unsigned long flags; + spin_lock_irqsave(&zwplug->lock, flags); - disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); } - - return false; } -static bool blk_zone_wplug_handle_reset_all(struct bio *bio) +static void blk_zone_reset_all_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug; unsigned long flags; - sector_t sector; + unsigned int i; - /* - * Set the write pointer offset of all zone write plugs to 0. This will - * abort all plugged BIOs. It is fine as resetting zones while writes - * are still in-flight will result in the writes failing anyway. - */ - for (sector = 0; sector < get_capacity(disk); - sector += disk->queue->limits.chunk_sectors) { - zwplug = disk_get_zone_wplug(disk, sector); - if (zwplug) { + /* Update the condition of all zone write plugs. */ + rcu_read_lock(); + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], + node) { spin_lock_irqsave(&zwplug->lock, flags); disk_zone_wplug_set_wp_offset(disk, zwplug, 0); spin_unlock_irqrestore(&zwplug->lock, flags); - disk_put_zone_wplug(zwplug); } } + rcu_read_unlock(); +} - return false; +static void blk_zone_finish_bio_endio(struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct gendisk *disk = bdev->bd_disk; + struct blk_zone_wplug *zwplug; + + /* + * If we have a zone write plug, set its write pointer offset to the + * zone size. This will abort all BIOs plugged for the target zone. It + * is fine as resetting zones while writes are still in-flight will + * result in the writes failing anyway. + */ + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + if (zwplug) { + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_wp_offset(disk, zwplug, + bdev_zone_sectors(bdev)); + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + } +} + +void blk_zone_mgmt_bio_endio(struct bio *bio) +{ + /* If the BIO failed, we have nothing to do. */ + if (bio->bi_status != BLK_STS_OK) + return; + + switch (bio_op(bio)) { + case REQ_OP_ZONE_RESET: + blk_zone_reset_bio_endio(bio); + return; + case REQ_OP_ZONE_RESET_ALL: + blk_zone_reset_all_bio_endio(bio); + return; + case REQ_OP_ZONE_FINISH: + blk_zone_finish_bio_endio(bio); + return; + default: + return; + } } static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, @@ -1106,6 +1131,30 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) disk_put_zone_wplug(zwplug); } +static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio) +{ + if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL && + !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + /* + * Zone reset and zone finish operations do not apply to + * conventional zones. + */ + bio_io_error(bio); + return true; + } + + /* + * No-wait zone management BIOs do not make much sense as the callers + * issue these as blocking operations in most cases. To avoid issues + * with the BIO execution potentially failing with BLK_STS_AGAIN, warn + * about REQ_NOWAIT being set and ignore that flag. + */ + if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) + bio->bi_opf &= ~REQ_NOWAIT; + + return false; +} + /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted @@ -1153,12 +1202,9 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) case REQ_OP_WRITE_ZEROES: return blk_zone_wplug_handle_write(bio, nr_segs); case REQ_OP_ZONE_RESET: - return blk_zone_wplug_handle_reset_or_finish(bio, 0); case REQ_OP_ZONE_FINISH: - return blk_zone_wplug_handle_reset_or_finish(bio, - bdev_zone_sectors(bdev)); case REQ_OP_ZONE_RESET_ALL: - return blk_zone_wplug_handle_reset_all(bio); + return blk_zone_wplug_handle_zone_mgmt(bio); default: return false; } @@ -1332,11 +1378,6 @@ put_zwplug: disk_put_zone_wplug(zwplug); } -static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) -{ - return 1U << disk->zone_wplugs_hash_bits; -} - void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_lock); diff --git a/block/blk.h b/block/blk.h index 32a10024efba..4d809588b771 100644 --- a/block/blk.h +++ b/block/blk.h @@ -489,9 +489,23 @@ static inline bool blk_req_bio_is_zone_append(struct request *rq, void blk_zone_write_plug_bio_merged(struct bio *bio); void blk_zone_write_plug_init_request(struct request *rq); void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio); +void blk_zone_mgmt_bio_endio(struct bio *bio); void blk_zone_write_plug_bio_endio(struct bio *bio); static inline void blk_zone_bio_endio(struct bio *bio) { + /* + * Zone management BIOs may impact zone write plugs (e.g. a zone reset + * changes a zone write plug zone write pointer offset), but these + * operation do not go through zone write plugging as they may operate + * on zones that do not have a zone write + * plug. blk_zone_mgmt_bio_endio() handles the potential changes to zone + * write plugs that are present. + */ + if (op_is_zone_mgmt(bio_op(bio))) { + blk_zone_mgmt_bio_endio(bio); + return; + } + /* * For write BIOs to zoned devices, signal the completion of the BIO so * that the next write BIO can be submitted by zone write plugging. From bba4322e3f303b2d656e748be758320b567f046f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:36 +0900 Subject: [PATCH 033/162] block: freeze queue when updating zone resources Modify disk_update_zone_resources() to freeze the device queue before updating the number of zones, zone capacity and other zone related resources. The locking order resulting from the call to queue_limits_commit_update_frozen() is preserved, that is, the queue limits lock is first taken by calling queue_limits_start_update() before freezing the queue, and the queue is unfrozen after executing queue_limits_commit_update(), which replaces the call to queue_limits_commit_update_frozen(). This change ensures that there are no in-flights I/Os when the zone resources are updated due to a zone revalidation. In case of error when the limits are applied, directly call disk_free_zone_resources() from disk_update_zone_resources() while the disk queue is still frozen to avoid needing to freeze & unfreeze the queue again in blk_revalidate_disk_zones(), thus simplifying that function code a little. Fixes: 0b83c86b444a ("block: Prevent potential deadlock in blk_revalidate_disk_zones()") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1621e8f78338..39381f2b2e94 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1557,8 +1557,13 @@ static int disk_update_zone_resources(struct gendisk *disk, { struct request_queue *q = disk->queue; unsigned int nr_seq_zones, nr_conv_zones; - unsigned int pool_size; + unsigned int pool_size, memflags; struct queue_limits lim; + int ret = 0; + + lim = queue_limits_start_update(q); + + memflags = blk_mq_freeze_queue(q); disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; @@ -1568,11 +1573,10 @@ static int disk_update_zone_resources(struct gendisk *disk, if (nr_conv_zones >= disk->nr_zones) { pr_warn("%s: Invalid number of conventional zones %u / %u\n", disk->disk_name, nr_conv_zones, disk->nr_zones); - return -ENODEV; + ret = -ENODEV; + goto unfreeze; } - lim = queue_limits_start_update(q); - /* * Some devices can advertize zone resource limits that are larger than * the number of sequential zones of the zoned block device, e.g. a @@ -1609,7 +1613,15 @@ static int disk_update_zone_resources(struct gendisk *disk, } commit: - return queue_limits_commit_update_frozen(q, &lim); + ret = queue_limits_commit_update(q, &lim); + +unfreeze: + if (ret) + disk_free_zone_resources(disk); + + blk_mq_unfreeze_queue(q, memflags); + + return ret; } static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, @@ -1774,7 +1786,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) sector_t zone_sectors = q->limits.chunk_sectors; sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; - unsigned int noio_flag; + unsigned int memflags, noio_flag; int ret = -ENOMEM; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) @@ -1824,20 +1836,14 @@ int blk_revalidate_disk_zones(struct gendisk *disk) ret = -ENODEV; } - /* - * Set the new disk zone parameters only once the queue is frozen and - * all I/Os are completed. - */ if (ret > 0) - ret = disk_update_zone_resources(disk, &args); - else - pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - if (ret) { - unsigned int memflags = blk_mq_freeze_queue(q); + return disk_update_zone_resources(disk, &args); - disk_free_zone_resources(disk); - blk_mq_unfreeze_queue(q, memflags); - } + pr_warn("%s: failed to revalidate zones\n", disk->disk_name); + + memflags = blk_mq_freeze_queue(q); + disk_free_zone_resources(disk); + blk_mq_unfreeze_queue(q, memflags); return ret; } From e8ecb21f081fe0cab33dc20cbe65ccbbfe615c15 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:37 +0900 Subject: [PATCH 034/162] block: cleanup blkdev_report_zones() The variable capacity is used only in one place and so can be removed and get_capacity(disk) used directly instead. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 39381f2b2e94..345a99c0b031 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -161,7 +161,6 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct gendisk *disk = bdev->bd_disk; - sector_t capacity = get_capacity(disk); struct disk_report_zones_cb_args args = { .disk = disk, .user_cb = cb, @@ -171,7 +170,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; - if (!nr_zones || sector >= capacity) + if (!nr_zones || sector >= get_capacity(disk)) return 0; return disk->fops->report_zones(disk, sector, nr_zones, From fdb9aed869f34d776298b3a8197909eb820e4d0d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:38 +0900 Subject: [PATCH 035/162] block: introduce disk_report_zone() Commit b76b840fd933 ("dm: Fix dm-zoned-reclaim zone write pointer alignment") introduced an indirect call for the callback function of a report zones executed with blkdev_report_zones(). This is necessary so that the function disk_zone_wplug_sync_wp_offset() can be called to refresh a zone write plug zone write pointer offset after a write error. However, this solution makes following the path of a zone information harder to understand. Clean this up by introducing the new blk_report_zones_args structure to define a zone report callback and its private data and introduce the helper function disk_report_zone() which calls both disk_zone_wplug_sync_wp_offset() and the zone report user callback function for all zones of a zone report. This helper function must be called by all block device drivers that implement the report zones block operation in order to correctly report a zone information. All block device drivers supporting the report_zones block operation are updated to use this new scheme. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 79 ++++++++++++++++--------------- drivers/block/null_blk/null_blk.h | 3 +- drivers/block/null_blk/zoned.c | 4 +- drivers/block/ublk_drv.c | 4 +- drivers/block/virtio_blk.c | 11 +++-- drivers/block/zloop.c | 4 +- drivers/md/dm-zone.c | 54 +++++++++++---------- drivers/md/dm.h | 3 +- drivers/nvme/host/core.c | 5 +- drivers/nvme/host/multipath.c | 4 +- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/zns.c | 10 ++-- drivers/scsi/sd.h | 2 +- drivers/scsi/sd_zbc.c | 20 +++----- include/linux/blkdev.h | 7 ++- include/linux/device-mapper.h | 10 +++- 16 files changed, 120 insertions(+), 102 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 345a99c0b031..de3524c17f67 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -114,30 +114,16 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -struct disk_report_zones_cb_args { - struct gendisk *disk; - report_zones_cb user_cb; - void *user_data; +/* + * Zone report arguments for block device drivers report_zones operation. + * @cb: report_zones_cb callback for each reported zone. + * @data: Private data passed to report_zones_cb. + */ +struct blk_report_zones_args { + report_zones_cb cb; + void *data; }; -static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, - struct blk_zone *zone); - -static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - struct disk_report_zones_cb_args *args = data; - struct gendisk *disk = args->disk; - - if (disk->zone_wplugs_hash) - disk_zone_wplug_sync_wp_offset(disk, zone); - - if (!args->user_cb) - return 0; - - return args->user_cb(zone, idx, args->user_data); -} - /** * blkdev_report_zones - Get zones information * @bdev: Target block device @@ -161,10 +147,9 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct gendisk *disk = bdev->bd_disk; - struct disk_report_zones_cb_args args = { - .disk = disk, - .user_cb = cb, - .user_data = data, + struct blk_report_zones_args args = { + .cb = cb, + .data = data, }; if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) @@ -173,8 +158,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, if (!nr_zones || sector >= get_capacity(disk)) return 0; - return disk->fops->report_zones(disk, sector, nr_zones, - disk_report_zones_cb, &args); + return disk->fops->report_zones(disk, sector, nr_zones, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); @@ -692,15 +676,32 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, disk_put_zone_wplug(zwplug); } -static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector) +/** + * disk_report_zone - Report one zone + * @disk: Target disk + * @zone: The zone to report + * @idx: The index of the zone in the overall zone report + * @args: report zones callback and data + * + * Description: + * Helper function for block device drivers to report one zone of a zone + * report initiated with blkdev_report_zones(). The zone being reported is + * specified by @zone and used to update, if necessary, the zone write plug + * information for the zone. If @args specifies a user callback function, + * this callback is executed. + */ +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args) { - struct disk_report_zones_cb_args args = { - .disk = disk, - }; + if (disk->zone_wplugs_hash) + disk_zone_wplug_sync_wp_offset(disk, zone); - return disk->fops->report_zones(disk, sector, 1, - disk_report_zones_cb, &args); + if (args && args->cb) + return args->cb(zone, idx, args->data); + + return 0; } +EXPORT_SYMBOL_GPL(disk_report_zone); static void blk_zone_reset_bio_endio(struct bio *bio) { @@ -1786,6 +1787,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk) sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; unsigned int memflags, noio_flag; + struct blk_report_zones_args rep_args = { + .cb = blk_revalidate_zone_cb, + .data = &args, + }; int ret = -ENOMEM; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) @@ -1817,8 +1822,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return ret; } - ret = disk->fops->report_zones(disk, 0, UINT_MAX, - blk_revalidate_zone_cb, &args); + ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args); if (!ret) { pr_warn("%s: No zones reported\n", disk->disk_name); ret = -ENODEV; @@ -1863,6 +1867,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask) { + struct gendisk *disk = bdev->bd_disk; int ret; if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) @@ -1878,7 +1883,7 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, * pointer. Undo this using a report zone to update the zone write * pointer to the correct current value. */ - ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); + ret = disk->fops->report_zones(disk, sector, 1, NULL); if (ret != 1) return ret < 0 ? ret : -EIO; diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 7bb6128dbaaf..6c4c4bbe7dad 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -143,7 +143,8 @@ int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); int null_register_zoned_dev(struct nullb *nullb); void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors); size_t null_zone_valid_read_len(struct nullb *nullb, diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 4e5728f45989..6a93b12a06ff 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -191,7 +191,7 @@ void null_free_zoned_dev(struct nullb_device *dev) } int null_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; @@ -225,7 +225,7 @@ int null_report_zones(struct gendisk *disk, sector_t sector, blkz.capacity = zone->capacity; null_unlock_zone(dev, zone); - error = cb(&blkz, i, data); + error = disk_report_zone(disk, &blkz, i, args); if (error) return error; } diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 96e07763cd28..97cc4bc0a6ce 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -367,7 +367,7 @@ static void *ublk_alloc_report_buffer(struct ublk_device *ublk, } static int ublk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct ublk_device *ub = disk->private_data; unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors; @@ -430,7 +430,7 @@ free_req: if (!zone->len) break; - ret = cb(zone, i, data); + ret = disk_report_zone(disk, zone, i, args); if (ret) goto out; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index f061420dfb10..a5e97f03dbf0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -584,7 +584,8 @@ out: static int virtblk_parse_zone(struct virtio_blk *vblk, struct virtio_blk_zone_descriptor *entry, - unsigned int idx, report_zones_cb cb, void *data) + unsigned int idx, + struct blk_report_zones_args *args) { struct blk_zone zone = { }; @@ -650,12 +651,12 @@ static int virtblk_parse_zone(struct virtio_blk *vblk, * The callback below checks the validity of the reported * entry data, no need to further validate it here. */ - return cb(&zone, idx, data); + return disk_report_zone(vblk->disk, &zone, idx, args); } static int virtblk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, - void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct virtio_blk *vblk = disk->private_data; struct virtio_blk_zone_report *report; @@ -693,7 +694,7 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector, for (i = 0; i < nz && zone_idx < nr_zones; i++) { ret = virtblk_parse_zone(vblk, &report->zones[i], - zone_idx, cb, data); + zone_idx, args); if (ret) goto fail_report; diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index a423228e201b..92be9f0af00a 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -647,7 +647,7 @@ static int zloop_open(struct gendisk *disk, blk_mode_t mode) } static int zloop_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct zloop_device *zlo = disk->private_data; struct blk_zone blkz = {}; @@ -687,7 +687,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector, mutex_unlock(&zone->lock); - ret = cb(&blkz, i, data); + ret = disk_report_zone(disk, &blkz, i, args); if (ret) return ret; } diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 78e17dd4d01b..984fb621b0e9 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -17,33 +17,26 @@ * For internal zone reports bypassing the top BIO submission path. */ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, - sector_t sector, unsigned int nr_zones, - report_zones_cb cb, void *data) + unsigned int nr_zones, + struct dm_report_zones_args *args) { - struct gendisk *disk = md->disk; - int ret; - struct dm_report_zones_args args = { - .next_sector = sector, - .orig_data = data, - .orig_cb = cb, - }; - do { struct dm_target *tgt; + int ret; - tgt = dm_table_find_target(t, args.next_sector); + tgt = dm_table_find_target(t, args->next_sector); if (WARN_ON_ONCE(!tgt->type->report_zones)) return -EIO; - args.tgt = tgt; - ret = tgt->type->report_zones(tgt, &args, - nr_zones - args.zone_idx); + args->tgt = tgt; + ret = tgt->type->report_zones(tgt, args, + nr_zones - args->zone_idx); if (ret < 0) return ret; - } while (args.zone_idx < nr_zones && - args.next_sector < get_capacity(disk)); + } while (args->zone_idx < nr_zones && + args->next_sector < get_capacity(md->disk)); - return args.zone_idx; + return args->zone_idx; } /* @@ -52,7 +45,8 @@ static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t, * generally implemented by targets using dm_report_zones(). */ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct mapped_device *md = disk->private_data; struct dm_table *map; @@ -76,9 +70,14 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, map = zone_revalidate_map; } - if (map) - ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, - data); + if (map) { + struct dm_report_zones_args dm_args = { + .disk = md->disk, + .next_sector = sector, + .rep_args = args, + }; + ret = dm_blk_do_report_zones(md, map, nr_zones, &dm_args); + } if (put_table) dm_put_live_table(md, srcu_idx); @@ -113,7 +112,9 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, } args->next_sector = zone->start + zone->len; - return args->orig_cb(zone, args->zone_idx++, args->orig_data); + + return disk_report_zone(args->disk, zone, args->zone_idx++, + args->rep_args); } /* @@ -492,10 +493,15 @@ int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset) { + struct dm_report_zones_args args = { + .disk = md->disk, + .next_sector = sector, + .cb = dm_zone_need_reset_cb, + .data = need_reset, + }; int ret; - ret = dm_blk_do_report_zones(md, t, sector, nr_zones, - dm_zone_need_reset_cb, need_reset); + ret = dm_blk_do_report_zones(md, t, nr_zones, &args); if (ret != nr_zones) { DMERR("Get %s zone reset bitmap failed\n", md->disk->disk_name); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 245f52b59215..7a795979ec72 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -109,7 +109,8 @@ void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fa4181d7de73..c0fe50fb7b08 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2599,10 +2599,9 @@ static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended) #ifdef CONFIG_BLK_DEV_ZONED static int nvme_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { - return nvme_ns_report_zones(disk->private_data, sector, nr_zones, cb, - data); + return nvme_ns_report_zones(disk->private_data, sector, nr_zones, args); } #else #define nvme_report_zones NULL diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 543e17aead12..0b7ac0735bd0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -576,7 +576,7 @@ static int nvme_ns_head_get_unique_id(struct gendisk *disk, u8 id[16], #ifdef CONFIG_BLK_DEV_ZONED static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct nvme_ns_head *head = disk->private_data; struct nvme_ns *ns; @@ -585,7 +585,7 @@ static int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); if (ns) - ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); + ret = nvme_ns_report_zones(ns, sector, nr_zones, args); srcu_read_unlock(&head->srcu, srcu_idx); return ret; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 102fae6a231c..928c748ccbd1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1108,7 +1108,7 @@ struct nvme_zone_info { }; int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, struct blk_report_zones_args *args); int nvme_query_zone_info(struct nvme_ns *ns, unsigned lbaf, struct nvme_zone_info *zi); void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index cce4c5b55aa9..deea2dbef5b8 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -148,8 +148,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns, static int nvme_zone_parse_entry(struct nvme_ns *ns, struct nvme_zone_descriptor *entry, - unsigned int idx, report_zones_cb cb, - void *data) + unsigned int idx, + struct blk_report_zones_args *args) { struct nvme_ns_head *head = ns->head; struct blk_zone zone = { }; @@ -169,11 +169,11 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns, else zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp)); - return cb(&zone, idx, data); + return disk_report_zone(ns->disk, &zone, idx, args); } int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, struct blk_report_zones_args *args) { struct nvme_zone_report *report; struct nvme_command c = { }; @@ -213,7 +213,7 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, for (i = 0; i < nz && zone_idx < nr_zones; i++) { ret = nvme_zone_parse_entry(ns, &report->entries[i], - zone_idx, cb, data); + zone_idx, args); if (ret) goto out_free; zone_idx++; diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 36382eca941c..574af8243016 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -240,7 +240,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, struct blk_report_zones_args *args); #else /* CONFIG_BLK_DEV_ZONED */ diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index a8db66428f80..56e455fb5add 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -35,8 +35,7 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64]) * @buf: SCSI zone descriptor. * @idx: Index of the zone relative to the first zone reported by the current * sd_zbc_report_zones() call. - * @cb: Callback function pointer. - * @data: Second argument passed to @cb. + * @args: report zones arguments (callback, etc) * * Return: Value returned by @cb. * @@ -44,12 +43,11 @@ static bool sd_zbc_is_gap_zone(const u8 buf[64]) * call @cb(blk_zone, @data). */ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], - unsigned int idx, report_zones_cb cb, void *data) + unsigned int idx, struct blk_report_zones_args *args) { struct scsi_device *sdp = sdkp->device; struct blk_zone zone = { 0 }; sector_t start_lba, gran; - int ret; if (WARN_ON_ONCE(sd_zbc_is_gap_zone(buf))) return -EINVAL; @@ -87,11 +85,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], else zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); - ret = cb(&zone, idx, data); - if (ret) - return ret; - - return 0; + return disk_report_zone(sdkp->disk, &zone, idx, args); } /** @@ -217,14 +211,14 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp) * @disk: Disk to report zones for. * @sector: Start sector. * @nr_zones: Maximum number of zones to report. - * @cb: Callback function called to report zone information. - * @data: Second argument passed to @cb. + * @args: Callback arguments. * * Called by the block layer to iterate over zone information. See also the * disk->fops->report_zones() calls in block/blk-zoned.c. */ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data) + unsigned int nr_zones, + struct blk_report_zones_args *args) { struct scsi_disk *sdkp = scsi_disk(disk); sector_t lba = sectors_to_logical(sdkp->device, sector); @@ -283,7 +277,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, } ret = sd_zbc_parse_report(sdkp, buf + offset, zone_idx, - cb, data); + args); if (ret) goto out; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99be263b31ab..2f75fb15f55f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; +struct blk_report_zones_args; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; @@ -432,6 +433,9 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -1662,7 +1666,8 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 84fdc3a6a19a..38f625af6ab4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -538,12 +538,18 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); #ifdef CONFIG_BLK_DEV_ZONED struct dm_report_zones_args { struct dm_target *tgt; + struct gendisk *disk; sector_t next_sector; - void *orig_data; - report_zones_cb orig_cb; unsigned int zone_idx; + /* for block layer ->report_zones */ + struct blk_report_zones_args *rep_args; + + /* for internal users */ + report_zones_cb cb; + void *data; + /* must be filled by ->report_zones before calling dm_report_zones_cb */ sector_t start; }; From ca1a897fb266c4b23b5ecb99fe787ed18559057d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:39 +0900 Subject: [PATCH 036/162] block: reorganize struct blk_zone_wplug Reorganize the fields of struct blk_zone_wplug to remove a hole after the wp_offset field and avoid having the bio_work structure split between 2 cache lines. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index de3524c17f67..d4fc87b0be6b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -39,6 +39,11 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @bio_list: The list of BIOs that are currently plugged. + * @bio_work: Work struct to handle issuing of plugged BIOs + * @rcu_head: RCU head to free zone write plugs with an RCU grace period. + * @disk: The gendisk the plug belongs to. + * @lock: Spinlock to atomically manipulate the plug. * @ref: Zone write plug reference counter. A zone write plug reference is * always at least 1 when the plug is hashed in the disk plug hash table. * The reference is incremented whenever a new BIO needing plugging is @@ -48,27 +53,22 @@ static const char *const zone_cond_name[] = { * reference is dropped whenever the zone of the zone write plug is reset, * finished and when the zone becomes full (last write BIO to the zone * completes). - * @lock: Spinlock to atomically manipulate the plug. * @flags: Flags indicating the plug state. * @zone_no: The number of the zone the plug is managing. * @wp_offset: The zone write pointer location relative to the start of the zone * as a number of 512B sectors. - * @bio_list: The list of BIOs that are currently plugged. - * @bio_work: Work struct to handle issuing of plugged BIOs - * @rcu_head: RCU head to free zone write plugs with an RCU grace period. - * @disk: The gendisk the plug belongs to. */ struct blk_zone_wplug { struct hlist_node node; - refcount_t ref; - spinlock_t lock; - unsigned int flags; - unsigned int zone_no; - unsigned int wp_offset; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; struct gendisk *disk; + spinlock_t lock; + refcount_t ref; + unsigned int flags; + unsigned int zone_no; + unsigned int wp_offset; }; static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) From 6e945ffb6555705cf20b1fcdc21a139911562995 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:40 +0900 Subject: [PATCH 037/162] block: use zone condition to determine conventional zones The conv_zones_bitmap field of struct gendisk is used to define a bitmap to identify the conventional zones of a zoned block device. The bit for a zone is set in this bitmap if the zone is a conventional one, that is, if the zone type is BLK_ZONE_TYPE_CONVENTIONAL. For such zone, this always corresponds to the zone condition BLK_ZONE_COND_NOT_WP. In other words, conv_zones_bitmap tracks a single condition of the zones of a zoned block device. In preparation for tracking more zone conditions, change conv_zones_bitmap into an array of zone conditions, using 1 byte per zone. This increases the memory usage from 1 bit per zone to 1 byte per zone, that is, from 16 KiB to about 100 KiB for a 30 TB SMR HDD with 256 MiB zones. This is a trade-off to allow fast cached report zones later on top of this change. Rename the conv_zones_bitmap field of struct gendisk to zones_cond. Add a blk_revalidate_zone_cond() function to initialize the zones_cond array of a disk during device scan and to update it on device revalidation. Move the allocation of the zones_cond array to disk_revalidate_zone_resources(), making sure that this array is always allocated, even for devices that do not need zone write plugs (zone resources), to ensure that bdev_zone_is_seq() can be re-implemented to use the zone condition array in place of the conv zones bitmap. Finally, the function bdev_zone_is_seq() is rewritten to use a test on the condition of the target zone. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 153 +++++++++++++++++++++++++++++------------ include/linux/blkdev.h | 37 +++------- 2 files changed, 117 insertions(+), 73 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d4fc87b0be6b..f62862274f9a 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -114,6 +114,33 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); +/** + * bdev_zone_is_seq - check if a sector belongs to a sequential write zone + * @bdev: block device to check + * @sector: sector number + * + * Check if @sector on @bdev is contained in a sequential write required zone. + */ +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + struct gendisk *disk = bdev->bd_disk; + unsigned int zno = disk_zone_no(disk, sector); + bool is_seq = false; + u8 *zones_cond; + + if (!bdev_is_zoned(bdev)) + return false; + + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (zones_cond && zno < disk->nr_zones) + is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP; + rcu_read_unlock(); + + return is_seq; +} +EXPORT_SYMBOL_GPL(bdev_zone_is_seq); + /* * Zone report arguments for block device drivers report_zones operation. * @cb: report_zones_cb callback for each reported zone. @@ -1458,22 +1485,16 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) disk->zone_wplugs_hash_bits = 0; } -static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk, - unsigned long *bitmap) +static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { - unsigned int nr_conv_zones = 0; unsigned long flags; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); - if (bitmap) - nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); - bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, - lockdep_is_held(&disk->zone_wplugs_lock)); + zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, + lockdep_is_held(&disk->zone_wplugs_lock)); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); - kfree_rcu_mightsleep(bitmap); - - return nr_conv_zones; + kfree_rcu_mightsleep(zones_cond); } void disk_free_zone_resources(struct gendisk *disk) @@ -1497,7 +1518,7 @@ void disk_free_zone_resources(struct gendisk *disk) mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; - disk_set_conv_zones_bitmap(disk, NULL); + disk_set_zones_cond_array(disk, NULL); disk->zone_capacity = 0; disk->last_zone_capacity = 0; disk->nr_zones = 0; @@ -1516,12 +1537,31 @@ static inline bool disk_need_zone_resources(struct gendisk *disk) queue_emulates_zone_append(disk->queue); } +struct blk_revalidate_zone_args { + struct gendisk *disk; + u8 *zones_cond; + unsigned int nr_zones; + unsigned int nr_conv_zones; + unsigned int zone_capacity; + unsigned int last_zone_capacity; + sector_t sector; +}; + static int disk_revalidate_zone_resources(struct gendisk *disk, - unsigned int nr_zones) + struct blk_revalidate_zone_args *args) { struct queue_limits *lim = &disk->queue->limits; unsigned int pool_size; + args->disk = disk; + args->nr_zones = + DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors); + + /* Cached zone conditions: 1 byte per zone */ + args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO); + if (!args->zones_cond) + return -ENOMEM; + if (!disk_need_zone_resources(disk)) return 0; @@ -1531,7 +1571,8 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, */ pool_size = max(lim->max_open_zones, lim->max_active_zones); if (!pool_size) - pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); + pool_size = + min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); if (!disk->zone_wplugs_hash) return disk_alloc_zone_resources(disk, pool_size); @@ -1539,15 +1580,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, return 0; } -struct blk_revalidate_zone_args { - struct gendisk *disk; - unsigned long *conv_zones_bitmap; - unsigned int nr_zones; - unsigned int zone_capacity; - unsigned int last_zone_capacity; - sector_t sector; -}; - /* * Update the disk zone resources information and device queue limits. * The disk queue is frozen when this is executed. @@ -1556,7 +1588,7 @@ static int disk_update_zone_resources(struct gendisk *disk, struct blk_revalidate_zone_args *args) { struct request_queue *q = disk->queue; - unsigned int nr_seq_zones, nr_conv_zones; + unsigned int nr_seq_zones; unsigned int pool_size, memflags; struct queue_limits lim; int ret = 0; @@ -1566,24 +1598,24 @@ static int disk_update_zone_resources(struct gendisk *disk, memflags = blk_mq_freeze_queue(q); disk->nr_zones = args->nr_zones; - disk->zone_capacity = args->zone_capacity; - disk->last_zone_capacity = args->last_zone_capacity; - nr_conv_zones = - disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); - if (nr_conv_zones >= disk->nr_zones) { + if (args->nr_conv_zones >= disk->nr_zones) { pr_warn("%s: Invalid number of conventional zones %u / %u\n", - disk->disk_name, nr_conv_zones, disk->nr_zones); + disk->disk_name, args->nr_conv_zones, disk->nr_zones); ret = -ENODEV; goto unfreeze; } + disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; + disk_set_zones_cond_array(disk, args->zones_cond); + /* - * Some devices can advertize zone resource limits that are larger than + * Some devices can advertise zone resource limits that are larger than * the number of sequential zones of the zoned block device, e.g. a * small ZNS namespace. For such case, assume that the zoned device has * no zone resource limits. */ - nr_seq_zones = disk->nr_zones - nr_conv_zones; + nr_seq_zones = disk->nr_zones - args->nr_conv_zones; if (lim.max_open_zones >= nr_seq_zones) lim.max_open_zones = 0; if (lim.max_active_zones >= nr_seq_zones) @@ -1624,6 +1656,44 @@ unfreeze: return ret; } +static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + enum blk_zone_cond cond = zone->cond; + + /* Check that the zone condition is consistent with the zone type. */ + switch (cond) { + case BLK_ZONE_COND_NOT_WP: + if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) + goto invalid_condition; + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) + goto invalid_condition; + break; + default: + pr_warn("%s: Invalid zone condition 0x%X\n", + args->disk->disk_name, cond); + return -ENODEV; + } + + args->zones_cond[idx] = cond; + + return 0; + +invalid_condition: + pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n", + args->disk->disk_name, cond, zone->type); + + return -ENODEV; +} + static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, struct blk_revalidate_zone_args *args) { @@ -1638,17 +1708,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, if (disk_zone_is_last(disk, zone)) args->last_zone_capacity = zone->capacity; - if (!disk_need_zone_resources(disk)) - return 0; - - if (!args->conv_zones_bitmap) { - args->conv_zones_bitmap = - bitmap_zalloc(args->nr_zones, GFP_NOIO); - if (!args->conv_zones_bitmap) - return -ENOMEM; - } - - set_bit(idx, args->conv_zones_bitmap); + args->nr_conv_zones++; return 0; } @@ -1746,6 +1806,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + /* Check zone condition */ + ret = blk_revalidate_zone_cond(zone, idx, args); + if (ret) + return ret; + /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: @@ -1813,10 +1878,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk) * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ - args.disk = disk; - args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); - ret = disk_revalidate_zone_resources(disk, args.nr_zones); + ret = disk_revalidate_zone_resources(disk, &args); if (ret) { memalloc_noio_restore(noio_flag); return ret; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f75fb15f55f..53bcfbc2f68f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,7 +196,7 @@ struct gendisk { unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; - unsigned long __rcu *conv_zones_bitmap; + u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; @@ -925,12 +925,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev, { return disk_zone_capacity(bdev->bd_disk, pos); } + +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector); + #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } +static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + return false; +} + static inline bool bio_needs_zone_write_plugging(struct bio *bio) { return false; @@ -1533,33 +1541,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev, return bdev_is_zone_start(bdev, sector); } -/** - * bdev_zone_is_seq - check if a sector belongs to a sequential write zone - * @bdev: block device to check - * @sector: sector number - * - * Check if @sector on @bdev is contained in a sequential write required zone. - */ -static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) -{ - bool is_seq = false; - -#if IS_ENABLED(CONFIG_BLK_DEV_ZONED) - if (bdev_is_zoned(bdev)) { - struct gendisk *disk = bdev->bd_disk; - unsigned long *bitmap; - - rcu_read_lock(); - bitmap = rcu_dereference(disk->conv_zones_bitmap); - is_seq = !bitmap || - !test_bit(disk_zone_no(disk, sector), bitmap); - rcu_read_unlock(); - } -#endif - - return is_seq; -} - int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); From 0bf0e2e4666822b62d7ad6473dc37fd6b377b5f1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:41 +0900 Subject: [PATCH 038/162] block: track zone conditions The function blk_revalidate_zone_cond() already caches the condition of all zones of a zoned block device in the zones_cond array of a gendisk. However, the zone conditions are updated only when the device is scanned or revalidated. Implement tracking of the runtime changes to zone conditions using the new cond field in struct blk_zone_wplug. The size of this structure remains 112 Bytes as the new field replaces the 4 Bytes padding at the end of the structure. Beause zones that do not have a zone write plug can be in the empty, implicit open, explicit open or full condition, the zones_cond array of a disk is used to track the conditions, of zones that do not have a zone write plug. The condition of such zone is updated in the disk zones_cond array when a zone reset, reset all or finish operation is executed, and also when a zone write plug is removed from the disk hash table when the zone becomes full. Since a device may automatically close an implicitly open zone when writing to an empty or closed zone, if the total number of open zones has reached the device limit, the BLK_ZONE_COND_IMP_OPEN and BLK_ZONE_COND_CLOSED zone conditions cannot be precisely tracked. To overcome this, the zone condition BLK_ZONE_COND_ACTIVE is introduced to represent a zone that has the condition BLK_ZONE_COND_IMP_OPEN, BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED. This follows the definition of an active zone as defined in the NVMe Zoned Namespace specifications. As such, for a zoned device that has a limit on the maximum number of open zones, we will never have more zones in the BLK_ZONE_COND_ACTIVE condition than the device limit. This is compatible with the SCSI ZBC and ATA ZAC specifications for SMR HDDs as these devices do not have a limit on the number of active zones. The function disk_zone_wplug_set_wp_offset() is modified to use the new helper disk_zone_wplug_update_cond() to update a zone write plug condition whenever a zone write plug write offset is updated on submission or merging of write BIOs to a zone. The functions blk_zone_reset_bio_endio(), blk_zone_reset_all_bio_endio() and blk_zone_finish_bio_endio() are modified to update the condition of the zones targeted by reset, reset_all and finish operations, either using though disk_zone_wplug_set_wp_offset() for zones that have a zone write plug, or using the disk_zone_set_cond() helper to update the zones_cond array of the disk for zones that do not have a zone write plug. When a zone write plug is removed from the disk hash table (when the zone becomes empty or full), the condition of struct blk_zone_wplug is used to update the disk zones_cond array. Conversely, when a zone write plug is added to the disk hash table, the zones_cond array is used to initialize the zone write plug condition. Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 120 ++++++++++++++++++++++++++++++++-- include/uapi/linux/blkzoned.h | 11 ++++ 2 files changed, 125 insertions(+), 6 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index f62862274f9a..c5fa303093a9 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -33,6 +33,7 @@ static const char *const zone_cond_name[] = { ZONE_COND_NAME(READONLY), ZONE_COND_NAME(FULL), ZONE_COND_NAME(OFFLINE), + ZONE_COND_NAME(ACTIVE), }; #undef ZONE_COND_NAME @@ -57,6 +58,7 @@ static const char *const zone_cond_name[] = { * @zone_no: The number of the zone the plug is managing. * @wp_offset: The zone write pointer location relative to the start of the zone * as a number of 512B sectors. + * @cond: Condition of the zone */ struct blk_zone_wplug { struct hlist_node node; @@ -69,6 +71,7 @@ struct blk_zone_wplug { unsigned int flags; unsigned int zone_no; unsigned int wp_offset; + enum blk_zone_cond cond; }; static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) @@ -114,6 +117,57 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); +static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno, + enum blk_zone_cond cond) +{ + if (!zones_cond) + return; + + switch (cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + zones_cond[zno] = BLK_ZONE_COND_ACTIVE; + return; + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + zones_cond[zno] = cond; + return; + } +} + +static void disk_zone_set_cond(struct gendisk *disk, sector_t sector, + enum blk_zone_cond cond) +{ + u8 *zones_cond; + + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (zones_cond) { + unsigned int zno = disk_zone_no(disk, sector); + + /* + * The condition of a conventional, readonly and offline zones + * never changes, so do nothing if the target zone is in one of + * these conditions. + */ + switch (zones_cond[zno]) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + break; + default: + blk_zone_set_cond(zones_cond, zno, cond); + break; + } + } + rcu_read_unlock(); +} + /** * bdev_zone_is_seq - check if a sector belongs to a sequential write zone * @bdev: block device to check @@ -416,6 +470,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, { struct blk_zone_wplug *zwplg; unsigned long flags; + u8 *zones_cond; unsigned int idx = hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); @@ -431,6 +486,20 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, return false; } } + + /* + * Set the zone condition: if we do not yet have a zones_cond array + * attached to the disk, then this is a zone write plug insert from the + * first call to blk_revalidate_disk_zones(), in which case the zone is + * necessarilly in the active condition. + */ + zones_cond = rcu_dereference_check(disk->zones_cond, + lockdep_is_held(&disk->zone_wplugs_lock)); + if (zones_cond) + zwplug->cond = zones_cond[zwplug->zone_no]; + else + zwplug->cond = BLK_ZONE_COND_ACTIVE; + hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); @@ -530,10 +599,15 @@ static void disk_remove_zone_wplug(struct gendisk *disk, /* * Mark the zone write plug as unhashed and drop the extra reference we - * took when the plug was inserted in the hash table. + * took when the plug was inserted in the hash table. Also update the + * disk zone condition array with the current condition of the zone + * write plug. */ zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, + lockdep_is_held(&disk->zone_wplugs_lock)), + zwplug->zone_no, zwplug->cond); hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); @@ -635,6 +709,22 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) blk_zone_wplug_bio_io_error(zwplug, bio); } +/* + * Update a zone write plug condition based on the write pointer offset. + */ +static void disk_zone_wplug_update_cond(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + lockdep_assert_held(&zwplug->lock); + + if (disk_zone_wplug_is_full(disk, zwplug)) + zwplug->cond = BLK_ZONE_COND_FULL; + else if (!zwplug->wp_offset) + zwplug->cond = BLK_ZONE_COND_EMPTY; + else + zwplug->cond = BLK_ZONE_COND_ACTIVE; +} + /* * Set a zone write plug write pointer offset to the specified value. * This aborts all plugged BIOs, which is fine as this function is called for @@ -650,6 +740,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, /* Update the zone write pointer and abort all plugged BIOs. */ zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; zwplug->wp_offset = wp_offset; + disk_zone_wplug_update_cond(disk, zwplug); + disk_zone_wplug_abort(zwplug); /* @@ -733,6 +825,7 @@ EXPORT_SYMBOL_GPL(disk_report_zone); static void blk_zone_reset_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; struct blk_zone_wplug *zwplug; /* @@ -741,7 +834,7 @@ static void blk_zone_reset_bio_endio(struct bio *bio) * resetting zones while writes are still in-flight will result in the * writes failing anyway. */ - zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { unsigned long flags; @@ -749,14 +842,18 @@ static void blk_zone_reset_bio_endio(struct bio *bio) disk_zone_wplug_set_wp_offset(disk, zwplug, 0); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); + } else { + disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); } } static void blk_zone_reset_all_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t capacity = get_capacity(disk); struct blk_zone_wplug *zwplug; unsigned long flags; + sector_t sector; unsigned int i; /* Update the condition of all zone write plugs. */ @@ -770,12 +867,18 @@ static void blk_zone_reset_all_bio_endio(struct bio *bio) } } rcu_read_unlock(); + + /* Update the cached zone conditions. */ + for (sector = 0; sector < capacity; + sector += bdev_zone_sectors(bio->bi_bdev)) + disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); } static void blk_zone_finish_bio_endio(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct gendisk *disk = bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; struct blk_zone_wplug *zwplug; /* @@ -784,7 +887,7 @@ static void blk_zone_finish_bio_endio(struct bio *bio) * is fine as resetting zones while writes are still in-flight will * result in the writes failing anyway. */ - zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { unsigned long flags; @@ -793,6 +896,8 @@ static void blk_zone_finish_bio_endio(struct bio *bio) bdev_zone_sectors(bdev)); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); + } else { + disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL); } } @@ -888,6 +993,7 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, */ void blk_zone_write_plug_bio_merged(struct bio *bio) { + struct gendisk *disk = bio->bi_bdev->bd_disk; struct blk_zone_wplug *zwplug; unsigned long flags; @@ -909,13 +1015,13 @@ void blk_zone_write_plug_bio_merged(struct bio *bio) * have at least one request and one BIO referencing the zone write * plug. So this should not fail. */ - zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, - bio->bi_iter.bi_sector); + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); if (WARN_ON_ONCE(!zwplug)) return; spin_lock_irqsave(&zwplug->lock, flags); zwplug->wp_offset += bio_sectors(bio); + disk_zone_wplug_update_cond(disk, zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -974,6 +1080,7 @@ void blk_zone_write_plug_init_request(struct request *req) /* Drop the reference taken by disk_zone_wplug_add_bio(). */ blk_queue_exit(q); zwplug->wp_offset += bio_sectors(bio); + disk_zone_wplug_update_cond(disk, zwplug); req_back_sector += bio_sectors(bio); } @@ -1037,6 +1144,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, /* Advance the zone write pointer offset. */ zwplug->wp_offset += bio_sectors(bio); + disk_zone_wplug_update_cond(disk, zwplug); return true; } @@ -1683,7 +1791,7 @@ static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, return -ENODEV; } - args->zones_cond[idx] = cond; + blk_zone_set_cond(args->zones_cond, idx, cond); return 0; diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index f85743ef6e7d..5c7662971414 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -48,6 +48,8 @@ enum blk_zone_type { * FINISH ZONE command. * @BLK_ZONE_COND_READONLY: The zone is read-only. * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written). + * @BLK_ZONE_COND_ACTIVE: The zone is either implicitly open, explicitly open, + * or closed. * * The Zone Condition state machine in the ZBC/ZAC standards maps the above * deinitions as: @@ -61,6 +63,13 @@ enum blk_zone_type { * * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should * be considered invalid. + * + * The condition BLK_ZONE_COND_ACTIVE is used only with cached zone reports. + * It is used to report any of the BLK_ZONE_COND_IMP_OPEN, + * BLK_ZONE_COND_EXP_OPEN and BLK_ZONE_COND_CLOSED conditions. Conversely, a + * regular zone report will never report a zone condition using + * BLK_ZONE_COND_ACTIVE and instead use the conditions BLK_ZONE_COND_IMP_OPEN, + * BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED as reported by the device. */ enum blk_zone_cond { BLK_ZONE_COND_NOT_WP = 0x0, @@ -71,6 +80,8 @@ enum blk_zone_cond { BLK_ZONE_COND_READONLY = 0xD, BLK_ZONE_COND_FULL = 0xE, BLK_ZONE_COND_OFFLINE = 0xF, + + BLK_ZONE_COND_ACTIVE = 0xFF, }; /** From 1af3f4e0c42b377f3405df498440566e3468c314 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:42 +0900 Subject: [PATCH 039/162] block: refactor blkdev_report_zones() code In preparation for implementing cached report zone, split the main part of the code of blkdev_report_zones() into the helper function blkdev_do_report_zones(), with this new helper taking as argument a struct blk_report_zones_args pointer instead of a report callback function and its private argument. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index c5fa303093a9..9ce7570c91e1 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -205,6 +205,21 @@ struct blk_report_zones_args { void *data; }; +static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, + struct blk_report_zones_args *args) +{ + struct gendisk *disk = bdev->bd_disk; + + if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) + return -EOPNOTSUPP; + + if (!nr_zones || sector >= get_capacity(disk)) + return 0; + + return disk->fops->report_zones(disk, sector, nr_zones, args); +} + /** * blkdev_report_zones - Get zones information * @bdev: Target block device @@ -227,19 +242,12 @@ struct blk_report_zones_args { int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { - struct gendisk *disk = bdev->bd_disk; struct blk_report_zones_args args = { .cb = cb, .data = data, }; - if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) - return -EOPNOTSUPP; - - if (!nr_zones || sector >= get_capacity(disk)) - return 0; - - return disk->fops->report_zones(disk, sector, nr_zones, &args); + return blkdev_do_report_zones(bdev, sector, nr_zones, &args); } EXPORT_SYMBOL_GPL(blkdev_report_zones); From f2284eec5053df271c78e687672247922bcee881 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:43 +0900 Subject: [PATCH 040/162] block: introduce blkdev_get_zone_info() Introduce the function blkdev_get_zone_info() to obtain a single zone information from cached zone data, that is, either from the zone write plug for the target zone if it exists and from the disk zones_cond array otherwise. Since sequential zones that do not have a zone write plug are either full, empty or in a bad state (read-only or offline), the zone write pointer can be inferred from the zone condition cached in the disk zones_cond array. For sequential zones that have a zone write plug, the zone condition and zone write pointer are obtained from the condition and write pointer offset managed with the zone write plug. This allows obtaining the information for a zone much more quickly than having to execute a report zones command on the device. blkdev_get_zone_info() falls back to using a regular zone report if the target zone is flagged as needing an update with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag, or if the target device does not use zone write plugs (i.e. a device mapper device). In this case, the new function blkdev_report_zone_fallback() is used and the zone condition is reported consistantly with the cahced report, that is, the BLK_ZONE_COND_ACTIVE condition is used in place of the implicit open, explicit open and closed conditions. This is achieved by adding the .report_active field to struct blk_report_zones_args and by having disk_report_zone() sets the correct zone condition if .report_active is true. In preparation for using blkdev_get_zone_info() in upcoming file systems changes, also export this function as a GPL symbol. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 141 +++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 3 + 2 files changed, 144 insertions(+) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 9ce7570c91e1..d98babfe49df 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -203,6 +203,7 @@ EXPORT_SYMBOL_GPL(bdev_zone_is_seq); struct blk_report_zones_args { report_zones_cb cb; void *data; + bool report_active; }; static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, @@ -820,6 +821,23 @@ static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, unsigned int idx, struct blk_report_zones_args *args) { + if (args->report_active) { + /* + * If we come here, then this is a report zones as a fallback + * for a cached report. So collapse the implicit open, explicit + * open and closed conditions into the active zone condition. + */ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + zone->cond = BLK_ZONE_COND_ACTIVE; + break; + default: + break; + } + } + if (disk->zone_wplugs_hash) disk_zone_wplug_sync_wp_offset(disk, zone); @@ -830,6 +848,129 @@ int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, } EXPORT_SYMBOL_GPL(disk_report_zone); +static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + memcpy(data, zone, sizeof(struct blk_zone)); + return 0; +} + +static int blkdev_report_zone_fallback(struct block_device *bdev, + sector_t sector, struct blk_zone *zone) +{ + struct blk_report_zones_args args = { + .cb = blkdev_report_zone_cb, + .data = zone, + .report_active = true, + }; + + return blkdev_do_report_zones(bdev, sector, 1, &args); +} + +/** + * blkdev_get_zone_info - Get a single zone information from cached data + * @bdev: Target block device + * @sector: Sector contained by the target zone + * @zone: zone structure to return the zone information + * + * Description: + * Get the zone information for the zone containing @sector using the zone + * write plug of the target zone, if one exist, or the disk zone condition + * array otherwise. The zone condition may be reported as being + * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit + * open, explicit open or closed condition. + * + * Returns 0 on success and a negative error code on failure. + */ +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t zone_sectors = bdev_zone_sectors(bdev); + struct blk_zone_wplug *zwplug; + unsigned long flags; + u8 *zones_cond; + + if (!bdev_is_zoned(bdev)) + return -EOPNOTSUPP; + + if (sector >= get_capacity(disk)) + return -EINVAL; + + memset(zone, 0, sizeof(*zone)); + sector = ALIGN_DOWN(sector, zone_sectors); + + rcu_read_lock(); + zones_cond = rcu_dereference(disk->zones_cond); + if (!disk->zone_wplugs_hash || !zones_cond) { + rcu_read_unlock(); + return blkdev_report_zone_fallback(bdev, sector, zone); + } + zone->cond = zones_cond[disk_zone_no(disk, sector)]; + rcu_read_unlock(); + + zone->start = sector; + zone->len = zone_sectors; + + /* + * If this is a conventional zone, we do not have a zone write plug and + * can report the zone immediately. + */ + if (zone->cond == BLK_ZONE_COND_NOT_WP) { + zone->type = BLK_ZONE_TYPE_CONVENTIONAL; + zone->capacity = zone_sectors; + zone->wp = ULLONG_MAX; + return 0; + } + + /* + * This is a sequential write required zone. If the zone is read-only or + * offline, only set the zone write pointer to an invalid value and + * report the zone. + */ + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + if (disk_zone_is_last(disk, zone)) + zone->capacity = disk->last_zone_capacity; + else + zone->capacity = disk->zone_capacity; + + if (zone->cond == BLK_ZONE_COND_READONLY || + zone->cond == BLK_ZONE_COND_OFFLINE) { + zone->wp = ULLONG_MAX; + return 0; + } + + /* + * If the zone does not have a zone write plug, it is either full or + * empty, as we otherwise would have a zone write plug for it. In this + * case, set the write pointer accordingly and report the zone. + * Otherwise, if we have a zone write plug, use it. + */ + zwplug = disk_get_zone_wplug(disk, sector); + if (!zwplug) { + if (zone->cond == BLK_ZONE_COND_FULL) + zone->wp = ULLONG_MAX; + else + zone->wp = sector; + return 0; + } + + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) { + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + return blkdev_report_zone_fallback(bdev, sector, zone); + } + zone->cond = zwplug->cond; + zone->wp = sector + zwplug->wp_offset; + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); + + return 0; +} +EXPORT_SYMBOL_GPL(blkdev_get_zone_info); + static void blk_zone_reset_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53bcfbc2f68f..03a594b4dfbc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -436,6 +436,9 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, unsigned int idx, struct blk_report_zones_args *args); +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); From 31f0656a4ab712edf2888eabcc0664197a4a938e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:44 +0900 Subject: [PATCH 041/162] block: introduce blkdev_report_zones_cached() Introduce the function blkdev_report_zones_cached() to provide a fast report zone built using the blkdev_get_zone_info() function, which gets zone information from a disk zones_cond array or zone write plugs. For a large capacity SMR drive, such fast report zone can be completed in a few milliseconds compared to several seconds completion times when the report zone is obtained from the device. The zone report is built in the same manner as with the regular blkdev_report_zones() function, that is, the first zone reported is the one containing the specified start sector and the report is limited to the specified number of zones (nr_zones argument). The information for each zone in the report is obtained using blkdev_get_zone_info(). For zoned devices that do not use zone write plug resources, using blkdev_get_zone_info() is inefficient as the zone report would be very slow, generated one zone at a time. To avoid this, blkdev_report_zones_cached() falls back to calling blkdev_do_report_zones() to execute a regular zone report. In this case, the .report_active field of struct blk_report_zones_args is set to true to report zone conditions using the BLK_ZONE_COND_ACTIVE condition in place of the implicit open, explicit open and closed conditions. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 88 +++++++++++++++++++++++++++++++++++------- include/linux/blkdev.h | 2 + 2 files changed, 77 insertions(+), 13 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d98babfe49df..bbd105b11843 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -74,6 +74,19 @@ struct blk_zone_wplug { enum blk_zone_cond cond; }; +static inline bool disk_need_zone_resources(struct gendisk *disk) +{ + /* + * All request-based zoned devices need zone resources so that the + * block layer can automatically handle write BIO plugging. BIO-based + * device drivers (e.g. DM devices) are normally responsible for + * handling zone write ordering and do not need zone resources, unless + * the driver requires zone append emulation. + */ + return queue_is_mq(disk->queue) || + queue_emulates_zone_append(disk->queue); +} + static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) { return 1U << disk->zone_wplugs_hash_bits; @@ -971,6 +984,68 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_get_zone_info); +/** + * blkdev_report_zones_cached - Get cached zones information + * @bdev: Target block device + * @sector: Sector from which to report zones + * @nr_zones: Maximum number of zones to report + * @cb: Callback function called for each reported zone + * @data: Private data for the callback function + * + * Description: + * Similar to blkdev_report_zones() but instead of calling into the low level + * device driver to get the zone report from the device, use + * blkdev_get_zone_info() to generate the report from the disk zone write + * plugs and zones condition array. Since calling this function without a + * callback does not make sense, @cb must be specified. + */ +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t capacity = get_capacity(disk); + sector_t zone_sectors = bdev_zone_sectors(bdev); + unsigned int idx = 0; + struct blk_zone zone; + int ret; + + if (!cb || !bdev_is_zoned(bdev) || + WARN_ON_ONCE(!disk->fops->report_zones)) + return -EOPNOTSUPP; + + if (!nr_zones || sector >= capacity) + return 0; + + /* + * If we do not have any zone write plug resources, fallback to using + * the regular zone report. + */ + if (!disk_need_zone_resources(disk)) { + struct blk_report_zones_args args = { + .cb = cb, + .data = data, + .report_active = true, + }; + + return blkdev_do_report_zones(bdev, sector, nr_zones, &args); + } + + for (sector = ALIGN_DOWN(sector, zone_sectors); + sector < capacity && idx < nr_zones; + sector += zone_sectors, idx++) { + ret = blkdev_get_zone_info(bdev, sector, &zone); + if (ret) + return ret; + + ret = cb(&zone, idx, data); + if (ret) + return ret; + } + + return idx; +} +EXPORT_SYMBOL_GPL(blkdev_report_zones_cached); + static void blk_zone_reset_bio_endio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; @@ -1781,19 +1856,6 @@ void disk_free_zone_resources(struct gendisk *disk) disk->nr_zones = 0; } -static inline bool disk_need_zone_resources(struct gendisk *disk) -{ - /* - * All mq zoned devices need zone resources so that the block layer - * can automatically handle write BIO plugging. BIO-based device drivers - * (e.g. DM devices) are normally responsible for handling zone write - * ordering and do not need zone resources, unless the driver requires - * zone append emulation. - */ - return queue_is_mq(disk->queue) || - queue_emulates_zone_append(disk->queue); -} - struct blk_revalidate_zone_args { struct gendisk *disk; u8 *zones_cond; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 03a594b4dfbc..f0ab02e0a673 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,6 +442,8 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk); From b30ffcdc0c15a88f8866529d3532454e02571221 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:45 +0900 Subject: [PATCH 042/162] block: introduce BLKREPORTZONESV2 ioctl Introduce the new BLKREPORTZONESV2 ioctl command to allow user applications access to the fast zone report implemented by blkdev_report_zones_cached(). This new ioctl is defined as number 142 and is documented in include/uapi/linux/fs.h. Unlike the existing BLKREPORTZONES ioctl, this new ioctl uses the flags field of struct blk_zone_report also as an input. If the user sets the BLK_ZONE_REP_CACHED flag as an input, then blkdev_report_zones_cached() is used to generate the zone report using cached zone information. If this flag is not set, then BLKREPORTZONESV2 behaves in the same manner as BLKREPORTZONES and the zone report is generated by accessing the zoned device. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 25 ++++++++++++++++++++++--- block/ioctl.c | 1 + include/uapi/linux/blkzoned.h | 35 ++++++++++++++++++++++++++++++----- include/uapi/linux/fs.h | 2 +- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index bbd105b11843..7a7b0704f095 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -357,7 +357,12 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, } /* - * BLKREPORTZONE ioctl processing. + * Mask of valid input flags for BLKREPORTZONEV2 ioctl. + */ +#define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED + +/* + * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing. * Called from blkdev_ioctl. */ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, @@ -381,8 +386,22 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, return -EINVAL; args.zones = argp + sizeof(struct blk_zone_report); - ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, - blkdev_copy_zone_to_user, &args); + + switch (cmd) { + case BLKREPORTZONE: + ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, + blkdev_copy_zone_to_user, &args); + break; + case BLKREPORTZONEV2: + if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS) + return -EINVAL; + ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones, + blkdev_copy_zone_to_user, &args); + break; + default: + return -EINVAL; + } + if (ret < 0) return ret; diff --git a/block/ioctl.c b/block/ioctl.c index 3927ca4707d0..698629e4c619 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -581,6 +581,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKGETDISKSEQ: return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: + case BLKREPORTZONEV2: return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 5c7662971414..e33f02703350 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -87,10 +87,20 @@ enum blk_zone_cond { /** * enum blk_zone_report_flags - Feature flags of reported zone descriptors. * - * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + * @BLK_ZONE_REP_CAPACITY: Output only. Indicates that zone descriptors in a + * zone report have a valid capacity field. + * @BLK_ZONE_REP_CACHED: Input only. Indicates that the zone report should be + * generated using cached zone information. In this case, + * the implicit open, explicit open and closed zone + * conditions are all reported with the + * BLK_ZONE_COND_ACTIVE condition. */ enum blk_zone_report_flags { - BLK_ZONE_REP_CAPACITY = (1 << 0), + /* Output flags */ + BLK_ZONE_REP_CAPACITY = (1U << 0), + + /* Input flags */ + BLK_ZONE_REP_CACHED = (1U << 31), }; /** @@ -133,6 +143,10 @@ struct blk_zone { * @sector: starting sector of report * @nr_zones: IN maximum / OUT actual * @flags: one or more flags as defined by enum blk_zone_report_flags. + * @flags: one or more flags as defined by enum blk_zone_report_flags. + * With BLKREPORTZONE, this field is ignored as an input and is valid + * only as an output. Using BLKREPORTZONEV2, this field is used as both + * input and output. * @zones: Space to hold @nr_zones @zones entries on reply. * * The array of at most @nr_zones must follow this structure in memory. @@ -159,9 +173,19 @@ struct blk_zone_range { /** * Zoned block device ioctl's: * - * @BLKREPORTZONE: Get zone information. Takes a zone report as argument. - * The zone report will start from the zone containing the - * sector specified in the report request structure. + * @BLKREPORTZONE: Get zone information from a zoned device. Takes a zone report + * as argument. The zone report will start from the zone + * containing the sector specified in struct blk_zone_report. + * The flags field of struct blk_zone_report is used as an + * output only and ignored as an input. + * DEPRECATED, use BLKREPORTZONEV2 instead. + * @BLKREPORTZONEV2: Same as @BLKREPORTZONE but uses the flags field of + * struct blk_zone_report as an input, allowing to get a zone + * report using cached zone information if the flag + * BLK_ZONE_REP_CACHED is set. In such case, the zone report + * may include zones with the condition @BLK_ZONE_COND_ACTIVE + * (c.f. the description of this condition above for more + * details). * @BLKRESETZONE: Reset the write pointer of the zones in the specified * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. @@ -180,5 +204,6 @@ struct blk_zone_range { #define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) #define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) #define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) +#define BLKREPORTZONEV2 _IOWR(0x12, 142, struct blk_zone_report) #endif /* _UAPI_BLKZONED_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 957ce3343a4f..66ca526cf786 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -298,7 +298,7 @@ struct file_attr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) #define BLKGETDISKSEQ _IOR(0x12,128,__u64) -/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ +/* 130-136 and 142 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ #define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2) From 2b39d4a6c67d11ead8f39ec6376645d8e9d34554 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:46 +0900 Subject: [PATCH 043/162] block: improve zone_wplugs debugfs attribute output Make the output of the zone_wplugs debugfs attribute file more easily readable by adding the name of the zone write plugs fields in the output. No functional changes. Suggested-by: Bart Van Assche Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7a7b0704f095..3104fda5809b 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -2313,8 +2313,10 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, zwp_bio_list_size = bio_list_size(&zwplug->bio_list); spin_unlock_irqrestore(&zwplug->lock, flags); - seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, - zwp_wp_offset, zwp_bio_list_size); + seq_printf(m, + "Zone no: %u, flags: 0x%x, ref: %u, wp ofst: %u, pending BIO: %u\n", + zwp_zone_no, zwp_flags, zwp_ref, + zwp_wp_offset, zwp_bio_list_size); } int queue_zone_wplugs_show(void *data, struct seq_file *m) From 1efbbc641ef7d673059cded789b9c8a743c17c9a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:47 +0900 Subject: [PATCH 044/162] block: add zone write plug condition to debugfs zone_wplugs Modify queue_zone_wplug_show() to include the condition of a zone write plug to the zone_wplugs debugfs attribute of a zoned block device. To improve readability and ease of use, rather than the zone condition raw value, the zone condition name is given using blk_zone_cond_str(). Suggested-by: Johannes Thumshirn Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-zoned.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 3104fda5809b..bba64b427082 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -2303,19 +2303,21 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, unsigned int zwp_wp_offset, zwp_flags; unsigned int zwp_zone_no, zwp_ref; unsigned int zwp_bio_list_size; + enum blk_zone_cond zwp_cond; unsigned long flags; spin_lock_irqsave(&zwplug->lock, flags); zwp_zone_no = zwplug->zone_no; zwp_flags = zwplug->flags; zwp_ref = refcount_read(&zwplug->ref); + zwp_cond = zwplug->cond; zwp_wp_offset = zwplug->wp_offset; zwp_bio_list_size = bio_list_size(&zwplug->bio_list); spin_unlock_irqrestore(&zwplug->lock, flags); seq_printf(m, - "Zone no: %u, flags: 0x%x, ref: %u, wp ofst: %u, pending BIO: %u\n", - zwp_zone_no, zwp_flags, zwp_ref, + "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n", + zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond), zwp_wp_offset, zwp_bio_list_size); } From ad3c1188b401cbc0533515ba2d45396b4fa320e1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:48 +0900 Subject: [PATCH 045/162] btrfs: use blkdev_report_zones_cached() Modify btrfs_get_dev_zones() and btrfs_sb_log_location_bdev() to replace the call to blkdev_report_zones() with blkdev_report_zones_cached() to speed-up mount operations. btrfs_get_dev_zone_info() is also modified to take into account the BLK_ZONE_COND_ACTIVE condition, which is equivalent to either BLK_ZONE_COND_IMP_OPEN, BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED. With this change, mounting a freshly formatted large capacity (30 TB) SMR HDD completes under 100ms compared to over 1.8s before. Signed-off-by: Damien Le Moal Acked-by: David Sterba Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/btrfs/zoned.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0ea0df18a8e4..a16b1a896c78 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -264,8 +264,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, } } - ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, - copy_zone_info_cb, zones); + ret = blkdev_report_zones_cached(device->bdev, pos >> SECTOR_SHIFT, + *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { btrfs_err(device->fs_info, "zoned: failed to read zone %llu on %s (devid %llu)", @@ -494,6 +494,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: __set_bit(nreported, zone_info->active_zones); nactive++; break; @@ -896,9 +897,9 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, if (sb_zone + 1 >= nr_zones) return -ENOENT; - ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), - BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, - zones); + ret = blkdev_report_zones_cached(bdev, zone_start_sector(sb_zone, bdev), + BTRFS_NR_SB_LOG_ZONES, + copy_zone_info_cb, zones); if (ret < 0) return ret; if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) From e04ccfc28252f181ea8d469d834b48e7dece65b2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:49 +0900 Subject: [PATCH 046/162] xfs: use blkdev_report_zones_cached() Modify xfs_mount_zones() to replace the call to blkdev_report_zones() with blkdev_report_zones_cached() to speed-up mount operations. Since this causes xfs_zone_validate_seq() to see zones with the BLK_ZONE_COND_ACTIVE condition, this function is also modified to acept this condition as valid. With this change, mounting a freshly formatted large capacity (30 TB) SMR HDD completes under 2s compared to over 4.7s before. Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/xfs/libxfs/xfs_zones.c | 1 + fs/xfs/xfs_zone_alloc.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c index b0791a71931c..b40f71f878b5 100644 --- a/fs/xfs/libxfs/xfs_zones.c +++ b/fs/xfs/libxfs/xfs_zones.c @@ -95,6 +95,7 @@ xfs_zone_validate_seq( case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: return xfs_zone_validate_wp(zone, rtg, write_pointer); case BLK_ZONE_COND_FULL: return xfs_zone_validate_full(zone, rtg, write_pointer); diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 1147bacb2da8..d121768dbccb 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1250,7 +1250,7 @@ xfs_mount_zones( trace_xfs_zones_mount(mp); if (bdev_is_zoned(bt->bt_bdev)) { - error = blkdev_report_zones(bt->bt_bdev, + error = blkdev_report_zones_cached(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); if (error < 0) From ba13710ddd1f47884701213a3b6a5e470f6bc81e Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Sun, 12 Oct 2025 19:50:12 +0530 Subject: [PATCH 047/162] rust: block: update ARef and AlwaysRefCounted imports from sync::aref Update call sites in the block subsystem to import `ARef` and `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to sync. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Signed-off-by: Shankari Anand Signed-off-by: Jens Axboe --- drivers/block/rnull/rnull.rs | 3 +-- rust/kernel/block/mq.rs | 5 ++--- rust/kernel/block/mq/operations.rs | 4 ++-- rust/kernel/block/mq/request.rs | 8 ++++++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index 1ec694d7f1a6..a9d5e575a2c4 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -17,8 +17,7 @@ use kernel::{ error::Result, pr_info, prelude::*, - sync::Arc, - types::ARef, + sync::{aref::ARef, Arc}, }; use pin_init::PinInit; diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index 637018ead0ab..1fd0d54dd549 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -20,7 +20,7 @@ //! The kernel will interface with the block device driver by calling the method //! implementations of the `Operations` trait. //! -//! IO requests are passed to the driver as [`kernel::types::ARef`] +//! IO requests are passed to the driver as [`kernel::sync::aref::ARef`] //! instances. The `Request` type is a wrapper around the C `struct request`. //! The driver must mark end of processing by calling one of the //! `Request::end`, methods. Failure to do so can lead to deadlock or timeout @@ -61,8 +61,7 @@ //! block::mq::*, //! new_mutex, //! prelude::*, -//! sync::{Arc, Mutex}, -//! types::{ARef, ForeignOwnable}, +//! sync::{aref::ARef, Arc, Mutex}, //! }; //! //! struct MyBlkDevice; diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index f91a1719886c..8ad46129a52c 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -9,8 +9,8 @@ use crate::{ block::mq::{request::RequestDataWrapper, Request}, error::{from_result, Result}, prelude::*, - sync::Refcount, - types::{ARef, ForeignOwnable}, + sync::{aref::ARef, Refcount}, + types::ForeignOwnable, }; use core::marker::PhantomData; diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index c5f1f6b1ccfb..ce3e30c81cb5 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -8,8 +8,12 @@ use crate::{ bindings, block::mq::Operations, error::Result, - sync::{atomic::Relaxed, Refcount}, - types::{ARef, AlwaysRefCounted, Opaque}, + sync::{ + aref::{ARef, AlwaysRefCounted}, + atomic::Relaxed, + Refcount, + }, + types::Opaque, }; use core::{marker::PhantomData, ptr::NonNull}; From c6886cf610f4bb0b8aa6b88ab013a042e317f898 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2025 14:52:14 -0500 Subject: [PATCH 048/162] block: don't leak disk->zones_cond for !disk_need_zone_resources disk->zones_cond is allocated for all zoned devices, but disk_free_zone_resources skips it when the zone write plug hash is not allocated, leaking the allocation for non-mq devices that don't emulate zone append. This is reported by kmemleak-enabled xfstests for various tests that use simple device mapper targets. Fix this by moving all code that requires writes plugs from disk_free_zone_resources into disk_destroy_zone_wplugs_hash_table and executing the rest of the code, including the disk->zones_cond freeing unconditionally. Fixes: 6e945ffb6555 ("block: use zone condition to determine conventional zones") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index bba64b427082..a0ce17e2143f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1834,6 +1834,14 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; + + /* + * Wait for the zone write plugs to be RCU-freed before destroying the + * mempool. + */ + rcu_barrier(); + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; } static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) @@ -1850,9 +1858,6 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) void disk_free_zone_resources(struct gendisk *disk) { - if (!disk->zone_wplugs_pool) - return; - if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; @@ -1860,15 +1865,6 @@ void disk_free_zone_resources(struct gendisk *disk) disk_destroy_zone_wplugs_hash_table(disk); - /* - * Wait for the zone write plugs to be RCU-freed before - * destorying the mempool. - */ - rcu_barrier(); - - mempool_destroy(disk->zone_wplugs_pool); - disk->zone_wplugs_pool = NULL; - disk_set_zones_cond_array(disk, NULL); disk->zone_capacity = 0; disk->last_zone_capacity = 0; From 15638d52cbcf6e969f4a5e2757b118355db583f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2025 14:52:15 -0500 Subject: [PATCH 049/162] block: fix cached zone reporting after zone append was used No zone plugs are allocated when a zone is opened by calling Zone Append on it. This makes the cached zone reporting report incorrectly empty zones if the file system is unmounted and report zones is called after that, e.g. by xfstests test cases using the scratch device. Fix this by recording if zone append was used on a device, and disable cached reporting for the device until a ZONE_RESET_ALL happens that guarantees all zones are empty. We could probably do even better using a per-zone flag, but the practical use cache for zone reporting after the initial mount are rather limited, so let's keep things simple for now. Fixes: 31f0656a4ab7 ("block: introduce blkdev_report_zones_cached()") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 26 +++++++++++++++++++++----- include/linux/blkdev.h | 1 + 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index a0ce17e2143f..c5226bcaaa94 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -899,6 +899,19 @@ static int blkdev_report_zone_fallback(struct block_device *bdev, return blkdev_do_report_zones(bdev, sector, 1, &args); } +/* + * For devices that natively support zone append operations, we do not use zone + * write plugging for zone append writes, which makes the zone condition + * tracking invalid once zone append was used. In that case fall back to a + * regular report zones to get correct information. + */ +static inline bool blkdev_has_cached_report_zones(struct block_device *bdev) +{ + return disk_need_zone_resources(bdev->bd_disk) && + (bdev_emulates_zone_append(bdev) || + !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state)); +} + /** * blkdev_get_zone_info - Get a single zone information from cached data * @bdev: Target block device @@ -932,6 +945,9 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, memset(zone, 0, sizeof(*zone)); sector = ALIGN_DOWN(sector, zone_sectors); + if (!blkdev_has_cached_report_zones(bdev)) + return blkdev_report_zone_fallback(bdev, sector, zone); + rcu_read_lock(); zones_cond = rcu_dereference(disk->zones_cond); if (!disk->zone_wplugs_hash || !zones_cond) { @@ -1035,11 +1051,7 @@ int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, if (!nr_zones || sector >= capacity) return 0; - /* - * If we do not have any zone write plug resources, fallback to using - * the regular zone report. - */ - if (!disk_need_zone_resources(disk)) { + if (!blkdev_has_cached_report_zones(bdev)) { struct blk_report_zones_args args = { .cb = cb, .data = data, @@ -1115,6 +1127,7 @@ static void blk_zone_reset_all_bio_endio(struct bio *bio) for (sector = 0; sector < capacity; sector += bdev_zone_sectors(bio->bi_bdev)) disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); + clear_bit(GD_ZONE_APPEND_USED, &disk->state); } static void blk_zone_finish_bio_endio(struct bio *bio) @@ -1474,6 +1487,9 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) struct blk_zone_wplug *zwplug; unsigned long flags; + if (!test_bit(GD_ZONE_APPEND_USED, &disk->state)) + set_bit(GD_ZONE_APPEND_USED, &disk->state); + /* * We have native support for zone append operations, so we are not * going to handle @bio through plugging. However, we may already have a diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f0ab02e0a673..6a498aa7f7e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -173,6 +173,7 @@ struct gendisk { #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 +#define GD_ZONE_APPEND_USED 7 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ From 2299ceec364eecdc0a5b4ec80c757551d130389c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 6 Nov 2025 10:16:46 -0700 Subject: [PATCH 050/162] ublk: use copy_{to,from}_iter() for user copy ublk_copy_user_pages()/ublk_copy_io_pages() currently uses iov_iter_get_pages2() to extract the pages from the iov_iter and memcpy()s between the bvec_iter and the iov_iter's pages one at a time. Switch to using copy_to_iter()/copy_from_iter() instead. This avoids the user page reference count increments and decrements and needing to split the memcpy() at user page boundaries. It also simplifies the code considerably. Ming reports a 40% throughput improvement when issuing I/O to the selftests null ublk server with zero-copy disabled. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 62 +++++++++------------------------------- 1 file changed, 14 insertions(+), 48 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 97cc4bc0a6ce..40eee3e15a4c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -913,47 +913,35 @@ static const struct block_device_operations ub_fops = { .report_zones = ublk_report_zones, }; -#define UBLK_MAX_PIN_PAGES 32 - struct ublk_io_iter { - struct page *pages[UBLK_MAX_PIN_PAGES]; struct bio *bio; struct bvec_iter iter; }; -/* return how many pages are copied */ -static void ublk_copy_io_pages(struct ublk_io_iter *data, - size_t total, size_t pg_off, int dir) +/* return how many bytes are copied */ +static size_t ublk_copy_io_pages(struct ublk_io_iter *data, + struct iov_iter *uiter, int dir) { - unsigned done = 0; - unsigned pg_idx = 0; + size_t done = 0; - while (done < total) { + for (;;) { struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); - unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, - (unsigned)(PAGE_SIZE - pg_off)); void *bv_buf = bvec_kmap_local(&bv); - void *pg_buf = kmap_local_page(data->pages[pg_idx]); + size_t copied; if (dir == ITER_DEST) - memcpy(pg_buf + pg_off, bv_buf, bytes); + copied = copy_to_iter(bv_buf, bv.bv_len, uiter); else - memcpy(bv_buf, pg_buf + pg_off, bytes); + copied = copy_from_iter(bv_buf, bv.bv_len, uiter); - kunmap_local(pg_buf); kunmap_local(bv_buf); - /* advance page array */ - pg_off += bytes; - if (pg_off == PAGE_SIZE) { - pg_idx += 1; - pg_off = 0; - } - - done += bytes; + done += copied; + if (copied < bv.bv_len) + break; /* advance bio */ - bio_advance_iter_single(data->bio, &data->iter, bytes); + bio_advance_iter_single(data->bio, &data->iter, copied); if (!data->iter.bi_size) { data->bio = data->bio->bi_next; if (data->bio == NULL) @@ -961,6 +949,7 @@ static void ublk_copy_io_pages(struct ublk_io_iter *data, data->iter = data->bio->bi_iter; } } + return done; } static bool ublk_advance_io_iter(const struct request *req, @@ -988,34 +977,11 @@ static size_t ublk_copy_user_pages(const struct request *req, unsigned offset, struct iov_iter *uiter, int dir) { struct ublk_io_iter iter; - size_t done = 0; if (!ublk_advance_io_iter(req, &iter, offset)) return 0; - while (iov_iter_count(uiter) && iter.bio) { - unsigned nr_pages; - ssize_t len; - size_t off; - int i; - - len = iov_iter_get_pages2(uiter, iter.pages, - iov_iter_count(uiter), - UBLK_MAX_PIN_PAGES, &off); - if (len <= 0) - return done; - - ublk_copy_io_pages(&iter, len, off, dir); - nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); - for (i = 0; i < nr_pages; i++) { - if (dir == ITER_DEST) - set_page_dirty(iter.pages[i]); - put_page(iter.pages[i]); - } - done += len; - } - - return done; + return ublk_copy_io_pages(&iter, uiter, dir); } static inline bool ublk_need_map_req(const struct request *req) From e87d66ab27ac89494b75ddc3fed697b5aa8417f1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 6 Nov 2025 10:16:47 -0700 Subject: [PATCH 051/162] ublk: use rq_for_each_segment() for user copy ublk_advance_io_iter() and ublk_copy_io_pages() currently open-code the iteration over the request's bvecs. Switch to the rq_for_each_segment() macro provided by blk-mq to avoid reaching into the bio internals and simplify the code. Suggested-by: Ming Lei Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 70 +++++++++++----------------------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 40eee3e15a4c..5cf288809226 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -913,22 +913,29 @@ static const struct block_device_operations ub_fops = { .report_zones = ublk_report_zones, }; -struct ublk_io_iter { - struct bio *bio; - struct bvec_iter iter; -}; - -/* return how many bytes are copied */ -static size_t ublk_copy_io_pages(struct ublk_io_iter *data, - struct iov_iter *uiter, int dir) +/* + * Copy data between request pages and io_iter, and 'offset' + * is the start point of linear offset of request. + */ +static size_t ublk_copy_user_pages(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) { + struct req_iterator iter; + struct bio_vec bv; size_t done = 0; - for (;;) { - struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); - void *bv_buf = bvec_kmap_local(&bv); + rq_for_each_segment(bv, req, iter) { + void *bv_buf; size_t copied; + if (offset >= bv.bv_len) { + offset -= bv.bv_len; + continue; + } + + bv.bv_offset += offset; + bv.bv_len -= offset; + bv_buf = bvec_kmap_local(&bv); if (dir == ITER_DEST) copied = copy_to_iter(bv_buf, bv.bv_len, uiter); else @@ -940,50 +947,11 @@ static size_t ublk_copy_io_pages(struct ublk_io_iter *data, if (copied < bv.bv_len) break; - /* advance bio */ - bio_advance_iter_single(data->bio, &data->iter, copied); - if (!data->iter.bi_size) { - data->bio = data->bio->bi_next; - if (data->bio == NULL) - break; - data->iter = data->bio->bi_iter; - } + offset = 0; } return done; } -static bool ublk_advance_io_iter(const struct request *req, - struct ublk_io_iter *iter, unsigned int offset) -{ - struct bio *bio = req->bio; - - for_each_bio(bio) { - if (bio->bi_iter.bi_size > offset) { - iter->bio = bio; - iter->iter = bio->bi_iter; - bio_advance_iter(iter->bio, &iter->iter, offset); - return true; - } - offset -= bio->bi_iter.bi_size; - } - return false; -} - -/* - * Copy data between request pages and io_iter, and 'offset' - * is the start point of linear offset of request. - */ -static size_t ublk_copy_user_pages(const struct request *req, - unsigned offset, struct iov_iter *uiter, int dir) -{ - struct ublk_io_iter iter; - - if (!ublk_advance_io_iter(req, &iter, offset)) - return 0; - - return ublk_copy_io_pages(&iter, uiter, dir); -} - static inline bool ublk_need_map_req(const struct request *req) { return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; From 1165d20f4d1abba59ff2f032df271605ad49c255 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 5 Nov 2025 17:54:44 -0800 Subject: [PATCH 052/162] null_blk: simplify copy_from_nullb It always returns success, so the code that saves the errors status, but proceeds without checking it looks a bit odd. Clean this up. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Tested-by: Hans Holmberg Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index f982027e8c85..e2597ea696bf 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1161,7 +1161,7 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, return 0; } -static int copy_from_nullb(struct nullb *nullb, struct page *dest, +static void copy_from_nullb(struct nullb *nullb, struct page *dest, unsigned int off, sector_t sector, size_t n) { size_t temp, count = 0; @@ -1184,7 +1184,6 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, count += temp; sector += temp >> SECTOR_SHIFT; } - return 0; } static void nullb_fill_pattern(struct nullb *nullb, struct page *page, @@ -1248,8 +1247,8 @@ static int null_transfer(struct nullb *nullb, struct page *page, sector, len); if (valid_len) { - err = copy_from_nullb(nullb, page, off, - sector, valid_len); + copy_from_nullb(nullb, page, off, sector, + valid_len); off += valid_len; len -= valid_len; } From 845928381963c61a537b932b6b3f494ce0ccea2d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 5 Nov 2025 17:54:45 -0800 Subject: [PATCH 053/162] null_blk: consistently use blk_status_t No need to mix errno and blk_status_t error types. Just use the standard block layer type. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Tested-by: Hans Holmberg Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index e2597ea696bf..2f64cd713584 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1129,7 +1129,7 @@ again: return 0; } -static int copy_to_nullb(struct nullb *nullb, struct page *source, +static blk_status_t copy_to_nullb(struct nullb *nullb, struct page *source, unsigned int off, sector_t sector, size_t n, bool is_fua) { size_t temp, count = 0; @@ -1146,7 +1146,7 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, t_page = null_insert_page(nullb, sector, !null_cache_active(nullb) || is_fua); if (!t_page) - return -ENOSPC; + return BLK_STS_NOSPC; memcpy_page(t_page->page, offset, source, off + count, temp); @@ -1158,7 +1158,7 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, count += temp; sector += temp >> SECTOR_SHIFT; } - return 0; + return BLK_STS_OK; } static void copy_from_nullb(struct nullb *nullb, struct page *dest, @@ -1233,13 +1233,13 @@ static blk_status_t null_handle_flush(struct nullb *nullb) return errno_to_blk_status(err); } -static int null_transfer(struct nullb *nullb, struct page *page, +static blk_status_t null_transfer(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off, bool is_write, sector_t sector, bool is_fua) { struct nullb_device *dev = nullb->dev; + blk_status_t err = BLK_STS_OK; unsigned int valid_len = len; - int err = 0; if (!is_write) { if (dev->zoned) @@ -1273,7 +1273,7 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; - int err = 0; + blk_status_t err = BLK_STS_OK; unsigned int len; sector_t sector = blk_rq_pos(rq); unsigned int max_bytes = nr_sectors << SECTOR_SHIFT; @@ -1298,7 +1298,7 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, } spin_unlock_irq(&nullb->lock); - return errno_to_blk_status(err); + return err; } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) From 262a3dd04e729386bececffeb095d31f7a9c43d5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 5 Nov 2025 17:54:46 -0800 Subject: [PATCH 054/162] null_blk: single kmap per bio segment Rather than kmap the the request bio segment for each sector, do the mapping just once. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Tested-by: Hans Holmberg Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 2f64cd713584..ea3fc4241f82 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1129,8 +1129,8 @@ again: return 0; } -static blk_status_t copy_to_nullb(struct nullb *nullb, struct page *source, - unsigned int off, sector_t sector, size_t n, bool is_fua) +static blk_status_t copy_to_nullb(struct nullb *nullb, void *source, + sector_t sector, size_t n, bool is_fua) { size_t temp, count = 0; unsigned int offset; @@ -1148,7 +1148,7 @@ static blk_status_t copy_to_nullb(struct nullb *nullb, struct page *source, if (!t_page) return BLK_STS_NOSPC; - memcpy_page(t_page->page, offset, source, off + count, temp); + memcpy_to_page(t_page->page, offset, source + count, temp); __set_bit(sector & SECTOR_MASK, t_page->bitmap); @@ -1161,8 +1161,8 @@ static blk_status_t copy_to_nullb(struct nullb *nullb, struct page *source, return BLK_STS_OK; } -static void copy_from_nullb(struct nullb *nullb, struct page *dest, - unsigned int off, sector_t sector, size_t n) +static void copy_from_nullb(struct nullb *nullb, void *dest, sector_t sector, + size_t n) { size_t temp, count = 0; unsigned int offset; @@ -1176,22 +1176,16 @@ static void copy_from_nullb(struct nullb *nullb, struct page *dest, !null_cache_active(nullb)); if (t_page) - memcpy_page(dest, off + count, t_page->page, offset, - temp); + memcpy_from_page(dest + count, t_page->page, offset, + temp); else - memzero_page(dest, off + count, temp); + memset(dest + count, 0, temp); count += temp; sector += temp >> SECTOR_SHIFT; } } -static void nullb_fill_pattern(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off) -{ - memset_page(page, off, 0xff, len); -} - blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors) { @@ -1240,27 +1234,29 @@ static blk_status_t null_transfer(struct nullb *nullb, struct page *page, struct nullb_device *dev = nullb->dev; blk_status_t err = BLK_STS_OK; unsigned int valid_len = len; + void *p; + p = kmap_local_page(page) + off; if (!is_write) { if (dev->zoned) valid_len = null_zone_valid_read_len(nullb, sector, len); if (valid_len) { - copy_from_nullb(nullb, page, off, sector, - valid_len); + copy_from_nullb(nullb, p, sector, valid_len); off += valid_len; len -= valid_len; } if (len) - nullb_fill_pattern(nullb, page, len, off); + memset(p + valid_len, 0xff, len); flush_dcache_page(page); } else { flush_dcache_page(page); - err = copy_to_nullb(nullb, page, off, sector, len, is_fua); + err = copy_to_nullb(nullb, p, sector, len, is_fua); } + kunmap_local(p); return err; } From 3451cf34f51bb70c24413abb20b423e64486161b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 5 Nov 2025 17:54:47 -0800 Subject: [PATCH 055/162] null_blk: allow byte aligned memory offsets Allowing byte aligned memory provides a nice testing ground for direct-io. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Tested-by: Hans Holmberg Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 45 ++++++++++++++++++---------------- drivers/block/null_blk/zoned.c | 2 +- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index ea3fc4241f82..f1e67962ecae 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1130,25 +1130,27 @@ again: } static blk_status_t copy_to_nullb(struct nullb *nullb, void *source, - sector_t sector, size_t n, bool is_fua) + loff_t pos, size_t n, bool is_fua) { size_t temp, count = 0; - unsigned int offset; struct nullb_page *t_page; + sector_t sector; while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); + temp = min3(nullb->dev->blocksize, n - count, + PAGE_SIZE - offset_in_page(pos)); + sector = pos >> SECTOR_SHIFT; if (null_cache_active(nullb) && !is_fua) null_make_cache_space(nullb, PAGE_SIZE); - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_insert_page(nullb, sector, !null_cache_active(nullb) || is_fua); if (!t_page) return BLK_STS_NOSPC; - memcpy_to_page(t_page->page, offset, source + count, temp); + memcpy_to_page(t_page->page, offset_in_page(pos), + source + count, temp); __set_bit(sector & SECTOR_MASK, t_page->bitmap); @@ -1156,33 +1158,33 @@ static blk_status_t copy_to_nullb(struct nullb *nullb, void *source, null_free_sector(nullb, sector, true); count += temp; - sector += temp >> SECTOR_SHIFT; + pos += temp; } return BLK_STS_OK; } -static void copy_from_nullb(struct nullb *nullb, void *dest, sector_t sector, +static void copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, size_t n) { size_t temp, count = 0; - unsigned int offset; struct nullb_page *t_page; + sector_t sector; while (count < n) { - temp = min_t(size_t, nullb->dev->blocksize, n - count); + temp = min3(nullb->dev->blocksize, n - count, + PAGE_SIZE - offset_in_page(pos)); + sector = pos >> SECTOR_SHIFT; - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT; t_page = null_lookup_page(nullb, sector, false, !null_cache_active(nullb)); - if (t_page) - memcpy_from_page(dest + count, t_page->page, offset, - temp); + memcpy_from_page(dest + count, t_page->page, + offset_in_page(pos), temp); else memset(dest + count, 0, temp); count += temp; - sector += temp >> SECTOR_SHIFT; + pos += temp; } } @@ -1228,7 +1230,7 @@ static blk_status_t null_handle_flush(struct nullb *nullb) } static blk_status_t null_transfer(struct nullb *nullb, struct page *page, - unsigned int len, unsigned int off, bool is_write, sector_t sector, + unsigned int len, unsigned int off, bool is_write, loff_t pos, bool is_fua) { struct nullb_device *dev = nullb->dev; @@ -1240,10 +1242,10 @@ static blk_status_t null_transfer(struct nullb *nullb, struct page *page, if (!is_write) { if (dev->zoned) valid_len = null_zone_valid_read_len(nullb, - sector, len); + pos >> SECTOR_SHIFT, len); if (valid_len) { - copy_from_nullb(nullb, p, sector, valid_len); + copy_from_nullb(nullb, p, pos, valid_len); off += valid_len; len -= valid_len; } @@ -1253,7 +1255,7 @@ static blk_status_t null_transfer(struct nullb *nullb, struct page *page, flush_dcache_page(page); } else { flush_dcache_page(page); - err = copy_to_nullb(nullb, p, sector, len, is_fua); + err = copy_to_nullb(nullb, p, pos, len, is_fua); } kunmap_local(p); @@ -1271,7 +1273,7 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, struct nullb *nullb = cmd->nq->dev->nullb; blk_status_t err = BLK_STS_OK; unsigned int len; - sector_t sector = blk_rq_pos(rq); + loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT; unsigned int max_bytes = nr_sectors << SECTOR_SHIFT; unsigned int transferred_bytes = 0; struct req_iterator iter; @@ -1283,11 +1285,11 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd, if (transferred_bytes + len > max_bytes) len = max_bytes - transferred_bytes; err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(req_op(rq)), sector, + op_is_write(req_op(rq)), pos, rq->cmd_flags & REQ_FUA); if (err) break; - sector += len >> SECTOR_SHIFT; + pos += len; transferred_bytes += len; if (transferred_bytes >= max_bytes) break; @@ -1944,6 +1946,7 @@ static int null_add_dev(struct nullb_device *dev) .logical_block_size = dev->blocksize, .physical_block_size = dev->blocksize, .max_hw_sectors = dev->max_sectors, + .dma_alignment = 1, }; struct nullb *nullb; diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 6a93b12a06ff..dbf292a8eae9 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb, { struct nullb_device *dev = nullb->dev; struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; - unsigned int nr_sectors = len >> SECTOR_SHIFT; + unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SHIFT); /* Read must be below the write pointer position */ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || From 0739c2c6a015604a7c01506bea28200a2cc2e08c Mon Sep 17 00:00:00 2001 From: Cong Zhang Date: Tue, 21 Oct 2025 19:07:56 +0800 Subject: [PATCH 056/162] virtio_blk: NULL out vqs to avoid double free on failed resume The vblk->vqs releases during freeze. If resume fails before vblk->vqs is allocated, later freeze/remove may attempt to free vqs again. Set vblk->vqs to NULL after freeing to avoid double free. Signed-off-by: Cong Zhang Acked-by: Jason Wang Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index a5e97f03dbf0..357434bdae99 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1027,8 +1027,13 @@ static int init_vq(struct virtio_blk *vblk) out: kfree(vqs); kfree(vqs_info); - if (err) + if (err) { kfree(vblk->vqs); + /* + * Set to NULL to prevent freeing vqs again during freezing. + */ + vblk->vqs = NULL; + } return err; } @@ -1599,6 +1604,12 @@ static int virtblk_freeze_priv(struct virtio_device *vdev) vdev->config->del_vqs(vdev); kfree(vblk->vqs); + /* + * Set to NULL to prevent freeing vqs again after a failed vqs + * allocation during resume. Note that kfree() already handles NULL + * pointers safely. + */ + vblk->vqs = NULL; return 0; } From 2f6b2565d43cdb5087cac23d530cca84aa3d897e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 14 Oct 2025 08:04:55 -0700 Subject: [PATCH 057/162] block: accumulate memory segment gaps per bio The blk-mq dma iterator has an optimization for requests that align to the device's iommu merge boundary. This boundary may be larger than the device's virtual boundary, but the code had been depending on that queue limit to know ahead of time if the request is guaranteed to align to that optimization. Rather than rely on that queue limit, which many devices may not report, save the lowest set bit of any boundary gap between each segment in the bio while checking the segments. The request stores the value for merging and quickly checking per io if the request can use iova optimizations. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 1 + block/blk-map.c | 3 +++ block/blk-merge.c | 39 ++++++++++++++++++++++++++++++++++++--- block/blk-mq-dma.c | 3 +-- block/blk-mq.c | 6 ++++++ include/linux/bio.h | 2 ++ include/linux/blk-mq.h | 16 ++++++++++++++++ include/linux/blk_types.h | 12 ++++++++++++ 8 files changed, 77 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index b3a79285c278..7b13bdf72de0 100644 --- a/block/bio.c +++ b/block/bio.c @@ -253,6 +253,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_write_hint = 0; bio->bi_write_stream = 0; bio->bi_status = 0; + bio->bi_bvec_gap_bit = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; diff --git a/block/blk-map.c b/block/blk-map.c index 60faf036fb6e..17a1dc288678 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -459,6 +459,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) if (rq->bio) { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; + rq->phys_gap_bit = bio_seg_gap(rq->q, rq->biotail, bio, + rq->phys_gap_bit); rq->biotail->bi_next = bio; rq->biotail = bio; rq->__data_len += bio->bi_iter.bi_size; @@ -469,6 +471,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) rq->nr_phys_segments = nr_segs; rq->bio = rq->biotail = bio; rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; return 0; } EXPORT_SYMBOL(blk_rq_append_bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index c47d18587a0b..3ca6fbf8b787 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -302,6 +302,12 @@ static unsigned int bio_split_alignment(struct bio *bio, return lim->logical_block_size; } +static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv, + struct bio_vec *bv) +{ + return bv->bv_offset | (bvprv->bv_offset + bvprv->bv_len); +} + /** * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split @@ -319,8 +325,8 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; + unsigned nsegs = 0, bytes = 0, gaps = 0; struct bvec_iter iter; - unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { if (bv.bv_offset & lim->dma_alignment || @@ -331,8 +337,11 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) - goto split; + if (bvprvp) { + if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) + goto split; + gaps |= bvec_seg_gap(bvprvp, &bv); + } if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && @@ -350,6 +359,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, } *segs = nsegs; + bio->bi_bvec_gap_bit = ffs(gaps); return 0; split: if (bio->bi_opf & REQ_ATOMIC) @@ -385,6 +395,7 @@ split: * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); + bio->bi_bvec_gap_bit = ffs(gaps); return bytes >> SECTOR_SHIFT; } EXPORT_SYMBOL_GPL(bio_split_io_at); @@ -721,6 +732,21 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq, return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); } +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit) +{ + struct bio_vec pb, nb; + + gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit); + gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit); + + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); + if (!biovec_phys_mergeable(q, &pb, &nb)) + gaps_bit = min_not_zero(gaps_bit, ffs(bvec_seg_gap(&pb, &nb))); + return gaps_bit; +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -785,6 +811,9 @@ static struct request *attempt_merge(struct request_queue *q, if (next->start_time_ns < req->start_time_ns) req->start_time_ns = next->start_time_ns; + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, next->bio, + min_not_zero(next->phys_gap_bit, + req->phys_gap_bit)); req->biotail->bi_next = next->bio; req->biotail = next->biotail; @@ -908,6 +937,8 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req, if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) blk_zone_write_plug_bio_merged(bio); + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, bio, + req->phys_gap_bit); req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -942,6 +973,8 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, blk_update_mixed_merge(req, bio, true); + req->phys_gap_bit = bio_seg_gap(req->q, bio, req->bio, + req->phys_gap_bit); bio->bi_next = req->bio; req->bio = bio; diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 449950029872..94d3461b5bc8 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -79,8 +79,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, static inline bool blk_can_dma_map_iova(struct request *req, struct device *dma_dev) { - return !((queue_virt_boundary(req->q) + 1) & - dma_get_merge_boundary(dma_dev)); + return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev)); } static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) diff --git a/block/blk-mq.c b/block/blk-mq.c index d626d32f6e57..b2fdeaac0efb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; + rq->phys_gap_bit = 0; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; @@ -668,6 +669,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, goto out_queue_exit; } rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -748,6 +750,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; + rq->phys_gap_bit = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; @@ -2674,6 +2677,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; rq->__data_len = bio->bi_iter.bi_size; + rq->phys_gap_bit = bio->bi_bvec_gap_bit; + rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, @@ -3380,6 +3385,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->nr_integrity_segments = rq_src->nr_integrity_segments; + rq->phys_gap_bit = rq_src->phys_gap_bit; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; diff --git a/include/linux/bio.h b/include/linux/bio.h index 16c1c85613b7..ad2d57908c1c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,8 @@ extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align); +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b25d12545f46..b54506b3b76d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -152,6 +152,14 @@ struct request { unsigned short nr_phys_segments; unsigned short nr_integrity_segments; + /* + * The lowest set bit for address gaps between physical segments. This + * provides information necessary for dma optimization opprotunities, + * like for testing if the segments can be coalesced against the + * device's iommu granule. + */ + unsigned char phys_gap_bit; + #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; @@ -208,6 +216,14 @@ struct request { void *end_io_data; }; +/* + * Returns a mask with all bits starting at req->phys_gap_bit set to 1. + */ +static inline unsigned long req_phys_gap_mask(const struct request *req) +{ + return ~(((1 << req->phys_gap_bit) >> 1) - 1); +} + static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8e8d1cc8b06c..53501ebb0623 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -218,6 +218,18 @@ struct bio { enum rw_hint bi_write_hint; u8 bi_write_stream; blk_status_t bi_status; + + /* + * The bvec gap bit indicates the lowest set bit in any address offset + * between all bi_io_vecs. This field is initialized only after the bio + * is split to the hardware limits (see bio_split_io_at()). The value + * may be used to consider DMA optimization when performing that + * mapping. The value is compared to a power of two mask where the + * result depends on any bit set within the mask, so saving the lowest + * bit is sufficient to know if any segment gap collides with the mask. + */ + u8 bi_bvec_gap_bit; + atomic_t __bi_remaining; struct bvec_iter bi_iter; From bc840b21a25a50f00e2b240329c09281506df387 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 14 Oct 2025 08:04:56 -0700 Subject: [PATCH 058/162] nvme: remove virtual boundary for sgl capable devices The nvme virtual boundary is only required for the PRP format. Devices that can use SGL for DMA don't need it for IO queues. Drop reporting it for such devices; rdma fabrics controllers will continue to use the limit as they currently don't report any boundary requirements, but tcp and fc never needed it in the first place so they get to report no virtual boundary. Applications may continue to align to the same virtual boundaries for optimization purposes if they want, and the driver will continue to decide whether to use the PRP format the same as before if the IO allows it. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/nvme/host/apple.c | 1 + drivers/nvme/host/core.c | 10 +++++----- drivers/nvme/host/fabrics.h | 6 ++++++ drivers/nvme/host/fc.c | 1 + drivers/nvme/host/nvme.h | 7 +++++++ drivers/nvme/host/pci.c | 28 +++++++++++++++++++++++++--- drivers/nvme/host/rdma.c | 1 + drivers/nvme/host/tcp.c | 1 + drivers/nvme/target/loop.c | 1 + 9 files changed, 48 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index f35d3f71d14f..15b3d07f8ccd 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1283,6 +1283,7 @@ static const struct nvme_ctrl_ops nvme_ctrl_ops = { .reg_read64 = apple_nvme_reg_read64, .free_ctrl = apple_nvme_free_ctrl, .get_address = apple_nvme_get_address, + .get_virt_boundary = nvme_get_virt_boundary, }; static void apple_nvme_async_probe(void *data, async_cookie_t cookie) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c0fe50fb7b08..4da937f96ae2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2069,13 +2069,13 @@ static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) } static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl, - struct queue_limits *lim) + struct queue_limits *lim, bool is_admin) { lim->max_hw_sectors = ctrl->max_hw_sectors; lim->max_segments = min_t(u32, USHRT_MAX, min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments)); lim->max_integrity_segments = ctrl->max_integrity_segments; - lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1; + lim->virt_boundary_mask = ctrl->ops->get_virt_boundary(ctrl, is_admin); lim->max_segment_size = UINT_MAX; lim->dma_alignment = 3; } @@ -2177,7 +2177,7 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, int ret; lim = queue_limits_start_update(ns->disk->queue); - nvme_set_ctrl_limits(ns->ctrl, &lim); + nvme_set_ctrl_limits(ns->ctrl, &lim, false); memflags = blk_mq_freeze_queue(ns->disk->queue); ret = queue_limits_commit_update(ns->disk->queue, &lim); @@ -2381,7 +2381,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); - nvme_set_ctrl_limits(ns->ctrl, &lim); + nvme_set_ctrl_limits(ns->ctrl, &lim, false); nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info); nvme_set_chunk_sectors(ns, id, &lim); if (!nvme_update_disk_info(ns, id, &lim)) @@ -3588,7 +3588,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); lim = queue_limits_start_update(ctrl->admin_q); - nvme_set_ctrl_limits(ctrl, &lim); + nvme_set_ctrl_limits(ctrl, &lim, true); ret = queue_limits_commit_update(ctrl->admin_q, &lim); if (ret) goto out_free; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 1b58ee7d0dce..caf5503d0833 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -217,6 +217,12 @@ static inline unsigned int nvmf_nr_io_queues(struct nvmf_ctrl_options *opts) min(opts->nr_poll_queues, num_online_cpus()); } +static inline unsigned long nvmf_get_virt_boundary(struct nvme_ctrl *ctrl, + bool is_admin) +{ + return 0; +} + int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 03987f497a5b..70c066c2e2d4 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3360,6 +3360,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, + .get_virt_boundary = nvmf_get_virt_boundary, }; static void diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 928c748ccbd1..9a5f28c5103c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -558,6 +558,12 @@ static inline bool nvme_ns_has_pi(struct nvme_ns_head *head) return head->pi_type && head->ms == head->pi_size; } +static inline unsigned long nvme_get_virt_boundary(struct nvme_ctrl *ctrl, + bool is_admin) +{ + return NVME_CTRL_PAGE_SIZE - 1; +} + struct nvme_ctrl_ops { const char *name; struct module *module; @@ -578,6 +584,7 @@ struct nvme_ctrl_ops { int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); void (*print_device_info)(struct nvme_ctrl *ctrl); bool (*supports_pci_p2pdma)(struct nvme_ctrl *ctrl); + unsigned long (*get_virt_boundary)(struct nvme_ctrl *ctrl, bool is_admin); }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c916176bd9f0..3c1727df1e36 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -613,9 +613,22 @@ static inline enum nvme_use_sgl nvme_pci_use_sgls(struct nvme_dev *dev, struct nvme_queue *nvmeq = req->mq_hctx->driver_data; if (nvmeq->qid && nvme_ctrl_sgl_supported(&dev->ctrl)) { - if (nvme_req(req)->flags & NVME_REQ_USERCMD) - return SGL_FORCED; - if (req->nr_integrity_segments > 1) + /* + * When the controller is capable of using SGL, there are + * several conditions that we force to use it: + * + * 1. A request containing page gaps within the controller's + * mask can not use the PRP format. + * + * 2. User commands use SGL because that lets the device + * validate the requested transfer lengths. + * + * 3. Multiple integrity segments must use SGL as that's the + * only way to describe such a command in NVMe. + */ + if (req_phys_gap_mask(req) & (NVME_CTRL_PAGE_SIZE - 1) || + nvme_req(req)->flags & NVME_REQ_USERCMD || + req->nr_integrity_segments > 1) return SGL_FORCED; return SGL_SUPPORTED; } @@ -3243,6 +3256,14 @@ static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl) return dma_pci_p2pdma_supported(dev->dev); } +static unsigned long nvme_pci_get_virt_boundary(struct nvme_ctrl *ctrl, + bool is_admin) +{ + if (!nvme_ctrl_sgl_supported(ctrl) || is_admin) + return NVME_CTRL_PAGE_SIZE - 1; + return 0; +} + static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .name = "pcie", .module = THIS_MODULE, @@ -3257,6 +3278,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .get_address = nvme_pci_get_address, .print_device_info = nvme_pci_print_device_info, .supports_pci_p2pdma = nvme_pci_supports_pci_p2pdma, + .get_virt_boundary = nvme_pci_get_virt_boundary, }; static int nvme_dev_map(struct nvme_dev *dev) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 190a4cfa8a5e..35c0822edb2d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2202,6 +2202,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, .stop_ctrl = nvme_rdma_stop_ctrl, + .get_virt_boundary = nvme_get_virt_boundary, }; /* diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 9a96df1a511c..29ad4735fac6 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2865,6 +2865,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .delete_ctrl = nvme_tcp_delete_ctrl, .get_address = nvme_tcp_get_address, .stop_ctrl = nvme_tcp_stop_ctrl, + .get_virt_boundary = nvmf_get_virt_boundary, }; static bool diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f85a8441bcc6..fc8e7c9ad858 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -511,6 +511,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .submit_async_event = nvme_loop_submit_async_event, .delete_ctrl = nvme_loop_delete_ctrl_host, .get_address = nvmf_get_address, + .get_virt_boundary = nvme_get_virt_boundary, }; static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) From 86a9ce21f5b781c56eba23cbbd2264ab74778ab0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Nov 2025 15:53:32 +0100 Subject: [PATCH 059/162] block: don't return 1 for the fallback case in blkdev_get_zone_info blkdev_do_report_zones returns the number of reported zones, but blkdev_get_zone_info returns 0 or an errno. Translate to the expected return value in blkdev_report_zone_fallback. Fixes: b037d41762fd ("block: introduce blkdev_get_zone_info()") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-zoned.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index c5226bcaaa94..8204214e3b89 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -895,8 +895,14 @@ static int blkdev_report_zone_fallback(struct block_device *bdev, .data = zone, .report_active = true, }; + int error; - return blkdev_do_report_zones(bdev, sector, 1, &args); + error = blkdev_do_report_zones(bdev, sector, 1, &args); + if (error < 0) + return error; + if (error == 0) + return -EIO; + return 0; } /* From bbac6e0fa57f6624123edf20ba8f9b7c0e092117 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Nov 2025 15:38:42 +0900 Subject: [PATCH 060/162] block: improve blk_zone_wp_offset() blk_zone_wp_offset() is always called with a struct blk_zone obtained from the device, that is, it will never see the BLK_ZONE_COND_ACTIVE condition. However, handling this condition makes this function more solid and will also avoid issues when propagating cached report requests to underlying stacked devices is implemented. Add BLK_ZONE_COND_ACTIVE as a new case in blk_zone_wp_offset() switch. Also while at it, change the handling of the full condition to return UINT_MAX for the zone write pointer to reflect the fact that the write pointer of a full zone is invalid. Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-zoned.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 8204214e3b89..7ce7b8ea5a4f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -800,18 +800,18 @@ static unsigned int blk_zone_wp_offset(struct blk_zone *zone) case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; case BLK_ZONE_COND_EMPTY: return 0; + case BLK_ZONE_COND_FULL: case BLK_ZONE_COND_NOT_WP: case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: default: /* - * Conventional, offline and read-only zones do not have a valid - * write pointer. + * Conventional, full, offline and read-only zones do not have + * a valid write pointer. */ return UINT_MAX; } From e2b0ec776164c11569ee021fac89596a2642654c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Nov 2025 15:38:43 +0900 Subject: [PATCH 061/162] block: refactor disk_zone_wplug_sync_wp_offset() The helper function blk_zone_wp_offset() is called from disk_zone_wplug_sync_wp_offset(), and again called from blk_revalidate_seq_zone() right after the call to disk_zone_wplug_sync_wp_offset(). Change disk_zone_wplug_sync_wp_offset() to return the value of obtained with blk_zone_wp_offset() to avoid this double call, which simplifies a little blk_revalidate_seq_zone(). Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-zoned.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7ce7b8ea5a4f..b580d59ce210 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -817,23 +817,24 @@ static unsigned int blk_zone_wp_offset(struct blk_zone *zone) } } -static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk, - struct blk_zone *zone) +static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk, + struct blk_zone *zone) { struct blk_zone_wplug *zwplug; - unsigned long flags; + unsigned int wp_offset = blk_zone_wp_offset(zone); zwplug = disk_get_zone_wplug(disk, zone->start); - if (!zwplug) - return; + if (zwplug) { + unsigned long flags; - spin_lock_irqsave(&zwplug->lock, flags); - if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) - disk_zone_wplug_set_wp_offset(disk, zwplug, - blk_zone_wp_offset(zone)); - spin_unlock_irqrestore(&zwplug->lock, flags); + spin_lock_irqsave(&zwplug->lock, flags); + if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) + disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + } - disk_put_zone_wplug(zwplug); + return wp_offset; } /** @@ -2101,9 +2102,7 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; - disk_zone_wplug_sync_wp_offset(disk, zone); - - wp_offset = blk_zone_wp_offset(zone); + wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone); if (!wp_offset || wp_offset >= zone->capacity) return 0; From 25976c314f6596254c9b1e2291d94393b7d5ae81 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Nov 2025 15:38:44 +0900 Subject: [PATCH 062/162] block: introduce bdev_zone_start() Introduce the function bdev_zone_start() as a more explicit (and clear) replacement for ALIGN_DOWN() to get the start sector of a zone containing a particular sector of a zoned block device. Use this new helper in blkdev_get_zone_info() and blkdev_report_zones_cached(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ++-- include/linux/blkdev.h | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index b580d59ce210..3791755bc6ad 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -950,7 +950,7 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, return -EINVAL; memset(zone, 0, sizeof(*zone)); - sector = ALIGN_DOWN(sector, zone_sectors); + sector = bdev_zone_start(bdev, sector); if (!blkdev_has_cached_report_zones(bdev)) return blkdev_report_zone_fallback(bdev, sector, zone); @@ -1068,7 +1068,7 @@ int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, return blkdev_do_report_zones(bdev, sector, nr_zones, &args); } - for (sector = ALIGN_DOWN(sector, zone_sectors); + for (sector = bdev_zone_start(bdev, sector); sector < capacity && idx < nr_zones; sector += zone_sectors, idx++) { ret = blkdev_get_zone_info(bdev, sector, &zone); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a498aa7f7e7..2fff8a80dbd2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1522,6 +1522,12 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return q->limits.chunk_sectors; } +static inline sector_t bdev_zone_start(struct block_device *bdev, + sector_t sector) +{ + return sector & ~(bdev_zone_sectors(bdev) - 1); +} + static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, sector_t sector) { From 3d7b1dbaa09877db78560d8020ea4d1c972720b9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 21 Oct 2025 20:28:00 +0800 Subject: [PATCH 063/162] MAINTAINERS: Update Yu Kuai's E-mail address Change to my new email address on fnnas.com. Link: https://lore.kernel.org/linux-raid/20251021122800.3158836-1-yukuai@fnnas.com Signed-off-by: Yu Kuai Reviewed-by: Paul Menzel --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 545a4776795e..ce7f67e09910 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4300,7 +4300,7 @@ F: Documentation/filesystems/befs.rst F: fs/befs/ BFQ I/O SCHEDULER -M: Yu Kuai +M: Yu Kuai L: linux-block@vger.kernel.org S: Odd Fixes F: Documentation/block/bfq-iosched.rst @@ -23842,7 +23842,7 @@ F: include/linux/property.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu -M: Yu Kuai +M: Yu Kuai L: linux-raid@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-raid/list/ From cc394b94dc40b661efc9895665abf03640ffff2d Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Sun, 28 Sep 2025 09:24:24 +0800 Subject: [PATCH 064/162] md: delete mddev kobj before deleting gendisk kobj In sync del gendisk path, it deletes gendisk first and the directory /sys/block/md is removed. Then it releases mddev kobj in a delayed work. If we enable debug log in sysfs_remove_group, we can see the debug log 'sysfs group bitmap not found for kobject md'. It's the reason that the parent kobj has been deleted, so it can't find parent directory. In creating path, it allocs gendisk first, then adds mddev kobj. So it should delete mddev kobj before deleting gendisk. Before commit 9e59d609763f ("md: call del_gendisk in control path"), it releases mddev kobj first. If the kobj hasn't been deleted, it does clean job and deletes the kobj. Then it calls del_gendisk and releases gendisk kobj. So it doesn't need to call kobject_del to delete mddev kobj. After this patch, in sync del gendisk path, the sequence changes. So it needs to call kobject_del to delete mddev kobj. After this patch, the sequence is: 1. kobject del mddev kobj 2. del_gendisk deletes gendisk kobj 3. mddev_delayed_delete releases mddev kobj 4. md_kobj_release releases gendisk kobj Link: https://lore.kernel.org/linux-raid/20250928012424.61370-1-xni@redhat.com Fixes: 9e59d609763f ("md: call del_gendisk in control path") Signed-off-by: Xiao Ni Reviewed-by: Li Nan Signed-off-by: Yu Kuai --- drivers/md/md.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 41c476b40c7a..8128c8839a08 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -941,8 +941,10 @@ void mddev_unlock(struct mddev *mddev) * do_md_stop. dm raid only uses md_stop to stop. So dm raid * doesn't need to check MD_DELETED when getting reconfig lock */ - if (test_bit(MD_DELETED, &mddev->flags)) + if (test_bit(MD_DELETED, &mddev->flags)) { + kobject_del(&mddev->kobj); del_gendisk(mddev->gendisk); + } } } EXPORT_SYMBOL_GPL(mddev_unlock); From 0dc76205549b4c25705e54345f211b9f66e018a0 Mon Sep 17 00:00:00 2001 From: Yun Zhou Date: Wed, 15 Oct 2025 16:32:27 +0800 Subject: [PATCH 065/162] md: fix rcu protection in md_wakeup_thread We attempted to use RCU to protect the pointer 'thread', but directly passed the value when calling md_wakeup_thread(). This means that the RCU pointer has been acquired before rcu_read_lock(), which renders rcu_read_lock() ineffective and could lead to a use-after-free. Link: https://lore.kernel.org/linux-raid/20251015083227.1079009-1-yun.zhou@windriver.com Fixes: 446931543982 ("md: protect md_thread with rcu") Signed-off-by: Yun Zhou Reviewed-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 14 ++++++-------- drivers/md/md.h | 8 +++++++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8128c8839a08..6062e0deb616 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -99,7 +99,7 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); -static void md_wakeup_thread_directly(struct md_thread __rcu *thread); +static void md_wakeup_thread_directly(struct md_thread __rcu **thread); /* * Default number of read corrections we'll attempt on an rdev @@ -5136,7 +5136,7 @@ static void stop_sync_thread(struct mddev *mddev, bool locked) * Thread might be blocked waiting for metadata update which will now * never happen */ - md_wakeup_thread_directly(mddev->sync_thread); + md_wakeup_thread_directly(&mddev->sync_thread); if (work_pending(&mddev->sync_work)) flush_work(&mddev->sync_work); @@ -8375,22 +8375,21 @@ static int md_thread(void *arg) return 0; } -static void md_wakeup_thread_directly(struct md_thread __rcu *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu **thread) { struct md_thread *t; rcu_read_lock(); - t = rcu_dereference(thread); + t = rcu_dereference(*thread); if (t) wake_up_process(t->tsk); rcu_read_unlock(); } -void md_wakeup_thread(struct md_thread __rcu *thread) +void __md_wakeup_thread(struct md_thread __rcu *thread) { struct md_thread *t; - rcu_read_lock(); t = rcu_dereference(thread); if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); @@ -8398,9 +8397,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread) if (wq_has_sleeper(&t->wqueue)) wake_up(&t->wqueue); } - rcu_read_unlock(); } -EXPORT_SYMBOL(md_wakeup_thread); +EXPORT_SYMBOL(__md_wakeup_thread); struct md_thread *md_register_thread(void (*run) (struct md_thread *), struct mddev *mddev, const char *name) diff --git a/drivers/md/md.h b/drivers/md/md.h index 1979c2d4fe89..5d5f780b8447 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -882,6 +882,12 @@ struct md_io_clone { #define THREAD_WAKEUP 0 +#define md_wakeup_thread(thread) do { \ + rcu_read_lock(); \ + __md_wakeup_thread(thread); \ + rcu_read_unlock(); \ +} while (0) + static inline void safe_put_page(struct page *p) { if (p) put_page(p); @@ -895,7 +901,7 @@ extern struct md_thread *md_register_thread( struct mddev *mddev, const char *name); extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp); -extern void md_wakeup_thread(struct md_thread __rcu *thread); +extern void __md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern enum sync_action md_sync_action(struct mddev *mddev); From 082d680faf262cd52b51ca0b8dcbc7a9f311c220 Mon Sep 17 00:00:00 2001 From: Wu Guanghao Date: Mon, 8 Sep 2025 16:20:37 +0800 Subject: [PATCH 066/162] Factor out code into md_should_do_recovery() In md_check_recovery(), use new helper to make code cleaner. Link: https://lore.kernel.org/linux-raid/e62894c8-d916-94bc-ef48-3c60e6e1fc5d@huawei.com Signed-off-by: Wu Guanghao Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 59 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6062e0deb616..7352e749e1b5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9978,6 +9978,52 @@ static void unregister_sync_thread(struct mddev *mddev) md_reap_sync_thread(mddev); } +static bool md_should_do_recovery(struct mddev *mddev) +{ + /* + * As long as one of the following flags is set, + * recovery needs to do or cleanup. + */ + if (test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || + test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + return true; + + /* + * If no flags are set and it is in read-only status, + * there is nothing to do. + */ + if (!md_is_rdwr(mddev)) + return false; + + /* + * MD_SB_CHANGE_PENDING indicates that the array is switching from clean to + * active, and no action is needed for now. + * All other MD_SB_* flags require to update the superblock. + */ + if (mddev->sb_flags & ~ (1<external == 0 && mddev->safemode == 1) + return true; + + /* + * When the system is about to restart or the process receives an signal, + * the array needs to be synchronized as soon as possible. + * Once the data synchronization is completed, need to change the array + * status to in_sync. + */ + if (mddev->safemode == 2 && !mddev->in_sync && + mddev->resync_offset == MaxSector) + return true; + + return false; +} + /* * This routine is regularly called by all per-raid-array threads to * deal with generic issues like resync and super-block update. @@ -10014,18 +10060,7 @@ void md_check_recovery(struct mddev *mddev) flush_signals(current); } - if (!md_is_rdwr(mddev) && - !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) - return; - if ( ! ( - (mddev->sb_flags & ~ (1<recovery) || - test_bit(MD_RECOVERY_DONE, &mddev->recovery) || - (mddev->external == 0 && mddev->safemode == 1) || - (mddev->safemode == 2 - && !mddev->in_sync && mddev->resync_offset == MaxSector) - )) + if (!md_should_do_recovery(mddev)) return; if (mddev_trylock(mddev)) { From 7fc8f632e68cd4db3bfee11ff7aa2ac731c5147a Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 3 Sep 2025 16:10:52 +0000 Subject: [PATCH 067/162] md/md-linear: Enable atomic writes All the infrastructure has already been plumbed to support this for stacked devices, so just enable the request_queue limits features flag. A note about chunk sectors for linear arrays: While it is possible to set a chunk sectors param for building a linear array, this is for specifying the granularity at which data sectors from the device are used. It is not the same as a stripe size, like for RAID0. As such, it is not appropriate to set chunk_sectors request queue limit to the same value, as chunk_sectors request limit is a boundary for which requests cannot straddle. However, request_queue limit max_hw_sectors is set to chunk sectors, which almost has the same effect as setting chunk_sectors limit. Link: https://lore.kernel.org/linux-raid/20250903161052.3326176-1-john.g.garry@oracle.com Signed-off-by: John Garry Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md-linear.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 7033d982d377..25a6ddedea65 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -75,6 +75,7 @@ static int linear_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; + lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; From 46caa40534fcab1b341455b156c2bcf3674a6bdc Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Wed, 10 Sep 2025 17:19:12 +0800 Subject: [PATCH 068/162] md/md-llbitmap: Remove unneeded semicolon Remove unnecessary semicolons reported by Coccinelle/coccicheck and the semantic patch at scripts/coccinelle/misc/semicolon.cocci. Link: https://lore.kernel.org/linux-raid/20250910091912.25624-1-nichen@iscas.ac.cn Signed-off-by: Chen Ni Signed-off-by: Yu Kuai --- drivers/md/md-llbitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index 1eb434306162..9c1ade19b774 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -378,7 +378,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, case BitClean: pctl->state[pos] = BitDirty; break; - }; + } } } From 90e3bb44c0a86e245d8e5c6520206fa113acb1ee Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Wed, 29 Oct 2025 14:34:19 +0800 Subject: [PATCH 069/162] md: avoid repeated calls to del_gendisk There is a uaf problem which is found by case 23rdev-lifetime: Oops: general protection fault, probably for non-canonical address 0xdead000000000122 RIP: 0010:bdi_unregister+0x4b/0x170 Call Trace: __del_gendisk+0x356/0x3e0 mddev_unlock+0x351/0x360 rdev_attr_store+0x217/0x280 kernfs_fop_write_iter+0x14a/0x210 vfs_write+0x29e/0x550 ksys_write+0x74/0xf0 do_syscall_64+0xbb/0x380 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7ff5250a177e The sequence is: 1. rdev remove path gets reconfig_mutex 2. rdev remove path release reconfig_mutex in mddev_unlock 3. md stop calls do_md_stop and sets MD_DELETED 4. rdev remove path calls del_gendisk because MD_DELETED is set 5. md stop path release reconfig_mutex and calls del_gendisk again So there is a race condition we should resolve. This patch adds a flag MD_DO_DELETE to avoid the race condition. Link: https://lore.kernel.org/linux-raid/20251029063419.21700-1-xni@redhat.com Fixes: 9e59d609763f ("md: call del_gendisk in control path") Signed-off-by: Xiao Ni Suggested-by: Yu Kuai Reviewed-by: Li Nan Signed-off-by: Yu Kuai --- drivers/md/md.c | 3 ++- drivers/md/md.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7352e749e1b5..66219cf16a5e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -941,7 +941,8 @@ void mddev_unlock(struct mddev *mddev) * do_md_stop. dm raid only uses md_stop to stop. So dm raid * doesn't need to check MD_DELETED when getting reconfig lock */ - if (test_bit(MD_DELETED, &mddev->flags)) { + if (test_bit(MD_DELETED, &mddev->flags) && + !test_and_set_bit(MD_DO_DELETE, &mddev->flags)) { kobject_del(&mddev->kobj); del_gendisk(mddev->gendisk); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 5d5f780b8447..fd6e001c1d38 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -354,6 +354,7 @@ enum mddev_flags { MD_HAS_MULTIPLE_PPLS, MD_NOT_READY, MD_BROKEN, + MD_DO_DELETE, MD_DELETED, }; From a811db39198817c91b60adb5e2a8f8bfe76012e4 Mon Sep 17 00:00:00 2001 From: Huiwen He Date: Sun, 2 Nov 2025 23:25:40 +0800 Subject: [PATCH 070/162] md/raid5: remove redundant __GFP_NOWARN The __GFP_NOWARN flag was included in GFP_NOWAIT since commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT"). So remove the redundant __GFP_NOWARN flag. Link: https://lore.kernel.org/linux-raid/20251102152540.871568-1-hehuiwen@kylinos.cn Signed-off-by: Huiwen He Reviewed-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- drivers/md/raid5-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index ba768ca7f422..e29e69335c69 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3104,7 +3104,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) goto out_mempool; spin_lock_init(&log->tree_lock); - INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT); thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); From 9517b82d8d422d426a988b213fdd45c6b417b86d Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Sat, 8 Nov 2025 15:02:02 +0800 Subject: [PATCH 071/162] nbd: defer config put in recv_work There is one uaf issue in recv_work when running NBD_CLEAR_SOCK and NBD_CMD_RECONFIGURE: nbd_genl_connect // conf_ref=2 (connect and recv_work A) nbd_open // conf_ref=3 recv_work A done // conf_ref=2 NBD_CLEAR_SOCK // conf_ref=1 nbd_genl_reconfigure // conf_ref=2 (trigger recv_work B) close nbd // conf_ref=1 recv_work B config_put // conf_ref=0 atomic_dec(&config->recv_threads); -> UAF Or only running NBD_CLEAR_SOCK: nbd_genl_connect // conf_ref=2 nbd_open // conf_ref=3 NBD_CLEAR_SOCK // conf_ref=2 close nbd nbd_release config_put // conf_ref=1 recv_work config_put // conf_ref=0 atomic_dec(&config->recv_threads); -> UAF Commit 87aac3a80af5 ("nbd: call nbd_config_put() before notifying the waiter") moved nbd_config_put() to run before waking up the waiter in recv_work, in order to ensure that nbd_start_device_ioctl() would not be woken up while nbd->task_recv was still uncleared. However, in nbd_start_device_ioctl(), after being woken up it explicitly calls flush_workqueue() to make sure all current works are finished. Therefore, there is no need to move the config put ahead of the wakeup. Move nbd_config_put() to the end of recv_work, so that the reference is held for the whole lifetime of the worker thread. This makes sure the config cannot be freed while recv_work is still running, even if clear + reconfigure interleave. In addition, we don't need to worry about recv_work dropping the last nbd_put (which causes deadlock): path A (netlink with NBD_CFLAG_DESTROY_ON_DISCONNECT): connect // nbd_refs=1 (trigger recv_work) open nbd // nbd_refs=2 NBD_CLEAR_SOCK close nbd nbd_release nbd_disconnect_and_put flush_workqueue // recv_work done nbd_config_put nbd_put // nbd_refs=1 nbd_put // nbd_refs=0 queue_work path B (netlink without NBD_CFLAG_DESTROY_ON_DISCONNECT): connect // nbd_refs=2 (trigger recv_work) open nbd // nbd_refs=3 NBD_CLEAR_SOCK // conf_refs=2 close nbd nbd_release nbd_config_put // conf_refs=1 nbd_put // nbd_refs=2 recv_work done // conf_refs=0, nbd_refs=1 rmmod // nbd_refs=0 Reported-by: syzbot+56fbf4c7ddf65e95c7cc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6907edce.a70a0220.37351b.0014.GAE@google.com/T/ Fixes: 87aac3a80af5 ("nbd: make the config put is called before the notifying the waiter") Depends-on: e2daec488c57 ("nbd: Fix hungtask when nbd_config_put") Signed-off-by: Zheng Qixing Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 1188f32a5e5e..0df7b33af677 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1018,9 +1018,9 @@ static void recv_work(struct work_struct *work) nbd_mark_nsock_dead(nbd, nsock, 1); mutex_unlock(&nsock->tx_lock); - nbd_config_put(nbd); atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); + nbd_config_put(nbd); kfree(args); } From 4cda40dce95a5b4ec0620a84f322472730d01f7a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 7 Nov 2025 21:06:13 -0700 Subject: [PATCH 072/162] block: clean up indentation in blk_rq_map_iter_init() blk_rq_map_iter_init() has one line with 7 spaces of indentation and another that mixes 1 tab and 8 spaces. Convert both to tabs. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 94d3461b5bc8..a7ef25843280 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -142,7 +142,7 @@ static inline void blk_rq_map_iter_init(struct request *rq, .bi_size = rq->special_vec.bv_len, } }; - } else if (bio) { + } else if (bio) { *iter = (struct blk_map_iter) { .bio = bio, .bvecs = bio->bi_io_vec, @@ -150,7 +150,7 @@ static inline void blk_rq_map_iter_init(struct request *rq, }; } else { /* the internal flush request may not have bio attached */ - *iter = (struct blk_map_iter) {}; + *iter = (struct blk_map_iter) {}; } } From 6c6b66f65e5510eb5de6a2342d6b37ac5c7bd678 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Thu, 18 Sep 2025 19:57:58 +0800 Subject: [PATCH 073/162] md: prevent adding disks with larger logical_block_size to active arrays When adding a disk to a md array, avoid updating the array's logical_block_size to match the new disk. This prevents accidental partition table loss that renders the array unusable. The later patch will introduce a way to configure the array's logical_block_size. The issue was introduced before Linux 2.6.12-rc2. Link: https://lore.kernel.org/linux-raid/20250918115759.334067-2-linan666@huaweicloud.com/ Fixes: d2e45eace8 ("[PATCH] Fix raid "bio too big" failures") Signed-off-by: Li Nan Reviewed-by: Martin K. Petersen Signed-off-by: Yu Kuai --- drivers/md/md.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 66219cf16a5e..9f9dd1f4496f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6067,6 +6067,13 @@ int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) if (mddev_is_dm(mddev)) return 0; + if (queue_logical_block_size(rdev->bdev->bd_disk->queue) > + queue_logical_block_size(mddev->gendisk->queue)) { + pr_err("%s: incompatible logical_block_size, can not add\n", + mdname(mddev)); + return -EINVAL; + } + lim = queue_limits_start_update(mddev->gendisk->queue); queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, mddev->gendisk->disk_name); From 0ce112d9171ad766d4c6716951e73f91a0bfc184 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 3 Nov 2025 20:57:53 +0800 Subject: [PATCH 074/162] md: delete md_redundancy_group when array is becoming inactive 'md_redundancy_group' are created in md_run() and deleted in del_gendisk(), but these are not paired. Writing inactive/active to sysfs array_state can trigger md_run() multiple times without del_gendisk(), leading to duplicate creation as below: sysfs: cannot create duplicate filename '/devices/virtual/block/md0/md/sync_action' Call Trace: dump_stack_lvl+0x9f/0x120 dump_stack+0x14/0x20 sysfs_warn_dup+0x96/0xc0 sysfs_add_file_mode_ns+0x19c/0x1b0 internal_create_group+0x213/0x830 sysfs_create_group+0x17/0x20 md_run+0x856/0xe60 ? __x64_sys_openat+0x23/0x30 do_md_run+0x26/0x1d0 array_state_store+0x559/0x760 md_attr_store+0xc9/0x1e0 sysfs_kf_write+0x6f/0xa0 kernfs_fop_write_iter+0x141/0x2a0 vfs_write+0x1fc/0x5a0 ksys_write+0x79/0x180 __x64_sys_write+0x1d/0x30 x64_sys_call+0x2818/0x2880 do_syscall_64+0xa9/0x580 entry_SYSCALL_64_after_hwframe+0x4b/0x53 md: cannot register extra attributes for md0 Creation of it depends on 'pers', its lifecycle cannot be aligned with gendisk. So fix this issue by triggering 'md_redundancy_group' deletion when the array is becoming inactive. Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-2-linan666@huaweicloud.com Fixes: 790abe4d77af ("md: remove/add redundancy group only in level change") Signed-off-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- drivers/md/md.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9f9dd1f4496f..9fa38e9829f4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6879,6 +6879,10 @@ static int do_md_stop(struct mddev *mddev, int mode) if (!md_is_rdwr(mddev)) set_disk_ro(disk, 0); + if (mode == 2 && mddev->pers->sync_request && + mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + __md_stop_writes(mddev); __md_stop(mddev); From 381a3ce1c0ffed647c9b913e142b099c7e9d5afc Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 3 Nov 2025 20:57:54 +0800 Subject: [PATCH 075/162] md: init bioset in mddev_init IO operations may be needed before md_run(), such as updating metadata after writing sysfs. Without bioset, this triggers a NULL pointer dereference as below: BUG: kernel NULL pointer dereference, address: 0000000000000020 Call Trace: md_update_sb+0x658/0xe00 new_level_store+0xc5/0x120 md_attr_store+0xc9/0x1e0 sysfs_kf_write+0x6f/0xa0 kernfs_fop_write_iter+0x141/0x2a0 vfs_write+0x1fc/0x5a0 ksys_write+0x79/0x180 __x64_sys_write+0x1d/0x30 x64_sys_call+0x2818/0x2880 do_syscall_64+0xa9/0x580 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Reproducer ``` mdadm -CR /dev/md0 -l1 -n2 /dev/sd[cd] echo inactive > /sys/block/md0/md/array_state echo 10 > /sys/block/md0/md/new_level ``` mddev_init() can only be called once per mddev, no need to test if bioset has been initialized anymore. Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-3-linan666@huaweicloud.com Fixes: d981ed841930 ("md: Add new_level sysfs interface") Signed-off-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- drivers/md/md.c | 69 +++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9fa38e9829f4..6193f5b2017d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -730,6 +730,8 @@ static void mddev_clear_bitmap_ops(struct mddev *mddev) int mddev_init(struct mddev *mddev) { + int err = 0; + if (!IS_ENABLED(CONFIG_MD_BITMAP)) mddev->bitmap_id = ID_BITMAP_NONE; else @@ -741,10 +743,23 @@ int mddev_init(struct mddev *mddev) if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - percpu_ref_exit(&mddev->active_io); - return -ENOMEM; + err = -ENOMEM; + goto exit_acitve_io; } + err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_writes_pending; + + err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (err) + goto exit_bio_set; + + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); + if (err) + goto exit_sync_set; + /* We want to start with the refcount at zero */ percpu_ref_put(&mddev->writes_pending); @@ -773,11 +788,24 @@ int mddev_init(struct mddev *mddev) INIT_WORK(&mddev->del_work, mddev_delayed_delete); return 0; + +exit_sync_set: + bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); +exit_writes_pending: + percpu_ref_exit(&mddev->writes_pending); +exit_acitve_io: + percpu_ref_exit(&mddev->active_io); + return err; } EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_clone_set); percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } @@ -6394,29 +6422,9 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } - if (!bioset_initialized(&mddev->bio_set)) { - err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (err) - return err; - } - if (!bioset_initialized(&mddev->sync_set)) { - err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (err) - goto exit_bio_set; - } - - if (!bioset_initialized(&mddev->io_clone_set)) { - err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, - offsetof(struct md_io_clone, bio_clone), 0); - if (err) - goto exit_sync_set; - } - pers = get_pers(mddev->level, mddev->clevel); - if (!pers) { - err = -EINVAL; - goto abort; - } + if (!pers) + return -EINVAL; if (mddev->level != pers->head.id) { mddev->level = pers->head.id; mddev->new_level = pers->head.id; @@ -6427,8 +6435,7 @@ int md_run(struct mddev *mddev) pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ put_pers(pers); - err = -EINVAL; - goto abort; + return -EINVAL; } if (pers->sync_request) { @@ -6555,12 +6562,6 @@ bitmap_abort: mddev->private = NULL; put_pers(pers); md_bitmap_destroy(mddev); -abort: - bioset_exit(&mddev->io_clone_set); -exit_sync_set: - bioset_exit(&mddev->sync_set); -exit_bio_set: - bioset_exit(&mddev->bio_set); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6785,10 +6786,6 @@ static void __md_stop(struct mddev *mddev) mddev->private = NULL; put_pers(pers); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) From 2107457e31fa138b4baa0bccf309d6fdcf9f47dd Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 3 Nov 2025 20:57:55 +0800 Subject: [PATCH 076/162] md/raid0: Move queue limit setup before r0conf initialization Prepare for making logical blocksize configurable. This change has no impact until logical block size becomes configurable. Move raid0_set_limits() before create_strip_zones(). It is safe as fields modified in create_strip_zones() do not involve mddev configuration, and rdev modifications there are not used in raid0_set_limits(). 'blksize' in create_strip_zones() fetches mddev's logical block size, which is already the maximum aross all rdevs, so the later max() can be removed. Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-4-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- drivers/md/raid0.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e443e478645a..fbf763401521 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -68,7 +68,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) struct strip_zone *zone; int cnt; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - unsigned blksize = 512; + unsigned int blksize = queue_logical_block_size(mddev->gendisk->queue); *private_conf = ERR_PTR(-ENOMEM); if (!conf) @@ -84,9 +84,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; - blksize = max(blksize, queue_logical_block_size( - rdev1->bdev->bd_disk->queue)); - rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %pg(%llu)" " with %pg(%llu)\n", @@ -405,6 +402,12 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; + if (!mddev_is_dm(mddev)) { + ret = raid0_set_limits(mddev); + if (ret) + return ret; + } + /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); @@ -413,11 +416,6 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; - if (!mddev_is_dm(mddev)) { - ret = raid0_set_limits(mddev); - if (ret) - return ret; - } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); From 9c47127a807da3e36ce80f7c83a1134a291fc021 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 3 Nov 2025 20:57:56 +0800 Subject: [PATCH 077/162] md: add check_new_feature module parameter Raid checks if pad3 is zero when loading superblock from disk. Arrays created with new features may fail to assemble on old kernels as pad3 is used. Add module parameter check_new_feature to bypass this check. Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-5-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- drivers/md/md.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6193f5b2017d..9676e2477df6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -339,6 +339,7 @@ static int start_readonly; */ static bool create_on_open = true; static bool legacy_async_del_gendisk = true; +static bool check_new_feature = true; /* * We have a system wide 'event count' that is incremented @@ -1851,9 +1852,13 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ } if (sb->pad0 || sb->pad3[0] || - memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) - /* Some padding is non-zero, might be a new feature */ - return -EINVAL; + memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) { + pr_warn("Some padding is non-zero on %pg, might be a new feature\n", + rdev->bdev); + if (check_new_feature) + return -EINVAL; + pr_warn("check_new_feature is disabled, data corruption possible\n"); + } rdev->preferred_minor = 0xffff; rdev->data_offset = le64_to_cpu(sb->data_offset); @@ -10741,6 +10746,7 @@ module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); module_param(create_on_open, bool, S_IRUSR|S_IWUSR); module_param(legacy_async_del_gendisk, bool, 0600); +module_param(check_new_feature, bool, 0600); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MD RAID framework"); From 62ed1b58224636185fa689db81224b8c8af46473 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 3 Nov 2025 20:57:57 +0800 Subject: [PATCH 078/162] md: allow configuring logical block size Previously, raid array used the maximum logical block size (LBS) of all member disks. Adding a larger LBS disk at runtime could unexpectedly increase RAID's LBS, risking corruption of existing partitions. This can be reproduced by: ``` # LBS of sd[de] is 512 bytes, sdf is 4096 bytes. mdadm -CRq /dev/md0 -l1 -n3 /dev/sd[de] missing --assume-clean # LBS is 512 cat /sys/block/md0/queue/logical_block_size # create partition md0p1 parted -s /dev/md0 mklabel gpt mkpart primary 1MiB 100% lsblk | grep md0p1 # LBS becomes 4096 after adding sdf mdadm --add -q /dev/md0 /dev/sdf cat /sys/block/md0/queue/logical_block_size # partition lost partprobe /dev/md0 lsblk | grep md0p1 ``` Simply restricting larger-LBS disks is inflexible. In some scenarios, only disks with 512 bytes LBS are available currently, but later, disks with 4KB LBS may be added to the array. Making LBS configurable is the best way to solve this scenario. After this patch, the raid will: - store LBS in disk metadata - add a read-write sysfs 'mdX/logical_block_size' Future mdadm should support setting LBS via metadata field during RAID creation and the new sysfs. Though the kernel allows runtime LBS changes, users should avoid modifying it after creating partitions or filesystems to prevent compatibility issues. Only 1.x metadata supports configurable LBS. 0.90 metadata inits all fields to default values at auto-detect. Supporting 0.90 would require more extensive changes and no such use case has been observed. Note that many RAID paths rely on PAGE_SIZE alignment, including for metadata I/O. A larger LBS than PAGE_SIZE will result in metadata read/write failures. So this config should be prevented. Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-6-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- Documentation/admin-guide/md.rst | 10 +++++ drivers/md/md-linear.c | 1 + drivers/md/md.c | 77 ++++++++++++++++++++++++++++++++ drivers/md/md.h | 1 + drivers/md/raid0.c | 1 + drivers/md/raid1.c | 1 + drivers/md/raid10.c | 1 + drivers/md/raid5.c | 1 + include/uapi/linux/raid/md_p.h | 3 +- 9 files changed, 95 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index deed823eab01..dc7eab191caa 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -238,6 +238,16 @@ All md devices contain: the number of devices in a raid4/5/6, or to support external metadata formats which mandate such clipping. + logical_block_size + Configure the array's logical block size in bytes. This attribute + is only supported for 1.x meta. Write the value before starting + array. The final array LBS uses the maximum between this + configuration and LBS of all combined devices. Note that + LBS cannot exceed PAGE_SIZE before RAID supports folio. + WARNING: Arrays created on new kernel cannot be assembled at old + kernel due to padding check, Set module parameter 'check_new_feature' + to false to bypass, but data loss may occur. + reshape_position This is either ``none`` or a sector number within the devices of the array where ``reshape`` is up to. If this is set, the three diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 25a6ddedea65..8d7b82c4a723 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -72,6 +72,7 @@ static int linear_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_hw_sectors = mddev->chunk_sectors; + lim.logical_block_size = mddev->logical_block_size; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9676e2477df6..7b5c5967568f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1999,6 +1999,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->layout = le32_to_cpu(sb->layout); mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); + mddev->logical_block_size = le32_to_cpu(sb->logical_block_size); mddev->events = ev1; mddev->bitmap_info.offset = 0; mddev->bitmap_info.space = 0; @@ -2208,6 +2209,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->chunksize = cpu_to_le32(mddev->chunk_sectors); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); + sb->logical_block_size = cpu_to_le32(mddev->logical_block_size); if (test_bit(FailFast, &rdev->flags)) sb->devflags |= FailFast1; else @@ -5936,6 +5938,68 @@ static struct md_sysfs_entry md_serialize_policy = __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, serialize_policy_store); +static int mddev_set_logical_block_size(struct mddev *mddev, + unsigned int lbs) +{ + int err = 0; + struct queue_limits lim; + + if (queue_logical_block_size(mddev->gendisk->queue) >= lbs) { + pr_err("%s: Cannot set LBS smaller than mddev LBS %u\n", + mdname(mddev), lbs); + return -EINVAL; + } + + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.logical_block_size = lbs; + pr_info("%s: logical_block_size is changed, data may be lost\n", + mdname(mddev)); + err = queue_limits_commit_update(mddev->gendisk->queue, &lim); + if (err) + return err; + + mddev->logical_block_size = lbs; + /* New lbs will be written to superblock after array is running */ + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + return 0; +} + +static ssize_t +lbs_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%u\n", mddev->logical_block_size); +} + +static ssize_t +lbs_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int lbs; + int err = -EBUSY; + + /* Only 1.x meta supports configurable LBS */ + if (mddev->major_version == 0) + return -EINVAL; + + if (mddev->pers) + return -EBUSY; + + err = kstrtouint(buf, 10, &lbs); + if (err < 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + goto unlock; + + err = mddev_set_logical_block_size(mddev, lbs); + +unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry md_logical_block_size = +__ATTR(logical_block_size, 0644, lbs_show, lbs_store); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -5958,6 +6022,7 @@ static struct attribute *md_default_attrs[] = { &md_consistency_policy.attr, &md_fail_last_dev.attr, &md_serialize_policy.attr, + &md_logical_block_size.attr, NULL, }; @@ -6088,6 +6153,17 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim, return -EINVAL; } + /* + * Before RAID adding folio support, the logical_block_size + * should be smaller than the page size. + */ + if (lim->logical_block_size > PAGE_SIZE) { + pr_err("%s: logical_block_size must not larger than PAGE_SIZE\n", + mdname(mddev)); + return -EINVAL; + } + mddev->logical_block_size = lim->logical_block_size; + return 0; } EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); @@ -6699,6 +6775,7 @@ static void md_clean(struct mddev *mddev) mddev->chunk_sectors = 0; mddev->ctime = mddev->utime = 0; mddev->layout = 0; + mddev->logical_block_size = 0; mddev->max_disks = 0; mddev->events = 0; mddev->can_decrease_events = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index fd6e001c1d38..6985f2829bbd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -433,6 +433,7 @@ struct mddev { sector_t array_sectors; /* exported array size */ int external_size; /* size managed * externally */ + unsigned int logical_block_size; __u64 events; /* If the last 'event' was simply a clean->dirty transition, and * we didn't write it to the spares, then it is safe and simple diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index fbf763401521..47aee1b1d4d1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -380,6 +380,7 @@ static int raid0_set_limits(struct mddev *mddev) lim.max_hw_sectors = mddev->chunk_sectors; lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.max_hw_wzeroes_unmap_sectors = mddev->chunk_sectors; + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; lim.chunk_sectors = mddev->chunk_sectors; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 592a40233004..57d50465eed1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3213,6 +3213,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.max_hw_wzeroes_unmap_sectors = 0; + lim.logical_block_size = mddev->logical_block_size; lim.features |= BLK_FEAT_ATOMIC_WRITES; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 14dcd5142eb4..84be4cc7e873 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4000,6 +4000,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; lim.max_hw_wzeroes_unmap_sectors = 0; + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.chunk_sectors = mddev->chunk_sectors; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 24b32a0c95b4..cdbc7eba5c54 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7745,6 +7745,7 @@ static int raid5_set_limits(struct mddev *mddev) stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); md_init_stacking_limits(&lim); + lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index ac74133a4768..310068bb2a1d 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -291,7 +291,8 @@ struct mdp_superblock_1 { __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ __le32 sb_csum; /* checksum up to devs[max_dev] */ __le32 max_dev; /* size of devs[] array to consider */ - __u8 pad3[64-32]; /* set to 0 when writing */ + __le32 logical_block_size; /* same as q->limits->logical_block_size */ + __u8 pad3[64-36]; /* set to 0 when writing */ /* device state information. Indexed by dev_number. * 2 bytes per device From 1649714b930f9ea6233ce0810ba885999da3b5d4 Mon Sep 17 00:00:00 2001 From: Zheng Qixing Date: Mon, 10 Nov 2025 20:49:20 +0800 Subject: [PATCH 079/162] nbd: defer config unlock in nbd_genl_connect There is one use-after-free warning when running NBD_CMD_CONNECT and NBD_CLEAR_SOCK: nbd_genl_connect nbd_alloc_and_init_config // config_refs=1 nbd_start_device // config_refs=2 set NBD_RT_HAS_CONFIG_REF open nbd // config_refs=3 recv_work done // config_refs=2 NBD_CLEAR_SOCK // config_refs=1 close nbd // config_refs=0 refcount_inc -> uaf ------------[ cut here ]------------ refcount_t: addition on 0; use-after-free. WARNING: CPU: 24 PID: 1014 at lib/refcount.c:25 refcount_warn_saturate+0x12e/0x290 nbd_genl_connect+0x16d0/0x1ab0 genl_family_rcv_msg_doit+0x1f3/0x310 genl_rcv_msg+0x44a/0x790 The issue can be easily reproduced by adding a small delay before refcount_inc(&nbd->config_refs) in nbd_genl_connect(): mutex_unlock(&nbd->config_lock); if (!ret) { set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); + printk("before sleep\n"); + mdelay(5 * 1000); + printk("after sleep\n"); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } Fixes: e46c7287b1c2 ("nbd: add a basic netlink interface") Signed-off-by: Zheng Qixing Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 0df7b33af677..bbbafb6e2b33 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2235,12 +2235,13 @@ again: ret = nbd_start_device(nbd); out: - mutex_unlock(&nbd->config_lock); if (!ret) { set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags); refcount_inc(&nbd->config_refs); nbd_connect_reply(info, nbd->index); } + mutex_unlock(&nbd->config_lock); + nbd_config_put(nbd); if (put_dev) nbd_put(nbd); From 86afb1cdc28f4332c6e0a1937244e0a80d4d63b1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sat, 8 Nov 2025 23:44:26 -0800 Subject: [PATCH 080/162] block: add lockdep to queue_limits_commit_update() queue_limits_commit_update() expects q->limits_lock to be held by the caller (via queue_limits_start_update()). The API pattern is: lim = queue_limits_start_update(q); /* acquires lock */ /* modify lim */ queue_limits_commit_update(q, &lim); /* releases lock */ OR queue_limits_commit_update_frozen(q, &lim); lim = queue_limits_start_update(q); /* acquires lock */ queue_limits_commit_update(q, &lim); /* releases lock */ Add lockdep_assert_held() to report incorrect API usage. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index e0d0b035f39d..b38e94c85402 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -546,6 +546,8 @@ int queue_limits_commit_update(struct request_queue *q, { int error; + lockdep_assert_held(&q->limits_lock); + error = blk_validate_limits(lim); if (error) goto out_unlock; From 6b0a29933f688a284e8869da0f63a93acf893757 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 8 Nov 2025 15:17:45 -0700 Subject: [PATCH 081/162] ublk: remove unnecessary checks in ublk_check_and_get_req() ub = iocb->ki_filp->private_data cannot be NULL, as it's set in ublk_ch_open() before it returns succesfully. req->mq_hctx cannot be NULL as any inflight ublk request must belong to some queue. And req->mq_hctx->driver_data cannot be NULL as it's set to the ublk_queue pointer in ublk_init_hctx(). So drop the unnecessary checks. Signed-off-by: Caleb Sander Mateos Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5cf288809226..30e798f062ef 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2508,9 +2508,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, size_t buf_off; u16 tag, q_id; - if (!ub) - return ERR_PTR(-EACCES); - if (!user_backed_iter(iter)) return ERR_PTR(-EACCES); @@ -2536,9 +2533,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, if (!req) return ERR_PTR(-EINVAL); - if (!req->mq_hctx || !req->mq_hctx->driver_data) - goto fail; - if (!ublk_check_ubuf_dir(req, dir)) goto fail; From 727a44027815aea13d34c28254d386562efc3bab Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 7 Nov 2025 21:23:16 -0700 Subject: [PATCH 082/162] ublk: return unsigned from ublk_{,un}map_io() ublk_map_io() and ublk_unmap_io() never return negative values, and their return values are stored in variables of type unsigned. Clarify that they can't fail by making their return types unsigned. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 30e798f062ef..75f210523e52 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -963,8 +963,9 @@ static inline bool ublk_need_unmap_req(const struct request *req) (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN); } -static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, - const struct ublk_io *io) +static unsigned int ublk_map_io(const struct ublk_queue *ubq, + const struct request *req, + const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); @@ -986,7 +987,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, return rq_bytes; } -static int ublk_unmap_io(bool need_map, +static unsigned int ublk_unmap_io(bool need_map, const struct request *req, const struct ublk_io *io) { From fd9ecd005252b595fd02ff7fcc4056251027404d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 11 Nov 2025 06:06:20 -0800 Subject: [PATCH 083/162] block: fix merging data-less bios The data segment gaps the block layer tracks doesn't apply to bio's that don't have data. Skip calculating this to fix a NULL pointer access. Fixes: 2f6b2565d43cdb5 ("block: accumulate memory segment gaps per bio") Reported-by: Matthew Wilcox Signed-off-by: Keith Busch Reviewed-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 3ca6fbf8b787..d3115d7469df 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -737,6 +737,9 @@ u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, { struct bio_vec pb, nb; + if (!bio_has_data(prev)) + return 0; + gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit); gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit); From 6d7e3870af11c2b5966b2769f9e8a0d4764f52cc Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Tue, 11 Nov 2025 11:58:10 +0000 Subject: [PATCH 084/162] blk-mq-dma: fix kernel-doc function name for integrity DMA iterator Documentation build reported: Warning: block/blk-mq-dma.c:373 expecting prototype for blk_rq_integrity_dma_map_iter_start(). Prototype was for blk_rq_integrity_dma_map_iter_next() instead The kernel-doc comment above `blk_rq_integrity_dma_map_iter_next()` used the wrong function name (`blk_rq_integrity_dma_map_iter_start`) in its header. This patch corrects the function name in the kernel-doc block to match the actual implementation, ensuring clean documentation builds. Fixes: fec9b16dc555 ("blk-mq-dma: add scatter-less integrity data DMA mapping") Signed-off-by: Kriish Sharma Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index a7ef25843280..b4f456472961 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -351,7 +351,7 @@ bool blk_rq_integrity_dma_map_iter_start(struct request *req, EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); /** - * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for + * blk_rq_integrity_dma_map_iter_next - map the next integrity DMA segment for * a request * @req: request to map * @dma_dev: device to map to From 3749ea4deeba6049ea97e653d964568a45543e37 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 12 Nov 2025 08:42:18 -0800 Subject: [PATCH 085/162] null_blk: fix zone read length beyond write pointer Fix up the divisor calculating the number of zone sectors being read and handle a read that straddles the zone write pointer. The length is rounded up a sector boundary, so be sure to truncate any excess bytes off to avoid copying past the data segment. Fixes: 3451cf34f51bb70 ("null_blk: allow byte aligned memory offsets") Signed-off-by: Keith Busch Tested-by: Bart van Assche Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 5 ++++- drivers/block/null_blk/zoned.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index f1e67962ecae..c7c0fb79a6bf 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1240,9 +1240,12 @@ static blk_status_t null_transfer(struct nullb *nullb, struct page *page, p = kmap_local_page(page) + off; if (!is_write) { - if (dev->zoned) + if (dev->zoned) { valid_len = null_zone_valid_read_len(nullb, pos >> SECTOR_SHIFT, len); + if (valid_len && valid_len != len) + valid_len -= pos & (SECTOR_SIZE - 1); + } if (valid_len) { copy_from_nullb(nullb, p, pos, valid_len); diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index dbf292a8eae9..0ada35dc0989 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb, { struct nullb_device *dev = nullb->dev; struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)]; - unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SHIFT); + unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE); /* Read must be below the write pointer position */ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL || From fd0ae4754c7b2add72338b14ddc8c8ff5ffda426 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 11 Nov 2025 15:29:00 -0800 Subject: [PATCH 086/162] blk-zoned: Fix a typo in a source code comment Remove a superfluous parenthesis that was introduced by commit fa8555630b32 ("blk-zoned: Improve the queue reference count strategy documentation"). Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 3791755bc6ad..57ab2b365c2d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -730,7 +730,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); bio_io_error(bio); disk_put_zone_wplug(zwplug); - /* Drop the reference taken by disk_zone_wplug_add_bio(() */ + /* Drop the reference taken by disk_zone_wplug_add_bio(). */ blk_queue_exit(q); } From faa3be1a61bfaace4c2bd57434de7b13347f9f31 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 11 Nov 2025 15:29:01 -0800 Subject: [PATCH 087/162] blk-zoned: Document disk_zone_wplug_schedule_bio_work() locking Document that all callers hold this lock because the code in disk_zone_wplug_schedule_bio_work() depends on this. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 57ab2b365c2d..5487d5eb2650 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1188,6 +1188,8 @@ void blk_zone_mgmt_bio_endio(struct bio *bio) static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, struct blk_zone_wplug *zwplug) { + lockdep_assert_held(&zwplug->lock); + /* * Take a reference on the zone write plug and schedule the submission * of the next plugged BIO. blk_zone_wplug_bio_work() will release the From f233339188cd9b1a985fd8410d5811e920fdd4ef Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 11 Nov 2025 15:29:02 -0800 Subject: [PATCH 088/162] blk-zoned: Move code from disk_zone_wplug_add_bio() into its caller Move the following code into the only caller of disk_zone_wplug_add_bio(): - The code for clearing the REQ_NOWAIT flag. - The code that sets the BLK_ZONE_WPLUG_PLUGGED flag. - The disk_zone_wplug_schedule_bio_work() call. This patch moves all code that is related to REQ_NOWAIT or to bio scheduling into a single function. Additionally, the 'schedule_bio_work' variable is removed. No functionality has been changed. Cc: Damien Le Moal Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 5487d5eb2650..85de45c3f6be 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1204,8 +1204,6 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, struct blk_zone_wplug *zwplug, struct bio *bio, unsigned int nr_segs) { - bool schedule_bio_work = false; - /* * Grab an extra reference on the BIO request queue usage counter. * This reference will be reused to submit a request for the BIO for @@ -1221,16 +1219,6 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, */ bio_clear_polled(bio); - /* - * REQ_NOWAIT BIOs are always handled using the zone write plug BIO - * work, which can block. So clear the REQ_NOWAIT flag and schedule the - * work if this is the first BIO we are plugging. - */ - if (bio->bi_opf & REQ_NOWAIT) { - schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); - bio->bi_opf &= ~REQ_NOWAIT; - } - /* * Reuse the poll cookie field to store the number of segments when * split to the hardware limits. @@ -1246,11 +1234,6 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); - - zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - - if (schedule_bio_work) - disk_zone_wplug_schedule_bio_work(disk, zwplug); } /* @@ -1461,14 +1444,17 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); /* - * If the zone is already plugged, add the BIO to the plug BIO list. - * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a - * BLK_STS_AGAIN failure if we let the BIO execute. - * Otherwise, plug and let the BIO execute. + * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a + * BLK_STS_AGAIN failure if we let the caller submit the BIO. */ - if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || - (bio->bi_opf & REQ_NOWAIT)) - goto plug; + if (bio->bi_opf & REQ_NOWAIT) { + bio->bi_opf &= ~REQ_NOWAIT; + goto queue_bio; + } + + /* If the zone is already plugged, add the BIO to the BIO plug list. */ + if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) + goto queue_bio; if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1476,15 +1462,21 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } + /* Otherwise, plug and let the caller submit the BIO. */ zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); return false; -plug: +queue_bio: disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); + if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + disk_zone_wplug_schedule_bio_work(disk, zwplug); + } + spin_unlock_irqrestore(&zwplug->lock, flags); return true; From c2b8d20628ca789640f64074a642f9440eefc623 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Nov 2025 22:40:26 +0900 Subject: [PATCH 089/162] block: fix NULL pointer dereference in blk_zone_reset_all_bio_endio() For zoned block devices that do not need zone write plugs (e.g. most device mapper devices that support zones), the disk hash table of zone write plugs is NULL. For such devices, blk_zone_reset_all_bio_endio() should not attempt to scan this has table as that causes a NULL pointer dereference. Fix this by checking that the disk does have zone write plugs using the atomic counter. This is equivalent to checking for a non-NULL hash table but has the advantage to also speed up the execution of blk_zone_reset_all_bio_endio() for devices that do use zone write plugs but do not have any plug in the hash table (e.g. a disk with only full zones). Fixes: efae226c2ef1 ("block: handle zone management operations completions") Reported-by: Shin'ichiro Kawasaki Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 85de45c3f6be..98c26af01e24 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1118,17 +1118,20 @@ static void blk_zone_reset_all_bio_endio(struct bio *bio) sector_t sector; unsigned int i; - /* Update the condition of all zone write plugs. */ - rcu_read_lock(); - for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { - hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], - node) { - spin_lock_irqsave(&zwplug->lock, flags); - disk_zone_wplug_set_wp_offset(disk, zwplug, 0); - spin_unlock_irqrestore(&zwplug->lock, flags); + if (atomic_read(&disk->nr_zone_wplugs)) { + /* Update the condition of all zone write plugs. */ + rcu_read_lock(); + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + hlist_for_each_entry_rcu(zwplug, + &disk->zone_wplugs_hash[i], + node) { + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); + spin_unlock_irqrestore(&zwplug->lock, flags); + } } + rcu_read_unlock(); } - rcu_read_unlock(); /* Update the cached zone conditions. */ for (sector = 0; sector < capacity; From 881880b6f3079c79e4306630dcb8d05bc7de1793 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Nov 2025 22:40:27 +0900 Subject: [PATCH 090/162] block: fix NULL pointer dereference in disk_report_zones() Commit 2284eec5053d ("block: introduce blkdev_get_zone_info()") introduced the report_active field in struct blk_report_zones_args so that open and closed zones can be reported with the condition BLK_ZONE_COND_ACTIVE in the case of a cached report zone. However, the args pointer to a struct blk_report_zones_args that is passed to disk_report_zones() can be NULL, e.g. in the case of internal report zones operations for device mapper zoned targets. Fix disk_report_zones() to make sure to check that the args is not null before updating a zone condition for cached zone reports. Fixes: 2284eec5053d ("block: introduce blkdev_get_zone_info()") Reported-by: Shin'ichiro Kawasaki Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 98c26af01e24..dcc295721c2c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -854,7 +854,7 @@ static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk, int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, unsigned int idx, struct blk_report_zones_args *args) { - if (args->report_active) { + if (args && args->report_active) { /* * If we come here, then this is a report zones as a fallback * for a cached report. So collapse the implicit open, explicit From 7b2038b1b1d4322a851ce7ee378ebf85a03bb1a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Nov 2025 22:40:28 +0900 Subject: [PATCH 091/162] dm: fix zone reset all operation processing dm_zone_get_reset_bitmap() is used to generate a bitmap of the zones of a zoned device target when a REQ_OP_ZONE_RESET_ALL request is being processed. This bitmap is built by executing a zone report with a report callback set to the function dm_zone_need_reset_cb() in struct dm_report_zones_args. However, the cb callback pointer is not anymore the same as the callback specified by callers of the blkdev_report_zones() function. Rather, this is a DM internal callback and report zones callback functions from blkdev_report_zones() are passed using struct blk_report_zones_args, introduced with commit db9aed869f34 ("block: introduce disk_report_zone()"). This commit changed the DM main report zones callback handler function dm_report_zones_cb() to call the new disk_report_zone() so that callback functions from blkdev_report_zones() are executed, and this change resulted in the DM internal dm_zone_need_reset_cb() callback function to not be executed anymore, turning any REQ_OP_ZONE_RESET_ALL request into a no-op. Fix this by calling in dm_report_zones_cb() the DM internal cb function specified in struct dm_report_zones_args. Fixes: db9aed869f34 ("block: introduce disk_report_zone()"). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm-zone.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 984fb621b0e9..5a840c4ae316 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -113,6 +113,15 @@ static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, args->next_sector = zone->start + zone->len; + /* If we have an internal callback, call it first. */ + if (args->cb) { + int ret; + + ret = args->cb(zone, args->zone_idx, args->data); + if (ret) + return ret; + } + return disk_report_zone(args->disk, zone, args->zone_idx++, args->rep_args); } From 0c72e9fcc156caaf123a6291321bc9bd74cd1b61 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Nov 2025 13:36:22 +0800 Subject: [PATCH 092/162] bcache: get rid of discard code from journal In bcache journal there is discard functionality but almost useless in reality. Because discard happens after a journal bucket is reclaimed, and the reclaimed bucket is allocated for new journaling immediately. There is no time for underlying SSD to use the discard hint for internal data management. The discard code in bcache journal doesn't bring any performance optimization and wastes CPU cycles for issuing discard bios. Therefore this patch gits rid of it from journal.c and journal.h. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/journal.c | 93 ++++--------------------------------- drivers/md/bcache/journal.h | 13 ------ 2 files changed, 8 insertions(+), 98 deletions(-) diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index d50eb82ccb4f..144693b7c46a 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -275,8 +275,7 @@ bsearch: * ja->cur_idx */ ja->cur_idx = i; - ja->last_idx = ja->discard_idx = (i + 1) % - ca->sb.njournal_buckets; + ja->last_idx = (i + 1) % ca->sb.njournal_buckets; } @@ -336,16 +335,6 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) } } -static bool is_discard_enabled(struct cache_set *s) -{ - struct cache *ca = s->cache; - - if (ca->discard) - return true; - - return false; -} - int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; @@ -360,15 +349,10 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list) BUG_ON(i->pin && atomic_read(i->pin) != 1); if (n != i->j.seq) { - if (n == start && is_discard_enabled(s)) - pr_info("journal entries %llu-%llu may be discarded! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - else { - pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); - ret = -EIO; - goto err; - } + pr_err("journal entries %llu-%llu missing! (replaying %llu-%llu)\n", + n, i->j.seq - 1, start, end); + ret = -EIO; + goto err; } for (k = i->j.start; @@ -568,65 +552,6 @@ out: #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio) -{ - struct journal_device *ja = - container_of(bio, struct journal_device, discard_bio); - struct cache *ca = container_of(ja, struct cache, journal); - - atomic_set(&ja->discard_in_flight, DISCARD_DONE); - - closure_wake_up(&ca->set->journal.wait); - closure_put(&ca->set->cl); -} - -static void journal_discard_work(struct work_struct *work) -{ - struct journal_device *ja = - container_of(work, struct journal_device, discard_work); - - submit_bio(&ja->discard_bio); -} - -static void do_journal_discard(struct cache *ca) -{ - struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->discard_bio; - - if (!ca->discard) { - ja->discard_idx = ja->last_idx; - return; - } - - switch (atomic_read(&ja->discard_in_flight)) { - case DISCARD_IN_FLIGHT: - return; - - case DISCARD_DONE: - ja->discard_idx = (ja->discard_idx + 1) % - ca->sb.njournal_buckets; - - atomic_set(&ja->discard_in_flight, DISCARD_READY); - fallthrough; - - case DISCARD_READY: - if (ja->discard_idx == ja->last_idx) - return; - - atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - - bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); - bio->bi_iter.bi_sector = bucket_to_sector(ca->set, - ca->sb.d[ja->discard_idx]); - bio->bi_iter.bi_size = bucket_bytes(ca); - bio->bi_end_io = journal_discard_endio; - - closure_get(&ca->set->cl); - INIT_WORK(&ja->discard_work, journal_discard_work); - queue_work(bch_journal_wq, &ja->discard_work); - } -} - static unsigned int free_journal_buckets(struct cache_set *c) { struct journal *j = &c->journal; @@ -635,10 +560,10 @@ static unsigned int free_journal_buckets(struct cache_set *c) unsigned int n; /* In case njournal_buckets is not power of 2 */ - if (ja->cur_idx >= ja->discard_idx) - n = ca->sb.njournal_buckets + ja->discard_idx - ja->cur_idx; + if (ja->cur_idx >= ja->last_idx) + n = ca->sb.njournal_buckets + ja->last_idx - ja->cur_idx; else - n = ja->discard_idx - ja->cur_idx; + n = ja->last_idx - ja->cur_idx; if (n > (1 + j->do_reserve)) return n - (1 + j->do_reserve); @@ -668,8 +593,6 @@ static void journal_reclaim(struct cache_set *c) ja->last_idx = (ja->last_idx + 1) % ca->sb.njournal_buckets; - do_journal_discard(ca); - if (c->journal.blocks_free) goto out; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index cd316b4a1e95..9e9d1b3016a5 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -139,19 +139,6 @@ struct journal_device { /* Last journal bucket that still contains an open journal entry */ unsigned int last_idx; - /* Next journal bucket to be discarded */ - unsigned int discard_idx; - -#define DISCARD_READY 0 -#define DISCARD_IN_FLIGHT 1 -#define DISCARD_DONE 2 - /* 1 - discard in flight, -1 - discard completed */ - atomic_t discard_in_flight; - - struct work_struct discard_work; - struct bio discard_bio; - struct bio_vec discard_bv; - /* Bio for journal reads/writes to this device */ struct bio bio; struct bio_vec bv[8]; From b4056afbd4b90f5bdbdc53cca2768f9b8872a2dd Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Nov 2025 13:36:23 +0800 Subject: [PATCH 093/162] bcache: remove discard code from alloc.c Bcache allocator initially has no free space to allocate. Firstly it does a garbage collection which is triggered by a cache device write and fills free space into ca->free[] lists. The discard happens after the free bucket is handled by garbage collection added into one of the ca->free[] lists. But normally this bucket will be allocated out very soon to requester and filled data onto it. The discard hint on this bucket LBA range doesn't help SSD control to improve internal erasure performance, and waste extra CPU cycles to issue discard bios. This patch removes the almost-useless discard code from alloc.c. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 48ce750bf70a..db3684819e38 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -24,21 +24,18 @@ * Since the gens and priorities are all stored contiguously on disk, we can * batch this up: We fill up the free_inc list with freshly invalidated buckets, * call prio_write(), and when prio_write() finishes we pull buckets off the - * free_inc list and optionally discard them. + * free_inc list. * * free_inc isn't the only freelist - if it was, we'd often to sleep while * priorities and gens were being written before we could allocate. c->free is a * smaller freelist, and buckets on that list are always ready to be used. * - * If we've got discards enabled, that happens when a bucket moves from the - * free_inc list to the free list. - * * There is another freelist, because sometimes we have buckets that we know * have nothing pointing into them - these we can reuse without waiting for * priorities to be rewritten. These come from freed btree nodes and buckets * that garbage collection discovered no longer had valid keys pointing into * them (because they were overwritten). That's the unused list - buckets on the - * unused list move to the free list, optionally being discarded in the process. + * unused list move to the free list. * * It's also important to ensure that gens don't wrap around - with respect to * either the oldest gen in the btree or the gen on disk. This is quite @@ -118,8 +115,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) /* * Background allocation thread: scans for buckets to be invalidated, * invalidates them, rewrites prios/gens (marking them as invalidated on disk), - * then optionally issues discard commands to the newly free buckets, then puts - * them on the various freelists. + * then puts them on the various freelists. */ static inline bool can_inc_bucket_gen(struct bucket *b) @@ -321,8 +317,7 @@ static int bch_allocator_thread(void *arg) while (1) { /* * First, we pull buckets off of the unused and free_inc lists, - * possibly issue discards to them, then we add the bucket to - * the free list: + * then we add the bucket to the free list: */ while (1) { long bucket; @@ -330,14 +325,6 @@ static int bch_allocator_thread(void *arg) if (!fifo_pop(&ca->free_inc, bucket)) break; - if (ca->discard) { - mutex_unlock(&ca->set->bucket_lock); - blkdev_issue_discard(ca->bdev, - bucket_to_sector(ca->set, bucket), - ca->sb.bucket_size, GFP_KERNEL); - mutex_lock(&ca->set->bucket_lock); - } - allocator_wait(ca, bch_allocator_push(ca, bucket)); wake_up(&ca->set->btree_cache_wait); wake_up(&ca->set->bucket_wait); From 73a004f83cf024e785b74243ba9817a329423379 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Nov 2025 13:36:24 +0800 Subject: [PATCH 094/162] bcache: drop discard sysfs interface Since discard code is removed, now the sysfs interface to enable discard is useless. This patch removes the corresponding sysfs entry, and remove bool variable 'discard' from struct cache as well. Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 5 +---- drivers/md/bcache/super.c | 3 --- drivers/md/bcache/sysfs.c | 15 --------------- drivers/md/bcache/writeback.c | 3 +-- 4 files changed, 2 insertions(+), 24 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 1d33e40d26ea..b8bd6d4a4298 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -447,8 +447,7 @@ struct cache { * free_inc: Incoming buckets - these are buckets that currently have * cached data in them, and we can't reuse them until after we write * their new gen to disk. After prio_write() finishes writing the new - * gens/prios, they'll be moved to the free list (and possibly discarded - * in the process) + * gens/prios, they'll be moved to the free list. */ DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); @@ -467,8 +466,6 @@ struct cache { */ unsigned int invalidate_needs_gc; - bool discard; /* Get rid of? */ - struct journal_device journal; /* The rest of this all shows up in sysfs */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6d250e366412..91a98ebc3f80 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2382,9 +2382,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, ca->bdev = file_bdev(bdev_file); ca->sb_disk = sb_disk; - if (bdev_max_discard_sectors(file_bdev(bdev_file))) - ca->discard = CACHE_DISCARD(&ca->sb); - ret = cache_alloc(ca); if (ret != 0) { if (ret == -ENOMEM) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 826b14cae4e5..72f38e5b6f5c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -134,7 +134,6 @@ read_attribute(partial_stripes_expensive); rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(io_disable); -rw_attribute(discard); rw_attribute(running); rw_attribute(label); rw_attribute(errors); @@ -1036,7 +1035,6 @@ SHOW(__bch_cache) sysfs_hprint(bucket_size, bucket_bytes(ca)); sysfs_hprint(block_size, block_bytes(ca)); sysfs_print(nbuckets, ca->sb.nbuckets); - sysfs_print(discard, ca->discard); sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9); sysfs_hprint(btree_written, atomic_long_read(&ca->btree_sectors_written) << 9); @@ -1142,18 +1140,6 @@ STORE(__bch_cache) if (bcache_is_reboot) return -EBUSY; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - if (bdev_max_discard_sectors(ca->bdev)) - ca->discard = v; - - if (v != CACHE_DISCARD(&ca->sb)) { - SET_CACHE_DISCARD(&ca->sb, v); - bcache_write_super(ca->set); - } - } - if (attr == &sysfs_cache_replacement_policy) { v = __sysfs_match_string(cache_replacement_policies, -1, buf); if (v < 0) @@ -1185,7 +1171,6 @@ static struct attribute *bch_cache_attrs[] = { &sysfs_block_size, &sysfs_nbuckets, &sysfs_priority_stats, - &sysfs_discard, &sysfs_written, &sysfs_btree_written, &sysfs_metadata_written, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 6ba73dc1a3df..cffef33b4acf 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -805,8 +805,7 @@ static int bch_writeback_thread(void *arg) * may set BCH_ENABLE_AUTO_GC via sysfs, then when * BCH_DO_AUTO_GC is set, garbage collection thread * will be wake up here. After moving gc, the shrunk - * btree and discarded free buckets SSD space may be - * helpful for following write requests. + * btree may be helpful for following write requests. */ if (c->gc_after_writeback == (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) { From 7bf90cd740bf87dd1692cf74d49bb1dc849dcd11 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Nov 2025 13:36:25 +0800 Subject: [PATCH 095/162] bcache: remove discard sysfs interface document This patch removes documents of bcache discard sysfs interface, it drops discard related sections from, - Documentation/ABI/testing/sysfs-block-bcache - Documentation/admin-guide/bcache.rst Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block-bcache | 7 ------- Documentation/admin-guide/bcache.rst | 13 ++----------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-bcache b/Documentation/ABI/testing/sysfs-block-bcache index 9e4bbc5d51fd..9344a657ca70 100644 --- a/Documentation/ABI/testing/sysfs-block-bcache +++ b/Documentation/ABI/testing/sysfs-block-bcache @@ -106,13 +106,6 @@ Description: will be discarded from the cache. Should not be turned off with writeback caching enabled. -What: /sys/block//bcache/discard -Date: November 2010 -Contact: Kent Overstreet -Description: - For a cache, a boolean allowing discard/TRIM to be turned off - or back on if the device supports it. - What: /sys/block//bcache/bucket_size Date: November 2010 Contact: Kent Overstreet diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index 6fdb495ac466..f71f349553e4 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -17,8 +17,7 @@ The latest bcache kernel code can be found from mainline Linux kernel: It's designed around the performance characteristics of SSDs - it only allocates in erase block sized buckets, and it uses a hybrid btree/log to track cached extents (which can be anywhere from a single sector to the bucket size). It's -designed to avoid random writes at all costs; it fills up an erase block -sequentially, then issues a discard before reusing it. +designed to avoid random writes at all costs. Both writethrough and writeback caching are supported. Writeback defaults to off, but can be switched on and off arbitrarily at runtime. Bcache goes to @@ -618,19 +617,11 @@ bucket_size cache_replacement_policy One of either lru, fifo or random. -discard - Boolean; if on a discard/TRIM will be issued to each bucket before it is - reused. Defaults to off, since SATA TRIM is an unqueued command (and thus - slow). - freelist_percent Size of the freelist as a percentage of nbuckets. Can be written to to increase the number of buckets kept on the freelist, which lets you artificially reduce the size of the cache at runtime. Mostly for testing - purposes (i.e. testing how different size caches affect your hit rate), but - since buckets are discarded when they move on to the freelist will also make - the SSD's garbage collection easier by effectively giving it more reserved - space. + purposes (i.e. testing how different size caches affect your hit rate). io_errors Number of errors that have occurred, decayed by io_error_halflife. From 70bc173ce06be90b026bb00ea175567c91f006e4 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 13 Nov 2025 13:36:26 +0800 Subject: [PATCH 096/162] bcache: reduce gc latency by processing less nodes and sleep less time When bcache device is busy for high I/O loads, there are two methods to reduce the garbage collection latency, - Process less nodes in eac loop of incremental garbage collection in btree_gc_recurse(). - Sleep less time between two full garbage collection in bch_btree_gc(). This patch introduces to hleper routines to provide different garbage collection nodes number and sleep intervel time. - btree_gc_min_nodes() If there is no front end I/O, return 128 nodes to process in each incremental loop, otherwise only 10 nodes are returned. Then front I/O is able to access the btree earlier. - btree_gc_sleep_ms() If there is no synchronized wait for bucket allocation, sleep 100 ms between two incremental GC loop. Othersize only sleep 10 ms before incremental GC loop. Then a faster GC may provide available buckets earlier, to avoid most of bcache working threads from being starved by buckets allocation. The idea is inspired by works from Mingzhe Zou and Robert Pang, but much simpler and the expected behavior is more predictable. Signed-off-by: Coly Li Signed-off-by: Robert Pang Signed-off-by: Mingzhe Zou Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 4 ++++ drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 48 +++++++++++++++++++------------------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index db3684819e38..7708d92df23e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -399,7 +399,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) TASK_UNINTERRUPTIBLE); mutex_unlock(&ca->set->bucket_lock); + + atomic_inc(&ca->set->bucket_wait_cnt); schedule(); + atomic_dec(&ca->set->bucket_wait_cnt); + mutex_lock(&ca->set->bucket_lock); } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && !fifo_pop(&ca->free[reserve], r)); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index b8bd6d4a4298..8ccacba85547 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -604,6 +604,7 @@ struct cache_set { */ atomic_t prio_blocked; wait_queue_head_t bucket_wait; + atomic_t bucket_wait_cnt; /* * For any bio we don't skip we subtract the number of sectors from diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 210b59007d98..5d922d301ab6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -89,8 +89,9 @@ * Test module load/unload */ -#define MAX_GC_TIMES 100 -#define MIN_GC_NODES 100 +#define MAX_GC_TIMES_SHIFT 7 /* 128 loops */ +#define GC_NODES_MIN 10 +#define GC_SLEEP_MS_MIN 10 #define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) @@ -1578,29 +1579,29 @@ static unsigned int btree_gc_count_keys(struct btree *b) static size_t btree_gc_min_nodes(struct cache_set *c) { - size_t min_nodes; + size_t min_nodes = GC_NODES_MIN; - /* - * Since incremental GC would stop 100ms when front - * side I/O comes, so when there are many btree nodes, - * if GC only processes constant (100) nodes each time, - * GC would last a long time, and the front side I/Os - * would run out of the buckets (since no new bucket - * can be allocated during GC), and be blocked again. - * So GC should not process constant nodes, but varied - * nodes according to the number of btree nodes, which - * realized by dividing GC into constant(100) times, - * so when there are many btree nodes, GC can process - * more nodes each time, otherwise, GC will process less - * nodes each time (but no less than MIN_GC_NODES) - */ - min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; - if (min_nodes < MIN_GC_NODES) - min_nodes = MIN_GC_NODES; + if (atomic_read(&c->search_inflight) == 0) { + size_t n = c->gc_stats.nodes >> MAX_GC_TIMES_SHIFT; + + if (min_nodes < n) + min_nodes = n; + } return min_nodes; } +static uint64_t btree_gc_sleep_ms(struct cache_set *c) +{ + uint64_t sleep_ms; + + if (atomic_read(&c->bucket_wait_cnt) > 0) + sleep_ms = GC_SLEEP_MS_MIN; + else + sleep_ms = GC_SLEEP_MS; + + return sleep_ms; +} static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) @@ -1668,8 +1669,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; - if (atomic_read(&b->c->search_inflight) && - gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { + if (gc->nodes >= (gc->nodes_pre + btree_gc_min_nodes(b->c))) { gc->nodes_pre = gc->nodes; ret = -EAGAIN; break; @@ -1846,8 +1846,8 @@ static void bch_btree_gc(struct cache_set *c) cond_resched(); if (ret == -EAGAIN) - schedule_timeout_interruptible(msecs_to_jiffies - (GC_SLEEP_MS)); + schedule_timeout_interruptible( + msecs_to_jiffies(btree_gc_sleep_ms(c))); else if (ret) pr_warn("gc failed!\n"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); From 21194c44b6bdf50a27a0e065683d94bae16f69cb Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Thu, 13 Nov 2025 13:36:27 +0800 Subject: [PATCH 097/162] bcache: remove redundant __GFP_NOWARN GFP_NOWAIT already includes __GFP_NOWARN, so let's remove the redundant __GFP_NOWARN. Signed-off-by: Qianfeng Rong Acked-by: Coly Li Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5d922d301ab6..24ddc353cb30 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -372,7 +372,7 @@ static void do_btree_node_write(struct btree *b) SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); - if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { + if (!bch_bio_alloc_pages(b->bio, GFP_NOWAIT)) { struct bio_vec *bv; void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; From fd82071814d06c7b760fe8d90b932d8a66cffc63 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 13 Nov 2025 13:36:28 +0800 Subject: [PATCH 098/162] bcache: replace use of system_wq with system_percpu_wq Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. This patch continues the effort to refactor worqueue APIs, which has begun with the change introducing new workqueues and a new alloc_workqueue flag: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") system_wq should be the per-cpu workqueue, yet in this name nothing makes that clear, so replace system_wq with system_percpu_wq. The old wq (system_wq) will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 91a98ebc3f80..92ced6b28cb2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1388,7 +1388,7 @@ static CLOSURE_CALLBACK(cached_dev_flush) bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); - continue_at(cl, cached_dev_free, system_wq); + continue_at(cl, cached_dev_free, system_percpu_wq); } static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) @@ -1400,7 +1400,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) __module_get(THIS_MODULE); INIT_LIST_HEAD(&dc->list); closure_init(&dc->disk.cl, NULL); - set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); + set_closure_fn(&dc->disk.cl, cached_dev_flush, system_percpu_wq); kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); INIT_WORK(&dc->detach, cached_dev_detach_finish); sema_init(&dc->sb_write_mutex, 1); @@ -1513,7 +1513,7 @@ static CLOSURE_CALLBACK(flash_dev_flush) bcache_device_unlink(d); mutex_unlock(&bch_register_lock); kobject_del(&d->kobj); - continue_at(cl, flash_dev_free, system_wq); + continue_at(cl, flash_dev_free, system_percpu_wq); } static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) @@ -1525,7 +1525,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err_ret; closure_init(&d->cl, NULL); - set_closure_fn(&d->cl, flash_dev_flush, system_wq); + set_closure_fn(&d->cl, flash_dev_flush, system_percpu_wq); kobject_init(&d->kobj, &bch_flash_dev_ktype); @@ -1833,7 +1833,7 @@ static CLOSURE_CALLBACK(__cache_set_unregister) mutex_unlock(&bch_register_lock); - continue_at(cl, cache_set_flush, system_wq); + continue_at(cl, cache_set_flush, system_percpu_wq); } void bch_cache_set_stop(struct cache_set *c) @@ -1863,10 +1863,10 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) __module_get(THIS_MODULE); closure_init(&c->cl, NULL); - set_closure_fn(&c->cl, cache_set_free, system_wq); + set_closure_fn(&c->cl, cache_set_free, system_percpu_wq); closure_init(&c->caching, &c->cl); - set_closure_fn(&c->caching, __cache_set_unregister, system_wq); + set_closure_fn(&c->caching, __cache_set_unregister, system_percpu_wq); /* Maybe create continue_at_noreturn() and use it here? */ closure_set_stopped(&c->cl); @@ -2528,7 +2528,7 @@ static void register_device_async(struct async_reg_args *args) INIT_DELAYED_WORK(&args->reg_work, register_cache_worker); /* 10 jiffies is enough for a delay */ - queue_delayed_work(system_wq, &args->reg_work, 10); + queue_delayed_work(system_percpu_wq, &args->reg_work, 10); } static void *alloc_holder_object(struct cache_sb *sb) @@ -2909,11 +2909,11 @@ static int __init bcache_init(void) /* * Let's not make this `WQ_MEM_RECLAIM` for the following reasons: * - * 1. It used `system_wq` before which also does no memory reclaim. + * 1. It used `system_percpu_wq` before which also does no memory reclaim. * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and * reduced throughput can be observed. * - * We still want to user our own queue to not congest the `system_wq`. + * We still want to user our own queue to not congest the `system_percpu_wq`. */ bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); if (!bch_flush_wq) From c0c808214249c32a8961999e0779b953095b0074 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Thu, 13 Nov 2025 13:36:29 +0800 Subject: [PATCH 099/162] bcache: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch continues the effort to refactor worqueue APIs, which has begun with the change introducing new workqueues and a new alloc_workqueue flag: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") This change adds a new WQ_PERCPU flag to explicitly request alloc_workqueue() to be per-cpu when WQ_UNBOUND has not been specified. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 3 ++- drivers/md/bcache/super.c | 10 ++++++---- drivers/md/bcache/writeback.c | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 24ddc353cb30..3ed39c823826 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2822,7 +2822,8 @@ void bch_btree_exit(void) int __init bch_btree_init(void) { - btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); + btree_io_wq = alloc_workqueue("bch_btree_io", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!btree_io_wq) return -ENOMEM; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 92ced6b28cb2..c17d4517af22 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1939,7 +1939,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (!c->uuids) goto err; - c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0); + c->moving_gc_wq = alloc_workqueue("bcache_gc", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!c->moving_gc_wq) goto err; @@ -2902,7 +2903,7 @@ static int __init bcache_init(void) if (bch_btree_init()) goto err; - bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0); + bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bcache_wq) goto err; @@ -2915,11 +2916,12 @@ static int __init bcache_init(void) * * We still want to user our own queue to not congest the `system_percpu_wq`. */ - bch_flush_wq = alloc_workqueue("bch_flush", 0, 0); + bch_flush_wq = alloc_workqueue("bch_flush", WQ_PERCPU, 0); if (!bch_flush_wq) goto err; - bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0); + bch_journal_wq = alloc_workqueue("bch_journal", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!bch_journal_wq) goto err; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index cffef33b4acf..4b237074f453 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1075,7 +1075,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!dc->writeback_write_wq) return -ENOMEM; From 699122b590ebbc450737eebde3ab8f5b871cc7f0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 13 Nov 2025 13:36:30 +0800 Subject: [PATCH 100/162] bcache: Avoid -Wflex-array-member-not-at-end warning -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the new TRAILING_OVERLAP() helper to fix the following warning: drivers/md/bcache/bset.h:330:27: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] This helper creates a union between a flexible-array member (FAM) and a set of MEMBERS that would otherwise follow it. This overlays the trailing MEMBER struct btree_iter_set stack_data[MAX_BSETS]; onto the FAM struct btree_iter::data[], while keeping the FAM and the start of MEMBER aligned. The static_assert() ensures this alignment remains, and it's intentionally placed immediately after the corresponding structures --no blank line in between. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bset.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 011f6062c4c0..6ee2c6a506a2 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -327,9 +327,13 @@ struct btree_iter { /* Fixed-size btree_iter that can be allocated on the stack */ struct btree_iter_stack { - struct btree_iter iter; - struct btree_iter_set stack_data[MAX_BSETS]; + /* Must be last as it ends in a flexible-array member. */ + TRAILING_OVERLAP(struct btree_iter, iter, data, + struct btree_iter_set stack_data[MAX_BSETS]; + ); }; +static_assert(offsetof(struct btree_iter_stack, iter.data) == + offsetof(struct btree_iter_stack, stack_data)); typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); From 232143b605387b372dee0ec7830f93b93df5f67d Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Nov 2025 14:28:18 +0530 Subject: [PATCH 101/162] block: unify elevator tags and type xarrays into struct elv_change_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the nr_hw_queues update path manages two disjoint xarrays — one for elevator tags and another for elevator type — both used during elevator switching. Maintaining these two parallel structures for the same purpose adds unnecessary complexity and potential for mismatched state. This patch unifies both xarrays into a single structure, struct elv_change_ctx, which holds all per-queue elevator change context. A single xarray, named elv_tbl, now maps each queue (q->id) in a tagset to its corresponding elv_change_ctx entry, encapsulating the elevator tags, type and name references. This unification simplifies the code, improves maintainability, and clarifies ownership of per-queue elevator state. Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Signed-off-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 78 +++++++++++++++++++++++++++++++++----------- block/blk-mq-sched.h | 3 ++ block/blk-mq.c | 50 ++++++++++++++++------------ block/blk.h | 7 ++-- block/elevator.c | 31 ++++-------------- block/elevator.h | 15 +++++++++ 6 files changed, 116 insertions(+), 68 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 97b69fbe26f6..7ba285a7759b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,11 +427,11 @@ void blk_mq_free_sched_tags(struct elevator_tags *et, kfree(et); } -void blk_mq_free_sched_tags_batch(struct xarray *et_table, +void blk_mq_free_sched_tags_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set) { struct request_queue *q; - struct elevator_tags *et; + struct elv_change_ctx *ctx; lockdep_assert_held_write(&set->update_nr_hwq_lock); @@ -444,15 +444,49 @@ void blk_mq_free_sched_tags_batch(struct xarray *et_table, * concurrently. */ if (q->elevator) { - et = xa_load(et_table, q->id); - if (unlikely(!et)) + ctx = xa_load(elv_tbl, q->id); + if (!ctx || !ctx->et) { WARN_ON_ONCE(1); - else - blk_mq_free_sched_tags(et, set); + continue; + } + blk_mq_free_sched_tags(ctx->et, set); + ctx->et = NULL; } } } +void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl) +{ + unsigned long i; + struct elv_change_ctx *ctx; + + xa_for_each(elv_tbl, i, ctx) { + xa_erase(elv_tbl, i); + kfree(ctx); + } +} + +int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, + struct blk_mq_tag_set *set) +{ + struct request_queue *q; + struct elv_change_ctx *ctx; + + lockdep_assert_held_write(&set->update_nr_hwq_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + ctx = kzalloc(sizeof(struct elv_change_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (xa_insert(elv_tbl, q->id, ctx, GFP_KERNEL)) { + kfree(ctx); + return -ENOMEM; + } + } + return 0; +} + struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests) { @@ -497,12 +531,13 @@ out: return NULL; } -int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, +int blk_mq_alloc_sched_tags_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues) { + struct elv_change_ctx *ctx; struct request_queue *q; struct elevator_tags *et; - gfp_t gfp = GFP_NOIO | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; + int ret = -ENOMEM; lockdep_assert_held_write(&set->update_nr_hwq_lock); @@ -515,26 +550,31 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, * concurrently. */ if (q->elevator) { - et = blk_mq_alloc_sched_tags(set, nr_hw_queues, - blk_mq_default_nr_requests(set)); - if (!et) + ctx = xa_load(elv_tbl, q->id); + if (WARN_ON_ONCE(!ctx)) { + ret = -ENOENT; goto out_unwind; - if (xa_insert(et_table, q->id, et, gfp)) - goto out_free_tags; + } + + ctx->et = blk_mq_alloc_sched_tags(set, nr_hw_queues, + blk_mq_default_nr_requests(set)); + if (!ctx->et) + goto out_unwind; + } } return 0; -out_free_tags: - blk_mq_free_sched_tags(et, set); out_unwind: list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { if (q->elevator) { - et = xa_load(et_table, q->id); - if (et) - blk_mq_free_sched_tags(et, set); + ctx = xa_load(elv_tbl, q->id); + if (ctx && ctx->et) { + blk_mq_free_sched_tags(ctx->et, set); + ctx->et = NULL; + } } } - return -ENOMEM; + return ret; } /* caller must have a reference to @e, will grab another one if successful */ diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 8e21a6b1415d..2fddbc91a235 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -27,6 +27,9 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests); int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); +int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, + struct blk_mq_tag_set *set); +void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); void blk_mq_free_sched_tags_batch(struct xarray *et_table, diff --git a/block/blk-mq.c b/block/blk-mq.c index b2fdeaac0efb..04c6356f0eb2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4989,27 +4989,28 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, * Switch back to the elevator type stored in the xarray. */ static void blk_mq_elv_switch_back(struct request_queue *q, - struct xarray *elv_tbl, struct xarray *et_tbl) + struct xarray *elv_tbl) { - struct elevator_type *e = xa_load(elv_tbl, q->id); - struct elevator_tags *t = xa_load(et_tbl, q->id); + struct elv_change_ctx *ctx = xa_load(elv_tbl, q->id); + + if (WARN_ON_ONCE(!ctx)) + return; /* The elv_update_nr_hw_queues unfreezes the queue. */ - elv_update_nr_hw_queues(q, e, t); + elv_update_nr_hw_queues(q, ctx); /* Drop the reference acquired in blk_mq_elv_switch_none. */ - if (e) - elevator_put(e); + if (ctx->type) + elevator_put(ctx->type); } /* - * Stores elevator type in xarray and set current elevator to none. It uses - * q->id as an index to store the elevator type into the xarray. + * Stores elevator name and type in ctx and set current elevator to none. */ static int blk_mq_elv_switch_none(struct request_queue *q, struct xarray *elv_tbl) { - int ret = 0; + struct elv_change_ctx *ctx; lockdep_assert_held_write(&q->tag_set->update_nr_hwq_lock); @@ -5021,10 +5022,11 @@ static int blk_mq_elv_switch_none(struct request_queue *q, * can't run concurrently. */ if (q->elevator) { + ctx = xa_load(elv_tbl, q->id); + if (WARN_ON_ONCE(!ctx)) + return -ENOENT; - ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); - if (WARN_ON_ONCE(ret)) - return ret; + ctx->name = q->elevator->type->elevator_name; /* * Before we switch elevator to 'none', take a reference to @@ -5035,9 +5037,14 @@ static int blk_mq_elv_switch_none(struct request_queue *q, */ __elevator_get(q->elevator->type); + /* + * Store elevator type so that we can release the reference + * taken above later. + */ + ctx->type = q->elevator->type; elevator_set_none(q); } - return ret; + return 0; } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, @@ -5047,7 +5054,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int prev_nr_hw_queues = set->nr_hw_queues; unsigned int memflags; int i; - struct xarray elv_tbl, et_tbl; + struct xarray elv_tbl; bool queues_frozen = false; lockdep_assert_held(&set->tag_list_lock); @@ -5061,11 +5068,12 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, memflags = memalloc_noio_save(); - xa_init(&et_tbl); - if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) - goto out_memalloc_restore; - xa_init(&elv_tbl); + if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0) + goto out_free_ctx; + + if (blk_mq_alloc_sched_tags_batch(&elv_tbl, set, nr_hw_queues) < 0) + goto out_free_ctx; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); @@ -5111,7 +5119,7 @@ switch_back: /* switch_back expects queue to be frozen */ if (!queues_frozen) blk_mq_freeze_queue_nomemsave(q); - blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl); + blk_mq_elv_switch_back(q, &elv_tbl); } list_for_each_entry(q, &set->tag_list, tag_set_list) { @@ -5122,9 +5130,9 @@ switch_back: blk_mq_add_hw_queues_cpuhp(q); } +out_free_ctx: + blk_mq_free_sched_ctx_batch(&elv_tbl); xa_destroy(&elv_tbl); - xa_destroy(&et_tbl); -out_memalloc_restore: memalloc_noio_restore(memflags); /* Free the excess tags when nr_hw_queues shrink. */ diff --git a/block/blk.h b/block/blk.h index 4d809588b771..e4c433f62dfc 100644 --- a/block/blk.h +++ b/block/blk.h @@ -11,8 +11,7 @@ #include #include "blk-crypto-internal.h" -struct elevator_type; -struct elevator_tags; +struct elv_change_ctx; /* * Default upper limit for the software max_sectors limit used for regular I/Os. @@ -333,8 +332,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_insert_flush(struct request *rq); -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *t); +void elv_update_nr_hw_queues(struct request_queue *q, + struct elv_change_ctx *ctx); void elevator_set_default(struct request_queue *q); void elevator_set_none(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index e2ebfbf107b3..cd7bdff205c8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -45,19 +45,6 @@ #include "blk-wbt.h" #include "blk-cgroup.h" -/* Holding context data for changing elevator */ -struct elv_change_ctx { - const char *name; - bool no_uevent; - - /* for unregistering old elevator */ - struct elevator_queue *old; - /* for registering new elevator */ - struct elevator_queue *new; - /* holds sched tags data */ - struct elevator_tags *et; -}; - static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -706,32 +693,28 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) * The I/O scheduler depends on the number of hardware queues, this forces a * reattachment when nr_hw_queues changes. */ -void elv_update_nr_hw_queues(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *t) +void elv_update_nr_hw_queues(struct request_queue *q, + struct elv_change_ctx *ctx) { struct blk_mq_tag_set *set = q->tag_set; - struct elv_change_ctx ctx = {}; int ret = -ENODEV; WARN_ON_ONCE(q->mq_freeze_depth == 0); - if (e && !blk_queue_dying(q) && blk_queue_registered(q)) { - ctx.name = e->elevator_name; - ctx.et = t; - + if (ctx->type && !blk_queue_dying(q) && blk_queue_registered(q)) { mutex_lock(&q->elevator_lock); /* force to reattach elevator after nr_hw_queue is updated */ - ret = elevator_switch(q, &ctx); + ret = elevator_switch(q, ctx); mutex_unlock(&q->elevator_lock); } blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) - WARN_ON_ONCE(elevator_change_done(q, &ctx)); + WARN_ON_ONCE(elevator_change_done(q, ctx)); /* * Free sched tags if it's allocated but we couldn't switch elevator. */ - if (t && !ctx.new) - blk_mq_free_sched_tags(t, set); + if (ctx->et && !ctx->new) + blk_mq_free_sched_tags(ctx->et, set); } /* diff --git a/block/elevator.h b/block/elevator.h index c4d20155065e..bad43182361e 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -32,6 +32,21 @@ struct elevator_tags { struct blk_mq_tags *tags[]; }; +/* Holding context data for changing elevator */ +struct elv_change_ctx { + const char *name; + bool no_uevent; + + /* for unregistering old elevator */ + struct elevator_queue *old; + /* for registering new elevator */ + struct elevator_queue *new; + /* store elevator type */ + struct elevator_type *type; + /* holds sched tags data */ + struct elevator_tags *et; +}; + struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_queue *); void (*exit_sched)(struct elevator_queue *); From 04728ce90966c54417fd8120a3820104d18ba68d Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Nov 2025 14:28:19 +0530 Subject: [PATCH 102/162] block: move elevator tags into struct elevator_resources This patch introduces a new structure, struct elevator_resources, to group together all elevator-related resources that share the same lifetime. As a first step, this change moves the elevator tag pointer from struct elv_change_ctx into the new struct elevator_resources. Additionally, rename blk_mq_alloc_sched_tags_batch() and blk_mq_free_sched_tags_batch() to blk_mq_alloc_sched_res_batch() and blk_mq_free_sched_res_batch(), respectively. Introduce two new wrapper helpers, blk_mq_alloc_sched_res() and blk_mq_free_sched_res(), around blk_mq_alloc_sched_tags() and blk_mq_free_sched_tags(). These changes pave the way for consolidating the allocation and freeing of elevator-specific resources into common helper functions. This refactoring improves encapsulation and prepares the code for future extensions, allowing additional elevator-specific data to be added to struct elevator_resources without cluttering struct elv_change_ctx. Subsequent patches will extend struct elevator_resources to include other elevator-related data. Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Signed-off-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 48 ++++++++++++++++++++++++++++++-------------- block/blk-mq-sched.h | 10 ++++++--- block/blk-mq.c | 2 +- block/elevator.c | 31 ++++++++++++++-------------- block/elevator.h | 9 +++++++-- 5 files changed, 64 insertions(+), 36 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 7ba285a7759b..32ae57518298 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -427,7 +427,16 @@ void blk_mq_free_sched_tags(struct elevator_tags *et, kfree(et); } -void blk_mq_free_sched_tags_batch(struct xarray *elv_tbl, +void blk_mq_free_sched_res(struct elevator_resources *res, + struct blk_mq_tag_set *set) +{ + if (res->et) { + blk_mq_free_sched_tags(res->et, set); + res->et = NULL; + } +} + +void blk_mq_free_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set) { struct request_queue *q; @@ -445,12 +454,11 @@ void blk_mq_free_sched_tags_batch(struct xarray *elv_tbl, */ if (q->elevator) { ctx = xa_load(elv_tbl, q->id); - if (!ctx || !ctx->et) { + if (!ctx) { WARN_ON_ONCE(1); continue; } - blk_mq_free_sched_tags(ctx->et, set); - ctx->et = NULL; + blk_mq_free_sched_res(&ctx->res, set); } } } @@ -531,12 +539,24 @@ out: return NULL; } -int blk_mq_alloc_sched_tags_batch(struct xarray *elv_tbl, +int blk_mq_alloc_sched_res(struct request_queue *q, + struct elevator_resources *res, unsigned int nr_hw_queues) +{ + struct blk_mq_tag_set *set = q->tag_set; + + res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues, + blk_mq_default_nr_requests(set)); + if (!res->et) + return -ENOMEM; + + return 0; +} + +int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues) { struct elv_change_ctx *ctx; struct request_queue *q; - struct elevator_tags *et; int ret = -ENOMEM; lockdep_assert_held_write(&set->update_nr_hwq_lock); @@ -556,11 +576,10 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *elv_tbl, goto out_unwind; } - ctx->et = blk_mq_alloc_sched_tags(set, nr_hw_queues, - blk_mq_default_nr_requests(set)); - if (!ctx->et) + ret = blk_mq_alloc_sched_res(q, &ctx->res, + nr_hw_queues); + if (ret) goto out_unwind; - } } return 0; @@ -568,10 +587,8 @@ out_unwind: list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { if (q->elevator) { ctx = xa_load(elv_tbl, q->id); - if (ctx && ctx->et) { - blk_mq_free_sched_tags(ctx->et, set); - ctx->et = NULL; - } + if (ctx) + blk_mq_free_sched_res(&ctx->res, set); } } return ret; @@ -579,9 +596,10 @@ out_unwind: /* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *et) + struct elevator_resources *res) { unsigned int flags = q->tag_set->flags; + struct elevator_tags *et = res->et; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; unsigned long i; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 2fddbc91a235..1f8e58dd4b49 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -19,20 +19,24 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, - struct elevator_tags *et); + struct elevator_resources *res); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests); -int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, +int blk_mq_alloc_sched_res(struct request_queue *q, + struct elevator_resources *res, unsigned int nr_hw_queues); +int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set); void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); -void blk_mq_free_sched_tags_batch(struct xarray *et_table, +void blk_mq_free_sched_res(struct elevator_resources *res, + struct blk_mq_tag_set *set); +void blk_mq_free_sched_res_batch(struct xarray *et_table, struct blk_mq_tag_set *set); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq.c b/block/blk-mq.c index 04c6356f0eb2..0d38daaa4705 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -5072,7 +5072,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (blk_mq_alloc_sched_ctx_batch(&elv_tbl, set) < 0) goto out_free_ctx; - if (blk_mq_alloc_sched_tags_batch(&elv_tbl, set, nr_hw_queues) < 0) + if (blk_mq_alloc_sched_res_batch(&elv_tbl, set, nr_hw_queues) < 0) goto out_free_ctx; list_for_each_entry(q, &set->tag_list, tag_set_list) { diff --git a/block/elevator.c b/block/elevator.c index cd7bdff205c8..cbec292a4af5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -580,7 +580,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) } if (new_e) { - ret = blk_mq_init_sched(q, new_e, ctx->et); + ret = blk_mq_init_sched(q, new_e, &ctx->res); if (ret) goto out_unfreeze; ctx->new = q->elevator; @@ -604,7 +604,8 @@ out_unfreeze: return ret; } -static void elv_exit_and_release(struct request_queue *q) +static void elv_exit_and_release(struct elv_change_ctx *ctx, + struct request_queue *q) { struct elevator_queue *e; unsigned memflags; @@ -616,7 +617,7 @@ static void elv_exit_and_release(struct request_queue *q) mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); if (e) { - blk_mq_free_sched_tags(e->et, q->tag_set); + blk_mq_free_sched_res(&ctx->res, q->tag_set); kobject_put(&e->kobj); } } @@ -627,11 +628,12 @@ static int elevator_change_done(struct request_queue *q, int ret = 0; if (ctx->old) { + struct elevator_resources res = {.et = ctx->old->et}; bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &ctx->old->flags); elv_unregister_queue(q, ctx->old); - blk_mq_free_sched_tags(ctx->old->et, q->tag_set); + blk_mq_free_sched_res(&res, q->tag_set); kobject_put(&ctx->old->kobj); if (enable_wbt) wbt_enable_default(q->disk); @@ -639,7 +641,7 @@ static int elevator_change_done(struct request_queue *q, if (ctx->new) { ret = elv_register_queue(q, ctx->new, !ctx->no_uevent); if (ret) - elv_exit_and_release(q); + elv_exit_and_release(ctx, q); } return ret; } @@ -656,10 +658,9 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) lockdep_assert_held(&set->update_nr_hwq_lock); if (strncmp(ctx->name, "none", 4)) { - ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues, - blk_mq_default_nr_requests(set)); - if (!ctx->et) - return -ENOMEM; + ret = blk_mq_alloc_sched_res(q, &ctx->res, set->nr_hw_queues); + if (ret) + return ret; } memflags = blk_mq_freeze_queue(q); @@ -681,10 +682,10 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) if (!ret) ret = elevator_change_done(q, ctx); /* - * Free sched tags if it's allocated but we couldn't switch elevator. + * Free sched resource if it's allocated but we couldn't switch elevator. */ - if (ctx->et && !ctx->new) - blk_mq_free_sched_tags(ctx->et, set); + if (!ctx->new) + blk_mq_free_sched_res(&ctx->res, set); return ret; } @@ -711,10 +712,10 @@ void elv_update_nr_hw_queues(struct request_queue *q, if (!ret) WARN_ON_ONCE(elevator_change_done(q, ctx)); /* - * Free sched tags if it's allocated but we couldn't switch elevator. + * Free sched resource if it's allocated but we couldn't switch elevator. */ - if (ctx->et && !ctx->new) - blk_mq_free_sched_tags(ctx->et, set); + if (!ctx->new) + blk_mq_free_sched_res(&ctx->res, set); } /* diff --git a/block/elevator.h b/block/elevator.h index bad43182361e..621a63597249 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -32,6 +32,11 @@ struct elevator_tags { struct blk_mq_tags *tags[]; }; +struct elevator_resources { + /* holds elevator tags */ + struct elevator_tags *et; +}; + /* Holding context data for changing elevator */ struct elv_change_ctx { const char *name; @@ -43,8 +48,8 @@ struct elv_change_ctx { struct elevator_queue *new; /* store elevator type */ struct elevator_type *type; - /* holds sched tags data */ - struct elevator_tags *et; + /* store elevator resources */ + struct elevator_resources res; }; struct elevator_mq_ops { From 61019afdf6ac17c8e8f9c42665aa1fa82f04a3e2 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Nov 2025 14:28:20 +0530 Subject: [PATCH 103/162] block: introduce alloc_sched_data and free_sched_data elevator methods The recent lockdep splat [1] highlights a potential deadlock risk involving ->elevator_lock and ->freeze_lock dependencies on -pcpu_alloc_ mutex. The trace shows that the issue occurs when the Kyber scheduler allocates dynamic memory for its elevator data during initialization. To address this, introduce two new elevator operation callbacks: ->alloc_sched_data and ->free_sched_data. The subsequent patch would build upon these newly introduced methods to suppress lockdep splat[1]. [1] https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/ Signed-off-by: Nilay Shroff Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.h | 24 ++++++++++++++++++++++++ block/elevator.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 1f8e58dd4b49..4e1b86e85a8a 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -38,6 +38,30 @@ void blk_mq_free_sched_res(struct elevator_resources *res, struct blk_mq_tag_set *set); void blk_mq_free_sched_res_batch(struct xarray *et_table, struct blk_mq_tag_set *set); +/* + * blk_mq_alloc_sched_data() - Allocates scheduler specific data + * Returns: + * - Pointer to allocated data on success + * - NULL if no allocation needed + * - ERR_PTR(-ENOMEM) in case of failure + */ +static inline void *blk_mq_alloc_sched_data(struct request_queue *q, + struct elevator_type *e) +{ + void *sched_data; + + if (!e || !e->ops.alloc_sched_data) + return NULL; + + sched_data = e->ops.alloc_sched_data(q); + return (sched_data) ?: ERR_PTR(-ENOMEM); +} + +static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data) +{ + if (e && e->ops.free_sched_data) + e->ops.free_sched_data(data); +} static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { diff --git a/block/elevator.h b/block/elevator.h index 621a63597249..e34043f6da26 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -58,6 +58,8 @@ struct elevator_mq_ops { int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*depth_updated)(struct request_queue *); + void *(*alloc_sched_data)(struct request_queue *); + void (*free_sched_data)(void *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); From 0315476e78c050048e80f66334a310e5581b46bb Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Nov 2025 14:28:21 +0530 Subject: [PATCH 104/162] block: use {alloc|free}_sched data methods The previous patch introduced ->alloc_sched_data and ->free_sched_data methods. This patch builds upon that by now using these methods during elevator switch and nr_hw_queue update. It's also ensured that scheduler-specific data is allocated and freed through the new callbacks outside of the ->freeze_lock and ->elevator_lock locking contexts, thereby preventing any dependency on pcpu_alloc_mutex. Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Signed-off-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 27 +++++++++++++++++++++------ block/blk-mq-sched.h | 5 ++++- block/elevator.c | 34 ++++++++++++++++++++++------------ block/elevator.h | 4 +++- 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 32ae57518298..e26898128a7e 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -428,12 +428,17 @@ void blk_mq_free_sched_tags(struct elevator_tags *et, } void blk_mq_free_sched_res(struct elevator_resources *res, + struct elevator_type *type, struct blk_mq_tag_set *set) { if (res->et) { blk_mq_free_sched_tags(res->et, set); res->et = NULL; } + if (res->data) { + blk_mq_free_sched_data(type, res->data); + res->data = NULL; + } } void blk_mq_free_sched_res_batch(struct xarray *elv_tbl, @@ -458,7 +463,7 @@ void blk_mq_free_sched_res_batch(struct xarray *elv_tbl, WARN_ON_ONCE(1); continue; } - blk_mq_free_sched_res(&ctx->res, set); + blk_mq_free_sched_res(&ctx->res, ctx->type, set); } } } @@ -540,7 +545,9 @@ out: } int blk_mq_alloc_sched_res(struct request_queue *q, - struct elevator_resources *res, unsigned int nr_hw_queues) + struct elevator_type *type, + struct elevator_resources *res, + unsigned int nr_hw_queues) { struct blk_mq_tag_set *set = q->tag_set; @@ -549,6 +556,12 @@ int blk_mq_alloc_sched_res(struct request_queue *q, if (!res->et) return -ENOMEM; + res->data = blk_mq_alloc_sched_data(q, type); + if (IS_ERR(res->data)) { + blk_mq_free_sched_tags(res->et, set); + return -ENOMEM; + } + return 0; } @@ -576,19 +589,21 @@ int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, goto out_unwind; } - ret = blk_mq_alloc_sched_res(q, &ctx->res, - nr_hw_queues); + ret = blk_mq_alloc_sched_res(q, q->elevator->type, + &ctx->res, nr_hw_queues); if (ret) goto out_unwind; } } return 0; + out_unwind: list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) { if (q->elevator) { ctx = xa_load(elv_tbl, q->id); if (ctx) - blk_mq_free_sched_res(&ctx->res, set); + blk_mq_free_sched_res(&ctx->res, + ctx->type, set); } } return ret; @@ -605,7 +620,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e, unsigned long i; int ret; - eq = elevator_alloc(q, e, et); + eq = elevator_alloc(q, e, res); if (!eq) return -ENOMEM; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 4e1b86e85a8a..02c40a72e959 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -26,7 +26,9 @@ void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, unsigned int nr_hw_queues, unsigned int nr_requests); int blk_mq_alloc_sched_res(struct request_queue *q, - struct elevator_resources *res, unsigned int nr_hw_queues); + struct elevator_type *type, + struct elevator_resources *res, + unsigned int nr_hw_queues); int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl, @@ -35,6 +37,7 @@ void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl); void blk_mq_free_sched_tags(struct elevator_tags *et, struct blk_mq_tag_set *set); void blk_mq_free_sched_res(struct elevator_resources *res, + struct elevator_type *type, struct blk_mq_tag_set *set); void blk_mq_free_sched_res_batch(struct xarray *et_table, struct blk_mq_tag_set *set); diff --git a/block/elevator.c b/block/elevator.c index cbec292a4af5..5b37ef44f52d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -121,7 +121,7 @@ static struct elevator_type *elevator_find_get(const char *name) static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, - struct elevator_type *e, struct elevator_tags *et) + struct elevator_type *e, struct elevator_resources *res) { struct elevator_queue *eq; @@ -134,7 +134,8 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); - eq->et = et; + eq->et = res->et; + eq->elevator_data = res->data; return eq; } @@ -617,7 +618,7 @@ static void elv_exit_and_release(struct elv_change_ctx *ctx, mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); if (e) { - blk_mq_free_sched_res(&ctx->res, q->tag_set); + blk_mq_free_sched_res(&ctx->res, ctx->type, q->tag_set); kobject_put(&e->kobj); } } @@ -628,12 +629,15 @@ static int elevator_change_done(struct request_queue *q, int ret = 0; if (ctx->old) { - struct elevator_resources res = {.et = ctx->old->et}; + struct elevator_resources res = { + .et = ctx->old->et, + .data = ctx->old->elevator_data + }; bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT, &ctx->old->flags); elv_unregister_queue(q, ctx->old); - blk_mq_free_sched_res(&res, q->tag_set); + blk_mq_free_sched_res(&res, ctx->old->type, q->tag_set); kobject_put(&ctx->old->kobj); if (enable_wbt) wbt_enable_default(q->disk); @@ -658,7 +662,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) lockdep_assert_held(&set->update_nr_hwq_lock); if (strncmp(ctx->name, "none", 4)) { - ret = blk_mq_alloc_sched_res(q, &ctx->res, set->nr_hw_queues); + ret = blk_mq_alloc_sched_res(q, ctx->type, &ctx->res, + set->nr_hw_queues); if (ret) return ret; } @@ -681,11 +686,12 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) blk_mq_unfreeze_queue(q, memflags); if (!ret) ret = elevator_change_done(q, ctx); + /* * Free sched resource if it's allocated but we couldn't switch elevator. */ if (!ctx->new) - blk_mq_free_sched_res(&ctx->res, set); + blk_mq_free_sched_res(&ctx->res, ctx->type, set); return ret; } @@ -711,11 +717,12 @@ void elv_update_nr_hw_queues(struct request_queue *q, blk_mq_unfreeze_queue_nomemrestore(q); if (!ret) WARN_ON_ONCE(elevator_change_done(q, ctx)); + /* * Free sched resource if it's allocated but we couldn't switch elevator. */ if (!ctx->new) - blk_mq_free_sched_res(&ctx->res, set); + blk_mq_free_sched_res(&ctx->res, ctx->type, set); } /* @@ -729,7 +736,6 @@ void elevator_set_default(struct request_queue *q) .no_uevent = true, }; int err; - struct elevator_type *e; /* now we allow to switch elevator */ blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q); @@ -742,8 +748,8 @@ void elevator_set_default(struct request_queue *q) * have multiple queues or mq-deadline is not available, default * to "none". */ - e = elevator_find_get(ctx.name); - if (!e) + ctx.type = elevator_find_get(ctx.name); + if (!ctx.type) return; if ((q->nr_hw_queues == 1 || @@ -753,7 +759,7 @@ void elevator_set_default(struct request_queue *q) pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", ctx.name, err); } - elevator_put(e); + elevator_put(ctx.type); } void elevator_set_none(struct request_queue *q) @@ -802,6 +808,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, ctx.name = strstrip(elevator_name); elv_iosched_load_module(ctx.name); + ctx.type = elevator_find_get(ctx.name); down_read(&set->update_nr_hwq_lock); if (!blk_queue_no_elv_switch(q)) { @@ -812,6 +819,9 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, ret = -ENOENT; } up_read(&set->update_nr_hwq_lock); + + if (ctx.type) + elevator_put(ctx.type); return ret; } diff --git a/block/elevator.h b/block/elevator.h index e34043f6da26..3ee1d494f48a 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -33,6 +33,8 @@ struct elevator_tags { }; struct elevator_resources { + /* holds elevator data */ + void *data; /* holds elevator tags */ struct elevator_tags *et; }; @@ -185,7 +187,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count); extern bool elv_bio_merge_ok(struct request *, struct bio *); struct elevator_queue *elevator_alloc(struct request_queue *, - struct elevator_type *, struct elevator_tags *); + struct elevator_type *, struct elevator_resources *); /* * Helper functions. From d4c3ef56a1618cb7d55a4be74a09cda09165745a Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 13 Nov 2025 14:28:22 +0530 Subject: [PATCH 105/162] block: define alloc_sched_data and free_sched_data methods for kyber Currently, the Kyber elevator allocates its private data dynamically in ->init_sched and frees it in ->exit_sched. However, since ->init_sched is invoked during elevator switch after acquiring both ->freeze_lock and ->elevator_lock, it may trigger the lockdep splat [1] due to dependency on pcpu_alloc_mutex. To resolve this, move the elevator data allocation and deallocation logic from ->init_sched and ->exit_sched into the newly introduced ->alloc_sched_data and ->free_sched_data methods. These callbacks are invoked before acquiring ->freeze_lock and ->elevator_lock, ensuring that memory allocation happens safely without introducing additional locking dependencies. This change breaks the dependency chain involving pcpu_alloc_mutex and prevents the reported lockdep warning. [1] https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/ Reported-by: Changhui Zhong Reported-by: Yi Zhang Closes: https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/ Tested-by: Yi Zhang Reviewed-by: Ming Lei Reviewed-by: Yu Kuai Signed-off-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 18efd6ef2a2b..c1b36ffd19ce 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -409,30 +409,42 @@ static void kyber_depth_updated(struct request_queue *q) static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { - struct kyber_queue_data *kqd; - - kqd = kyber_queue_data_alloc(q); - if (IS_ERR(kqd)) - return PTR_ERR(kqd); - blk_stat_enable_accounting(q); blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - eq->elevator_data = kqd; q->elevator = eq; kyber_depth_updated(q); return 0; } +static void *kyber_alloc_sched_data(struct request_queue *q) +{ + struct kyber_queue_data *kqd; + + kqd = kyber_queue_data_alloc(q); + if (IS_ERR(kqd)) + return NULL; + + return kqd; +} + static void kyber_exit_sched(struct elevator_queue *e) { struct kyber_queue_data *kqd = e->elevator_data; - int i; timer_shutdown_sync(&kqd->timer); blk_stat_disable_accounting(kqd->q); +} + +static void kyber_free_sched_data(void *elv_data) +{ + struct kyber_queue_data *kqd = elv_data; + int i; + + if (!kqd) + return; for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); @@ -1004,6 +1016,8 @@ static struct elevator_type kyber_sched = { .exit_sched = kyber_exit_sched, .init_hctx = kyber_init_hctx, .exit_hctx = kyber_exit_hctx, + .alloc_sched_data = kyber_alloc_sched_data, + .free_sched_data = kyber_free_sched_data, .limit_depth = kyber_limit_depth, .bio_merge = kyber_bio_merge, .prepare_request = kyber_prepare_request, From 61d43b1731e0bc122a0f78df42ce424db5b14a19 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 14 Nov 2025 11:07:03 +0200 Subject: [PATCH 106/162] nvme-pci: migrate to dma_map_phys instead of map_page After introduction of dma_map_phys(), there is no need to convert from physical address to struct page in order to map page. So let's use it directly. Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Leon Romanovsky Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++-- drivers/nvme/host/pci.c | 25 +++++++++++++------------ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index b4f456472961..cebfead826ee 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -92,8 +92,8 @@ static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter, struct phys_vec *vec) { - iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), - offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); + iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len, + rq_dma_dir(req), 0); if (dma_mapping_error(dma_dev, iter->addr)) { iter->status = BLK_STS_RESOURCE; return false; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 3c1727df1e36..d0dd836ccdb9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -698,20 +698,20 @@ static void nvme_free_descriptors(struct request *req) } } -static void nvme_free_prps(struct request *req) +static void nvme_free_prps(struct request *req, unsigned int attrs) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; unsigned int i; for (i = 0; i < iod->nr_dma_vecs; i++) - dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr, - iod->dma_vecs[i].len, rq_dma_dir(req)); + dma_unmap_phys(nvmeq->dev->dev, iod->dma_vecs[i].addr, + iod->dma_vecs[i].len, rq_dma_dir(req), attrs); mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); } static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, - struct nvme_sgl_desc *sg_list) + struct nvme_sgl_desc *sg_list, unsigned int attrs) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; enum dma_data_direction dir = rq_dma_dir(req); @@ -720,13 +720,14 @@ static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, unsigned int i; if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) { - dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir); + dma_unmap_phys(dma_dev, le64_to_cpu(sge->addr), len, dir, + attrs); return; } for (i = 0; i < len / sizeof(*sg_list); i++) - dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), - le32_to_cpu(sg_list[i].length), dir); + dma_unmap_phys(dma_dev, le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), dir, attrs); } static void nvme_unmap_metadata(struct request *req) @@ -747,10 +748,10 @@ static void nvme_unmap_metadata(struct request *req) if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, iod->meta_total_len)) { if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) - nvme_free_sgls(req, sge, &sge[1]); + nvme_free_sgls(req, sge, &sge[1], 0); else - dma_unmap_page(dma_dev, iod->meta_dma, - iod->meta_total_len, dir); + dma_unmap_phys(dma_dev, iod->meta_dma, + iod->meta_total_len, dir, 0); } if (iod->meta_descriptor) @@ -775,9 +776,9 @@ static void nvme_unmap_data(struct request *req) if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req, iod->descriptors[0], - &iod->cmd.common.dptr.sgl); + &iod->cmd.common.dptr.sgl, 0); else - nvme_free_prps(req); + nvme_free_prps(req, 0); } if (iod->nr_descriptors) From 37f0c7a8df7ad719a68fa1c2dbf066cfebc391a7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 14 Nov 2025 11:07:04 +0200 Subject: [PATCH 107/162] block-dma: properly take MMIO path In commit eadaa8b255f3 ("dma-mapping: introduce new DMA attribute to indicate MMIO memory"), DMA_ATTR_MMIO attribute was added to describe MMIO addresses, which require to avoid any memory cache flushing, as an outcome of the discussion pointed in Link tag below. In case of PCI_P2PDMA_MAP_THRU_HOST_BRIDGE transfer, blk-mq-dm logic treated this as regular page and relied on "struct page" DMA flow. That flow performs CPU cache flushing, which shouldn't be done here, and doesn't set IOMMU_MMIO flag in DMA-IOMMU case. As a solution, let's encode peer-to-peer transaction type in NVMe IOD flags variable and provide it to blk-mq-dma API. Link: https://lore.kernel.org/all/f912c446-1ae9-4390-9c11-00dce7bf0fd3@arm.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Leon Romanovsky Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 18 ++++++--- drivers/nvme/host/pci.c | 73 +++++++++++++++++++++++++++++++---- include/linux/bio-integrity.h | 1 - include/linux/blk-integrity.h | 14 ------- include/linux/blk-mq-dma.h | 28 +++++++------- include/linux/blk_types.h | 2 - 6 files changed, 90 insertions(+), 46 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index cebfead826ee..e9108ccaf4b0 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -92,8 +92,13 @@ static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, struct blk_dma_iter *iter, struct phys_vec *vec) { + unsigned int attrs = 0; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len, - rq_dma_dir(req), 0); + rq_dma_dir(req), attrs); if (dma_mapping_error(dma_dev, iter->addr)) { iter->status = BLK_STS_RESOURCE; return false; @@ -108,14 +113,18 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, { enum dma_data_direction dir = rq_dma_dir(req); unsigned int mapped = 0; + unsigned int attrs = 0; int error; iter->addr = state->addr; iter->len = dma_iova_size(state); + if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + do { error = dma_iova_link(dma_dev, state, vec->paddr, mapped, - vec->len, dir, 0); + vec->len, dir, attrs); if (error) break; mapped += vec->len; @@ -162,6 +171,7 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; + iter->p2pdma.map = PCI_P2PDMA_MAP_NONE; /* * Grab the first segment ASAP because we'll need it to check for P2P @@ -173,10 +183,6 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: - if (iter->iter.is_integrity) - bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; - else - req->cmd_flags |= REQ_P2PDMA; return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d0dd836ccdb9..9085bed107fd 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -260,8 +260,20 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + /* Data payload contains p2p memory */ + IOD_DATA_P2P = 1U << 3, + + /* Metadata contains p2p memory */ + IOD_META_P2P = 1U << 4, + + /* Data payload contains MMIO memory */ + IOD_DATA_MMIO = 1U << 5, + + /* Metadata contains MMIO memory */ + IOD_META_MMIO = 1U << 6, + /* Metadata using non-coalesced MPTR */ - IOD_SINGLE_META_SEGMENT = 1U << 5, + IOD_SINGLE_META_SEGMENT = 1U << 7, }; struct nvme_dma_vec { @@ -733,10 +745,12 @@ static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, static void nvme_unmap_metadata(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; enum dma_data_direction dir = rq_dma_dir(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct device *dma_dev = nvmeq->dev->dev; struct nvme_sgl_desc *sge = iod->meta_descriptor; + unsigned int attrs = 0; if (iod->flags & IOD_SINGLE_META_SEGMENT) { dma_unmap_page(dma_dev, iod->meta_dma, @@ -745,13 +759,20 @@ static void nvme_unmap_metadata(struct request *req) return; } - if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, - iod->meta_total_len)) { + if (iod->flags & IOD_META_P2P) + map = PCI_P2PDMA_MAP_BUS_ADDR; + else if (iod->flags & IOD_META_MMIO) { + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; + attrs |= DMA_ATTR_MMIO; + } + + if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len, map)) { if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) - nvme_free_sgls(req, sge, &sge[1], 0); + nvme_free_sgls(req, sge, &sge[1], attrs); else dma_unmap_phys(dma_dev, iod->meta_dma, - iod->meta_total_len, dir, 0); + iod->meta_total_len, dir, attrs); } if (iod->meta_descriptor) @@ -761,9 +782,11 @@ static void nvme_unmap_metadata(struct request *req) static void nvme_unmap_data(struct request *req) { + enum pci_p2pdma_map_type map = PCI_P2PDMA_MAP_NONE; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct device *dma_dev = nvmeq->dev->dev; + unsigned int attrs = 0; if (iod->flags & IOD_SINGLE_SEGMENT) { static_assert(offsetof(union nvme_data_ptr, prp1) == @@ -773,12 +796,20 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (iod->flags & IOD_DATA_P2P) + map = PCI_P2PDMA_MAP_BUS_ADDR; + else if (iod->flags & IOD_DATA_MMIO) { + map = PCI_P2PDMA_MAP_THRU_HOST_BRIDGE; + attrs |= DMA_ATTR_MMIO; + } + + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, + map)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req, iod->descriptors[0], - &iod->cmd.common.dptr.sgl, 0); + &iod->cmd.common.dptr.sgl, attrs); else - nvme_free_prps(req, 0); + nvme_free_prps(req, attrs); } if (iod->nr_descriptors) @@ -1049,6 +1080,19 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; + switch (iter.p2pdma.map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + iod->flags |= IOD_DATA_P2P; + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + iod->flags |= IOD_DATA_MMIO; + break; + case PCI_P2PDMA_MAP_NONE: + break; + default: + return BLK_STS_RESOURCE; + } + if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) @@ -1071,6 +1115,19 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) &iod->meta_dma_state, &iter)) return iter.status; + switch (iter.p2pdma.map) { + case PCI_P2PDMA_MAP_BUS_ADDR: + iod->flags |= IOD_META_P2P; + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + iod->flags |= IOD_META_MMIO; + break; + case PCI_P2PDMA_MAP_NONE: + break; + default: + return BLK_STS_RESOURCE; + } + if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) entries = 1; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 3d05296a5afe..21e4652dcfd2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,7 +13,6 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ - BIP_P2P_DMA = 1 << 8, /* using P2P address */ BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c2030fd8ba0a..a6b84206eb94 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -33,14 +33,6 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); -} - int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -129,12 +121,6 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return false; -} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 51829958d872..cb88fc791fbd 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -16,13 +16,13 @@ struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; u32 len; + struct pci_p2pdma_map_state p2pdma; /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ struct blk_map_iter iter; - struct pci_p2pdma_map_state p2pdma; }; bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, @@ -43,36 +43,34 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_dma_unmap - try to DMA unmap a request + * blk_rq_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap - * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR + * @map: peer-to-peer mapping type * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len, bool is_p2p) +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len, + enum pci_p2pdma_map_type map) { - if (is_p2p) + if (map == PCI_P2PDMA_MAP_BUS_ADDR) return true; if (dma_use_iova(state)) { + unsigned int attrs = 0; + + if (map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), - 0); + attrs); return true; } return !dma_need_unmap(dma_dev); } - -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - req->cmd_flags & REQ_P2PDMA); -} - #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 53501ebb0623..d884cc1256ec 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -393,7 +393,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -426,7 +425,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) From 79bd8c9814a273fa7ba43399e1c07adec3fc95db Mon Sep 17 00:00:00 2001 From: Rene Rebe Date: Fri, 14 Nov 2025 15:30:33 +0100 Subject: [PATCH 108/162] ps3disk: use memcpy_{from,to}_bvec index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With 6e0a48552b8c (ps3disk: use memcpy_{from,to}_bvec) converting ps3disk to new bvec helpers, incrementing the offset was accidently lost, corrupting consecutive buffers. Restore index for non-corrupted data transfers. Fixes: 6e0a48552b8c (ps3disk: use memcpy_{from,to}_bvec) Signed-off-by: René Rebe Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/ps3disk.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index dc9e4a14b885..8892f218a814 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -85,10 +85,14 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev, struct bio_vec bvec; rq_for_each_segment(bvec, req, iter) { + dev_dbg(&dev->sbd.core, "%s:%u: %u sectors from %llu\n", + __func__, __LINE__, bio_sectors(iter.bio), + iter.bio->bi_iter.bi_sector); if (gather) memcpy_from_bvec(dev->bounce_buf + offset, &bvec); else memcpy_to_bvec(&bvec, dev->bounce_buf + offset); + offset += bvec.bv_len; } } From 82d20481024cbae2ea87fe8b86d12961bfda7169 Mon Sep 17 00:00:00 2001 From: Rene Rebe Date: Fri, 14 Nov 2025 14:41:27 +0100 Subject: [PATCH 109/162] floppy: fix for PAGE_SIZE != 4KB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For years I wondered why the floppy driver does not just work on sparc64, e.g: root@SUNW_375_0066:# disktype /dev/fd0 disktype: Can't open /dev/fd0: No such device or address [ 525.341906] disktype: attempt to access beyond end of device fd0: rw=0, sector=0, nr_sectors = 16 limit=8 [ 525.341991] floppy: error 10 while reading block 0 Turns out floppy.c __floppy_read_block_0 tries to read one page for the first test read to determine the disk size and thus fails if that is greater than 4k. Adjust minimum MAX_DISK_SIZE to PAGE_SIZE to fix floppy on sparc64 and likely all other PAGE_SIZE != 4KB configs. Cc: stable@vger.kernel.org Signed-off-by: René Rebe Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 5336c3c5ca36..c28786e0fe1c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -329,7 +329,7 @@ static bool initialized; * This default is used whenever the current disk size is unknown. * [Now it is rather a minimum] */ -#define MAX_DISK_SIZE 4 /* 3984 */ +#define MAX_DISK_SIZE (PAGE_SIZE / 1024) /* * globals used by 'result()' From 2516c246d01c23a5f5310e9ac78d9f8aad9b1d0e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 14 Nov 2025 10:31:45 -0800 Subject: [PATCH 110/162] block: consider discard merge last If the next discard range is contiguous with the current range being considered, it's cheaper to expand the current range than to append an additional bio. Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-merge.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index d3115d7469df..db08bc906091 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -712,10 +712,10 @@ static void blk_account_io_merge_request(struct request *req) static enum elv_merge blk_try_req_merge(struct request *req, struct request *next) { - if (blk_discard_mergable(req)) - return ELEVATOR_DISCARD_MERGE; - else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next)) + if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next)) return ELEVATOR_BACK_MERGE; + else if (blk_discard_mergable(req)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_NO_MERGE; } @@ -903,12 +903,12 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) { - if (blk_discard_mergable(rq)) - return ELEVATOR_DISCARD_MERGE; - else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) + if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) return ELEVATOR_BACK_MERGE; else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) return ELEVATOR_FRONT_MERGE; + else if (blk_discard_mergable(rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_NO_MERGE; } From f76581f9f1d29e32e120b0242974ba266e79de58 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 14 Nov 2025 15:54:32 -0800 Subject: [PATCH 111/162] block/blk-throttle: Fix throttle slice time for SSDs Commit d61fcfa4bb18 ("blk-throttle: choose a small throtl_slice for SSD") introduced device type specific throttle slices if BLK_DEV_THROTTLING_LOW was enabled. Commit bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") removed support for BLK_DEV_THROTTLING_LOW, but left the device type specific throttle slices in place. This effectively changed throttling behavior on systems with SSD which now use a different and non-configurable slice time compared to non-SSD devices. Practical impact is that throughput tests with low configured throttle values (65536 bps) experience less than expected throughput on SSDs, presumably due to rounding errors associated with the small throttle slice time used for those devices. The same tests pass when setting the throttle values to 65536 * 4 = 262144 bps. The original code sets the throttle slice time to DFL_THROTL_SLICE_HD if CONFIG_BLK_DEV_THROTTLING_LOW is disabled. Restore that code to fix the problem. With that, DFL_THROTL_SLICE_SSD is no longer necessary. Revert to the original code and re-introduce DFL_THROTL_SLICE to replace both DFL_THROTL_SLICE_HD and DFL_THROTL_SLICE_SSD. This effectively reverts commit d61fcfa4bb18 ("blk-throttle: choose a small throtl_slice for SSD"). While at it, also remove MAX_THROTL_SLICE since it is not used anymore. Fixes: bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") Cc: Yu Kuai Cc: Tejun Heo Signed-off-by: Guenter Roeck Signed-off-by: Khazhismel Kumykov Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-throttle.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2c5b64b1a724..c19d052a8f2f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -22,9 +22,7 @@ #define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ -#define DFL_THROTL_SLICE_HD (HZ / 10) -#define DFL_THROTL_SLICE_SSD (HZ / 50) -#define MAX_THROTL_SLICE (HZ) +#define DFL_THROTL_SLICE (HZ / 10) /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; @@ -1341,10 +1339,7 @@ static int blk_throtl_init(struct gendisk *disk) goto out; } - if (blk_queue_nonrot(q)) - td->throtl_slice = DFL_THROTL_SLICE_SSD; - else - td->throtl_slice = DFL_THROTL_SLICE_HD; + td->throtl_slice = DFL_THROTL_SLICE; td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); From 20d0b359c73d15b25abea04066ef4cdbc6a8738d Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 14 Nov 2025 15:54:33 -0800 Subject: [PATCH 112/162] block/blk-throttle: drop unneeded blk_stat_enable_accounting After the removal of CONFIG_BLK_DEV_THROTTLING_LOW, it is no longer necessary to enable block accounting, so remove the call to blk_stat_enable_accounting(). With that, the track_bio_latency variable is no longer used and can be deleted from struct throtl_data. Also, including blk-stat.h is no longer necessary. Fixes: bf20ab538c81 ("blk-throttle: remove CONFIG_BLK_DEV_THROTTLING_LOW") Cc: Yu Kuai Cc: Tejun Heo Signed-off-by: Guenter Roeck Signed-off-by: Khazhismel Kumykov Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-throttle.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c19d052a8f2f..041bcf7b2c7c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -12,7 +12,6 @@ #include #include "blk.h" #include "blk-cgroup-rwstat.h" -#include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ @@ -43,8 +42,6 @@ struct throtl_data /* Work for dispatching throttled bios */ struct work_struct dispatch_work; - - bool track_bio_latency; }; static void throtl_pending_timer_fn(struct timer_list *t); @@ -1340,9 +1337,6 @@ static int blk_throtl_init(struct gendisk *disk) } td->throtl_slice = DFL_THROTL_SLICE; - td->track_bio_latency = !queue_is_mq(q); - if (!td->track_bio_latency) - blk_stat_enable_accounting(q); out: blk_mq_unquiesce_queue(disk->queue); From 6483faa3938bfbd2c9f8ae090f647635f3bd2877 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 14 Nov 2025 15:54:34 -0800 Subject: [PATCH 113/162] block/blk-throttle: Remove throtl_slice from struct throtl_data throtl_slice is now a constant. Remove the variable and use the constant directly where needed. Cc: Yu Kuai Cc: Tejun Heo Signed-off-by: Guenter Roeck Signed-off-by: Khazhismel Kumykov Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-throttle.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 041bcf7b2c7c..97188a795848 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -38,8 +38,6 @@ struct throtl_data /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; - unsigned int throtl_slice; - /* Work for dispatching throttled bios */ struct work_struct dispatch_work; }; @@ -446,7 +444,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg) static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { - unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; + unsigned long max_expire = jiffies + 8 * DFL_THROTL_SLICE; /* * Since we are adjusting the throttle limit dynamically, the sleep @@ -514,7 +512,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; - tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -529,7 +527,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, tg->io_disp[rw] = 0; } tg->slice_start[rw] = jiffies; - tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", @@ -540,7 +538,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); + tg->slice_end[rw] = roundup(jiffy_end, DFL_THROTL_SLICE); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, @@ -671,12 +669,12 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ - throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, jiffies + DFL_THROTL_SLICE); time_elapsed = rounddown(jiffies - tg->slice_start[rw], - tg->td->throtl_slice); + DFL_THROTL_SLICE); /* Don't trim slice until at least 2 slices are used */ - if (time_elapsed < tg->td->throtl_slice * 2) + if (time_elapsed < DFL_THROTL_SLICE * 2) return; /* @@ -687,7 +685,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) * lower rate than expected. Therefore, other than the above rounddown, * one extra slice is preserved for deviation. */ - time_elapsed -= tg->td->throtl_slice; + time_elapsed -= DFL_THROTL_SLICE; bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); io_trim = throtl_trim_iops(tg, rw, time_elapsed); if (!bytes_trim && !io_trim) @@ -697,7 +695,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, + rw == READ ? 'R' : 'W', time_elapsed / DFL_THROTL_SLICE, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } @@ -768,7 +766,7 @@ static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ - jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, DFL_THROTL_SLICE); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; @@ -794,9 +792,9 @@ static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) - jiffy_elapsed_rnd = tg->td->throtl_slice; + jiffy_elapsed_rnd = DFL_THROTL_SLICE; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, DFL_THROTL_SLICE); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); /* Need to consider the case of bytes_allowed overflow. */ if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) @@ -848,7 +846,7 @@ static void tg_update_slice(struct throtl_grp *tg, bool rw) sq_queued(&tg->service_queue, rw) == 0) throtl_start_new_slice(tg, rw, true); else - throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); + throtl_extend_slice(tg, rw, jiffies + DFL_THROTL_SLICE); } static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) @@ -1333,12 +1331,8 @@ static int blk_throtl_init(struct gendisk *disk) if (ret) { q->td = NULL; kfree(td); - goto out; } - td->throtl_slice = DFL_THROTL_SLICE; - -out: blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue, memflags); From 866d65745b635927c3d1343ab67e6fd4a99d116d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 15 Nov 2025 21:15:51 +0900 Subject: [PATCH 114/162] zloop: make the write pointer of full zones invalid The write pointer of zones that are in the full condition is always invalid. Reflect that fact by setting the write pointer of full zones to ULLONG_MAX. Fixes: eb0570c7df23 ("block: new zoned loop block device driver") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 92be9f0af00a..a975b1d07f1c 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -177,7 +177,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) zone->wp = zone->start; } else if (file_sectors == zlo->zone_capacity) { zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zlo->zone_size; + zone->wp = ULLONG_MAX; } else { zone->cond = BLK_ZONE_COND_CLOSED; zone->wp = zone->start + file_sectors; @@ -326,7 +326,7 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) } zone->cond = BLK_ZONE_COND_FULL; - zone->wp = zone->start + zlo->zone_size; + zone->wp = ULLONG_MAX; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); unlock: @@ -433,8 +433,10 @@ static void zloop_rw(struct zloop_cmd *cmd) * copmpletes. */ zone->wp += nr_sectors; - if (zone->wp == zone_end) + if (zone->wp == zone_end) { zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } } rq_for_each_bvec(tmp, rq, rq_iter) From cf28f6f923cb1dd2765b5c3d7697bb4dcf2096a0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 15 Nov 2025 21:15:52 +0900 Subject: [PATCH 115/162] zloop: fail zone append operations that are targeting full zones zloop_rw() will fail any regular write operation that targets a full sequential zone. The check for this is indirect and achieved by checking the write pointer alignment of the write operation. But this check is ineffective for zone append operations since these are alwasy automatically directed at a zone write pointer. Prevent zone append operations from being executed in a full zone with an explicit check of the zone condition. Fixes: eb0570c7df23 ("block: new zoned loop block device driver") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index a975b1d07f1c..266d233776ad 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -407,6 +407,10 @@ static void zloop_rw(struct zloop_cmd *cmd) mutex_lock(&zone->lock); if (is_append) { + if (zone->cond == BLK_ZONE_COND_FULL) { + ret = -EIO; + goto unlock; + } sector = zone->wp; cmd->sector = sector; } From e3a96ca90462f80d9f58a1236514823334deef39 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 15 Nov 2025 21:15:53 +0900 Subject: [PATCH 116/162] zloop: simplify checks for writes to sequential zones The function zloop_rw() already checks early that a request is fully contained within the target zone. So this check does not need to be done again for regular writes to sequential zones. Furthermore, since zone append operations are always directed to the zone write pointer location, we do not need to check for their alignment to that value after setting it. So turn the "if" checking the write pointer alignment into an "else if". While at it, improve the comment describing the write pointer modification and how this value is corrected in case of error. Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 266d233776ad..0526277f6cd1 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -406,6 +406,11 @@ static void zloop_rw(struct zloop_cmd *cmd) if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { mutex_lock(&zone->lock); + /* + * Zone append operations always go at the current write + * pointer, but regular write operations must already be + * aligned to the write pointer when submitted. + */ if (is_append) { if (zone->cond == BLK_ZONE_COND_FULL) { ret = -EIO; @@ -413,13 +418,7 @@ static void zloop_rw(struct zloop_cmd *cmd) } sector = zone->wp; cmd->sector = sector; - } - - /* - * Write operations must be aligned to the write pointer and - * fully contained within the zone capacity. - */ - if (sector != zone->wp || zone->wp + nr_sectors > zone_end) { + } else if (sector != zone->wp) { pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", zone_no, sector, zone->wp); ret = -EIO; @@ -432,9 +431,9 @@ static void zloop_rw(struct zloop_cmd *cmd) zone->cond = BLK_ZONE_COND_IMP_OPEN; /* - * Advance the write pointer of sequential zones. If the write - * fails, the wp position will be corrected when the next I/O - * copmpletes. + * Advance the write pointer. If the write fails, the write + * pointer position will be corrected when the next I/O starts + * execution. */ zone->wp += nr_sectors; if (zone->wp == zone_end) { From 9236c5fdd5a8bec2445e834e7e1bbefb2eb62f67 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 15 Nov 2025 21:15:54 +0900 Subject: [PATCH 117/162] zloop: introduce the zone_append configuration parameter A zloop zoned block device declares to the block layer that it supports zone append operations. That is, a zloop device ressembles an NVMe ZNS devices supporting zone append. This native support is fine but it does not allow exercising the block layer zone write plugging emulation of zone append, as is done with SCSI or ATA SMR HDDs. Introduce the zone_append configuration parameter to allow creating a zloop device without native support for zone append, thus relying on the block layer zone append emulation. If not specified, zone append support is enabled by default. Otherwise, a value of 0 disables native zone append and a value of 1 enables it. Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 0526277f6cd1..cf9be42ca3e1 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -32,6 +32,7 @@ enum { ZLOOP_OPT_NR_QUEUES = (1 << 6), ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), ZLOOP_OPT_BUFFERED_IO = (1 << 8), + ZLOOP_OPT_ZONE_APPEND = (1 << 9), }; static const match_table_t zloop_opt_tokens = { @@ -44,6 +45,7 @@ static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" }, { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, + { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, { ZLOOP_OPT_ERR, NULL } }; @@ -56,6 +58,7 @@ static const match_table_t zloop_opt_tokens = { #define ZLOOP_DEF_NR_QUEUES 1 #define ZLOOP_DEF_QUEUE_DEPTH 128 #define ZLOOP_DEF_BUFFERED_IO false +#define ZLOOP_DEF_ZONE_APPEND true /* Arbitrary limit on the zone size (16GB). */ #define ZLOOP_MAX_ZONE_SIZE_MB 16384 @@ -71,6 +74,7 @@ struct zloop_options { unsigned int nr_queues; unsigned int queue_depth; bool buffered_io; + bool zone_append; }; /* @@ -108,6 +112,7 @@ struct zloop_device { struct workqueue_struct *workqueue; bool buffered_io; + bool zone_append; const char *base_dir; struct file *data_dir; @@ -378,6 +383,11 @@ static void zloop_rw(struct zloop_cmd *cmd) cmd->nr_sectors = nr_sectors; cmd->ret = 0; + if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { + ret = -EIO; + goto out; + } + /* We should never get an I/O beyond the device capacity. */ if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { ret = -EIO; @@ -889,7 +899,6 @@ static int zloop_ctl_add(struct zloop_options *opts) { struct queue_limits lim = { .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, - .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT, .chunk_sectors = opts->zone_size, .features = BLK_FEAT_ZONED, }; @@ -941,6 +950,7 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->nr_zones = nr_zones; zlo->nr_conv_zones = opts->nr_conv_zones; zlo->buffered_io = opts->buffered_io; + zlo->zone_append = opts->zone_append; zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, opts->nr_queues * opts->queue_depth, zlo->id); @@ -981,6 +991,8 @@ static int zloop_ctl_add(struct zloop_options *opts) lim.physical_block_size = zlo->block_size; lim.logical_block_size = zlo->block_size; + if (zlo->zone_append) + lim.max_hw_zone_append_sectors = lim.max_hw_sectors; zlo->tag_set.ops = &zloop_mq_ops; zlo->tag_set.nr_hw_queues = opts->nr_queues; @@ -1021,10 +1033,13 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->state = Zlo_live; mutex_unlock(&zloop_ctl_mutex); - pr_info("Added device %d: %u zones of %llu MB, %u B block size\n", + pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n", zlo->id, zlo->nr_zones, ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, zlo->block_size); + pr_info("zloop%d: using %s zone append\n", + zlo->id, + zlo->zone_append ? "native" : "emulated"); return 0; @@ -1111,6 +1126,7 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) opts->nr_queues = ZLOOP_DEF_NR_QUEUES; opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; + opts->zone_append = ZLOOP_DEF_ZONE_APPEND; if (!buf) return 0; @@ -1220,6 +1236,18 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) case ZLOOP_OPT_BUFFERED_IO: opts->buffered_io = true; break; + case ZLOOP_OPT_ZONE_APPEND: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token != 0 && token != 1) { + pr_err("Invalid zone_append value\n"); + ret = -EINVAL; + goto out; + } + opts->zone_append = token; + break; case ZLOOP_OPT_ERR: default: pr_warn("unknown parameter or missing value '%s'\n", p); From fcc6eaa3a03a0e94f6f1d0ac455209b520ef8024 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 15 Nov 2025 21:15:55 +0900 Subject: [PATCH 118/162] zloop: introduce the ordered_zone_append configuration parameter The zone append operation processing for zloop devices is similar to any other command, that is, the operation is processed as a command work item, without any special serialization between the work items (beside the zone mutex for mutually exclusive code sections). This processing is fine and gives excellent performance. However, it has a side effect: zone append operation are very often reordered and processed in a sequence that is very different from their issuing order by the user. This effect is very visible using an XFS file system on top of a zloop device. A simple file write leads to many file extents as the data writes using zone append are reordered and so result in the physical order being different than the file logical order. E.g. executing: $ dd if=/dev/zero of=/mnt/test bs=1M count=10 && sync $ xfs_bmap /mnt/test /mnt/test: 0: [0..4095]: 2162688..2166783 1: [4096..6143]: 2168832..2170879 2: [6144..8191]: 2166784..2168831 3: [8192..10239]: 2170880..2172927 4: [10240..12287]: 2174976..2177023 5: [12288..14335]: 2172928..2174975 6: [14336..20479]: 2177024..2183167 For 10 IOs, 6 extents are created. This is fine and actually allows to exercise XFS zone garbage collection very well. However, this also makes debugging/working on XFS data placement harder as the underlying device will most of the time reorder IOs, resulting in many file extents. Allow a user to mitigate this with the new ordered_zone_append configuration parameter. For a zloop device created with this parameter specified, the sector of a zone append command is set early, when the command is submitted by the block layer with the zloop_queue_rq() function, instead of in the zloop_rw() function which is exectued later in the command work item context. This change ensures that more often than not, zone append operations data end up being written in the same order as the command submission by the user. In the case of XFS, this leads to far less file data extents. E.g., for the previous example, we get a single file data extent for the written file. $ dd if=/dev/zero of=/mnt/test bs=1M count=10 && sync $ xfs_bmap /mnt/test /mnt/test: 0: [0..20479]: 2162688..2183167 Since we cannot use a mutex in the context of the zloop_queue_rq() function to atomically set a zone append operation sector to the target zone write pointer location and increment that the write pointer, a new per-zone spinlock is introduced to protect a zone write pointer access and modifications. To check a zone write pointer location and set a zone append operation target sector to that value, the function zloop_set_zone_append_sector() is introduced and called from zloop_queue_rq(). Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 108 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 12 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index cf9be42ca3e1..c4da3116f7a9 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -33,6 +33,7 @@ enum { ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), ZLOOP_OPT_BUFFERED_IO = (1 << 8), ZLOOP_OPT_ZONE_APPEND = (1 << 9), + ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), }; static const match_table_t zloop_opt_tokens = { @@ -46,6 +47,7 @@ static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, + { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, { ZLOOP_OPT_ERR, NULL } }; @@ -59,6 +61,7 @@ static const match_table_t zloop_opt_tokens = { #define ZLOOP_DEF_QUEUE_DEPTH 128 #define ZLOOP_DEF_BUFFERED_IO false #define ZLOOP_DEF_ZONE_APPEND true +#define ZLOOP_DEF_ORDERED_ZONE_APPEND false /* Arbitrary limit on the zone size (16GB). */ #define ZLOOP_MAX_ZONE_SIZE_MB 16384 @@ -75,6 +78,7 @@ struct zloop_options { unsigned int queue_depth; bool buffered_io; bool zone_append; + bool ordered_zone_append; }; /* @@ -96,6 +100,7 @@ struct zloop_zone { unsigned long flags; struct mutex lock; + spinlock_t wp_lock; enum blk_zone_cond cond; sector_t start; sector_t wp; @@ -113,6 +118,7 @@ struct zloop_device { struct workqueue_struct *workqueue; bool buffered_io; bool zone_append; + bool ordered_zone_append; const char *base_dir; struct file *data_dir; @@ -152,6 +158,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) struct zloop_zone *zone = &zlo->zones[zone_no]; struct kstat stat; sector_t file_sectors; + unsigned long flags; int ret; lockdep_assert_held(&zone->lock); @@ -177,6 +184,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) return -EINVAL; } + spin_lock_irqsave(&zone->wp_lock, flags); if (!file_sectors) { zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; @@ -187,6 +195,7 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) zone->cond = BLK_ZONE_COND_CLOSED; zone->wp = zone->start + file_sectors; } + spin_unlock_irqrestore(&zone->wp_lock, flags); return 0; } @@ -230,6 +239,7 @@ unlock: static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; + unsigned long flags; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) @@ -248,10 +258,12 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: + spin_lock_irqsave(&zone->wp_lock, flags); if (zone->wp == zone->start) zone->cond = BLK_ZONE_COND_EMPTY; else zone->cond = BLK_ZONE_COND_CLOSED; + spin_unlock_irqrestore(&zone->wp_lock, flags); break; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_FULL: @@ -269,6 +281,7 @@ unlock: static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; + unsigned long flags; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) @@ -286,9 +299,11 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) goto unlock; } + spin_lock_irqsave(&zone->wp_lock, flags); zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + spin_unlock_irqrestore(&zone->wp_lock, flags); unlock: mutex_unlock(&zone->lock); @@ -313,6 +328,7 @@ static int zloop_reset_all_zones(struct zloop_device *zlo) static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; + unsigned long flags; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) @@ -330,9 +346,11 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) goto unlock; } + spin_lock_irqsave(&zone->wp_lock, flags); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); + spin_unlock_irqrestore(&zone->wp_lock, flags); unlock: mutex_unlock(&zone->lock); @@ -374,6 +392,7 @@ static void zloop_rw(struct zloop_cmd *cmd) struct zloop_zone *zone; struct iov_iter iter; struct bio_vec tmp; + unsigned long flags; sector_t zone_end; int nr_bvec = 0; int ret; @@ -416,19 +435,30 @@ static void zloop_rw(struct zloop_cmd *cmd) if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { mutex_lock(&zone->lock); + spin_lock_irqsave(&zone->wp_lock, flags); + /* * Zone append operations always go at the current write * pointer, but regular write operations must already be * aligned to the write pointer when submitted. */ if (is_append) { - if (zone->cond == BLK_ZONE_COND_FULL) { - ret = -EIO; - goto unlock; + /* + * If ordered zone append is in use, we already checked + * and set the target sector in zloop_queue_rq(). + */ + if (!zlo->ordered_zone_append) { + if (zone->cond == BLK_ZONE_COND_FULL) { + spin_unlock_irqrestore(&zone->wp_lock, + flags); + ret = -EIO; + goto unlock; + } + sector = zone->wp; } - sector = zone->wp; cmd->sector = sector; } else if (sector != zone->wp) { + spin_unlock_irqrestore(&zone->wp_lock, flags); pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", zone_no, sector, zone->wp); ret = -EIO; @@ -441,15 +471,19 @@ static void zloop_rw(struct zloop_cmd *cmd) zone->cond = BLK_ZONE_COND_IMP_OPEN; /* - * Advance the write pointer. If the write fails, the write - * pointer position will be corrected when the next I/O starts - * execution. + * Advance the write pointer, unless ordered zone append is in + * use. If the write fails, the write pointer position will be + * corrected when the next I/O starts execution. */ - zone->wp += nr_sectors; - if (zone->wp == zone_end) { - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = ULLONG_MAX; + if (!is_append || !zlo->ordered_zone_append) { + zone->wp += nr_sectors; + if (zone->wp == zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } } + + spin_unlock_irqrestore(&zone->wp_lock, flags); } rq_for_each_bvec(tmp, rq, rq_iter) @@ -623,6 +657,35 @@ static void zloop_complete_rq(struct request *rq) blk_mq_end_request(rq, sts); } +static bool zloop_set_zone_append_sector(struct request *rq) +{ + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + struct zloop_zone *zone = &zlo->zones[zone_no]; + sector_t zone_end = zone->start + zlo->zone_capacity; + sector_t nr_sectors = blk_rq_sectors(rq); + unsigned long flags; + + spin_lock_irqsave(&zone->wp_lock, flags); + + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + spin_unlock_irqrestore(&zone->wp_lock, flags); + return false; + } + + rq->__sector = zone->wp; + zone->wp += blk_rq_sectors(rq); + if (zone->wp >= zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + + spin_unlock_irqrestore(&zone->wp_lock, flags); + + return true; +} + static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -633,6 +696,16 @@ static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, if (zlo->state == Zlo_deleting) return BLK_STS_IOERR; + /* + * If we need to strongly order zone append operations, set the request + * sector to the zone write pointer location now instead of when the + * command work runs. + */ + if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) { + if (!zloop_set_zone_append_sector(rq)) + return BLK_STS_IOERR; + } + blk_mq_start_request(rq); INIT_WORK(&cmd->work, zloop_cmd_workfn); @@ -667,6 +740,7 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector, struct zloop_device *zlo = disk->private_data; struct blk_zone blkz = {}; unsigned int first, i; + unsigned long flags; int ret; first = disk_zone_no(disk, sector); @@ -690,7 +764,9 @@ static int zloop_report_zones(struct gendisk *disk, sector_t sector, blkz.start = zone->start; blkz.len = zlo->zone_size; + spin_lock_irqsave(&zone->wp_lock, flags); blkz.wp = zone->wp; + spin_unlock_irqrestore(&zone->wp_lock, flags); blkz.cond = zone->cond; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { blkz.type = BLK_ZONE_TYPE_CONVENTIONAL; @@ -798,6 +874,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, int ret; mutex_init(&zone->lock); + spin_lock_init(&zone->wp_lock); zone->start = (sector_t)zone_no << zlo->zone_shift; if (!restore) @@ -951,6 +1028,8 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->nr_conv_zones = opts->nr_conv_zones; zlo->buffered_io = opts->buffered_io; zlo->zone_append = opts->zone_append; + if (zlo->zone_append) + zlo->ordered_zone_append = opts->ordered_zone_append; zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, opts->nr_queues * opts->queue_depth, zlo->id); @@ -1037,8 +1116,9 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->id, zlo->nr_zones, ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, zlo->block_size); - pr_info("zloop%d: using %s zone append\n", + pr_info("zloop%d: using %s%s zone append\n", zlo->id, + zlo->ordered_zone_append ? "ordered " : "", zlo->zone_append ? "native" : "emulated"); return 0; @@ -1127,6 +1207,7 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; opts->zone_append = ZLOOP_DEF_ZONE_APPEND; + opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND; if (!buf) return 0; @@ -1248,6 +1329,9 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) } opts->zone_append = token; break; + case ZLOOP_OPT_ORDERED_ZONE_APPEND: + opts->ordered_zone_append = true; + break; case ZLOOP_OPT_ERR: default: pr_warn("unknown parameter or missing value '%s'\n", p); From ade260ca858627b21be87711b1e12a7bf80c0261 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 15 Nov 2025 21:15:56 +0900 Subject: [PATCH 119/162] Documentation: admin-guide: blockdev: update zloop parameters In Documentation/admin-guide/blockdev/zoned_loop.rst, add the description of the zone_append and ordered_zone_append configuration arguments of zloop "add" command (device creation). Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- .../admin-guide/blockdev/zoned_loop.rst | 61 +++++++++++-------- 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst index 64dcfde7450a..806adde664db 100644 --- a/Documentation/admin-guide/blockdev/zoned_loop.rst +++ b/Documentation/admin-guide/blockdev/zoned_loop.rst @@ -68,30 +68,43 @@ The options available for the add command can be listed by reading the In more details, the options that can be used with the "add" command are as follows. -================ =========================================================== -id Device number (the X in /dev/zloopX). - Default: automatically assigned. -capacity_mb Device total capacity in MiB. This is always rounded up to - the nearest higher multiple of the zone size. - Default: 16384 MiB (16 GiB). -zone_size_mb Device zone size in MiB. Default: 256 MiB. -zone_capacity_mb Device zone capacity (must always be equal to or lower than - the zone size. Default: zone size. -conv_zones Total number of conventioanl zones starting from sector 0. - Default: 8. -base_dir Path to the base directory where to create the directory - containing the zone files of the device. - Default=/var/local/zloop. - The device directory containing the zone files is always - named with the device ID. E.g. the default zone file - directory for /dev/zloop0 is /var/local/zloop/0. -nr_queues Number of I/O queues of the zoned block device. This value is - always capped by the number of online CPUs - Default: 1 -queue_depth Maximum I/O queue depth per I/O queue. - Default: 64 -buffered_io Do buffered IOs instead of direct IOs (default: false) -================ =========================================================== +=================== ========================================================= +id Device number (the X in /dev/zloopX). + Default: automatically assigned. +capacity_mb Device total capacity in MiB. This is always rounded up + to the nearest higher multiple of the zone size. + Default: 16384 MiB (16 GiB). +zone_size_mb Device zone size in MiB. Default: 256 MiB. +zone_capacity_mb Device zone capacity (must always be equal to or lower + than the zone size. Default: zone size. +conv_zones Total number of conventioanl zones starting from + sector 0 + Default: 8 +base_dir Path to the base directory where to create the directory + containing the zone files of the device. + Default=/var/local/zloop. + The device directory containing the zone files is always + named with the device ID. E.g. the default zone file + directory for /dev/zloop0 is /var/local/zloop/0. +nr_queues Number of I/O queues of the zoned block device. This + value is always capped by the number of online CPUs + Default: 1 +queue_depth Maximum I/O queue depth per I/O queue. + Default: 64 +buffered_io Do buffered IOs instead of direct IOs (default: false) +zone_append Enable or disable a zloop device native zone append + support. + Default: 1 (enabled). + If native zone append support is disabled, the block layer + will emulate this operation using regular write + operations. +ordered_zone_append Enable zloop mitigation of zone append reordering. + Default: disabled. + This is useful for testing file systems file data mapping + (extents), as when enabled, this can significantly reduce + the number of data extents needed to for a file data + mapping. +=================== ========================================================= 3) Deleting a Zoned Device -------------------------- From 3179a5f7f86bcc3acd5d6fb2a29f891ef5615852 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Mon, 17 Nov 2025 13:34:07 +0800 Subject: [PATCH 120/162] block: rate-limit capacity change info log loop devices under heavy stress-ng loop streessor can trigger many capacity change events in a short time. Each event prints an info message from set_capacity_and_notify(), flooding the console and contributing to soft lockups on slow consoles. Switch the printk in set_capacity_and_notify() to pr_info_ratelimited() so frequent capacity changes do not spam the log while still reporting occasional changes. Cc: stable@vger.kernel.org Signed-off-by: Li Chen Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 9bbc38d12792..bd3a6841e5b5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -90,7 +90,7 @@ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) (disk->flags & GENHD_FL_HIDDEN)) return false; - pr_info("%s: detected capacity change from %lld to %lld\n", + pr_info_ratelimited("%s: detected capacity change from %lld to %lld\n", disk->disk_name, capacity, size); /* From 2c6d792d4b7676e2b340df05425330452fee1f40 Mon Sep 17 00:00:00 2001 From: Sukrut Heroorkar Date: Tue, 18 Nov 2025 14:37:53 +0530 Subject: [PATCH 121/162] drbd: turn bitmap I/O comments into regular block comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit W=1 build warns because the bitmap I/O comments use '/**', which marks them as kernel-doc comments even though these functions do not document an external API. Convert these comments to regular block comments so kernel-doc no longer parses them. Signed-off-by: Sukrut Heroorkar Acked-by: Christoph Böhmwalder Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_bitmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 85ca000a0564..d90fa3e7f4cf 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1210,7 +1210,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned return err; } -/** +/* * drbd_bm_read() - Read the whole bitmap from its on disk location. * @device: DRBD device. */ @@ -1221,7 +1221,7 @@ int drbd_bm_read(struct drbd_device *device, return bm_rw(device, BM_AIO_READ, 0); } -/** +/* * drbd_bm_write() - Write the whole bitmap to its on disk location. * @device: DRBD device. * @@ -1233,7 +1233,7 @@ int drbd_bm_write(struct drbd_device *device, return bm_rw(device, 0, 0); } -/** +/* * drbd_bm_write_all() - Write the whole bitmap to its on disk location. * @device: DRBD device. * @@ -1255,7 +1255,7 @@ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_ho return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx); } -/** +/* * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. * @device: DRBD device. * @@ -1272,7 +1272,7 @@ int drbd_bm_write_copy_pages(struct drbd_device *device, return bm_rw(device, BM_AIO_COPY_PAGES, 0); } -/** +/* * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. * @device: DRBD device. */ From c3e6c11147f6f05c15e9c2d74f5d234a6661013c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:26 +0800 Subject: [PATCH 122/162] loop: add helper lo_cmd_nr_bvec() Add lo_cmd_nr_bvec() and prepare for refactoring lo_rw_aio(). Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 13ce229d450c..c6c37c9df193 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -337,6 +337,19 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret) lo_rw_aio_do_completion(cmd); } +static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct req_iterator rq_iter; + struct bio_vec tmp; + int nr_bvec = 0; + + rq_for_each_bvec(tmp, rq, rq_iter) + nr_bvec++; + + return nr_bvec; +} + static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, int rw) { @@ -348,12 +361,9 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, struct file *file = lo->lo_backing_file; struct bio_vec tmp; unsigned int offset; - int nr_bvec = 0; + int nr_bvec = lo_cmd_nr_bvec(cmd); int ret; - rq_for_each_bvec(tmp, rq, rq_iter) - nr_bvec++; - if (rq->bio != rq->biotail) { bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), From fd858d1ca9694c88703a8a936d5c7596c86ada74 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:27 +0800 Subject: [PATCH 123/162] loop: add helper lo_rw_aio_prep() Add helper lo_rw_aio_prep() to separate the preparation phase(setting up bio vectors and initializing the iocb structure) from the actual I/O execution in the loop block driver. Prepare for using NOWAIT to improve loop performance. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 63 ++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c6c37c9df193..c0d8d290cb78 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -350,21 +350,15 @@ static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd) return nr_bvec; } -static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, int rw) +static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, + unsigned nr_bvec, loff_t pos) { - struct iov_iter iter; - struct req_iterator rq_iter; - struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); - struct bio *bio = rq->bio; - struct file *file = lo->lo_backing_file; - struct bio_vec tmp; - unsigned int offset; - int nr_bvec = lo_cmd_nr_bvec(cmd); - int ret; if (rq->bio != rq->biotail) { + struct req_iterator rq_iter; + struct bio_vec *bvec; + struct bio_vec tmp; bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO); @@ -382,8 +376,42 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, *bvec = tmp; bvec++; } - bvec = cmd->bvec; + } else { + cmd->bvec = NULL; + } + + cmd->iocb.ki_pos = pos; + cmd->iocb.ki_filp = lo->lo_backing_file; + cmd->iocb.ki_ioprio = req_get_ioprio(rq); + if (cmd->use_aio) { + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + } else { + cmd->iocb.ki_complete = NULL; + cmd->iocb.ki_flags = 0; + } + return 0; +} + +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, int rw) +{ + struct iov_iter iter; + struct bio_vec *bvec; + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct bio *bio = rq->bio; + struct file *file = lo->lo_backing_file; + unsigned int offset; + int nr_bvec = lo_cmd_nr_bvec(cmd); + int ret; + + ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + if (unlikely(ret)) + return ret; + + if (cmd->bvec) { offset = 0; + bvec = cmd->bvec; } else { /* * Same here, this bio may be started from the middle of the @@ -398,17 +426,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = offset; - cmd->iocb.ki_pos = pos; - cmd->iocb.ki_filp = file; - cmd->iocb.ki_ioprio = req_get_ioprio(rq); - if (cmd->use_aio) { - cmd->iocb.ki_complete = lo_rw_aio_complete; - cmd->iocb.ki_flags = IOCB_DIRECT; - } else { - cmd->iocb.ki_complete = NULL; - cmd->iocb.ki_flags = 0; - } - if (rw == ITER_SOURCE) { kiocb_start_write(&cmd->iocb); ret = file->f_op->write_iter(&cmd->iocb, &iter); From c66e9708f92760147a1ea7f66c7b60ec801f85e3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:28 +0800 Subject: [PATCH 124/162] loop: add lo_submit_rw_aio() Refactor lo_rw_aio() by extracting the I/O submission logic into a new helper function lo_submit_rw_aio(). This further improves code organization by separating the I/O preparation, submission, and completion handling into distinct phases. Prepare for using NOWAIT to improve loop performance. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c0d8d290cb78..a494a93ed2b1 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -393,38 +393,32 @@ static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, return 0; } -static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, int rw) +static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + int nr_bvec, int rw) { - struct iov_iter iter; - struct bio_vec *bvec; struct request *rq = blk_mq_rq_from_pdu(cmd); - struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; - unsigned int offset; - int nr_bvec = lo_cmd_nr_bvec(cmd); + struct iov_iter iter; int ret; - ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); - if (unlikely(ret)) - return ret; - if (cmd->bvec) { - offset = 0; - bvec = cmd->bvec; + iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); + iter.iov_offset = 0; } else { + struct bio *bio = rq->bio; + struct bio_vec *bvec = __bvec_iter_bvec(bio->bi_io_vec, + bio->bi_iter); + /* * Same here, this bio may be started from the middle of the * 'bvec' because of bio splitting, so offset from the bvec * must be passed to iov iterator */ - offset = bio->bi_iter.bi_bvec_done; - bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); + iter.iov_offset = bio->bi_iter.bi_bvec_done; } atomic_set(&cmd->ref, 2); - iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); - iter.iov_offset = offset; if (rw == ITER_SOURCE) { kiocb_start_write(&cmd->iocb); @@ -433,7 +427,20 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); + return ret; +} +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, int rw) +{ + int nr_bvec = lo_cmd_nr_bvec(cmd); + int ret; + + ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + if (unlikely(ret)) + return ret; + + ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); return -EIOCBQUEUED; From f4788ae9d7bc01735cb6ada333b038c2e3fff260 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:29 +0800 Subject: [PATCH 125/162] loop: move command blkcg/memcg initialization into loop_queue_work Move loop command blkcg/memcg initialization into loop_queue_work, and prepare for supporting to handle loop io command by IOCB_NOWAIT. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a494a93ed2b1..c45635e48164 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -831,11 +831,28 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { + struct request __maybe_unused *rq = blk_mq_rq_from_pdu(cmd); struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; + /* always use the first bio's css */ + cmd->blkcg_css = NULL; + cmd->memcg_css = NULL; +#ifdef CONFIG_BLK_CGROUP + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); +#ifdef CONFIG_MEMCG + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } +#endif + } +#endif + spin_lock_irq(&lo->lo_work_lock); if (queue_on_root_worker(cmd->blkcg_css)) @@ -1907,21 +1924,6 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, break; } - /* always use the first bio's css */ - cmd->blkcg_css = NULL; - cmd->memcg_css = NULL; -#ifdef CONFIG_BLK_CGROUP - if (rq->bio) { - cmd->blkcg_css = bio_blkcg_css(rq->bio); -#ifdef CONFIG_MEMCG - if (cmd->blkcg_css) { - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); - } -#endif - } -#endif loop_queue_work(lo, cmd); return BLK_STS_OK; From 0ba93a906dda7ede9e7669adefe005ee18f3ff42 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:30 +0800 Subject: [PATCH 126/162] loop: try to handle loop aio command via NOWAIT IO first Try to handle loop aio command via NOWAIT IO first, then we can avoid to queue the aio command into workqueue. This is usually one big win in case that FS block mapping is stable, Mikulas verified [1] that this way improves IO perf by close to 5X in 12jobs sequential read/write test, in which FS block mapping is just stable. Fallback to workqueue in case of -EAGAIN. This way may bring a little cost from the 1st retry, but when running the following write test over loop/sparse_file, the actual effect on randwrite is obvious: ``` truncate -s 4G 1.img #1.img is created on XFS/virtio-scsi losetup -f 1.img --direct-io=on fio --direct=1 --bs=4k --runtime=40 --time_based --numjobs=1 --ioengine=libaio \ --iodepth=16 --group_reporting=1 --filename=/dev/loop0 -name=job --rw=$RW ``` - RW=randwrite: obvious IOPS drop observed - RW=write: a little drop(%5 - 10%) This perf drop on randwrite over sparse file will be addressed in the following patch. BLK_MQ_F_BLOCKING has to be set for calling into .read_iter() or .write_iter() which might sleep even though it is NOWAIT, and the only effect is that rcu read lock is replaced with srcu read lock. Link: https://lore.kernel.org/linux-block/a8e5c76a-231f-07d1-a394-847de930f638@redhat.com/ [1] Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 68 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c45635e48164..94478c02fea6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -90,6 +90,8 @@ struct loop_cmd { #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH 128 +static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd); + static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); @@ -321,6 +323,15 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) if (!atomic_dec_and_test(&cmd->ref)) return; + + /* -EAGAIN could be returned from bdev's ->ki_complete */ + if (cmd->ret == -EAGAIN) { + struct loop_device *lo = rq->q->queuedata; + + loop_queue_work(lo, cmd); + return; + } + kfree(cmd->bvec); cmd->bvec = NULL; if (req_op(rq) == REQ_OP_WRITE) @@ -430,22 +441,51 @@ static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, return ret; } +static bool lo_backfile_support_nowait(const struct loop_device *lo) +{ + return lo->lo_backing_file->f_mode & FMODE_NOWAIT; +} + static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, int rw) { int nr_bvec = lo_cmd_nr_bvec(cmd); int ret; - ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); - if (unlikely(ret)) - return ret; + /* prepared already if we have tried nowait */ + if (!cmd->use_aio || !lo_backfile_support_nowait(lo)) { + ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + if (unlikely(ret)) + goto fail; + } + cmd->iocb.ki_flags &= ~IOCB_NOWAIT; ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); +fail: if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); return -EIOCBQUEUED; } +static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, + int rw) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; + int nr_bvec = lo_cmd_nr_bvec(cmd); + int ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); + + if (unlikely(ret)) + goto fail; + + cmd->iocb.ki_flags |= IOCB_NOWAIT; + ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); +fail: + if (ret != -EIOCBQUEUED && ret != -EAGAIN) + lo_rw_aio_complete(&cmd->iocb, ret); + return ret; +} + static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); @@ -1907,6 +1947,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct loop_device *lo = rq->q->queuedata; + int rw = 0; blk_mq_start_request(rq); @@ -1919,9 +1960,25 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_WRITE_ZEROES: cmd->use_aio = false; break; - default: + case REQ_OP_READ: + rw = ITER_DEST; cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; + case REQ_OP_WRITE: + rw = ITER_SOURCE; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; + break; + default: + return BLK_STS_IOERR; + } + + /* try NOWAIT if the backing file supports the mode */ + if (cmd->use_aio && lo_backfile_support_nowait(lo)) { + int res = lo_rw_aio_nowait(lo, cmd, rw); + + if (res != -EAGAIN && res != -EOPNOTSUPP) + return BLK_STS_OK; + /* fallback to workqueue for handling aio */ } loop_queue_work(lo, cmd); @@ -2073,7 +2130,8 @@ static int loop_add(int i) lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT | + BLK_MQ_F_BLOCKING; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); From 837ed303964673cf0c7e6a4624cd68d8cf254827 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Oct 2025 19:07:31 +0800 Subject: [PATCH 127/162] loop: add hint for handling aio via IOCB_NOWAIT Add hint for using IOCB_NOWAIT to handle loop aio command for avoiding to cause write(especially randwrite) perf regression on sparse backed file. Try IOCB_NOWAIT in the following situations: - backing file is block device OR - READ aio command OR - there isn't any queued blocking async WRITEs, because NOWAIT won't cause contention with blocking WRITE, which often implies exclusive lock With this simple policy, perf regression of randwrite/write on sparse backing file is fixed. Link: https://lore.kernel.org/dm-devel/7d6ae2c9-df8e-50d0-7ad6-b787cb3cfab4@redhat.com/ Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 94478c02fea6..9b842d767381 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -68,6 +68,7 @@ struct loop_device { struct rb_root worker_tree; struct timer_list timer; bool sysfs_inited; + unsigned lo_nr_blocking_writes; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; @@ -467,6 +468,33 @@ fail: return -EIOCBQUEUED; } +static inline bool lo_aio_try_nowait(struct loop_device *lo, + struct loop_cmd *cmd) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct request *rq = blk_mq_rq_from_pdu(cmd); + + /* NOWAIT works fine for backing block device */ + if (S_ISBLK(inode->i_mode)) + return true; + + /* + * NOWAIT is supposed to be fine for READ without contending with + * blocking WRITE + */ + if (req_op(rq) == REQ_OP_READ) + return true; + + /* + * If there is any queued non-NOWAIT async WRITE , don't try new + * NOWAIT WRITE for avoiding contention + * + * Here we focus on handling stable FS block mapping via NOWAIT + */ + return READ_ONCE(lo->lo_nr_blocking_writes) == 0; +} + static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, int rw) { @@ -478,6 +506,9 @@ static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, if (unlikely(ret)) goto fail; + if (!lo_aio_try_nowait(lo, cmd)) + return -EAGAIN; + cmd->iocb.ki_flags |= IOCB_NOWAIT; ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); fail: @@ -780,12 +811,19 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } +static ssize_t loop_attr_nr_blocking_writes_show(struct loop_device *lo, + char *buf) +{ + return sysfs_emit(buf, "%u\n", lo->lo_nr_blocking_writes); +} + LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); LOOP_ATTR_RO(dio); +LOOP_ATTR_RO(nr_blocking_writes); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, @@ -794,6 +832,7 @@ static struct attribute *loop_attrs[] = { &loop_attr_autoclear.attr, &loop_attr_partscan.attr, &loop_attr_dio.attr, + &loop_attr_nr_blocking_writes.attr, NULL, }; @@ -869,6 +908,24 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) } #endif +static inline void loop_inc_blocking_writes(struct loop_device *lo, + struct loop_cmd *cmd) +{ + lockdep_assert_held(&lo->lo_work_lock); + + if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) + lo->lo_nr_blocking_writes += 1; +} + +static inline void loop_dec_blocking_writes(struct loop_device *lo, + struct loop_cmd *cmd) +{ + lockdep_assert_held(&lo->lo_work_lock); + + if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) + lo->lo_nr_blocking_writes -= 1; +} + static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { struct request __maybe_unused *rq = blk_mq_rq_from_pdu(cmd); @@ -951,6 +1008,8 @@ queue_work: work = &lo->rootcg_work; cmd_list = &lo->rootcg_cmd_list; } + if (cmd->use_aio) + loop_inc_blocking_writes(lo, cmd); list_add_tail(&cmd->list_entry, cmd_list); queue_work(lo->workqueue, work); spin_unlock_irq(&lo->lo_work_lock); @@ -2052,6 +2111,8 @@ static void loop_process_work(struct loop_worker *worker, cond_resched(); spin_lock_irq(&lo->lo_work_lock); + if (cmd->use_aio) + loop_dec_blocking_writes(lo, cmd); } /* From 152c331bcd805eddd520979c51cea46582e260fc Mon Sep 17 00:00:00 2001 From: Xue He Date: Tue, 18 Nov 2025 07:32:30 +0000 Subject: [PATCH 128/162] block: plug attempts to batch allocate tags multiple times This patch aims to enable batch allocation of sufficient tags after batch IO submission with plug mechanism, thereby avoiding the need for frequent individual requests when the initial allocation is insufficient. ----------------------------------------------------------- HW: 16 CPUs/16 poll queues Disk: Samsung PM9A3 Gen4 3.84T CMD: [global] ioengine=io_uring group_reporting=1 time_based=1 runtime=1m refill_buffers=1 norandommap=1 randrepeat=0 fixedbufs=1 registerfiles=1 rw=randread iodepth=128 iodepth_batch_submit=32 iodepth_batch_complete_min=32 iodepth_batch_complete_max=128 iodepth_low=32 bs=4k numjobs=1 direct=1 hipri=1 [job1] filename=/dev/nvme0n1 name=batch_test ------------------------------------------------------------ Perf: base code: __blk_mq_alloc_requests() 1.47% patch: __blk_mq_alloc_requests() 0.75% ------------------------------------------------------------ Signed-off-by: hexue Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0d38daaa4705..f2650c97a75e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -468,21 +468,26 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) unsigned long tag_mask; int i, nr = 0; - tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); - if (unlikely(!tag_mask)) - return NULL; + do { + tag_mask = blk_mq_get_tags(data, data->nr_tags - nr, &tag_offset); + if (unlikely(!tag_mask)) { + if (nr == 0) + return NULL; + break; + } + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + tag = tag_offset + i; + prefetch(tags->static_rqs[tag]); + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag); + rq_list_add_head(data->cached_rqs, rq); + nr++; + } + } while (data->nr_tags > nr); - tags = blk_mq_tags_from_data(data); - for (i = 0; tag_mask; i++) { - if (!(tag_mask & (1UL << i))) - continue; - tag = tag_offset + i; - prefetch(tags->static_rqs[tag]); - tag_mask &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tags, tag); - rq_list_add_head(data->cached_rqs, rq); - nr++; - } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ From 42adb2d4ef24d2834cbd3bb96a6660826ae763da Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 14 Nov 2025 13:04:06 -0800 Subject: [PATCH 129/162] fs: Add the __data_racy annotation to backing_dev_info.ra_pages Some but not all .ra_pages changes happen while block layer I/O is paused with blk_mq_freeze_queue(). Filesystems may read .ra_pages even while block layer I/O is paused, e.g. from inside their .fadvise callback. Annotating all .ra_pages reads with READ_ONCE() would be cumbersome. Hence, add the __data_racy annotatation to the .ra_pages member variable. Cc: Alexander Viro Cc: Christian Brauner Cc: Nilay Shroff Signed-off-by: Bart Van Assche Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..30f4bd9ff7c8 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -168,7 +168,9 @@ struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ + /* max readahead in PAGE_SIZE units */ + unsigned long __data_racy ra_pages; + unsigned long io_pages; /* max allowed IO size */ struct kref refcnt; /* Reference counter for the structure */ From 935a20d1bebf6236076785fac3ff81e3931834e9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 14 Nov 2025 13:04:07 -0800 Subject: [PATCH 130/162] block: Remove queue freezing from several sysfs store callbacks Freezing the request queue from inside sysfs store callbacks may cause a deadlock in combination with the dm-multipath driver and the queue_if_no_path option. Additionally, freezing the request queue slows down system boot on systems where sysfs attributes are set synchronously. Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue() calls from the store callbacks that do not strictly need these callbacks. Add the __data_racy annotation to request_queue.rq_timeout to suppress KCSAN data race reports about the rq_timeout reads. This patch may cause a small delay in applying the new settings. For all the attributes affected by this patch, I/O will complete correctly whether the old or the new value of the attribute is used. This patch affects the following sysfs attributes: * io_poll_delay * io_timeout * nomerges * read_ahead_kb * rq_affinity Here is an example of a deadlock triggered by running test srp/002 if this patch is not applied: task:multipathd Call Trace: __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 schedule_preempt_disabled+0x1c/0x30 __mutex_lock+0xb89/0x1650 mutex_lock_nested+0x1f/0x30 dm_table_set_restrictions+0x823/0xdf0 __bind+0x166/0x590 dm_swap_table+0x2a7/0x490 do_resume+0x1b1/0x610 dev_suspend+0x55/0x1a0 ctl_ioctl+0x3a5/0x7e0 dm_ctl_ioctl+0x12/0x20 __x64_sys_ioctl+0x127/0x1a0 x64_sys_call+0xe2b/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 task:(udev-worker) Call Trace: __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 blk_mq_freeze_queue_wait+0xf2/0x140 blk_mq_freeze_queue_nomemsave+0x23/0x30 queue_ra_store+0x14e/0x290 queue_attr_store+0x23e/0x2c0 sysfs_kf_write+0xde/0x140 kernfs_fop_write_iter+0x3b2/0x630 vfs_write+0x4fd/0x1390 ksys_write+0xfd/0x230 __x64_sys_write+0x76/0xc0 x64_sys_call+0x276/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Cc: Christoph Hellwig Cc: Ming Lei Cc: Nilay Shroff Cc: Martin Wilck Cc: Benjamin Marzinski Cc: stable@vger.kernel.org Fixes: af2814149883 ("block: freeze the queue in queue_attr_store") Signed-off-by: Bart Van Assche Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 26 ++++++++------------------ include/linux/blkdev.h | 2 +- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 76c47fe9b8d6..8684c57498cc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -143,21 +143,22 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count) { unsigned long ra_kb; ssize_t ret; - unsigned int memflags; struct request_queue *q = disk->queue; ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; /* - * ->ra_pages is protected by ->limits_lock because it is usually - * calculated from the queue limits by queue_limits_commit_update. + * The ->ra_pages change below is protected by ->limits_lock because it + * is usually calculated from the queue limits by + * queue_limits_commit_update(). + * + * bdi->ra_pages reads are not serialized against bdi->ra_pages writes. + * Use WRITE_ONCE() to write bdi->ra_pages once. */ mutex_lock(&q->limits_lock); - memflags = blk_mq_freeze_queue(q); - disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + WRITE_ONCE(disk->bdi->ra_pages, ra_kb >> (PAGE_SHIFT - 10)); mutex_unlock(&q->limits_lock); - blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -375,21 +376,18 @@ static ssize_t queue_nomerges_store(struct gendisk *disk, const char *page, size_t count) { unsigned long nm; - unsigned int memflags; struct request_queue *q = disk->queue; ssize_t ret = queue_var_store(&nm, page, count); if (ret < 0) return ret; - memflags = blk_mq_freeze_queue(q); blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -409,7 +407,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) #ifdef CONFIG_SMP struct request_queue *q = disk->queue; unsigned long val; - unsigned int memflags; ret = queue_var_store(&val, page, count); if (ret < 0) @@ -421,7 +418,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) * are accessed individually using atomic test_bit operation. So we * don't grab any lock while updating these flags. */ - memflags = blk_mq_freeze_queue(q); if (val == 2) { blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); @@ -432,7 +428,6 @@ queue_rq_affinity_store(struct gendisk *disk, const char *page, size_t count) blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } - blk_mq_unfreeze_queue(q, memflags); #endif return ret; } @@ -446,11 +441,9 @@ static ssize_t queue_poll_delay_store(struct gendisk *disk, const char *page, static ssize_t queue_poll_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int memflags; ssize_t ret = count; struct request_queue *q = disk->queue; - memflags = blk_mq_freeze_queue(q); if (!(q->limits.features & BLK_FEAT_POLL)) { ret = -EINVAL; goto out; @@ -459,7 +452,6 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page, pr_info_ratelimited("writes to the poll attribute are ignored.\n"); pr_info_ratelimited("please use driver specific parameters instead.\n"); out: - blk_mq_unfreeze_queue(q, memflags); return ret; } @@ -472,7 +464,7 @@ static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page) static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, size_t count) { - unsigned int val, memflags; + unsigned int val; int err; struct request_queue *q = disk->queue; @@ -480,9 +472,7 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page, if (err || val == 0) return -EINVAL; - memflags = blk_mq_freeze_queue(q); blk_queue_rq_timeout(q, msecs_to_jiffies(val)); - blk_mq_unfreeze_queue(q, memflags); return count; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2fff8a80dbd2..cb4ba09959ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -495,7 +495,7 @@ struct request_queue { */ unsigned long queue_flags; - unsigned int rq_timeout; + unsigned int __data_racy rq_timeout; unsigned int queue_depth; From caebce24f6a7f8315b1b87505b74066efb592d92 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Nov 2025 08:37:26 -0700 Subject: [PATCH 131/162] Revert "block: consider discard merge last" This reverts commit 2516c246d01c23a5f5310e9ac78d9f8aad9b1d0e. Suspected issues with discard merging post this patch, hence revert it for now. Link: https://lore.kernel.org/linux-block/26acdfdf-de13-430b-8c73-f890c7689a84@kernel.dk/ Signed-off-by: Jens Axboe --- block/blk-merge.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index db08bc906091..d3115d7469df 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -712,10 +712,10 @@ static void blk_account_io_merge_request(struct request *req) static enum elv_merge blk_try_req_merge(struct request *req, struct request *next) { - if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next)) - return ELEVATOR_BACK_MERGE; - else if (blk_discard_mergable(req)) + if (blk_discard_mergable(req)) return ELEVATOR_DISCARD_MERGE; + else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next)) + return ELEVATOR_BACK_MERGE; return ELEVATOR_NO_MERGE; } @@ -903,12 +903,12 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) { - if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) + if (blk_discard_mergable(rq)) + return ELEVATOR_DISCARD_MERGE; + else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) return ELEVATOR_BACK_MERGE; else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) return ELEVATOR_FRONT_MERGE; - else if (blk_discard_mergable(rq)) - return ELEVATOR_DISCARD_MERGE; return ELEVATOR_NO_MERGE; } From 8e1d91c2582d2b60d62616649546bb132fff566b Mon Sep 17 00:00:00 2001 From: Chengkaitao Date: Tue, 18 Nov 2025 09:26:44 +0800 Subject: [PATCH 132/162] block: remove the declaration of elevator_init_mq function In commit 1e44bedbc921 ("block: unifying elevator change"), the elevator_init_mq function was deleted, but its declaration in elevator.h was overlooked. This patch fixes it. Signed-off-by: Chengkaitao Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/elevator.h | 1 - 1 file changed, 1 deletion(-) diff --git a/block/elevator.h b/block/elevator.h index 3ee1d494f48a..a9d092c5a9e8 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -171,7 +171,6 @@ extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); -void elevator_init_mq(struct request_queue *q); /* * io scheduler registration From 00ed0350944dc33ac76cca2ddd2966e34f32a80e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 19 Nov 2025 12:02:19 +0900 Subject: [PATCH 133/162] MAINTAINERS: add missing block layer user API header files Add the missing user API header files related to the block layer to the list of matching file patterns for Jens's block layer entry. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index ce7f67e09910..92d545e52ba0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4400,6 +4400,8 @@ F: block/ F: drivers/block/ F: include/linux/bio.h F: include/linux/blk* +F: include/uapi/linux/blk* +F: include/uapi/linux/ioprio.h F: kernel/trace/blktrace.c F: lib/sbitmap.c From ebcc028b4a3db7f4a76f97b05d746aa6ff1a56ab Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 19 Nov 2025 12:02:20 +0900 Subject: [PATCH 134/162] MAINTAINERS: add a maintainer for zoned block device support Add myself as the maintainer of the block layer support for the zoned block device code and user API. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 92d545e52ba0..ae9cf811bd2c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -28292,6 +28292,13 @@ L: linux-kernel@vger.kernel.org S: Maintained F: arch/x86/kernel/cpu/zhaoxin.c +ZONED BLOCK DEVICE (BLOCK LAYER) +M: Damien Le Moal +L: linux-block@vger.kernel.org +S: Maintained +F: block/blk-zoned.c +F: include/uapi/linux/blkzoned.h + ZONED LOOP DEVICE M: Damien Le Moal R: Christoph Hellwig From a9637ab93c6cfdf7a80a299b7de691dea6a7d7ba Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 19 Nov 2025 13:34:23 +0900 Subject: [PATCH 135/162] zloop: fix zone append check in zloop_rw() While commit cf28f6f923cb ("zloop: fail zone append operations that are targeting full zones") added a check in zloop_rw() that a zone append is not issued to a full zone, commit e3a96ca90462 ("zloop: simplify checks for writes to sequential zones") inadvertently removed the check to verify that there is enough unwritten space in a zone for an incoming zone append opration. Re-add this check in zloop_rw() to make sure we do not write beyond the end of a zone. Of note is that this same check is already present in the function zloop_set_zone_append_sector() when ordered zone append is in use. Reported-by: Hans Holmberg Fixes: e3a96ca90462 ("zloop: simplify checks for writes to sequential zones") Signed-off-by: Damien Le Moal Reviewed-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index c4da3116f7a9..1273bbca7843 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -448,7 +448,8 @@ static void zloop_rw(struct zloop_cmd *cmd) * and set the target sector in zloop_queue_rq(). */ if (!zlo->ordered_zone_append) { - if (zone->cond == BLK_ZONE_COND_FULL) { + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { spin_unlock_irqrestore(&zone->wp_lock, flags); ret = -EIO; From b11e483a1cc32e7b557ff680e9bfb4ff11dea9c1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Nov 2025 15:22:33 -0800 Subject: [PATCH 136/162] loop: clear nowait flag in workqueue context The loop driver advertises REQ_NOWAIT support through BLK_FEAT_NOWAIT (enabled by default for all blk-mq devices), and honors the nowait behavior throughout loop_queue_rq(). However, actual I/O to the backing file is performed in a workqueue, where blocking is allowed. To avoid imposing unnecessary non-blocking constraints in this blocking context, clear the REQ_NOWAIT flag before processing the request in the workqueue context. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9b842d767381..dd049764bb0c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2060,6 +2060,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd) goto failed; } + /* We can block in this context, so ignore REQ_NOWAIT. */ + if (rq->cmd_flags & REQ_NOWAIT) + rq->cmd_flags &= ~REQ_NOWAIT; + if (cmd_blkcg_css) kthread_associate_blkcg(cmd_blkcg_css); if (cmd_memcg_css) From e8f0abdd49baacee3886d5827f113514fcd9fd05 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Nov 2025 15:22:34 -0800 Subject: [PATCH 137/162] zloop: clear nowait flag in workqueue context The zloop driver advertises REQ_NOWAIT support through BLK_FEAT_NOWAIT (enabled by default for all blk-mq devices), and honors the nowait behavior throughout zloop_queue_rq(). However, actual I/O to the backing file is performed in a workqueue, where blocking is allowed. To avoid imposing unnecessary non-blocking constraints in this blocking context, clear the REQ_NOWAIT flag before processing the request in the workqueue context. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 1273bbca7843..3f50321aa4a7 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -548,6 +548,10 @@ static void zloop_handle_cmd(struct zloop_cmd *cmd) struct request *rq = blk_mq_rq_from_pdu(cmd); struct zloop_device *zlo = rq->q->queuedata; + /* We can block in this context, so ignore REQ_NOWAIT. */ + if (rq->cmd_flags & REQ_NOWAIT) + rq->cmd_flags &= ~REQ_NOWAIT; + switch (req_op(rq)) { case REQ_OP_READ: case REQ_OP_WRITE: From 9420e720ad192c53c8d2803c5a2313b2d586adbd Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 19 Nov 2025 22:41:08 +0000 Subject: [PATCH 138/162] block: use min() instead of min_t() min_t(unsigned int, a, b) casts an 'unsigned long' to 'unsigned int'. Use min(a, b) instead as it promotes any 'unsigned int' to 'unsigned long' and so cannot discard significant bits. In this case the 'unsigned long' value is small enough that the result is ok. (Similarly for max_t() and clamp_t().) Detected by an extra check added to min_t(). Signed-off-by: David Laight Signed-off-by: Jens Axboe --- block/blk-iocost.c | 6 ++---- block/partitions/efi.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5bfd70311359..a0416927d33d 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2334,10 +2334,8 @@ static void ioc_timer_fn(struct timer_list *timer) else usage_dur = max_t(u64, now.now - ioc->period_at, 1); - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); + usage = clamp(DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 7acba66eed48..638261e9f2fb 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -215,8 +215,7 @@ check_hybrid: sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", - sz, min_t(uint32_t, - total_sectors - 1, 0xFFFFFFFF)); + sz, (uint32_t)min(total_sectors - 1, 0xFFFFFFFF)); } done: return ret; From 96f03c8cb29f2ec5ffe8e24fbec2c64f3c3bf4e5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Nov 2025 20:53:19 -0700 Subject: [PATCH 139/162] Revert "Merge branch 'loop-aio-nowait' into for-6.19/block" This reverts commit f43fdeb9a368a5ff56b088b46edc245bd4b52cde, reversing changes made to 2c6d792d4b7676e2b340df05425330452fee1f40. There are concerns that doing inline submits can cause excessive stack usage, particularly when going back into the filesystem. Revert the loop dio nowait change for now. Link: https://lore.kernel.org/linux-block/aSP3SG_KaROJTBHx@infradead.org/ Signed-off-by: Jens Axboe --- drivers/block/loop.c | 235 ++++++++----------------------------------- 1 file changed, 40 insertions(+), 195 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index dd049764bb0c..ebe751f39742 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -68,7 +68,6 @@ struct loop_device { struct rb_root worker_tree; struct timer_list timer; bool sysfs_inited; - unsigned lo_nr_blocking_writes; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; @@ -91,8 +90,6 @@ struct loop_cmd { #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) #define LOOP_DEFAULT_HW_Q_DEPTH 128 -static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd); - static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); static DEFINE_MUTEX(loop_validate_mutex); @@ -324,15 +321,6 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) if (!atomic_dec_and_test(&cmd->ref)) return; - - /* -EAGAIN could be returned from bdev's ->ki_complete */ - if (cmd->ret == -EAGAIN) { - struct loop_device *lo = rq->q->queuedata; - - loop_queue_work(lo, cmd); - return; - } - kfree(cmd->bvec); cmd->bvec = NULL; if (req_op(rq) == REQ_OP_WRITE) @@ -349,28 +337,24 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret) lo_rw_aio_do_completion(cmd); } -static inline unsigned lo_cmd_nr_bvec(struct loop_cmd *cmd) +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, int rw) { - struct request *rq = blk_mq_rq_from_pdu(cmd); + struct iov_iter iter; struct req_iterator rq_iter; + struct bio_vec *bvec; + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct bio *bio = rq->bio; + struct file *file = lo->lo_backing_file; struct bio_vec tmp; + unsigned int offset; int nr_bvec = 0; + int ret; rq_for_each_bvec(tmp, rq, rq_iter) nr_bvec++; - return nr_bvec; -} - -static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, - unsigned nr_bvec, loff_t pos) -{ - struct request *rq = blk_mq_rq_from_pdu(cmd); - if (rq->bio != rq->biotail) { - struct req_iterator rq_iter; - struct bio_vec *bvec; - struct bio_vec tmp; bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO); @@ -388,12 +372,24 @@ static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, *bvec = tmp; bvec++; } + bvec = cmd->bvec; + offset = 0; } else { - cmd->bvec = NULL; + /* + * Same here, this bio may be started from the middle of the + * 'bvec' because of bio splitting, so offset from the bvec + * must be passed to iov iterator + */ + offset = bio->bi_iter.bi_bvec_done; + bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); } + atomic_set(&cmd->ref, 2); + + iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); + iter.iov_offset = offset; cmd->iocb.ki_pos = pos; - cmd->iocb.ki_filp = lo->lo_backing_file; + cmd->iocb.ki_filp = file; cmd->iocb.ki_ioprio = req_get_ioprio(rq); if (cmd->use_aio) { cmd->iocb.ki_complete = lo_rw_aio_complete; @@ -402,35 +398,6 @@ static int lo_rw_aio_prep(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_complete = NULL; cmd->iocb.ki_flags = 0; } - return 0; -} - -static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - int nr_bvec, int rw) -{ - struct request *rq = blk_mq_rq_from_pdu(cmd); - struct file *file = lo->lo_backing_file; - struct iov_iter iter; - int ret; - - if (cmd->bvec) { - iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); - iter.iov_offset = 0; - } else { - struct bio *bio = rq->bio; - struct bio_vec *bvec = __bvec_iter_bvec(bio->bi_io_vec, - bio->bi_iter); - - /* - * Same here, this bio may be started from the middle of the - * 'bvec' because of bio splitting, so offset from the bvec - * must be passed to iov iterator - */ - iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq)); - iter.iov_offset = bio->bi_iter.bi_bvec_done; - } - atomic_set(&cmd->ref, 2); - if (rw == ITER_SOURCE) { kiocb_start_write(&cmd->iocb); @@ -439,84 +406,12 @@ static int lo_submit_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, ret = file->f_op->read_iter(&cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); - return ret; -} -static bool lo_backfile_support_nowait(const struct loop_device *lo) -{ - return lo->lo_backing_file->f_mode & FMODE_NOWAIT; -} - -static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, - loff_t pos, int rw) -{ - int nr_bvec = lo_cmd_nr_bvec(cmd); - int ret; - - /* prepared already if we have tried nowait */ - if (!cmd->use_aio || !lo_backfile_support_nowait(lo)) { - ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); - if (unlikely(ret)) - goto fail; - } - - cmd->iocb.ki_flags &= ~IOCB_NOWAIT; - ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); -fail: if (ret != -EIOCBQUEUED) lo_rw_aio_complete(&cmd->iocb, ret); return -EIOCBQUEUED; } -static inline bool lo_aio_try_nowait(struct loop_device *lo, - struct loop_cmd *cmd) -{ - struct file *file = lo->lo_backing_file; - struct inode *inode = file->f_mapping->host; - struct request *rq = blk_mq_rq_from_pdu(cmd); - - /* NOWAIT works fine for backing block device */ - if (S_ISBLK(inode->i_mode)) - return true; - - /* - * NOWAIT is supposed to be fine for READ without contending with - * blocking WRITE - */ - if (req_op(rq) == REQ_OP_READ) - return true; - - /* - * If there is any queued non-NOWAIT async WRITE , don't try new - * NOWAIT WRITE for avoiding contention - * - * Here we focus on handling stable FS block mapping via NOWAIT - */ - return READ_ONCE(lo->lo_nr_blocking_writes) == 0; -} - -static int lo_rw_aio_nowait(struct loop_device *lo, struct loop_cmd *cmd, - int rw) -{ - struct request *rq = blk_mq_rq_from_pdu(cmd); - loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - int nr_bvec = lo_cmd_nr_bvec(cmd); - int ret = lo_rw_aio_prep(lo, cmd, nr_bvec, pos); - - if (unlikely(ret)) - goto fail; - - if (!lo_aio_try_nowait(lo, cmd)) - return -EAGAIN; - - cmd->iocb.ki_flags |= IOCB_NOWAIT; - ret = lo_submit_rw_aio(lo, cmd, nr_bvec, rw); -fail: - if (ret != -EIOCBQUEUED && ret != -EAGAIN) - lo_rw_aio_complete(&cmd->iocb, ret); - return ret; -} - static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); @@ -811,19 +706,12 @@ static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } -static ssize_t loop_attr_nr_blocking_writes_show(struct loop_device *lo, - char *buf) -{ - return sysfs_emit(buf, "%u\n", lo->lo_nr_blocking_writes); -} - LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); LOOP_ATTR_RO(dio); -LOOP_ATTR_RO(nr_blocking_writes); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, @@ -832,7 +720,6 @@ static struct attribute *loop_attrs[] = { &loop_attr_autoclear.attr, &loop_attr_partscan.attr, &loop_attr_dio.attr, - &loop_attr_nr_blocking_writes.attr, NULL, }; @@ -908,48 +795,13 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) } #endif -static inline void loop_inc_blocking_writes(struct loop_device *lo, - struct loop_cmd *cmd) -{ - lockdep_assert_held(&lo->lo_work_lock); - - if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) - lo->lo_nr_blocking_writes += 1; -} - -static inline void loop_dec_blocking_writes(struct loop_device *lo, - struct loop_cmd *cmd) -{ - lockdep_assert_held(&lo->lo_work_lock); - - if (req_op(blk_mq_rq_from_pdu(cmd)) == REQ_OP_WRITE) - lo->lo_nr_blocking_writes -= 1; -} - static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { - struct request __maybe_unused *rq = blk_mq_rq_from_pdu(cmd); struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; - /* always use the first bio's css */ - cmd->blkcg_css = NULL; - cmd->memcg_css = NULL; -#ifdef CONFIG_BLK_CGROUP - if (rq->bio) { - cmd->blkcg_css = bio_blkcg_css(rq->bio); -#ifdef CONFIG_MEMCG - if (cmd->blkcg_css) { - cmd->memcg_css = - cgroup_get_e_css(cmd->blkcg_css->cgroup, - &memory_cgrp_subsys); - } -#endif - } -#endif - spin_lock_irq(&lo->lo_work_lock); if (queue_on_root_worker(cmd->blkcg_css)) @@ -1008,8 +860,6 @@ queue_work: work = &lo->rootcg_work; cmd_list = &lo->rootcg_cmd_list; } - if (cmd->use_aio) - loop_inc_blocking_writes(lo, cmd); list_add_tail(&cmd->list_entry, cmd_list); queue_work(lo->workqueue, work); spin_unlock_irq(&lo->lo_work_lock); @@ -2006,7 +1856,6 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct loop_device *lo = rq->q->queuedata; - int rw = 0; blk_mq_start_request(rq); @@ -2019,27 +1868,26 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_WRITE_ZEROES: cmd->use_aio = false; break; - case REQ_OP_READ: - rw = ITER_DEST; - cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; - break; - case REQ_OP_WRITE: - rw = ITER_SOURCE; - cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; - break; default: - return BLK_STS_IOERR; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; + break; } - /* try NOWAIT if the backing file supports the mode */ - if (cmd->use_aio && lo_backfile_support_nowait(lo)) { - int res = lo_rw_aio_nowait(lo, cmd, rw); - - if (res != -EAGAIN && res != -EOPNOTSUPP) - return BLK_STS_OK; - /* fallback to workqueue for handling aio */ + /* always use the first bio's css */ + cmd->blkcg_css = NULL; + cmd->memcg_css = NULL; +#ifdef CONFIG_BLK_CGROUP + if (rq->bio) { + cmd->blkcg_css = bio_blkcg_css(rq->bio); +#ifdef CONFIG_MEMCG + if (cmd->blkcg_css) { + cmd->memcg_css = + cgroup_get_e_css(cmd->blkcg_css->cgroup, + &memory_cgrp_subsys); + } +#endif } - +#endif loop_queue_work(lo, cmd); return BLK_STS_OK; @@ -2115,8 +1963,6 @@ static void loop_process_work(struct loop_worker *worker, cond_resched(); spin_lock_irq(&lo->lo_work_lock); - if (cmd->use_aio) - loop_dec_blocking_writes(lo, cmd); } /* @@ -2195,8 +2041,7 @@ static int loop_add(int i) lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT | - BLK_MQ_F_BLOCKING; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); From a74de0c3663cf5cf568025e964524a5f875e4bfc Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 25 Nov 2025 16:11:13 +0000 Subject: [PATCH 140/162] block: Remove references to __device_add_disk() Since commit d1254a874971 ("block: remove support for delayed queue registrations"), function __device_add_disk() has been replaced with device_add_disk(), so fix up comments. Signed-off-by: John Garry Signed-off-by: Jens Axboe --- block/genhd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index bd3a6841e5b5..69c75117ba2c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -795,11 +795,11 @@ static void disable_elv_switch(struct request_queue *q) * partitions associated with the gendisk, and unregisters the associated * request_queue. * - * This is the counter to the respective __device_add_disk() call. + * This is the counter to the respective device_add_disk() call. * * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if - * __device_add_disk() was used. + * device_add_disk() was used. * * Drivers exist which depend on the release of the gendisk to be synchronous, * it should not be deferred. @@ -1265,7 +1265,7 @@ static const struct attribute_group *disk_attr_groups[] = { * * This function releases all allocated resources of the gendisk. * - * Drivers which used __device_add_disk() have a gendisk with a request_queue + * Drivers which used device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue From 3a64c46c40460386aeca6e8698076d1207aeaf44 Mon Sep 17 00:00:00 2001 From: shechenglong Date: Mon, 24 Nov 2025 10:02:58 +0800 Subject: [PATCH 141/162] block: fix typos in comments and strings in blk-core This patch fixes multiple spelling mistakes in comments and documentation in the file block/blk-core.c. No functional changes intended. Suggested-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: shechenglong Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 14ae73eebe0d..8387fe50ea15 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -662,13 +662,13 @@ static void __submit_bio(struct bio *bio) * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. - * - In this case we really did just take the bio of the top of the list (no + * - In this case we really did just take the bio off the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current - * ->submit_bio, but that haven't been processed yet. + * ->submit_bio(), but that haven't been processed yet. */ static void __submit_bio_noacct(struct bio *bio) { @@ -743,8 +743,8 @@ void submit_bio_noacct_nocheck(struct bio *bio, bool split) /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list - * to collect a list of requests submited by a ->submit_bio method while - * it is active, and then process them after it returned. + * to collect a list of requests submitted by a ->submit_bio method + * while it is active, and then process them after it returned. */ if (current->bio_list) { if (split) @@ -901,7 +901,7 @@ static void bio_set_ioprio(struct bio *bio) * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_bdev field. + * bio will be sent to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback @@ -991,7 +991,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, * point to a freshly allocated bio at this point. If that happens * we have a few cases to consider: * - * 1) the bio is beeing initialized and bi_bdev is NULL. We can just + * 1) the bio is being initialized and bi_bdev is NULL. We can just * simply nothing in this case * 2) the bio points to a not poll enabled device. bio_poll will catch * this and return 0 From 7d09a8e25121a20214558d013b31e17ff84b004d Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 24 Nov 2025 15:48:01 -0800 Subject: [PATCH 142/162] block: ignore __blkdev_issue_discard() return value __blkdev_issue_discard() always returns 0, making the error check in blkdev_issue_discard() dead code. In function blkdev_issue_discard() initialize ret = 0, remove ret assignment from __blkdev_issue_discard(), rely on bio == NULL check to call submit_bio_wait(), preserve submit_bio_wait() error handling, and preserve -EOPNOTSUPP to 0 mapping. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Chaitanya Kulkarni Reviewed-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/blk-lib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 3030a772d3aa..19e0203cc18a 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -87,11 +87,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, { struct bio *bio = NULL; struct blk_plug plug; - int ret; + int ret = 0; blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); - if (!ret && bio) { + __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); + if (bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) ret = 0; From c943bfc6afb8d0e781b9b7406f36caa8bbf95cb9 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Wed, 26 Nov 2025 17:06:31 +0100 Subject: [PATCH 143/162] s390/dasd: Fix gendisk parent after copy pair swap After a copy pair swap the block device's "device" symlink points to the secondary CCW device, but the gendisk's parent remained the primary, leaving /sys/block/ under the wrong parent. Move the gendisk to the secondary's device with device_move(), keeping the sysfs topology consistent after the swap. Fixes: 413862caad6f ("s390/dasd: add copy pair swap capability") Cc: stable@vger.kernel.org #6.1 Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 88fa17aea2ec..ec0c62e5ef73 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -6150,6 +6150,7 @@ static int dasd_eckd_copy_pair_swap(struct dasd_device *device, char *prim_busid struct dasd_copy_relation *copy; struct dasd_block *block; struct gendisk *gdp; + int rc; copy = device->copy; if (!copy) @@ -6184,6 +6185,13 @@ static int dasd_eckd_copy_pair_swap(struct dasd_device *device, char *prim_busid /* swap blocklayer device link */ gdp = block->gdp; dasd_add_link_to_gendisk(gdp, secondary); + rc = device_move(disk_to_dev(gdp), &secondary->cdev->dev, DPM_ORDER_NONE); + if (rc) { + dev_err(&primary->cdev->dev, + "copy_pair_swap: moving blockdevice parent %s->%s failed (%d)\n", + dev_name(&primary->cdev->dev), + dev_name(&secondary->cdev->dev), rc); + } /* re-enable device */ dasd_device_remove_stop_bits(primary, DASD_STOPPED_PPRC); From 764def9e8eaf1b1ccdcd89b8c16db4194ade775f Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Wed, 26 Nov 2025 17:06:32 +0100 Subject: [PATCH 144/162] s390/dasd: Remove unnecessary debugfs_create() return checks The DASD driver only uses the dentry pointers when removing debugfs entries, and debugfs_remove() can safely handle both NULL and ERR_PTR. There is therefore no need to check debugfs_create() return values. This simplifies the debugfs setup code without changing functionality. Suggested-by: Heiko Carstens Reviewed-by: Heiko Carstens Reviewed-by: Jan Hoeppner Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 64 +++++---------------------------------- 1 file changed, 8 insertions(+), 56 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 7765e40f7cea..496f95745ade 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -207,19 +207,6 @@ static int dasd_state_known_to_new(struct dasd_device *device) return 0; } -static struct dentry *dasd_debugfs_setup(const char *name, - struct dentry *base_dentry) -{ - struct dentry *pde; - - if (!base_dentry) - return NULL; - pde = debugfs_create_dir(name, base_dentry); - if (!pde || IS_ERR(pde)) - return NULL; - return pde; -} - /* * Request the irq line for the device. */ @@ -234,14 +221,14 @@ static int dasd_state_known_to_basic(struct dasd_device *device) if (rc) return rc; block->debugfs_dentry = - dasd_debugfs_setup(block->gdp->disk_name, + debugfs_create_dir(block->gdp->disk_name, dasd_debugfs_root_entry); dasd_profile_init(&block->profile, block->debugfs_dentry); if (dasd_global_profile_level == DASD_PROFILE_ON) dasd_profile_on(&device->block->profile); } device->debugfs_dentry = - dasd_debugfs_setup(dev_name(&device->cdev->dev), + debugfs_create_dir(dev_name(&device->cdev->dev), dasd_debugfs_root_entry); dasd_profile_init(&device->profile, device->debugfs_dentry); dasd_hosts_init(device->debugfs_dentry, device); @@ -1058,19 +1045,9 @@ static const struct file_operations dasd_stats_raw_fops = { static void dasd_profile_init(struct dasd_profile *profile, struct dentry *base_dentry) { - umode_t mode; - struct dentry *pde; - - if (!base_dentry) - return; - profile->dentry = NULL; profile->data = NULL; - mode = (S_IRUSR | S_IWUSR | S_IFREG); - pde = debugfs_create_file("statistics", mode, base_dentry, - profile, &dasd_stats_raw_fops); - if (pde && !IS_ERR(pde)) - profile->dentry = pde; - return; + profile->dentry = debugfs_create_file("statistics", 0600, base_dentry, + profile, &dasd_stats_raw_fops); } static void dasd_profile_exit(struct dasd_profile *profile) @@ -1090,25 +1067,9 @@ static void dasd_statistics_removeroot(void) static void dasd_statistics_createroot(void) { - struct dentry *pde; - - dasd_debugfs_root_entry = NULL; - pde = debugfs_create_dir("dasd", NULL); - if (!pde || IS_ERR(pde)) - goto error; - dasd_debugfs_root_entry = pde; - pde = debugfs_create_dir("global", dasd_debugfs_root_entry); - if (!pde || IS_ERR(pde)) - goto error; - dasd_debugfs_global_entry = pde; + dasd_debugfs_root_entry = debugfs_create_dir("dasd", NULL); + dasd_debugfs_global_entry = debugfs_create_dir("global", dasd_debugfs_root_entry); dasd_profile_init(&dasd_global_profile, dasd_debugfs_global_entry); - return; - -error: - DBF_EVENT(DBF_ERR, "%s", - "Creation of the dasd debugfs interface failed"); - dasd_statistics_removeroot(); - return; } #else @@ -1169,17 +1130,8 @@ static void dasd_hosts_exit(struct dasd_device *device) static void dasd_hosts_init(struct dentry *base_dentry, struct dasd_device *device) { - struct dentry *pde; - umode_t mode; - - if (!base_dentry) - return; - - mode = S_IRUSR | S_IFREG; - pde = debugfs_create_file("host_access_list", mode, base_dentry, - device, &dasd_hosts_fops); - if (pde && !IS_ERR(pde)) - device->hosts_dentry = pde; + device->hosts_dentry = debugfs_create_file("host_access_list", 0400, base_dentry, + device, &dasd_hosts_fops); } struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength, int datasize, From 43198756ee8cade0acc17a89f959764cd17776bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Wed, 26 Nov 2025 17:06:33 +0100 Subject: [PATCH 145/162] s390/dasd: Move device name formatting into separate function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The device name formatting can be generalized and made more readable compared to the current state. SCSI already provides a generalized way to format many devices in the same naming scheme as DASD does, which was introduced with commit 3e1a7ff8a0a7 ("block: allow disk to have extended device number"). Use this much cleaner code from drivers/scsi/sd.c to handle the legacy naming scheme in DASD as a replacement for the current implementation. For easier error handling for the new function, move the gendisk free portion of dasd_gendisk_free() out into a new function dasd_gd_free(). Signed-off-by: Jan Höppner Reviewed-by: Stefan Haberland Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_genhd.c | 80 ++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 28e92fad0ca1..6ee3d952412e 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -22,6 +22,7 @@ static unsigned int queue_depth = 32; static unsigned int nr_hw_queues = 4; +static void dasd_gd_free(struct gendisk *gdp); module_param(queue_depth, uint, 0444); MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices"); @@ -29,6 +30,37 @@ MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices"); module_param(nr_hw_queues, uint, 0444); MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices"); +/* + * Set device name. + * dasda - dasdz : 26 devices + * dasdaa - dasdzz : 676 devices, added up = 702 + * dasdaaa - dasdzzz : 17576 devices, added up = 18278 + * dasdaaaa - dasdzzzz : 456976 devices, added up = 475252 + */ +static int dasd_name_format(char *prefix, int index, char *buf, int buflen) +{ + const int base = 'z' - 'a' + 1; + char *begin = buf + strlen(prefix); + char *end = buf + buflen; + char *p; + int unit; + + p = end - 1; + *p = '\0'; + unit = base; + do { + if (p == begin) + return -EINVAL; + *--p = 'a' + (index % unit); + index = (index / unit) - 1; + } while (index >= 0); + + memmove(begin, p, end - p); + memcpy(buf, prefix, strlen(prefix)); + + return 0; +} + /* * Allocate and register gendisk structure for device. */ @@ -45,11 +77,13 @@ int dasd_gendisk_alloc(struct dasd_block *block) }; struct gendisk *gdp; struct dasd_device *base; - int len, rc; + unsigned int devindex; + int rc; /* Make sure the minor for this device exists. */ base = block->base; - if (base->devindex >= DASD_PER_MAJOR) + devindex = base->devindex; + if (devindex >= DASD_PER_MAJOR) return -EBUSY; block->tag_set.ops = &dasd_mq_ops; @@ -69,31 +103,17 @@ int dasd_gendisk_alloc(struct dasd_block *block) /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; - gdp->first_minor = base->devindex << DASD_PARTN_BITS; + gdp->first_minor = devindex << DASD_PARTN_BITS; gdp->minors = 1 << DASD_PARTN_BITS; gdp->fops = &dasd_device_operations; - /* - * Set device name. - * dasda - dasdz : 26 devices - * dasdaa - dasdzz : 676 devices, added up = 702 - * dasdaaa - dasdzzz : 17576 devices, added up = 18278 - * dasdaaaa - dasdzzzz : 456976 devices, added up = 475252 - */ - len = sprintf(gdp->disk_name, "dasd"); - if (base->devindex > 25) { - if (base->devindex > 701) { - if (base->devindex > 18277) - len += sprintf(gdp->disk_name + len, "%c", - 'a'+(((base->devindex-18278) - /17576)%26)); - len += sprintf(gdp->disk_name + len, "%c", - 'a'+(((base->devindex-702)/676)%26)); - } - len += sprintf(gdp->disk_name + len, "%c", - 'a'+(((base->devindex-26)/26)%26)); + rc = dasd_name_format("dasd", devindex, gdp->disk_name, sizeof(gdp->disk_name)); + if (rc) { + DBF_DEV_EVENT(DBF_ERR, block->base, + "setting disk name failed, rc %d", rc); + dasd_gd_free(gdp); + return rc; } - len += sprintf(gdp->disk_name + len, "%c", 'a'+(base->devindex%26)); if (base->features & DASD_FEATURE_READONLY || test_bit(DASD_FLAG_DEVICE_RO, &base->flags)) @@ -111,15 +131,23 @@ int dasd_gendisk_alloc(struct dasd_block *block) return 0; } +/* + * Free gendisk structure + */ +static void dasd_gd_free(struct gendisk *gd) +{ + del_gendisk(gd); + gd->private_data = NULL; + put_disk(gd); +} + /* * Unregister and free gendisk structure for device. */ void dasd_gendisk_free(struct dasd_block *block) { if (block->gdp) { - del_gendisk(block->gdp); - block->gdp->private_data = NULL; - put_disk(block->gdp); + dasd_gd_free(block->gdp); block->gdp = NULL; blk_mq_free_tag_set(&block->tag_set); } From a857d99201cc4eb3cb78b9dcb6f1d027ef3ae699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Wed, 26 Nov 2025 17:06:34 +0100 Subject: [PATCH 146/162] s390/dasd: Use scnprintf() instead of sprintf() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use scnprintf() instead of sprintf() for those cases where the destination is an array and the size of the array is known at compile time. This prevents theoretical buffer overflows, but also avoids that people again and again spend time to figure out if the code is actually safe. Signed-off-by: Jan Höppner Reviewed-by: Stefan Haberland Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_devmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index ddbdf1f85d44..73972900fc55 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -355,7 +355,8 @@ static int __init dasd_parse_range(const char *range) /* each device in dasd= parameter should be set initially online */ features |= DASD_FEATURE_INITIAL_ONLINE; while (from <= to) { - sprintf(bus_id, "%01x.%01x.%04x", from_id0, from_id1, from++); + scnprintf(bus_id, sizeof(bus_id), + "%01x.%01x.%04x", from_id0, from_id1, from++); devmap = dasd_add_busid(bus_id, features); if (IS_ERR(devmap)) { rc = PTR_ERR(devmap); From c6a45ee7607de3a350008630f4369b1b5ac80884 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Wed, 26 Nov 2025 12:48:35 +0000 Subject: [PATCH 147/162] ublk: prevent invalid access with DEBUG ublk_ch_uring_cmd_local() may jump to the out label before initialising the io pointer. This will cause trouble if DEBUG is defined, because the pr_devel() call dereferences io. Clang reports: drivers/block/ublk_drv.c:2403:6: error: variable 'io' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] 2403 | if (tag >= ub->dev_info.queue_depth) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/block/ublk_drv.c:2492:32: note: uninitialized use occurs here 2492 | __func__, cmd_op, tag, ret, io->flags); | Fix this by initialising io to NULL and checking it before dereferencing it. Signed-off-by: Kevin Brodsky Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 75f210523e52..09718a68137c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2301,7 +2301,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; - struct ublk_io *io; + struct ublk_io *io = NULL; u32 cmd_op = cmd->cmd_op; u16 q_id = READ_ONCE(ub_src->q_id); u16 tag = READ_ONCE(ub_src->tag); @@ -2422,7 +2422,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, out: pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", - __func__, cmd_op, tag, ret, io->flags); + __func__, cmd_op, tag, ret, io ? io->flags : 0); return ret; } From d0c98769ee7d5db8d699a270690639cde1766cd4 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 28 Nov 2025 16:53:13 +0800 Subject: [PATCH 148/162] blk-mq: use array manage hctx map instead of xarray After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use an xarray instead of array to store hctx, but in poll mode, each time in blk_mq_poll, we need use xa_load to find corresponding hctx, this introduce some costs. In my test, xa_load may cost 3.8% cpu. This patch revert previous change, eliminates the overhead of xa_load and can result in a 3% performance improvement. Signed-off-by: Fengnan Chang Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 2 +- block/blk-mq.c | 58 +++++++++++++++++++++++++++--------------- block/blk-mq.h | 2 +- include/linux/blk-mq.h | 3 ++- include/linux/blkdev.h | 2 +- 5 files changed, 42 insertions(+), 25 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5b664dbdf655..33946cdb5716 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -499,7 +499,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, int srcu_idx; /* - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ diff --git a/block/blk-mq.c b/block/blk-mq.c index f2650c97a75e..1ef81110eb8a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -730,7 +730,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = xa_load(&q->hctx_table, hctx_idx); + data.hctx = q->queue_hw_ctx[hctx_idx]; if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -3946,8 +3946,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_free_flush_queue_callback); hctx->fq = NULL; - xa_erase(&q->hctx_table, hctx_idx); - spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -3989,14 +3987,8 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->numa_node)) goto exit_hctx; - if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) - goto exit_flush_rq; - return 0; - exit_flush_rq: - if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -4385,7 +4377,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - xa_destroy(&q->hctx_table); + kfree(q->queue_hw_ctx); /* * release .mq_kobj and sw queue's kobject now because @@ -4529,26 +4521,44 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned long i, j; + int i, j, end; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + q->queue_hw_ctx = new_hctxs; + kfree(hctxs); + hctxs = new_hctxs; + } for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); - struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); + struct blk_mq_hw_ctx *old_hctx = hctxs[i]; if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } - if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (!hctxs[i]) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); - hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); - WARN_ON_ONCE(!hctx); + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, + old_node); + WARN_ON_ONCE(!hctxs[i]); } } /* @@ -4557,13 +4567,21 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; + end = i; } else { j = i; + end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - xa_for_each_start(&q->hctx_table, j, hctx, j) - blk_mq_exit_hctx(q, set, hctx, j); + for (; j < end; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + blk_mq_exit_hctx(q, set, hctx, j); + hctxs[j] = NULL; + } + } } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, @@ -4599,8 +4617,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); - xa_init(&q->hctx_table); - blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -5187,7 +5203,7 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, { if (!blk_mq_can_poll(q)) return 0; - return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); + return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, diff --git a/block/blk-mq.h b/block/blk-mq.h index c4fccdeb5441..80a3f0c2bce7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -84,7 +84,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * enum hctx_type type, unsigned int cpu) { - return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); + return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b54506b3b76d..9208ff90ae16 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1016,7 +1016,8 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) } #define queue_for_each_hw_ctx(q, hctx, i) \ - xa_for_each(&(q)->hctx_table, (i), (hctx)) + for ((i) = 0; (i) < (q)->nr_hw_queues && \ + ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cb4ba09959ee..6195f89648db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -503,7 +503,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct xarray hctx_table; + struct blk_mq_hw_ctx **queue_hw_ctx; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; From 89e1fb7ceffd898505ad7fa57acec0585bfaa2cc Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 28 Nov 2025 16:53:14 +0800 Subject: [PATCH 149/162] blk-mq: fix potential uaf for 'queue_hw_ctx' This is just apply Kuai's patch in [1] with mirror changes. blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate submit_queues through configfs for null_blk), while it might still be used from other context(e.g. switch elevator to none): t1 t2 elevator_switch blk_mq_unquiesce_queue blk_mq_run_hw_queues queue_for_each_hw_ctx // assembly code for hctx = (q)->queue_hw_ctx[i] mov 0x48(%rbp),%rdx -> read old queue_hw_ctx __blk_mq_update_nr_hw_queues blk_mq_realloc_hw_ctxs hctxs = q->queue_hw_ctx q->queue_hw_ctx = new_hctxs kfree(hctxs) movslq %ebx,%rax mov (%rdx,%rax,8),%rdi ->uaf This problem was found by code review, and I comfirmed that the concurrent scenario do exist(specifically 'q->queue_hw_ctx' can be changed during blk_mq_run_hw_queues()), however, the uaf problem hasn't been repoduced yet without hacking the kernel. Sicne the queue is freezed in __blk_mq_update_nr_hw_queues(), fix the problem by protecting 'queue_hw_ctx' through rcu where it can be accessed without grabbing 'q_usage_counter'. [1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/ Signed-off-by: Yu Kuai Signed-off-by: Fengnan Chang Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ++++++- include/linux/blk-mq.h | 13 ++++++++++++- include/linux/blkdev.h | 2 +- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1ef81110eb8a..4e96bb246247 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4535,7 +4535,12 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, if (hctxs) memcpy(new_hctxs, hctxs, q->nr_hw_queues * sizeof(*hctxs)); - q->queue_hw_ctx = new_hctxs; + rcu_assign_pointer(q->queue_hw_ctx, new_hctxs); + /* + * Make sure reading the old queue_hw_ctx from other + * context concurrently won't trigger uaf. + */ + synchronize_rcu_expedited(); kfree(hctxs); hctxs = new_hctxs; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9208ff90ae16..eb7254b3dddd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1015,9 +1015,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) return rq + 1; } +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) +{ + struct blk_mq_hw_ctx *hctx; + + rcu_read_lock(); + hctx = rcu_dereference(q->queue_hw_ctx)[id]; + rcu_read_unlock(); + + return hctx; +} + #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ - ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) + ({ hctx = queue_hctx((q), i); 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6195f89648db..72e34acd439c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -503,7 +503,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx **queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; From 9574b21e952256d4fa3c8797c94482a240992d18 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:23 +0800 Subject: [PATCH 150/162] kfifo: add kfifo_alloc_node() helper for NUMA awareness Add __kfifo_alloc_node() by refactoring and reusing __kfifo_alloc(), and define kfifo_alloc_node() macro to support NUMA-aware memory allocation. The new __kfifo_alloc_node() function accepts a NUMA node parameter and uses kmalloc_array_node() instead of kmalloc_array() for node-specific allocation. The existing __kfifo_alloc() now calls __kfifo_alloc_node() with NUMA_NO_NODE to maintain backward compatibility. This enables users to allocate kfifo buffers on specific NUMA nodes, which is important for performance in NUMA systems where the kfifo will be primarily accessed by threads running on specific nodes. Cc: Stefani Seibold Cc: Andrew Morton Cc: linux-kernel@vger.kernel.org Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/kfifo.h | 34 ++++++++++++++++++++++++++++++++-- lib/kfifo.c | 8 ++++---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index fd743d4c4b4b..8b81ac74829c 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -369,6 +369,30 @@ __kfifo_int_must_check_helper( \ }) \ ) +/** + * kfifo_alloc_node - dynamically allocates a new fifo buffer on a NUMA node + * @fifo: pointer to the fifo + * @size: the number of elements in the fifo, this must be a power of 2 + * @gfp_mask: get_free_pages mask, passed to kmalloc() + * @node: NUMA node to allocate memory on + * + * This macro dynamically allocates a new fifo buffer with NUMA node awareness. + * + * The number of elements will be rounded-up to a power of 2. + * The fifo will be release with kfifo_free(). + * Return 0 if no error, otherwise an error code. + */ +#define kfifo_alloc_node(fifo, size, gfp_mask, node) \ +__kfifo_int_must_check_helper( \ +({ \ + typeof((fifo) + 1) __tmp = (fifo); \ + struct __kfifo *__kfifo = &__tmp->kfifo; \ + __is_kfifo_ptr(__tmp) ? \ + __kfifo_alloc_node(__kfifo, size, sizeof(*__tmp->type), gfp_mask, node) : \ + -EINVAL; \ +}) \ +) + /** * kfifo_free - frees the fifo * @fifo: the fifo to be freed @@ -899,8 +923,14 @@ __kfifo_uint_must_check_helper( \ ) -extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask); +extern int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask, int node); + +static inline int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) +{ + return __kfifo_alloc_node(fifo, size, esize, gfp_mask, NUMA_NO_NODE); +} extern void __kfifo_free(struct __kfifo *fifo); diff --git a/lib/kfifo.c b/lib/kfifo.c index a8b2eed90599..525e66f8294c 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -22,8 +22,8 @@ static inline unsigned int kfifo_unused(struct __kfifo *fifo) return (fifo->mask + 1) - (fifo->in - fifo->out); } -int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask) +int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask, int node) { /* * round up to the next power of 2, since our 'let the indices @@ -41,7 +41,7 @@ int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, return -EINVAL; } - fifo->data = kmalloc_array(esize, size, gfp_mask); + fifo->data = kmalloc_array_node(esize, size, gfp_mask, node); if (!fifo->data) { fifo->mask = 0; @@ -51,7 +51,7 @@ int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, return 0; } -EXPORT_SYMBOL(__kfifo_alloc); +EXPORT_SYMBOL(__kfifo_alloc_node); void __kfifo_free(struct __kfifo *fifo) { From 3035b9b46b0611898babc0b96ede65790d3566f7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:24 +0800 Subject: [PATCH 151/162] ublk: add parameter `struct io_uring_cmd *` to ublk_prep_auto_buf_reg() Add parameter `struct io_uring_cmd *` to ublk_prep_auto_buf_reg() and prepare for reusing this helper for the coming UBLK_BATCH_IO feature, which can fetch & commit one batch of io commands via single uring_cmd. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 09718a68137c..7e0f1f481a3a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1176,11 +1176,12 @@ ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) } static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, - struct ublk_io *io, unsigned int issue_flags) + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { int ret; - ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release, + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, io->buf.index, issue_flags); if (ret) { if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { @@ -1192,18 +1193,19 @@ static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, } io->task_registered_buffers = 1; - io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd); + io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; return true; } static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned int issue_flags) { ublk_init_req_ref(ubq, io); if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) - return ublk_auto_buf_reg(ubq, req, io, issue_flags); + return ublk_auto_buf_reg(ubq, req, io, cmd, issue_flags); return true; } @@ -1278,7 +1280,7 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, if (!ublk_start_io(ubq, req, io)) return; - if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags)) + if (ublk_prep_auto_buf_reg(ubq, req, io, io->cmd, issue_flags)) ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); } From 8d61ece156bd4f2b9e7d3b2a374a26d42c7a4a06 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:25 +0800 Subject: [PATCH 152/162] ublk: add `union ublk_io_buf` with improved naming Add `union ublk_io_buf` for naming the anonymous union of struct ublk_io's addr and buf fields, meantime apply it to `struct ublk_io` for storing either ublk auto buffer register data or ublk server io buffer address. The union uses clear field names: - `addr`: for regular ublk server io buffer addresses - `auto_reg`: for ublk auto buffer registration data This eliminates confusing access patterns and improves code readability. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 7e0f1f481a3a..7217146781fa 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -155,12 +155,13 @@ struct ublk_uring_cmd_pdu { */ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) +union ublk_io_buf { + __u64 addr; + struct ublk_auto_buf_reg auto_reg; +}; + struct ublk_io { - /* userspace buffer address from io cmd */ - union { - __u64 addr; - struct ublk_auto_buf_reg buf; - }; + union ublk_io_buf buf; unsigned int flags; int res; @@ -498,7 +499,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); - iod->addr = io->addr; + iod->addr = io->buf.addr; return BLK_STS_OK; } @@ -981,7 +982,7 @@ static unsigned int ublk_map_io(const struct ublk_queue *ubq, struct iov_iter iter; const int dir = ITER_DEST; - import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter); + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), rq_bytes, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; @@ -1002,7 +1003,7 @@ static unsigned int ublk_unmap_io(bool need_map, WARN_ON_ONCE(io->res > rq_bytes); - import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter); + import_ubuf(dir, u64_to_user_ptr(io->buf.addr), io->res, &iter); return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; @@ -1068,7 +1069,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); - iod->addr = io->addr; + iod->addr = io->buf.addr; return BLK_STS_OK; } @@ -1182,9 +1183,9 @@ static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, int ret; ret = io_buffer_register_bvec(cmd, req, ublk_io_release, - io->buf.index, issue_flags); + io->buf.auto_reg.index, issue_flags); if (ret) { - if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) { + if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { ublk_auto_buf_reg_fallback(ubq, io); return true; } @@ -1473,7 +1474,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) */ io->flags &= UBLK_IO_FLAG_CANCELED; io->cmd = NULL; - io->addr = 0; + io->buf.addr = 0; /* * old task is PF_EXITING, put it now @@ -2034,13 +2035,16 @@ static inline int ublk_check_cmd_op(u32 cmd_op) static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd) { - io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + struct ublk_auto_buf_reg buf; - if (io->buf.reserved0 || io->buf.reserved1) + buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr)); + + if (buf.reserved0 || buf.reserved1) return -EINVAL; - if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) + if (buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK) return -EINVAL; + io->buf.auto_reg = buf; return 0; } @@ -2062,7 +2066,7 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, * this ublk request gets stuck. */ if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) - *buf_idx = io->buf.index; + *buf_idx = io->buf.auto_reg.index; } return ublk_set_auto_buf_reg(io, cmd); @@ -2090,7 +2094,7 @@ ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io, if (ublk_dev_support_auto_buf_reg(ub)) return ublk_handle_auto_buf_reg(io, cmd, buf_idx); - io->addr = buf_addr; + io->buf.addr = buf_addr; return 0; } @@ -2287,7 +2291,7 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, */ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA; /* update iod->addr because ublksrv may have passed a new io buffer */ - ublk_get_iod(ubq, req->tag)->addr = io->addr; + ublk_get_iod(ubq, req->tag)->addr = io->buf.addr; pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n", __func__, ubq->q_id, req->tag, io->flags, ublk_get_iod(ubq, req->tag)->addr); From 0a9beafa7c633e6ff66b05b81eea78231b7e6520 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:26 +0800 Subject: [PATCH 153/162] ublk: refactor auto buffer register in ublk_dispatch_req() Refactor auto buffer register code and prepare for supporting batch IO feature, and the main motivation is to put 'ublk_io' operation code together, so that per-io lock can be applied for the code block. The key changes are: - Rename ublk_auto_buf_reg() as ublk_do_auto_buf_reg() - Introduce an enum `auto_buf_reg_res` to represent the result of the buffer registration attempt (FAIL, FALLBACK, OK). - Split the existing `ublk_do_auto_buf_reg` function into two: - `__ublk_do_auto_buf_reg`: Performs the actual buffer registration and returns the `auto_buf_reg_res` status. - `ublk_do_auto_buf_reg`: A wrapper that calls the internal function and handles the I/O preparation based on the result. - Introduce `ublk_prep_auto_buf_reg_io` to encapsulate the logic for preparing the I/O for completion after buffer registration. - Pass the `tag` directly to `ublk_auto_buf_reg_fallback` to avoid recalculating it. This refactoring makes the control flow clearer and isolates the different stages of the auto buffer registration process. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 64 +++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 7217146781fa..4d2dc6556832 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1168,17 +1168,37 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, } static void -ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io) +ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, unsigned tag) { - unsigned tag = io - ubq->ios; struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag); iod->op_flags |= UBLK_IO_F_NEED_REG_BUF; } -static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, - struct ublk_io *io, struct io_uring_cmd *cmd, - unsigned int issue_flags) +enum auto_buf_reg_res { + AUTO_BUF_REG_FAIL, + AUTO_BUF_REG_FALLBACK, + AUTO_BUF_REG_OK, +}; + +static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + enum auto_buf_reg_res res) +{ + if (res == AUTO_BUF_REG_OK) { + io->task_registered_buffers = 1; + io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); + io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; + } + ublk_init_req_ref(ubq, io); + __ublk_prep_compl_io_cmd(io, req); +} + +static enum auto_buf_reg_res +__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { int ret; @@ -1186,29 +1206,27 @@ static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, io->buf.auto_reg.index, issue_flags); if (ret) { if (io->buf.auto_reg.flags & UBLK_AUTO_BUF_REG_FALLBACK) { - ublk_auto_buf_reg_fallback(ubq, io); - return true; + ublk_auto_buf_reg_fallback(ubq, req->tag); + return AUTO_BUF_REG_FALLBACK; } blk_mq_end_request(req, BLK_STS_IOERR); - return false; + return AUTO_BUF_REG_FAIL; } - io->task_registered_buffers = 1; - io->buf_ctx_handle = io_uring_cmd_ctx_handle(cmd); - io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG; - return true; + return AUTO_BUF_REG_OK; } -static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq, - struct request *req, struct ublk_io *io, - struct io_uring_cmd *cmd, - unsigned int issue_flags) +static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, + struct ublk_io *io, struct io_uring_cmd *cmd, + unsigned int issue_flags) { - ublk_init_req_ref(ubq, io); - if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) - return ublk_auto_buf_reg(ubq, req, io, cmd, issue_flags); + enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + issue_flags); - return true; + if (res != AUTO_BUF_REG_FAIL) { + ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags); + } } static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, @@ -1281,8 +1299,12 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, if (!ublk_start_io(ubq, req, io)) return; - if (ublk_prep_auto_buf_reg(ubq, req, io, io->cmd, issue_flags)) + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { + ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags); + } else { + ublk_init_req_ref(ubq, io); ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); + } } static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, From 3443bab2f8e44e00adaf76ba677d4219416376f2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:27 +0800 Subject: [PATCH 154/162] ublk: pass const pointer to ublk_queue_is_zoned() Pass const pointer to ublk_queue_is_zoned() because it is readonly. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4d2dc6556832..f1b2f170146b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -265,7 +265,7 @@ static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) return ub->dev_info.flags & UBLK_F_ZONED; } -static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq) +static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_ZONED; } From 28d7a371f021419cb6c3a243f5cf167f88eb51b9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:28 +0800 Subject: [PATCH 155/162] ublk: add helper of __ublk_fetch() Add helper __ublk_fetch() for refactoring ublk_fetch(). Meantime move ublk_config_io_buf() out of __ublk_fetch() to make the code structure cleaner. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 48 +++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f1b2f170146b..c2250172de4c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2234,10 +2234,31 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) return 0; } +static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, + struct ublk_io *io) +{ + /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ + if (ublk_dev_ready(ub)) + return -EBUSY; + + /* allow each command to be FETCHed at most once */ + if (io->flags & UBLK_IO_FLAG_ACTIVE) + return -EINVAL; + + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); + + ublk_fill_io_cmd(io, cmd); + + WRITE_ONCE(io->task, get_task_struct(current)); + ublk_mark_io_ready(ub); + + return 0; +} + static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, struct ublk_io *io, __u64 buf_addr) { - int ret = 0; + int ret; /* * When handling FETCH command for setting up ublk uring queue, @@ -2245,28 +2266,9 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, * FETCH, so it is fine even for IO_URING_F_NONBLOCK. */ mutex_lock(&ub->mutex); - /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ - if (ublk_dev_ready(ub)) { - ret = -EBUSY; - goto out; - } - - /* allow each command to be FETCHed at most once */ - if (io->flags & UBLK_IO_FLAG_ACTIVE) { - ret = -EINVAL; - goto out; - } - - WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); - - ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); - if (ret) - goto out; - - WRITE_ONCE(io->task, get_task_struct(current)); - ublk_mark_io_ready(ub); -out: + ret = __ublk_fetch(cmd, ub, io); + if (!ret) + ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); mutex_unlock(&ub->mutex); return ret; } From 418de94e7593081c29066555bf9059f1f7dd9d79 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Nov 2025 22:57:54 -0800 Subject: [PATCH 156/162] sbitmap: fix all kernel-doc warnings Modify kernel-doc comments in sbitmap.h to prevent warnings: Warning: include/linux/sbitmap.h:84 struct member 'alloc_hint' not described in 'sbitmap' Warning: include/linux/sbitmap.h:151 struct member 'ws_active' not described in 'sbitmap_queue' Warning: include/linux/sbitmap.h:552 No description found for return value of 'sbq_wait_ptr' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index ffb9907c7070..cc7ad189caa5 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -75,7 +75,7 @@ struct sbitmap { */ struct sbitmap_word *map; - /* + /** * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different @@ -128,7 +128,7 @@ struct sbitmap_queue { */ struct sbq_wait_state *ws; - /* + /** * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; @@ -547,6 +547,8 @@ static inline void sbq_index_atomic_inc(atomic_t *index) * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. + * + * Return: Next wait queue to be used */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) From 46f21952c492243b138281dc4cb755ab63b637c4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sun, 16 Nov 2025 10:18:16 +0800 Subject: [PATCH 157/162] md/raid0: fix NULL pointer dereference in create_strip_zones() for dm-raid Commit 2107457e31fa ("md/raid0: Move queue limit setup before r0conf initialization") dereference mddev->gendisk unconditionally, which is NULL for dm-raid. Fix this problem by reverting to old codes for dm-raid. Link: https://lore.kernel.org/linux-raid/20251116021816.107648-1-yukuai@fnnas.com Fixes: 2107457e31fa ("md/raid0: Move queue limit setup before r0conf initialization") Reported-and-tested-by: Changhui Zhong Closes: https://lore.kernel.org/all/CAGVVp+VqVnvGeneUoTbYvBv2cw6GwQRrR3B-iQ-_9rVfyumoKA@mail.gmail.com/ Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Reviewed-by: Li Nan Reviewed-by: Paul Menzel --- drivers/md/raid0.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 47aee1b1d4d1..985c377356eb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -68,7 +68,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) struct strip_zone *zone; int cnt; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - unsigned int blksize = queue_logical_block_size(mddev->gendisk->queue); + unsigned int blksize = 512; + + if (!mddev_is_dm(mddev)) + blksize = queue_logical_block_size(mddev->gendisk->queue); *private_conf = ERR_PTR(-ENOMEM); if (!conf) @@ -84,6 +87,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; + if (mddev_is_dm(mddev)) + blksize = max(blksize, queue_logical_block_size( + rdev1->bdev->bd_disk->queue)); + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %pg(%llu)" " with %pg(%llu)\n", From 8c9e376b9d1a222fa02b93b615d2e25be0a91fed Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 17 Nov 2025 16:55:56 +0800 Subject: [PATCH 158/162] md: warn about updating super block failure Many personalities will handle IO error from daemon thread(like raid1d, raid10d, raid5d), and sb will require to be clean before hanlding these failed IO. However update sb can fail, for example array is broken by IO failure, or user config sysfs api array_state. This patch adds warning if updating sb failed first, in case this will be related to IO hang. Link: https://lore.kernel.org/linux-raid/20251117085557.770572-2-yukuai@fnnas.com Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7b5c5967568f..345b1e623aba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2788,6 +2788,7 @@ void md_update_sb(struct mddev *mddev, int force_change) if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); return; } From a913d1f6a7f607c110aeef8b58c8988f47a4b24e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 17 Nov 2025 16:55:57 +0800 Subject: [PATCH 159/162] md/raid5: fix IO hang when array is broken with IO inflight Following test can cause IO hang: mdadm -CvR /dev/md0 -l10 -n4 /dev/sd[abcd] --assume-clean --chunk=64K --bitmap=none sleep 5 echo 1 > /sys/block/sda/device/delete echo 1 > /sys/block/sdb/device/delete echo 1 > /sys/block/sdc/device/delete echo 1 > /sys/block/sdd/device/delete dd if=/dev/md0 of=/dev/null bs=8k count=1 iflag=direct Root cause: 1) all disks removed, however all rdevs in the array is still in sync, IO will be issued normally. 2) IO failure from sda, and set badblocks failed, sda will be faulty and MD_SB_CHANGING_PENDING will be set. 3) error recovery try to recover this IO from other disks, IO will be issued to sdb, sdc, and sdd. 4) IO failure from sdb, and set badblocks failed again, now array is broken and will become read-only. 5) IO failure from sdc and sdd, however, stripe can't be handled anymore because MD_SB_CHANGING_PENDING is set: handle_stripe handle_stripe if (test_bit MD_SB_CHANGING_PENDING) set_bit STRIPE_HANDLE goto finish // skip handling failed stripe release_stripe if (test_bit STRIPE_HANDLE) list_add_tail conf->hand_list 6) later raid5d can't handle failed stripe as well: raid5d md_check_recovery md_update_sb if (!md_is_rdwr()) // can't clear pending bit return if (test_bit MD_SB_CHANGING_PENDING) break; // can't handle failed stripe Since MD_SB_CHANGING_PENDING can never be cleared for read-only array, fix this problem by skip this checking for read-only array. Link: https://lore.kernel.org/linux-raid/20251117085557.770572-3-yukuai@fnnas.com Fixes: d87f064f5874 ("md: never update metadata when array is read-only.") Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/raid5.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cdbc7eba5c54..e57ce3295292 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4956,7 +4956,8 @@ static void handle_stripe(struct stripe_head *sh) goto finish; if (s.handle_bad_blocks || - test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { + (md_is_rdwr(conf->mddev) && + test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags))) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; } @@ -6768,7 +6769,8 @@ static void raid5d(struct md_thread *thread) int batch_size, released; unsigned int offset; - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + if (md_is_rdwr(mddev) && + test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) break; released = release_stripe_list(conf, conf->temp_inactive_list); From fdd0c6a649d24107bbadd249c87feab67b9037c5 Mon Sep 17 00:00:00 2001 From: Tarun Sahu Date: Fri, 21 Nov 2025 19:14:22 +0000 Subject: [PATCH 160/162] md: remove legacy 1s delay in md_notify_reboot During system shutdown, the md driver registered notifier function (md_notify_reboot) currently imposes a hardcoded one-second delay. This delay was introduced approximately 23 years ago and was likely necessary for the hardware generation of that time. Proposing this patch to make sure there are no known devices that need this delay. Link: https://lore.kernel.org/linux-raid/20251121191422.2758555-1-tarunsahu@google.com Signed-off-by: Tarun Sahu Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 345b1e623aba..e5922a682953 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -10408,7 +10408,6 @@ static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { struct mddev *mddev; - int need_delay = 0; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev, &all_mddevs, all_mddevs) { @@ -10422,21 +10421,11 @@ static int md_notify_reboot(struct notifier_block *this, mddev->safemode = 2; mddev_unlock(mddev); } - need_delay = 1; spin_lock(&all_mddevs_lock); mddev_put_locked(mddev); } spin_unlock(&all_mddevs_lock); - /* - * certain more exotic SCSI devices are known to be - * volatile wrt too early system reboots. While the - * right place to handle this issue is the given - * driver, we do want to have a safe RAID driver ... - */ - if (need_delay) - msleep(1000); - return NOTIFY_DONE; } From 4d0e1f2139ad452d0e209a16b3d016af2f8ef1f7 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Mon, 1 Dec 2025 20:25:04 +0800 Subject: [PATCH 161/162] blk-mq: use queue_hctx in blk_mq_map_queue_type Some caller of blk_mq_map_queue_type now didn't grab 'q_usage_counter', such as blk_mq_cpu_mapped_to_hctx, so we need protect 'queue_hw_ctx' through rcu. Also checked all other functions, no more missed cases. Fixes: 89e1fb7ceffd ("blk-mq: fix potential uaf for 'queue_hw_ctx'") Reported-by: Jens Axboe Signed-off-by: Fengnan Chang Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.h b/block/blk-mq.h index 80a3f0c2bce7..aa15d31aaae9 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -84,7 +84,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * enum hctx_type type, unsigned int cpu) { - return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; + return queue_hctx((q), (q->tag_set->map[type].mq_map[cpu])); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) From d211a2803551c8ffdf0b97d129388f7d9cc129b5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 29 Nov 2025 14:35:42 -0800 Subject: [PATCH 162/162] block/rnbd: correct all kernel-doc complaints Fix all kernel-doc warnings in rnbd-proto.h: - use correct enum name in kdoc comment - mark several struct members as "/* private: */" so that no kdoc is required for them - don't use "/**" for a non-kernel-doc comment - use the correct struct member name for "dev_name" - use " *" for a blank kernel-doc line Fixes these warnings: Warning: drivers/block/rnbd/rnbd-proto.h:41 expecting prototype for enum rnbd_msg_types. Prototype was for enum rnbd_msg_type instead Warning: drivers/block/rnbd/rnbd-proto.h:50 struct member '__padding' not described in 'rnbd_msg_hdr' Warning: drivers/block/rnbd/rnbd-proto.h:53 This comment starts with '/**', but isn't a kernel-doc comment. * We allow to map RO many times and RW only once. We allow to map yet another Warning: drivers/block/rnbd/rnbd-proto.h:81 struct member 'reserved' not described in 'rnbd_msg_sess_info' Warning: drivers/block/rnbd/rnbd-proto.h:92 struct member 'reserved' not described in 'rnbd_msg_sess_info_rsp' Warning: drivers/block/rnbd/rnbd-proto.h:107 struct member 'resv1' not described in 'rnbd_msg_open' Warning: drivers/block/rnbd/rnbd-proto.h:107 struct member 'dev_name' not described in 'rnbd_msg_open' Warning: drivers/block/rnbd/rnbd-proto.h:107 struct member 'reserved' not described in 'rnbd_msg_open' Warning: drivers/block/rnbd/rnbd-proto.h:158 struct member 'reserved' not described in 'rnbd_msg_open_rsp' Warning: drivers/block/rnbd/rnbd-proto.h:189 bad line: Signed-off-by: Randy Dunlap Acked-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-proto.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index f35be51d213c..77360c2a6069 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -24,7 +24,7 @@ #define RTRS_PORT 1234 /** - * enum rnbd_msg_types - RNBD message types + * enum rnbd_msg_type - RNBD message types * @RNBD_MSG_SESS_INFO: initial session info from client to server * @RNBD_MSG_SESS_INFO_RSP: initial session info from server to client * @RNBD_MSG_OPEN: open (map) device request @@ -47,10 +47,11 @@ enum rnbd_msg_type { */ struct rnbd_msg_hdr { __le16 type; + /* private: */ __le16 __padding; }; -/** +/* * We allow to map RO many times and RW only once. We allow to map yet another * time RW, if MIGRATION is provided (second RW export can be required for * example for VM migration) @@ -78,6 +79,7 @@ static const __maybe_unused struct { struct rnbd_msg_sess_info { struct rnbd_msg_hdr hdr; u8 ver; + /* private: */ u8 reserved[31]; }; @@ -89,6 +91,7 @@ struct rnbd_msg_sess_info { struct rnbd_msg_sess_info_rsp { struct rnbd_msg_hdr hdr; u8 ver; + /* private: */ u8 reserved[31]; }; @@ -97,13 +100,16 @@ struct rnbd_msg_sess_info_rsp { * @hdr: message header * @access_mode: the mode to open remote device, valid values see: * enum rnbd_access_mode - * @device_name: device path on remote side + * @dev_name: device path on remote side */ struct rnbd_msg_open { struct rnbd_msg_hdr hdr; u8 access_mode; + /* private: */ u8 resv1; + /* public: */ s8 dev_name[NAME_MAX]; + /* private: */ u8 reserved[3]; }; @@ -155,6 +161,7 @@ struct rnbd_msg_open_rsp { __le16 secure_discard; u8 obsolete_rotational; u8 cache_policy; + /* private: */ u8 reserved[10]; }; @@ -187,7 +194,7 @@ struct rnbd_msg_io { * @RNBD_OP_DISCARD: discard sectors * @RNBD_OP_SECURE_ERASE: securely erase sectors * @RNBD_OP_WRITE_ZEROES: write zeroes sectors - + * * @RNBD_F_SYNC: request is sync (sync write or read) * @RNBD_F_FUA: forced unit access */