From 1fd143c24fb621f063f913cb1e48cc688c7eca15 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 23:08:31 -0400 Subject: [PATCH 001/164] scsi: switch scsi_bios_ptable() and scsi_partsize() to gendisk Both helpers are reading the partition table of the disk specified by block_device of some partition on it; result depends only upon the disk in question, so we might as well pass the struct gendisk instead. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/scsi/scsi_mid_low_api.rst | 4 ++-- drivers/scsi/BusLogic.c | 2 +- drivers/scsi/aacraid/linit.c | 2 +- drivers/scsi/aic7xxx/aic79xx_osm.c | 2 +- drivers/scsi/aic7xxx/aic7xxx_osm.c | 2 +- drivers/scsi/arcmsr/arcmsr_hba.c | 2 +- drivers/scsi/fdomain.c | 2 +- drivers/scsi/megaraid.c | 2 +- drivers/scsi/scsicam.c | 12 ++++++------ include/scsi/scsicam.h | 5 +++-- 10 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst index 3ac4c7fafb55..96c8230d638e 100644 --- a/Documentation/scsi/scsi_mid_low_api.rst +++ b/Documentation/scsi/scsi_mid_low_api.rst @@ -380,7 +380,7 @@ Details:: /** * scsi_bios_ptable - return copy of block device's partition table - * @dev: pointer to block device + * @dev: pointer to gendisk * * Returns pointer to partition table, or NULL for failure * @@ -390,7 +390,7 @@ Details:: * * Defined in: drivers/scsi/scsicam.c **/ - unsigned char *scsi_bios_ptable(struct block_device *dev) + unsigned char *scsi_bios_ptable(struct gendisk *dev) /** diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 1f100270cd38..743be2ef6d1a 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, diskparam->sectors = 32; } diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors); - buf = scsi_bios_ptable(dev); + buf = scsi_bios_ptable(dev->bd_disk); if (buf == NULL) return 0; /* diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 4b12e6dd8f07..2264a97d91a0 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, * entry whose end_head matches one of the standard geometry * translations ( 64/32, 128/32, 255/63 ). */ - buf = scsi_bios_ptable(bdev); + buf = scsi_bios_ptable(bdev->bd_disk); if (!buf) return 0; if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 17dfc3c72110..2cff19e95fec 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahd = *((struct ahd_softc **)sdev->host->hostdata); - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index cebf8c5d0caf..05bdb73d1157 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahc = *((struct ahc_softc **)sdev->host->hostdata); channel = sdev_channel(sdev); - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index fb57343a97bd..6968da9fb67c 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -381,7 +381,7 @@ static int arcmsr_bios_param(struct scsi_device *sdev, { int heads, sectors, cylinders, total_capacity; - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; total_capacity = capacity; diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 504c4e0c5d17..4a3716dc644c 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -472,7 +472,7 @@ static int fdomain_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) { - unsigned char *p = scsi_bios_ptable(bdev); + unsigned char *p = scsi_bios_ptable(bdev->bd_disk); if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */ && p[4]) { /* Partition type */ diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 2006094af418..c7581c7829af 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, geom[2] = cylinders; } else { - if (scsi_partsize(bdev, capacity, geom)) + if (scsi_partsize(bdev->bd_disk, capacity, geom)) return 0; dev_info(&adapter->dev->dev, diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 19e6c3852d50..3ff2cf51d5b0 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -30,9 +30,9 @@ * starting at offset %0x1be. * Returns: partition table in kmalloc(GFP_KERNEL) memory, or NULL on error. */ -unsigned char *scsi_bios_ptable(struct block_device *dev) +unsigned char *scsi_bios_ptable(struct gendisk *dev) { - struct address_space *mapping = bdev_whole(dev)->bd_mapping; + struct address_space *mapping = dev->part0->bd_mapping; unsigned char *res = NULL; struct folio *folio; @@ -48,7 +48,7 @@ EXPORT_SYMBOL(scsi_bios_ptable); /** * scsi_partsize - Parse cylinders/heads/sectors from PC partition table - * @bdev: block device to parse + * @disk: gendisk of the disk to parse * @capacity: size of the disk in sectors * @geom: output in form of [hds, cylinders, sectors] * @@ -57,7 +57,7 @@ EXPORT_SYMBOL(scsi_bios_ptable); * * Returns: %false on failure, %true on success. */ -bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) +bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]) { int cyl, ext_cyl, end_head, end_cyl, end_sector; unsigned int logical_end, physical_end, ext_physical_end; @@ -65,7 +65,7 @@ bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]) void *buf; int ret = false; - buf = scsi_bios_ptable(bdev); + buf = scsi_bios_ptable(disk); if (!buf) return false; @@ -221,7 +221,7 @@ int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) int ret = 0; /* try to infer mapping from partition table */ - if (scsi_partsize(bdev, capacity, ip)) + if (scsi_partsize(bdev->bd_disk, capacity, ip)) return 0; if (capacity64 < (1ULL << 32)) { diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 08edd603e521..67f4e8835bc8 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -13,7 +13,8 @@ #ifndef SCSICAM_H #define SCSICAM_H +struct gendisk; int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); -bool scsi_partsize(struct block_device *bdev, sector_t capacity, int geom[3]); -unsigned char *scsi_bios_ptable(struct block_device *bdev); +bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]); +unsigned char *scsi_bios_ptable(struct gendisk *disk); #endif /* def SCSICAM_H */ From 3eb50369c09efb0f668a7f568a7e6f7cf4194cde Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 23:22:01 -0400 Subject: [PATCH 002/164] scsi: switch ->bios_param() to passing gendisk Instances are passed struct block_device *bdev argument; the only thing it is used for (if it's used in the first place) is bdev->bd_disk. Might as well pass that in the first place... Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/scsi/scsi_mid_low_api.rst | 4 ++-- drivers/ata/libata-scsi.c | 4 ++-- drivers/message/fusion/mptscsih.c | 2 +- drivers/message/fusion/mptscsih.h | 2 +- drivers/scsi/3w-9xxx.c | 2 +- drivers/scsi/3w-sas.c | 2 +- drivers/scsi/3w-xxxx.c | 2 +- drivers/scsi/BusLogic.c | 4 ++-- drivers/scsi/BusLogic.h | 2 +- drivers/scsi/aacraid/linit.c | 6 +++--- drivers/scsi/advansys.c | 2 +- drivers/scsi/aha152x.c | 4 ++-- drivers/scsi/aha1542.c | 2 +- drivers/scsi/aha1740.c | 2 +- drivers/scsi/aic7xxx/aic79xx_osm.c | 4 ++-- drivers/scsi/aic7xxx/aic7xxx_osm.c | 4 ++-- drivers/scsi/arcmsr/arcmsr_hba.c | 6 +++--- drivers/scsi/atp870u.c | 2 +- drivers/scsi/fdomain.c | 4 ++-- drivers/scsi/imm.c | 2 +- drivers/scsi/initio.c | 4 ++-- drivers/scsi/ipr.c | 8 ++++---- drivers/scsi/ips.c | 2 +- drivers/scsi/ips.h | 2 +- drivers/scsi/libsas/sas_scsi_host.c | 2 +- drivers/scsi/megaraid.c | 4 ++-- drivers/scsi/megaraid.h | 2 +- drivers/scsi/megaraid/megaraid_sas_base.c | 4 ++-- drivers/scsi/mpi3mr/mpi3mr_os.c | 4 ++-- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 4 ++-- drivers/scsi/mvumi.c | 2 +- drivers/scsi/myrb.c | 2 +- drivers/scsi/pcmcia/sym53c500_cs.c | 2 +- drivers/scsi/ppa.c | 2 +- drivers/scsi/qla1280.c | 2 +- drivers/scsi/qlogicfas408.c | 2 +- drivers/scsi/qlogicfas408.h | 2 +- drivers/scsi/scsicam.c | 6 +++--- drivers/scsi/sd.c | 4 ++-- drivers/scsi/stex.c | 2 +- drivers/scsi/storvsc_drv.c | 2 +- drivers/scsi/wd719x.c | 2 +- include/linux/libata.h | 2 +- include/scsi/libsas.h | 2 +- include/scsi/scsi_host.h | 2 +- include/scsi/scsicam.h | 2 +- 46 files changed, 68 insertions(+), 68 deletions(-) diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst index 96c8230d638e..634f5c28a849 100644 --- a/Documentation/scsi/scsi_mid_low_api.rst +++ b/Documentation/scsi/scsi_mid_low_api.rst @@ -623,7 +623,7 @@ Details:: * bios_param - fetch head, sector, cylinder info for a disk * @sdev: pointer to scsi device context (defined in * include/scsi/scsi_device.h) - * @bdev: pointer to block device context (defined in fs.h) + * @disk: pointer to gendisk (defined in blkdev.h) * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -643,7 +643,7 @@ Details:: * * Optionally defined in: LLD **/ - int bios_param(struct scsi_device * sdev, struct block_device *bdev, + int bios_param(struct scsi_device * sdev, struct gendisk *disk, sector_t capacity, int params[3]) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57f674f51b0c..3603b430724b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -351,7 +351,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); /** * ata_std_bios_param - generic bios head/sector/cylinder calculator used by sd. * @sdev: SCSI device for which BIOS geometry is to be determined - * @bdev: block device associated with @sdev + * @unused: gendisk associated with @sdev * @capacity: capacity of SCSI device * @geom: location to which geometry will be output * @@ -366,7 +366,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); * RETURNS: * Zero. */ -int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, +int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { geom[0] = 255; diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c index 3a64dc7a7e27..3304f8824cf7 100644 --- a/drivers/message/fusion/mptscsih.c +++ b/drivers/message/fusion/mptscsih.c @@ -2074,7 +2074,7 @@ mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, * This is anyones guess quite frankly. */ int -mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, +mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/message/fusion/mptscsih.h b/drivers/message/fusion/mptscsih.h index 8c2bb2331fc1..f9678d48100c 100644 --- a/drivers/message/fusion/mptscsih.h +++ b/drivers/message/fusion/mptscsih.h @@ -123,7 +123,7 @@ extern int mptscsih_abort(struct scsi_cmnd * SCpnt); extern int mptscsih_dev_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_bus_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_host_reset(struct scsi_cmnd *SCpnt); -extern int mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, sector_t capacity, int geom[]); +extern int mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]); extern int mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_scandv_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 883d4a12a172..a377a6f6900a 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -1695,7 +1695,7 @@ out: } /* End twa_reset_sequence() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twa_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twa_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index 8d4174c7107e..e319be7d369c 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -1404,7 +1404,7 @@ out: } /* End twl_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twl_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twl_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 89bd56f78ef9..0306a228c702 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -1340,7 +1340,7 @@ static int tw_reset_device_extension(TW_Device_Extension *tw_dev) } /* End tw_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int tw_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int tw_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 743be2ef6d1a..8b17d8103e90 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3240,7 +3240,7 @@ static int blogic_resetadapter(struct blogic_adapter *adapter, bool hard_reset) the BIOS, and a warning may be displayed. */ -static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, +static int blogic_diskparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *params) { struct blogic_adapter *adapter = @@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, diskparam->sectors = 32; } diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors); - buf = scsi_bios_ptable(dev->bd_disk); + buf = scsi_bios_ptable(disk); if (buf == NULL) return 0; /* diff --git a/drivers/scsi/BusLogic.h b/drivers/scsi/BusLogic.h index 61bf26d4fc10..79de815e33b0 100644 --- a/drivers/scsi/BusLogic.h +++ b/drivers/scsi/BusLogic.h @@ -1273,7 +1273,7 @@ static inline void blogic_incszbucket(unsigned int *cmdsz_buckets, static const char *blogic_drvr_info(struct Scsi_Host *); static int blogic_qcmd(struct Scsi_Host *h, struct scsi_cmnd *); -static int blogic_diskparam(struct scsi_device *, struct block_device *, sector_t, int *); +static int blogic_diskparam(struct scsi_device *, struct gendisk *, sector_t, int *); static int blogic_sdev_configure(struct scsi_device *, struct queue_limits *lim); static void blogic_qcompleted_ccb(struct blogic_ccb *); diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 2264a97d91a0..ea66196ef7c7 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -273,7 +273,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) /** * aac_biosparm - return BIOS parameters for disk * @sdev: The scsi device corresponding to the disk - * @bdev: the block device corresponding to the disk + * @disk: the gendisk corresponding to the disk * @capacity: the sector capacity of the disk * @geom: geometry block to fill in * @@ -292,7 +292,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) * be displayed. */ -static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, +static int aac_biosparm(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *geom) { struct diskparm *param = (struct diskparm *)geom; @@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, * entry whose end_head matches one of the standard geometry * translations ( 64/32, 128/32, 255/63 ). */ - buf = scsi_bios_ptable(bdev->bd_disk); + buf = scsi_bios_ptable(disk); if (!buf) return 0; if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c index 3a2c336307c0..063e1b5818d3 100644 --- a/drivers/scsi/advansys.c +++ b/drivers/scsi/advansys.c @@ -7096,7 +7096,7 @@ static int advansys_reset(struct scsi_cmnd *scp) * ip[2]: cylinders */ static int -advansys_biosparam(struct scsi_device *sdev, struct block_device *bdev, +advansys_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { struct asc_board *boardp = shost_priv(sdev->host); diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index e94c0a19c435..182aa80ec4c6 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -1246,7 +1246,7 @@ int aha152x_host_reset_host(struct Scsi_Host *shpnt) * Return the "logical geometry" * */ -static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int aha152x_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *info_array) { struct Scsi_Host *shpnt = sdev->host; @@ -1261,7 +1261,7 @@ static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev int info[3]; /* try to figure out the geometry from the partition table */ - if (scsicam_bios_param(bdev, capacity, info) < 0 || + if (scsicam_bios_param(disk, capacity, info) < 0 || !((info[0] == 64 && info[1] == 32) || (info[0] == 255 && info[1] == 63))) { if (EXT_TRANS) { printk(KERN_NOTICE diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c index 389499d3e00a..371e8300f029 100644 --- a/drivers/scsi/aha1542.c +++ b/drivers/scsi/aha1542.c @@ -992,7 +992,7 @@ static int aha1542_host_reset(struct scsi_cmnd *cmd) } static int aha1542_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { struct aha1542_hostdata *aha1542 = shost_priv(sdev->host); diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c index be7ebbbb9ba8..b234621f6b37 100644 --- a/drivers/scsi/aha1740.c +++ b/drivers/scsi/aha1740.c @@ -510,7 +510,7 @@ static void aha1740_getconfig(unsigned int base, unsigned int *irq_level, } static int aha1740_biosparam(struct scsi_device *sdev, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int* ip) { int size = capacity; diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 2cff19e95fec..c3d1b9dd24ae 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -720,7 +720,7 @@ ahd_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahd_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahd = *((struct ahd_softc **)sdev->host->hostdata); - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index 05bdb73d1157..8b2b98666d61 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -683,7 +683,7 @@ ahc_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahc_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahc = *((struct ahc_softc **)sdev->host->hostdata); channel = sdev_channel(sdev); - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 6968da9fb67c..f0c5a30ce51b 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -112,7 +112,7 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb); static int arcmsr_abort(struct scsi_cmnd *); static int arcmsr_bus_reset(struct scsi_cmnd *); static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *info); + struct gendisk *disk, sector_t capacity, int *info); static int arcmsr_queue_command(struct Scsi_Host *h, struct scsi_cmnd *cmd); static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id); @@ -377,11 +377,11 @@ static irqreturn_t arcmsr_do_interrupt(int irq, void *dev_id) } static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *geom) + struct gendisk *disk, sector_t capacity, int *geom) { int heads, sectors, cylinders, total_capacity; - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; total_capacity = capacity; diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c index 401242912855..df6f40b51deb 100644 --- a/drivers/scsi/atp870u.c +++ b/drivers/scsi/atp870u.c @@ -1692,7 +1692,7 @@ static int atp870u_show_info(struct seq_file *m, struct Scsi_Host *HBAptr) } -static int atp870u_biosparam(struct scsi_device *disk, struct block_device *dev, +static int atp870u_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int *ip) { int heads, sectors, cylinders; diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 4a3716dc644c..c0b2a980db34 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -469,10 +469,10 @@ static int fdomain_host_reset(struct scsi_cmnd *cmd) } static int fdomain_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, + struct gendisk *disk, sector_t capacity, int geom[]) { - unsigned char *p = scsi_bios_ptable(bdev->bd_disk); + unsigned char *p = scsi_bios_ptable(disk); if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */ && p[4]) { /* Partition type */ diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c index 0821cf994b98..5c602c057798 100644 --- a/drivers/scsi/imm.c +++ b/drivers/scsi/imm.c @@ -954,7 +954,7 @@ static DEF_SCSI_QCMD(imm_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int imm_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int imm_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index 8648bd965287..ed34ad92c807 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c @@ -2645,7 +2645,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) /** * i91u_biosparam - return the "logical geometry * @sdev: SCSI device - * @dev: Matching block device + * @unused: Matching gendisk * @capacity: Sector size of drive * @info_array: Return space for BIOS geometry * @@ -2655,7 +2655,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) * FIXME: limited to 2^32 sector devices. */ -static int i91u_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int i91u_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info_array) { struct initio_host *host; /* Point to Host adapter control block */ diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index d06b79f03538..dd6754db7e4c 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -4644,10 +4644,10 @@ ATTRIBUTE_GROUPS(ipr_dev); /** * ipr_biosparam - Return the HSC mapping - * @sdev: scsi device struct - * @block_device: block device pointer + * @sdev: scsi device struct + * @unused: gendisk pointer * @capacity: capacity of the device - * @parm: Array containing returned HSC values. + * @parm: Array containing returned HSC values. * * This function generates the HSC parms that fdisk uses. * We want to make sure we return something that places partitions @@ -4657,7 +4657,7 @@ ATTRIBUTE_GROUPS(ipr_dev); * 0 on success **/ static int ipr_biosparam(struct scsi_device *sdev, - struct block_device *block_device, + struct gendisk *unused, sector_t capacity, int *parm) { int heads, sectors; diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 94adb6ac02a4..3393a288fd23 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -1123,7 +1123,7 @@ static DEF_SCSI_QCMD(ips_queue) /* Set bios geometry for the controller */ /* */ /****************************************************************************/ -static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { ips_ha_t *ha = (ips_ha_t *) sdev->host->hostdata; diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index 8ac932ec4444..30a4d4a580e9 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -398,7 +398,7 @@ /* * Scsi_Host Template */ - static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, + static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]); static int ips_sdev_configure(struct scsi_device *SDptr, struct queue_limits *lim); diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 928723c90b75..ffa5b49aaf08 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -845,7 +845,7 @@ int sas_change_queue_depth(struct scsi_device *sdev, int depth) EXPORT_SYMBOL_GPL(sas_change_queue_depth); int sas_bios_param(struct scsi_device *scsi_dev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int *hsc) { hsc[0] = 255; diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index c7581c7829af..a00622c0c526 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2780,7 +2780,7 @@ static inline void mega_create_proc_entry(int index, struct proc_dir_entry *pare * Return the disk geometry for a particular disk */ static int -megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, +megaraid_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { adapter_t *adapter; @@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, geom[2] = cylinders; } else { - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; dev_info(&adapter->dev->dev, diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h index 013fbfb911b9..d6bfd26a8843 100644 --- a/drivers/scsi/megaraid.h +++ b/drivers/scsi/megaraid.h @@ -975,7 +975,7 @@ static void mega_free_scb(adapter_t *, scb_t *); static int megaraid_abort(struct scsi_cmnd *); static int megaraid_reset(struct scsi_cmnd *); static int megaraid_abort_and_reset(adapter_t *, struct scsi_cmnd *, int); -static int megaraid_biosparam(struct scsi_device *, struct block_device *, +static int megaraid_biosparam(struct scsi_device *, struct gendisk *, sector_t, int []); static int mega_build_sglist (adapter_t *adapter, scb_t *scb, diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 615e06fd4ee8..abbbc4b36cd1 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3137,12 +3137,12 @@ static int megasas_reset_target(struct scsi_cmnd *scmd) /** * megasas_bios_param - Returns disk geometry for a disk * @sdev: device handle - * @bdev: block device + * @unused: gendisk * @capacity: drive capacity * @geom: geometry parameters */ static int -megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, +megasas_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index e467b56949e9..3df52a3b435b 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -4031,7 +4031,7 @@ out: /** * mpi3mr_bios_param - BIOS param callback * @sdev: SCSI device reference - * @bdev: Block device reference + * @unused: gendisk reference * @capacity: Capacity in logical sectors * @params: Parameter array * @@ -4040,7 +4040,7 @@ out: * Return: 0 always */ static int mpi3mr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int params[]) + struct gendisk *unused, sector_t capacity, int params[]) { int heads; int sectors; diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 967af259118e..7092d0debef3 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2754,7 +2754,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) /** * scsih_bios_param - fetch head, sector, cylinder info for a disk * @sdev: scsi device struct - * @bdev: pointer to block device context + * @unused: pointer to gendisk * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -2762,7 +2762,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * params[2] number of cylinders */ static int -scsih_bios_param(struct scsi_device *sdev, struct block_device *bdev, +scsih_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int params[]) { int heads; diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c index 96549e7f5705..bdc2f2f17753 100644 --- a/drivers/scsi/mvumi.c +++ b/drivers/scsi/mvumi.c @@ -2142,7 +2142,7 @@ static enum scsi_timeout_action mvumi_timed_out(struct scsi_cmnd *scmd) } static int -mvumi_bios_param(struct scsi_device *sdev, struct block_device *bdev, +mvumi_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index 486db5b2f05d..b8453c0333dc 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -1745,7 +1745,7 @@ static void myrb_sdev_destroy(struct scsi_device *sdev) kfree(sdev->hostdata); } -static int myrb_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int myrb_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { struct myrb_hba *cb = shost_priv(sdev->host); diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c index 278c78d066c4..a3b505240351 100644 --- a/drivers/scsi/pcmcia/sym53c500_cs.c +++ b/drivers/scsi/pcmcia/sym53c500_cs.c @@ -597,7 +597,7 @@ SYM53C500_host_reset(struct scsi_cmnd *SCpnt) static int SYM53C500_biosparm(struct scsi_device *disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int *info_array) { int size; diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c index 1ed3171f1797..ea682f3044b6 100644 --- a/drivers/scsi/ppa.c +++ b/drivers/scsi/ppa.c @@ -845,7 +845,7 @@ static DEF_SCSI_QCMD(ppa_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int ppa_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int ppa_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 6af018f1ca22..ef841f643171 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -1023,7 +1023,7 @@ qla1280_eh_adapter_reset(struct scsi_cmnd *cmd) } static int -qla1280_biosparam(struct scsi_device *sdev, struct block_device *bdev, +qla1280_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/qlogicfas408.c b/drivers/scsi/qlogicfas408.c index 3e065d5fc80c..1ce469b7db99 100644 --- a/drivers/scsi/qlogicfas408.c +++ b/drivers/scsi/qlogicfas408.c @@ -492,7 +492,7 @@ DEF_SCSI_QCMD(qlogicfas408_queuecommand) * Return bios parameters */ -int qlogicfas408_biosparam(struct scsi_device *disk, struct block_device *dev, +int qlogicfas408_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int ip[]) { /* This should mimic the DOS Qlogic driver's behavior exactly */ diff --git a/drivers/scsi/qlogicfas408.h b/drivers/scsi/qlogicfas408.h index a971db11d293..83ef86c71f2f 100644 --- a/drivers/scsi/qlogicfas408.h +++ b/drivers/scsi/qlogicfas408.h @@ -106,7 +106,7 @@ struct qlogicfas408_priv { irqreturn_t qlogicfas408_ihandl(int irq, void *dev_id); int qlogicfas408_queuecommand(struct Scsi_Host *h, struct scsi_cmnd * cmd); int qlogicfas408_biosparam(struct scsi_device * disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int ip[]); int qlogicfas408_abort(struct scsi_cmnd * cmd); extern int qlogicfas408_host_reset(struct scsi_cmnd *cmd); diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 3ff2cf51d5b0..887de505bcf9 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -205,7 +205,7 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds /** * scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors. - * @bdev: which device + * @disk: which device * @capacity: size of the disk in sectors * @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders * @@ -215,13 +215,13 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds * * Returns : -1 on failure, 0 on success. */ -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip) { u64 capacity64 = capacity; /* Suppress gcc warning */ int ret = 0; /* try to infer mapping from partition table */ - if (scsi_partsize(bdev->bd_disk, capacity, ip)) + if (scsi_partsize(disk, capacity, ip)) return 0; if (capacity64 < (1ULL << 32)) { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5b8668accf8e..3edcc43f180b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) - host->hostt->bios_param(sdp, bdev, capacity, diskinfo); + host->hostt->bios_param(sdp, bdev->bd_disk, capacity, diskinfo); else - scsicam_bios_param(bdev, capacity, diskinfo); + scsicam_bios_param(bdev->bd_disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c index 63ed7f9aaa93..d8ad02c29320 100644 --- a/drivers/scsi/stex.c +++ b/drivers/scsi/stex.c @@ -1457,7 +1457,7 @@ static void stex_reset_work(struct work_struct *work) } static int stex_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { int heads = 255, sectors = 63; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index d9e59204a9c3..dc51ea352198 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1615,7 +1615,7 @@ static int storvsc_sdev_configure(struct scsi_device *sdevice, return 0; } -static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * bdev, +static int storvsc_get_chs(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info) { sector_t nsect = capacity; diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c index 5a380eecfc75..0c9987828774 100644 --- a/drivers/scsi/wd719x.c +++ b/drivers/scsi/wd719x.c @@ -544,7 +544,7 @@ static int wd719x_host_reset(struct scsi_cmnd *cmd) return wd719x_chip_init(wd) == 0 ? SUCCESS : FAILED; } -static int wd719x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int wd719x_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { if (capacity >= 0x200000) { diff --git a/include/linux/libata.h b/include/linux/libata.h index 0620dd67369f..21de0935775d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1203,7 +1203,7 @@ extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_sdev_init(struct scsi_device *sdev); diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index ba460b6c0374..9c6e90829dbd 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -685,7 +685,7 @@ extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); extern int sas_target_alloc(struct scsi_target *); int sas_sdev_configure(struct scsi_device *dev, struct queue_limits *lim); extern int sas_change_queue_depth(struct scsi_device *, int new_depth); -extern int sas_bios_param(struct scsi_device *, struct block_device *, +extern int sas_bios_param(struct scsi_device *, struct gendisk *, sector_t capacity, int *hsc); int sas_execute_internal_abort_single(struct domain_device *device, u16 tag, unsigned int qid, diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index c53812b9026f..f5a243261236 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -318,7 +318,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - int (* bios_param)(struct scsi_device *, struct block_device *, + int (* bios_param)(struct scsi_device *, struct gendisk *, sector_t, int []); /* diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 67f4e8835bc8..1131f51ed2c8 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -14,7 +14,7 @@ #ifndef SCSICAM_H #define SCSICAM_H struct gendisk; -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip); bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]); unsigned char *scsi_bios_ptable(struct gendisk *disk); #endif /* def SCSICAM_H */ From 4fc8728aa34f54835b72e4db0f3db76a72948b65 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 22:19:55 -0400 Subject: [PATCH 003/164] block: switch ->getgeo() to struct gendisk Instances are happier that way and it makes more sense anyway - the only part of the result that is related to partition we are given is the start sector, and that has been filled in by the caller. Everything else is a function of the disk. Only one instance (DASD) is ever looking at anything other than bdev->bd_disk and that one is trivial to adjust. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/filesystems/locking.rst | 2 +- arch/m68k/emu/nfblock.c | 4 ++-- arch/um/drivers/ubd_kern.c | 6 +++--- block/ioctl.c | 4 ++-- block/partitions/ibm.c | 2 +- drivers/block/amiflop.c | 10 +++++----- drivers/block/aoe/aoeblk.c | 4 ++-- drivers/block/floppy.c | 4 ++-- drivers/block/mtip32xx/mtip32xx.c | 6 +++--- drivers/block/rnbd/rnbd-clt.c | 4 ++-- drivers/block/sunvdc.c | 3 +-- drivers/block/swim.c | 4 ++-- drivers/block/virtio_blk.c | 6 +++--- drivers/block/xen-blkfront.c | 4 ++-- drivers/md/dm.c | 4 ++-- drivers/md/md.c | 4 ++-- drivers/memstick/core/ms_block.c | 4 ++-- drivers/memstick/core/mspro_block.c | 4 ++-- drivers/mmc/core/block.c | 4 ++-- drivers/mtd/mtd_blkdevs.c | 4 ++-- drivers/mtd/ubi/block.c | 4 ++-- drivers/nvdimm/btt.c | 4 ++-- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/nvme.h | 2 +- drivers/s390/block/dasd.c | 7 ++++--- drivers/scsi/sd.c | 8 ++++---- include/linux/blkdev.h | 2 +- 27 files changed, 59 insertions(+), 59 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index aa287ccdac2f..77704fde9845 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -443,7 +443,7 @@ prototypes:: int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 83410f8184ec..94a4fadc651a 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -77,9 +77,9 @@ static void nfhd_submit_bio(struct bio *bio) bio_endio(bio); } -static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int nfhd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct nfhd_device *dev = bdev->bd_disk->private_data; + struct nfhd_device *dev = disk->private_data; geo->cylinders = dev->blocks >> (6 - dev->bshift); geo->heads = 4; diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 4de6613e7468..f2b2feeeb455 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,7 +108,7 @@ static DEFINE_MUTEX(ubd_lock); static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo); #define MAX_DEV (16) @@ -1324,9 +1324,9 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, return res; } -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct ubd *ubd_dev = bdev->bd_disk->private_data; + struct ubd *ubd_dev = disk->private_data; geo->heads = 128; geo->sectors = 32; diff --git a/block/ioctl.c b/block/ioctl.c index f7b0006ca45d..a14304f737c5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -481,7 +481,7 @@ static int blkdev_getgeo(struct block_device *bdev, */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) @@ -515,7 +515,7 @@ static int compat_hdio_getgeo(struct block_device *bdev, * want to override it. */ geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 82d9c4c3fb41..631291fbb356 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -358,7 +358,7 @@ int ibm_partition(struct parsed_partitions *state) goto out_nolab; /* set start if not filled by getgeo function e.g. virtblk */ geo->start = get_start_sect(bdev); - if (disk->fops->getgeo(bdev, geo)) + if (disk->fops->getgeo(disk, geo)) goto out_freeall; if (!fn || fn(disk, info)) { kfree(info); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 6357d86eafdc..2932b6653b6f 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1523,13 +1523,13 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = MINOR(bdev->bd_dev) & 3; + struct amiga_floppy_struct *p = disk->private_data; - geo->heads = unit[drive].type->heads; - geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult; - geo->cylinders = unit[drive].type->tracks; + geo->heads = p->type->heads; + geo->sectors = p->dtype->sects * p->type->sect_mult; + geo->cylinders = p->type->tracks; return 0; } diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 00b74a845328..34ead75e7e02 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -269,9 +269,9 @@ static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx, } static int -aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +aoeblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct aoedev *d = bdev->bd_disk->private_data; + struct aoedev *d = disk->private_data; if ((d->flags & DEVFL_UP) == 0) { printk(KERN_ERR "aoe: disk not up\n"); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 24be0c2c4075..eb125a36ec24 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3363,9 +3363,9 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g) return 0; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = (long)bdev->bd_disk->private_data; + int drive = (long)disk->private_data; int type = ITYPE(drive_state[drive].fd_device); struct floppy_struct *g; int ret; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 8fc7761397bd..567192e371a8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3148,17 +3148,17 @@ static int mtip_block_compat_ioctl(struct block_device *dev, * that each partition is also 4KB aligned. Non-aligned partitions adversely * affects performance. * - * @dev Pointer to the block_device strucutre. + * @disk Pointer to the gendisk strucutre. * @geo Pointer to a hd_geometry structure. * * return value * 0 Operation completed successfully. * -ENOTTY An error occurred while reading the drive capacity. */ -static int mtip_block_getgeo(struct block_device *dev, +static int mtip_block_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct driver_data *dd = dev->bd_disk->private_data; + struct driver_data *dd = disk->private_data; sector_t capacity; if (!dd) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 15627417f12e..119b7e27023f 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -942,11 +942,11 @@ static void rnbd_client_release(struct gendisk *gen) rnbd_clt_put_dev(dev); } -static int rnbd_client_getgeo(struct block_device *block_device, +static int rnbd_client_getgeo(struct gendisk *disk, struct hd_geometry *geo) { u64 size; - struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + struct rnbd_clt_dev *dev = disk->private_data; struct queue_limits *limit = &dev->queue->limits; size = dev->size * (limit->logical_block_size / SECTOR_SIZE); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7af21fe67671..107751721178 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -119,9 +119,8 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr) return vio_dring_avail(dr, VDC_TX_RING_SIZE); } -static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int vdc_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct gendisk *disk = bdev->bd_disk; sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index eda33c5eb5e2..416015947ae6 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -711,9 +711,9 @@ static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } -static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int floppy_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct floppy_state *fs = bdev->bd_disk->private_data; + struct floppy_state *fs = disk->private_data; struct floppy_struct *g; int ret; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e649fa67bac1..85efa7e535f8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -829,9 +829,9 @@ out: } /* We provide getgeo only to please some old bootloader/partitioning tools */ -static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int virtblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct virtio_blk *vblk = bd->bd_disk->private_data; + struct virtio_blk *vblk = disk->private_data; int ret = 0; mutex_lock(&vblk->vdev_mutex); @@ -853,7 +853,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) /* some standard values, similar to sd */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; } out: mutex_unlock(&vblk->vdev_mutex); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5babe575c288..04fc6b552c04 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -493,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg) schedule_work(&rinfo->work); } -static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg) { /* We don't have real geometry info, but let's at least return values consistent with the size of the device */ - sector_t nsect = get_capacity(bd->bd_disk); + sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; hg->heads = 0xff; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a44e8c2dccee..7bd6fa05b00a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w) dm_deferred_remove(); } -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mapped_device *md = bdev->bd_disk->private_data; + struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } diff --git a/drivers/md/md.c b/drivers/md/md.c index ac85ec73a409..236bb04f3e54 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7678,9 +7678,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index d34892782f6e..1af157ce0a63 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1953,10 +1953,10 @@ static void msb_data_clear(struct msb_data *msb) msb->card = NULL; } -static int msb_bd_getgeo(struct block_device *bdev, +static int msb_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct msb_data *msb = bdev->bd_disk->private_data; + struct msb_data *msb = disk->private_data; *geo = msb->geometry; return 0; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c9853d887d28..075519caa547 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -189,10 +189,10 @@ static void mspro_block_bd_free_disk(struct gendisk *disk) kfree(msb); } -static int mspro_block_bd_getgeo(struct block_device *bdev, +static int mspro_block_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mspro_block_data *msb = bdev->bd_disk->private_data; + struct mspro_block_data *msb = disk->private_data; geo->heads = msb->heads; geo->sectors = msb->sectors_per_track; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 9cc47bf94804..d1f295af9713 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -435,9 +435,9 @@ static void mmc_blk_release(struct gendisk *disk) } static int -mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +mmc_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16); + geo->cylinders = get_capacity(disk) / (4 * 16); geo->heads = 4; geo->sectors = 16; return 0; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 847c11542f02..28e09d080440 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -246,9 +246,9 @@ unlock: blktrans_dev_put(dev); } -static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int blktrans_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data; + struct mtd_blktrans_dev *dev = disk->private_data; int ret = -ENXIO; mutex_lock(&dev->lock); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 39cc0a6a4d37..b53fd147fa65 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -282,12 +282,12 @@ static void ubiblock_release(struct gendisk *gd) mutex_unlock(&dev->dev_mutex); } -static int ubiblock_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubiblock_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* Some tools might require this information */ geo->heads = 1; geo->cylinders = 1; - geo->sectors = get_capacity(bdev->bd_disk); + geo->sectors = get_capacity(disk); geo->start = 0; return 0; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 2a1aa32e6693..a933db961ed7 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1478,12 +1478,12 @@ static void btt_submit_bio(struct bio *bio) bio_endio(bio); } -static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int btt_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 812c1565114f..f96f74bff6ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1803,12 +1803,12 @@ static void nvme_release(struct gendisk *disk) nvme_ns_release(disk->private_data); } -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cfd2b5b90b91..102fae6a231c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -936,7 +936,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_id_ns **id); -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo); int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_attr_groups[]; diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 506a947d00a5..582ac7e61b24 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3317,11 +3317,11 @@ static void dasd_release(struct gendisk *disk) /* * Return disk geometry. */ -static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dasd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { struct dasd_device *base; - base = dasd_device_from_gendisk(bdev->bd_disk); + base = dasd_device_from_gendisk(disk); if (!base) return -ENODEV; @@ -3331,7 +3331,8 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return -EINVAL; } base->discipline->fill_geometry(base->block, geo); - geo->start = get_start_sect(bdev) >> base->block->s2b_shift; + // geo->start is left unchanged by the above + geo->start >>= base->block->s2b_shift; dasd_put_device(base); return 0; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3edcc43f180b..00ad574ce61c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1599,9 +1599,9 @@ static void sd_release(struct gendisk *disk) scsi_device_put(sdev); } -static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); + struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); @@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) - host->hostt->bios_param(sdp, bdev->bd_disk, capacity, diskinfo); + host->hostt->bios_param(sdp, disk, capacity, diskinfo); else - scsicam_bios_param(bdev->bd_disk, capacity, diskinfo); + scsicam_bios_param(disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95886b404b16..fbc45121cd4f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1659,7 +1659,7 @@ struct block_device_operations { unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ From 7a6fc1634cea6f220228a69b1c0210e6b8b1aaf0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:45 -0700 Subject: [PATCH 004/164] blk-mq-dma: create blk_map_iter type The req_iterator happens to have a similar fields to what the dma iterator needs, but we're not necessarily iterating a request's bi_io_vec. Create a new type that can be amended for additional future use. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++-- include/linux/blk-mq-dma.h | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index ad283017caef..51e7a0ff045f 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -10,7 +10,7 @@ struct phys_vec { u32 len; }; -static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, +static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { unsigned int max_size; @@ -246,7 +246,7 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct req_iterator iter = { + struct blk_map_iter iter = { .bio = rq->bio, }; struct phys_vec vec; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index c26a01aeae00..6a7e3828673d 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -5,6 +5,11 @@ #include #include +struct blk_map_iter { + struct bvec_iter iter; + struct bio *bio; +}; + struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; @@ -14,7 +19,7 @@ struct blk_dma_iter { blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ - struct req_iterator iter; + struct blk_map_iter iter; struct pci_p2pdma_map_state p2pdma; }; From dae75dead2359edd7c55e1964e0edf7d03535b31 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:46 -0700 Subject: [PATCH 005/164] blk-mq-dma: provide the bio_vec array being iterated This will make it easier to add different sources of the bvec array, like for upcoming integrity support, rather than assume to use the bio's bi_io_vec. It also makes iterating "special" payloads more in common with iterating normal payloads. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-3-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 56 ++++++++++++++++++++++---------------- include/linux/blk-mq-dma.h | 1 + 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 51e7a0ff045f..8f41fe740b46 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -16,23 +16,14 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, unsigned int max_size; struct bio_vec bv; - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - if (!iter->iter.bi_size) return false; - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter); vec->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next @@ -43,19 +34,20 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct bio_vec next; if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) + if (!iter->bio || !iter->bio->bi_next) break; iter->bio = iter->bio->bi_next; iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; } - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len); } vec->len = bv.bv_len; @@ -125,6 +117,30 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, return true; } +static inline void blk_rq_map_iter_init(struct request *rq, + struct blk_map_iter *iter) +{ + struct bio *bio = rq->bio; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + *iter = (struct blk_map_iter) { + .bvecs = &rq->special_vec, + .iter = { + .bi_size = rq->special_vec.bv_len, + } + }; + } else if (bio) { + *iter = (struct blk_map_iter) { + .bio = bio, + .bvecs = bio->bi_io_vec, + .iter = bio->bi_iter, + }; + } else { + /* the internal flush request may not have bio attached */ + *iter = (struct blk_map_iter) {}; + } +} + /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map @@ -153,8 +169,7 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, unsigned int total_len = blk_rq_payload_bytes(req); struct phys_vec vec; - iter->iter.bio = req->bio; - iter->iter.iter = req->bio->bi_iter; + blk_rq_map_iter_init(req, &iter->iter); memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; @@ -246,16 +261,11 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct blk_map_iter iter = { - .bio = rq->bio, - }; + struct blk_map_iter iter; struct phys_vec vec; int nsegs = 0; - /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; - + blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 6a7e3828673d..e5cb5e46fc92 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -8,6 +8,7 @@ struct blk_map_iter { struct bvec_iter iter; struct bio *bio; + struct bio_vec *bvecs; }; struct blk_dma_iter { From 92fb75fd14b041038e30bc725ab4c1e625243573 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:47 -0700 Subject: [PATCH 006/164] blk-mq-dma: require unmap caller provide p2p map type In preparing for integrity dma mappings, we can't rely on the request flag because data and metadata may have different mapping types. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-4-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 9 ++++++++- include/linux/blk-mq-dma.h | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2c6d9506b172..111b6bc6c93e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -261,6 +261,9 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + + /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ + IOD_P2P_BUS_ADDR = 1U << 3, }; struct nvme_dma_vec { @@ -725,7 +728,8 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, + iod->flags & IOD_P2P_BUS_ADDR)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req); else @@ -1000,6 +1004,9 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; + if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + iod->flags |= IOD_P2P_BUS_ADDR; + if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index e5cb5e46fc92..881880095e0d 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -47,14 +47,15 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap + * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) + struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { - if (req->cmd_flags & REQ_P2PDMA) + if (is_p2p) return true; if (dma_use_iova(state)) { From 7092639031a1bd5320ab827e8f665350f332b7ce Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:48 -0700 Subject: [PATCH 007/164] blk-mq: remove REQ_P2PDMA flag It's not serving any particular purpose. pci_p2pdma_state() already has all the appropriate checks, so the config and flag checks are not guarding anything. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-5-kbusch@meta.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-mq-dma.c | 30 ++++++++++++++---------------- include/linux/blk_types.h | 2 -- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/block/bio.c b/block/bio.c index 3b371a5da159..44c43b970387 100644 --- a/block/bio.c +++ b/block/bio.c @@ -981,7 +981,7 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) - bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 8f41fe740b46..58defab21882 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -180,22 +180,20 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; - if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { - switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, - phys_to_page(vec.paddr))) { - case PCI_P2PDMA_MAP_BUS_ADDR: - return blk_dma_map_bus(iter, &vec); - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * P2P transfers through the host bridge are treated the - * same as non-P2P transfers below and during unmap. - */ - req->cmd_flags &= ~REQ_P2PDMA; - break; - default: - iter->status = BLK_STS_INVAL; - return false; - } + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + case PCI_P2PDMA_MAP_NONE: + break; + default: + iter->status = BLK_STS_INVAL; + return false; } if (blk_can_dma_map_iova(req, dma_dev) && diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09b99d52fd36..930daff207df 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -386,7 +386,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -419,7 +418,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) From e2be2ba6d27d5c74f3bd458b64d1ad34a8a75bd8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:49 -0700 Subject: [PATCH 008/164] blk-mq-dma: move common dma start code to a helper In preparing for dma mapping integrity metadata, move the common dma setup to a helper. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-6-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 59 ++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 58defab21882..31dd8f58f081 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -141,35 +141,12 @@ static inline void blk_rq_map_iter_init(struct request *rq, } } -/** - * blk_rq_dma_map_iter_start - map the first DMA segment for a request - * @req: request to map - * @dma_dev: device to map to - * @state: DMA IOVA state - * @iter: block layer DMA iterator - * - * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the - * caller and don't need to be initialized. @state needs to be stored for use - * at unmap time, @iter is only needed at map time. - * - * Returns %false if there is no segment to map, including due to an error, or - * %true ft it did map a segment. - * - * If a segment was mapped, the DMA address for it is returned in @iter.addr and - * the length in @iter.len. If no segment was mapped the status code is - * returned in @iter.status. - * - * The caller can call blk_rq_dma_map_coalesce() to check if further segments - * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() - * to try to map the following segments. - */ -bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter) +static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter, + unsigned int total_len) { - unsigned int total_len = blk_rq_payload_bytes(req); struct phys_vec vec; - blk_rq_map_iter_init(req, &iter->iter); memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; @@ -201,6 +178,36 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); return blk_dma_map_direct(req, dma_dev, iter, &vec); } + +/** + * blk_rq_dma_map_iter_start - map the first DMA segment for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the + * caller and don't need to be initialized. @state needs to be stored for use + * at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true ft it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, struct blk_dma_iter *iter) +{ + blk_rq_map_iter_init(req, &iter->iter); + return blk_dma_map_iter_start(req, dma_dev, state, iter, + blk_rq_payload_bytes(req)); +} EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); /** From fec9b16dc5550191fd85af118271ea00e8dcc5f8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:50 -0700 Subject: [PATCH 009/164] blk-mq-dma: add scatter-less integrity data DMA mapping Similar to regular data, introduce more efficient integrity mapping helpers that does away with the scatterlist structure. This uses the block mapping iterator to add IOVA segments if IOMMU is enabled, or maps directly if not. This also supports P2P segements if integrity data ever wants to allocate that type of memory. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-7-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 104 +++++++++++++++++++++++++++++++--- include/linux/blk-integrity.h | 17 ++++++ include/linux/blk-mq-dma.h | 1 + 3 files changed, 115 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 31dd8f58f081..60a244a129c3 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include #include #include "blk.h" @@ -10,6 +11,24 @@ struct phys_vec { u32 len; }; +static bool __blk_map_iter_next(struct blk_map_iter *iter) +{ + if (iter->iter.bi_size) + return true; + if (!iter->bio || !iter->bio->bi_next) + return false; + + iter->bio = iter->bio->bi_next; + if (iter->is_integrity) { + iter->iter = bio_integrity(iter->bio)->bip_iter; + iter->bvecs = bio_integrity(iter->bio)->bip_vec; + } else { + iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; + } + return true; +} + static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { @@ -33,13 +52,8 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; - if (!iter->iter.bi_size) { - if (!iter->bio || !iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - iter->bvecs = iter->bio->bi_io_vec; - } + if (!__blk_map_iter_next(iter)) + break; next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || @@ -290,3 +304,79 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, return nsegs; } EXPORT_SYMBOL(__blk_rq_map_sg); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +/** + * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment + * for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are + * provided by the caller and don't need to be initialized. @state needs to be + * stored for use at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr + * and the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + unsigned len = bio_integrity_bytes(&req->q->limits.integrity, + blk_rq_sectors(req)); + struct bio *bio = req->bio; + + iter->iter = (struct blk_map_iter) { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + return blk_dma_map_iter_start(req, dma_dev, state, iter, len); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); + +/** + * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for + * a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next integrity mapping after a previous call to + * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description + * of the arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); +#endif diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e67a2b6e8f11..78fe2459e661 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -4,6 +4,7 @@ #include #include +#include struct request; @@ -31,6 +32,11 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp); +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter); +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -115,6 +121,17 @@ static inline int blk_rq_integrity_map_user(struct request *rq, { return -EINVAL; } +static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + return false; +} +static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + return false; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 881880095e0d..0f45ea110ca1 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -9,6 +9,7 @@ struct blk_map_iter { struct bvec_iter iter; struct bio *bio; struct bio_vec *bvecs; + bool is_integrity; }; struct blk_dma_iter { From c16b52a0a095888efdd5d76f7194caf2a4752256 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:51 -0700 Subject: [PATCH 010/164] blk-integrity: use iterator for mapping sg Modify blk_rq_map_integrity_sg to use the blk-mq mapping iterator. This produces more efficient code and converges the integrity mapping implementations to reduce future maintenance burdens. The function implementation moves from blk-integrity.c to blk-mq-dma.c in order to use the types and functions private to that file. Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250813153153.3260897-8-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-integrity.c | 58 ------------------------------------------- block/blk-mq-dma.c | 45 +++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 58 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 056b8948369d..dd97b27366e0 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -122,64 +122,6 @@ out: NULL); } -/** - * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist - * @rq: request to map - * @sglist: target scatterlist - * - * Description: Map the integrity vectors in request into a - * scatterlist. The scatterlist must be big enough to hold all - * elements. I.e. sized using blk_rq_count_integrity_sg() or - * rq->nr_integrity_segments. - */ -int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) -{ - struct bio_vec iv, ivprv = { NULL }; - struct request_queue *q = rq->q; - struct scatterlist *sg = NULL; - struct bio *bio = rq->bio; - unsigned int segments = 0; - struct bvec_iter iter; - int prev = 0; - - bio_for_each_integrity_vec(iv, bio, iter) { - if (prev) { - if (!biovec_phys_mergeable(q, &ivprv, &iv)) - goto new_segment; - if (sg->length + iv.bv_len > queue_max_segment_size(q)) - goto new_segment; - - sg->length += iv.bv_len; - } else { -new_segment: - if (!sg) - sg = sglist; - else { - sg_unmark_end(sg); - sg = sg_next(sg); - } - - sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset); - segments++; - } - - prev = 1; - ivprv = iv; - } - - if (sg) - sg_mark_end(sg); - - /* - * Something must have been wrong if the figured number of segment - * is bigger than number of req's physical integrity segments - */ - BUG_ON(segments > rq->nr_integrity_segments); - BUG_ON(segments > queue_max_integrity_segments(q)); - return segments; -} -EXPORT_SYMBOL(blk_rq_map_integrity_sg); - int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 60a244a129c3..660b5e200ccf 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -379,4 +379,49 @@ bool blk_rq_integrity_dma_map_iter_next(struct request *req, return blk_dma_map_direct(req, dma_dev, iter, &vec); } EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); + +/** + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist + * @rq: request to map + * @sglist: target scatterlist + * + * Description: Map the integrity vectors in request into a + * scatterlist. The scatterlist must be big enough to hold all + * elements. I.e. sized using blk_rq_count_integrity_sg() or + * rq->nr_integrity_segments. + */ +int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) +{ + struct request_queue *q = rq->q; + struct scatterlist *sg = NULL; + struct bio *bio = rq->bio; + unsigned int segments = 0; + struct phys_vec vec; + + struct blk_map_iter iter = { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + + while (blk_map_iter_next(rq, &iter, &vec)) { + sg = blk_next_sg(&sg, sglist); + sg_set_page(sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + segments++; + } + + if (sg) + sg_mark_end(sg); + + /* + * Something must have been wrong if the figured number of segment + * is bigger than number of req's physical integrity segments + */ + BUG_ON(segments > rq->nr_integrity_segments); + BUG_ON(segments > queue_max_integrity_segments(q)); + return segments; +} +EXPORT_SYMBOL(blk_rq_map_integrity_sg); #endif From f0887e2a52d4ef3f246e508bd1d947dc13988643 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:52 -0700 Subject: [PATCH 011/164] nvme-pci: create common sgl unmapping helper This can be reused by metadata sgls once that starts using the blk-mq dma api. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-9-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 111b6bc6c93e..e12d47fecc58 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -693,25 +693,23 @@ static void nvme_free_prps(struct request *req) mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool); } -static void nvme_free_sgls(struct request *req) +static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, + struct nvme_sgl_desc *sg_list) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct device *dma_dev = nvmeq->dev->dev; - dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr); - unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length); - struct nvme_sgl_desc *sg_list = iod->descriptors[0]; enum dma_data_direction dir = rq_dma_dir(req); + unsigned int len = le32_to_cpu(sge->length); + struct device *dma_dev = nvmeq->dev->dev; + unsigned int i; - if (iod->nr_descriptors) { - unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i; - - for (i = 0; i < nr_entries; i++) - dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), - le32_to_cpu(sg_list[i].length), dir); - } else { - dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir); + if (sge->type == (NVME_SGL_FMT_DATA_DESC << 4)) { + dma_unmap_page(dma_dev, le64_to_cpu(sge->addr), len, dir); + return; } + + for (i = 0; i < len / sizeof(*sg_list); i++) + dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr), + le32_to_cpu(sg_list[i].length), dir); } static void nvme_unmap_data(struct request *req) @@ -731,7 +729,8 @@ static void nvme_unmap_data(struct request *req) if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, iod->flags & IOD_P2P_BUS_ADDR)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) - nvme_free_sgls(req); + nvme_free_sgls(req, iod->descriptors[0], + &iod->cmd.common.dptr.sgl); else nvme_free_prps(req); } From 94ce55046c3625380afd2a612237619c11dae6ad Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:53 -0700 Subject: [PATCH 012/164] nvme-pci: convert metadata mapping to dma iter Aligns data and metadata to the similar dma mapping scheme and removes one more user of the scatter-gather dma mapping. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-10-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 163 +++++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 76 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e12d47fecc58..d8a9dee55de3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -172,9 +172,7 @@ struct nvme_dev { u32 last_ps; bool hmb; struct sg_table *hmb_sgt; - mempool_t *dmavec_mempool; - mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ __le32 *dbbuf_dbs; @@ -264,6 +262,12 @@ enum nvme_iod_flags { /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ IOD_P2P_BUS_ADDR = 1U << 3, + + /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ + IOD_META_P2P_BUS_ADDR = 1U << 4, + + /* Metadata using non-coalesced MPTR */ + IOD_SINGLE_META_SEGMENT = 1U << 5, }; struct nvme_dma_vec { @@ -287,7 +291,8 @@ struct nvme_iod { unsigned int nr_dma_vecs; dma_addr_t meta_dma; - struct sg_table meta_sgt; + unsigned int meta_total_len; + struct dma_iova_state meta_dma_state; struct nvme_sgl_desc *meta_descriptor; }; @@ -644,6 +649,11 @@ static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq, return nvmeq->descriptor_pools.large; } +static inline bool nvme_pci_cmd_use_meta_sgl(struct nvme_command *cmd) +{ + return (cmd->common.flags & NVME_CMD_SGL_ALL) == NVME_CMD_SGL_METASEG; +} + static inline bool nvme_pci_cmd_use_sgl(struct nvme_command *cmd) { return cmd->common.flags & @@ -712,6 +722,36 @@ static void nvme_free_sgls(struct request *req, struct nvme_sgl_desc *sge, le32_to_cpu(sg_list[i].length), dir); } +static void nvme_unmap_metadata(struct request *req) +{ + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + enum dma_data_direction dir = rq_dma_dir(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct device *dma_dev = nvmeq->dev->dev; + struct nvme_sgl_desc *sge = iod->meta_descriptor; + + if (iod->flags & IOD_SINGLE_META_SEGMENT) { + dma_unmap_page(dma_dev, iod->meta_dma, + rq_integrity_vec(req).bv_len, + rq_dma_dir(req)); + return; + } + + if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len, + iod->flags & IOD_META_P2P_BUS_ADDR)) { + if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) + nvme_free_sgls(req, sge, &sge[1]); + else + dma_unmap_page(dma_dev, iod->meta_dma, + iod->meta_total_len, dir); + } + + if (iod->meta_descriptor) + dma_pool_free(nvmeq->descriptor_pools.small, + iod->meta_descriptor, iod->meta_dma); +} + static void nvme_unmap_data(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -1013,70 +1053,72 @@ static blk_status_t nvme_map_data(struct request *req) return nvme_pci_setup_data_prp(req, &iter); } -static void nvme_pci_sgl_set_data_sg(struct nvme_sgl_desc *sge, - struct scatterlist *sg) -{ - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); - sge->type = NVME_SGL_FMT_DATA_DESC << 4; -} - static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; + unsigned int entries = req->nr_integrity_segments; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = nvmeq->dev; struct nvme_sgl_desc *sg_list; - struct scatterlist *sgl, *sg; - unsigned int entries; + struct blk_dma_iter iter; dma_addr_t sgl_dma; - int rc, i; + int i = 0; - iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); - if (!iod->meta_sgt.sgl) - return BLK_STS_RESOURCE; + if (!blk_rq_integrity_dma_map_iter_start(req, dev->dev, + &iod->meta_dma_state, &iter)) + return iter.status; - sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); - iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, - iod->meta_sgt.sgl); - if (!iod->meta_sgt.orig_nents) - goto out_free_sg; + if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + iod->flags |= IOD_META_P2P_BUS_ADDR; + else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + entries = 1; - rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) - goto out_free_sg; + /* + * The NVMe MPTR descriptor has an implicit length that the host and + * device must agree on to avoid data/memory corruption. We trust the + * kernel allocated correctly based on the format's parameters, so use + * the more efficient MPTR to avoid extra dma pool allocations for the + * SGL indirection. + * + * But for user commands, we don't necessarily know what they do, so + * the driver can't validate the metadata buffer size. The SGL + * descriptor provides an explicit length, so we're relying on that + * mechanism to catch any misunderstandings between the application and + * device. + */ + if (entries == 1 && !(nvme_req(req)->flags & NVME_REQ_USERCMD)) { + iod->cmd.common.metadata = cpu_to_le64(iter.addr); + iod->meta_total_len = iter.len; + iod->meta_dma = iter.addr; + iod->meta_descriptor = NULL; + return BLK_STS_OK; + } sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC, &sgl_dma); if (!sg_list) - goto out_unmap_sg; + return BLK_STS_RESOURCE; - entries = iod->meta_sgt.nents; iod->meta_descriptor = sg_list; iod->meta_dma = sgl_dma; - iod->cmd.common.flags = NVME_CMD_SGL_METASEG; iod->cmd.common.metadata = cpu_to_le64(sgl_dma); - - sgl = iod->meta_sgt.sgl; if (entries == 1) { - nvme_pci_sgl_set_data_sg(sg_list, sgl); + iod->meta_total_len = iter.len; + nvme_pci_sgl_set_data(sg_list, &iter); return BLK_STS_OK; } sgl_dma += sizeof(*sg_list); - nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); - for_each_sg(sgl, sg, entries, i) - nvme_pci_sgl_set_data_sg(&sg_list[i + 1], sg); + do { + nvme_pci_sgl_set_data(&sg_list[++i], &iter); + iod->meta_total_len += iter.len; + } while (blk_rq_integrity_dma_map_iter_next(req, dev->dev, &iter)); - return BLK_STS_OK; - -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); - return BLK_STS_RESOURCE; + nvme_pci_sgl_set_seg(sg_list, sgl_dma, i); + if (unlikely(iter.status)) + nvme_unmap_metadata(req); + return iter.status; } static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) @@ -1089,6 +1131,7 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct request *req) if (dma_mapping_error(nvmeq->dev->dev, iod->meta_dma)) return BLK_STS_IOERR; iod->cmd.common.metadata = cpu_to_le64(iod->meta_dma); + iod->flags |= IOD_SINGLE_META_SEGMENT; return BLK_STS_OK; } @@ -1110,7 +1153,7 @@ static blk_status_t nvme_prep_rq(struct request *req) iod->flags = 0; iod->nr_descriptors = 0; iod->total_len = 0; - iod->meta_sgt.nents = 0; + iod->meta_total_len = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) @@ -1221,25 +1264,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } -static __always_inline void nvme_unmap_metadata(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - struct nvme_dev *dev = nvmeq->dev; - - if (!iod->meta_sgt.nents) { - dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req).bv_len, - rq_dma_dir(req)); - return; - } - - dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor, - iod->meta_dma); - dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); - mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); -} - static __always_inline void nvme_pci_unmap_rq(struct request *req) { if (blk_integrity_rq(req)) @@ -3045,7 +3069,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { - size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS; dev->dmavec_mempool = mempool_create_node(1, @@ -3054,17 +3077,7 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) dev_to_node(dev->dev)); if (!dev->dmavec_mempool) return -ENOMEM; - - dev->iod_meta_mempool = mempool_create_node(1, - mempool_kmalloc, mempool_kfree, - (void *)meta_size, GFP_KERNEL, - dev_to_node(dev->dev)); - if (!dev->iod_meta_mempool) - goto free; return 0; -free: - mempool_destroy(dev->dmavec_mempool); - return -ENOMEM; } static void nvme_free_tagset(struct nvme_dev *dev) @@ -3514,7 +3527,6 @@ out_disable: nvme_free_queues(dev, 0); out_release_iod_mempool: mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: @@ -3578,7 +3590,6 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); mempool_destroy(dev->dmavec_mempool); - mempool_destroy(dev->iod_meta_mempool); nvme_release_descriptor_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); From f5d10e6915d80f7f4c4ed445a8b8ba4a5ee79318 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 Aug 2025 08:14:24 -0700 Subject: [PATCH 013/164] block: Move a misplaced comment in queue_wb_lat_store() blk_mq_quiesce_queue() does not wait for pending I/O to finish. Freezing a queue waits for pending I/O to finish. Hence move the comment that refers to waiting for pending I/O above the call that freezes the request queue. This patch moves this comment back to the position where it was when this comment was introduced. See also commit c125311d96b1 ("blk-wbt: don't maintain inflight counts if disabled"). Cc: Christoph Hellwig Cc: Nilay Shroff Signed-off-by: Bart Van Assche Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250825151424.1653910-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4a7f1a349998..c94b8b6ab024 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -620,6 +620,11 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ memflags = blk_mq_freeze_queue(q); rqos = wbt_rq_qos(q); @@ -638,11 +643,6 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (wbt_get_min_lat(q) == val) goto out; - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ blk_mq_quiesce_queue(q); mutex_lock(&disk->rqos_state_mutex); From d74968780bf287958e2815be5f088dd6c7b7aa19 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 25 Aug 2025 18:32:55 +0200 Subject: [PATCH 014/164] floppy: Remove unused CROSS_64KB() macro from arch/ code Since the commit 3d86739c6343 ("floppy: always use the track buffer") the CROSS_64KB() is not used by the driver, remove the leftovers. Acked-by: Helge Deller #parisc Acked-by: Geert Uytterhoeven # m68k Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250825163545.39303-2-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- arch/alpha/include/asm/floppy.h | 19 ------------------- arch/arm/include/asm/floppy.h | 2 -- arch/m68k/include/asm/floppy.h | 4 ---- arch/mips/include/asm/floppy.h | 15 --------------- arch/parisc/include/asm/floppy.h | 6 +----- arch/powerpc/include/asm/floppy.h | 5 ----- arch/sparc/include/asm/floppy_32.h | 3 --- arch/sparc/include/asm/floppy_64.h | 3 --- arch/x86/include/asm/floppy.h | 5 +---- 9 files changed, 2 insertions(+), 60 deletions(-) diff --git a/arch/alpha/include/asm/floppy.h b/arch/alpha/include/asm/floppy.h index 64b42d9591fc..5a6239e65097 100644 --- a/arch/alpha/include/asm/floppy.h +++ b/arch/alpha/include/asm/floppy.h @@ -90,25 +90,6 @@ static int FDC2 = -1; #define N_FDC 2 #define N_DRIVE 8 -/* - * Most Alphas have no problems with floppy DMA crossing 64k borders, - * except for certain ones, like XL and RUFFIAN. - * - * However, the test is simple and fast, and this *is* floppy, after all, - * so we do it for all platforms, just to make sure. - * - * This is advantageous in other circumstances as well, as in moving - * about the PCI DMA windows and forcing the floppy to start doing - * scatter-gather when it never had before, and there *is* a problem - * on that platform... ;-} - */ - -static inline unsigned long CROSS_64KB(void *a, unsigned long s) -{ - unsigned long p = (unsigned long)a; - return ((p + s - 1) ^ p) & ~0xffffUL; -} - #define EXTRA_FLOPPY_PARAMS #endif /* __ASM_ALPHA_FLOPPY_H */ diff --git a/arch/arm/include/asm/floppy.h b/arch/arm/include/asm/floppy.h index e1cb04ed5008..e579f77162e9 100644 --- a/arch/arm/include/asm/floppy.h +++ b/arch/arm/include/asm/floppy.h @@ -65,8 +65,6 @@ static unsigned char floppy_selects[4] = { 0x10, 0x21, 0x23, 0x33 }; #define N_FDC 1 #define N_DRIVE 4 -#define CROSS_64KB(a,s) (0) - /* * This allows people to reverse the order of * fd0 and fd1, in case their hardware is diff --git a/arch/m68k/include/asm/floppy.h b/arch/m68k/include/asm/floppy.h index a4d0fea47c6b..dea98bbc0932 100644 --- a/arch/m68k/include/asm/floppy.h +++ b/arch/m68k/include/asm/floppy.h @@ -107,13 +107,9 @@ static void fd_free_irq(void) #define fd_free_dma() /* nothing */ -/* No 64k boundary crossing problems on Q40 - no DMA at all */ -#define CROSS_64KB(a,s) (0) - #define DMA_MODE_READ 0x44 /* i386 look-alike */ #define DMA_MODE_WRITE 0x48 - static int m68k_floppy_init(void) { use_virtual_dma =1; diff --git a/arch/mips/include/asm/floppy.h b/arch/mips/include/asm/floppy.h index 021d09ae5670..44da2ff91f65 100644 --- a/arch/mips/include/asm/floppy.h +++ b/arch/mips/include/asm/floppy.h @@ -34,21 +34,6 @@ static inline void fd_cacheflush(char * addr, long size) #define N_FDC 1 /* do you *really* want a second controller? */ #define N_DRIVE 8 -/* - * The DMA channel used by the floppy controller cannot access data at - * addresses >= 16MB - * - * Went back to the 1MB limit, as some people had problems with the floppy - * driver otherwise. It doesn't matter much for performance anyway, as most - * floppy accesses go through the track buffer. - * - * On MIPSes using vdma, this actually means that *all* transfers go thru - * the * track buffer since 0x1000000 is always smaller than KSEG0/1. - * Actually this needs to be a bit more complicated since the so much different - * hardware available with MIPS CPUs ... - */ -#define CROSS_64KB(a, s) ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64) - #define EXTRA_FLOPPY_PARAMS #include diff --git a/arch/parisc/include/asm/floppy.h b/arch/parisc/include/asm/floppy.h index b318a7df52f6..df20dbef3ada 100644 --- a/arch/parisc/include/asm/floppy.h +++ b/arch/parisc/include/asm/floppy.h @@ -22,13 +22,9 @@ #define _CROSS_64KB(a,s,vdma) \ (!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) -#define CROSS_64KB(a,s) _CROSS_64KB(a,s,use_virtual_dma & 1) - - #define SW fd_routine[use_virtual_dma&1] #define CSW fd_routine[can_use_virtual_dma & 1] - #define fd_inb(base, reg) readb((base) + (reg)) #define fd_outb(value, base, reg) writeb(value, (base) + (reg)) @@ -206,7 +202,7 @@ static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) { #ifdef FLOPPY_SANITY_CHECK - if (CROSS_64KB(addr, size)) { + if (_CROSS_64KB(addr, size, use_virtual_dma & 1)) { printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size); return -1; } diff --git a/arch/powerpc/include/asm/floppy.h b/arch/powerpc/include/asm/floppy.h index 34abf8bea2cc..f4dc657638b3 100644 --- a/arch/powerpc/include/asm/floppy.h +++ b/arch/powerpc/include/asm/floppy.h @@ -206,11 +206,6 @@ static int FDC2 = -1; #define N_FDC 2 /* Don't change this! */ #define N_DRIVE 8 -/* - * The PowerPC has no problems with floppy DMA crossing 64k borders. - */ -#define CROSS_64KB(a,s) (0) - #define EXTRA_FLOPPY_PARAMS #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/floppy_32.h b/arch/sparc/include/asm/floppy_32.h index 836f6575aa1d..7251d1fed7a4 100644 --- a/arch/sparc/include/asm/floppy_32.h +++ b/arch/sparc/include/asm/floppy_32.h @@ -96,9 +96,6 @@ static struct sun_floppy_ops sun_fdops; #define N_FDC 1 #define N_DRIVE 8 -/* No 64k boundary crossing problems on the Sparc. */ -#define CROSS_64KB(a,s) (0) - /* Routines unique to each controller type on a Sun. */ static void sun_set_dor(unsigned char value, int fdc_82077) { diff --git a/arch/sparc/include/asm/floppy_64.h b/arch/sparc/include/asm/floppy_64.h index b0f633ce3518..135f9a49b6ba 100644 --- a/arch/sparc/include/asm/floppy_64.h +++ b/arch/sparc/include/asm/floppy_64.h @@ -95,9 +95,6 @@ static int sun_floppy_types[2] = { 0, 0 }; #define N_FDC 1 #define N_DRIVE 8 -/* No 64k boundary crossing problems on the Sparc. */ -#define CROSS_64KB(a,s) (0) - static unsigned char sun_82077_fd_inb(unsigned long base, unsigned int reg) { udelay(5); diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 6ec3fc969ad5..e76cb74bbed2 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -24,9 +24,6 @@ (!(vdma) && \ ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) -#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1) - - #define SW fd_routine[use_virtual_dma & 1] #define CSW fd_routine[can_use_virtual_dma & 1] @@ -206,7 +203,7 @@ static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) static int hard_dma_setup(char *addr, unsigned long size, int mode, int io) { #ifdef FLOPPY_SANITY_CHECK - if (CROSS_64KB(addr, size)) { + if (_CROSS_64KB(addr, size, use_virtual_dma & 1)) { printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size); return -1; } From 8e7ee0f6fa33934373c1c37e8cfb71cff2acea09 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 25 Aug 2025 18:32:56 +0200 Subject: [PATCH 015/164] floppy: Replace custom SZ_64K constant There are only two headers using the K_64 custom constant. Moreover, its usage tangles a code because the constant is defined in the C file, while users are in the headers. Replace it with well defined SZ_64K from sizes.h. Acked-by: Helge Deller Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250825163545.39303-3-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- arch/parisc/include/asm/floppy.h | 5 +++-- arch/x86/include/asm/floppy.h | 3 ++- drivers/block/floppy.c | 2 -- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/parisc/include/asm/floppy.h b/arch/parisc/include/asm/floppy.h index df20dbef3ada..f15b69fea901 100644 --- a/arch/parisc/include/asm/floppy.h +++ b/arch/parisc/include/asm/floppy.h @@ -8,9 +8,9 @@ #ifndef __ASM_PARISC_FLOPPY_H #define __ASM_PARISC_FLOPPY_H +#include #include - /* * The DMA channel used by the floppy controller cannot access data at * addresses >= 16MB @@ -20,7 +20,8 @@ * floppy accesses go through the track buffer. */ #define _CROSS_64KB(a,s,vdma) \ -(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) + (!(vdma) && \ + ((unsigned long)(a) / SZ_64K != ((unsigned long)(a) + (s) - 1) / SZ_64K)) #define SW fd_routine[use_virtual_dma&1] #define CSW fd_routine[can_use_virtual_dma & 1] diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index e76cb74bbed2..e7a244051c62 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -10,6 +10,7 @@ #ifndef _ASM_X86_FLOPPY_H #define _ASM_X86_FLOPPY_H +#include #include /* @@ -22,7 +23,7 @@ */ #define _CROSS_64KB(a, s, vdma) \ (!(vdma) && \ - ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) + ((unsigned long)(a) / SZ_64K != ((unsigned long)(a) + (s) - 1) / SZ_64K)) #define SW fd_routine[use_virtual_dma & 1] #define CSW fd_routine[can_use_virtual_dma & 1] diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 24be0c2c4075..d769a223fcc8 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -233,8 +233,6 @@ static unsigned short virtual_dma_port = 0x3f0; irqreturn_t floppy_interrupt(int irq, void *dev_id); static int set_dor(int fdc, char mask, char data); -#define K_64 0x10000 /* 64KB */ - /* the following is the mask of allowed drives. By default units 2 and * 3 of both floppy controllers are disabled, because switching on the * motor of these drives causes system hangs on some PCI computers. drive From d4399e6eb27a803b73d17fe984448a823b4d3a30 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 25 Aug 2025 18:32:57 +0200 Subject: [PATCH 016/164] floppy: Sort headers alphabetically Sorting headers alphabetically helps locating duplicates, and makes it easier to figure out where to insert new headers. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250825163545.39303-4-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 54 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index d769a223fcc8..61d62019a38d 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -163,35 +163,35 @@ /* do print messages for unexpected interrupts */ static int print_unex = 1; -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* CMOS defines */ -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* CMOS defines */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* * PS/2 floppies have much slower step rates than regular floppies. From 4c7ef92f6d4d08a27d676e4c348f4e2922cab3ed Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 26 Aug 2025 16:48:54 +0800 Subject: [PATCH 017/164] blk-mq: check kobject state_in_sysfs before deleting in blk_mq_unregister_hctx In __blk_mq_update_nr_hw_queues() the return value of blk_mq_sysfs_register_hctxs() is not checked. If sysfs creation for hctx fails, later changing the number of hw_queues or removing disk will trigger the following warning: kernfs: can not remove 'nr_tags', no directory WARNING: CPU: 2 PID: 637 at fs/kernfs/dir.c:1707 kernfs_remove_by_name_ns+0x13f/0x160 Call Trace: remove_files.isra.1+0x38/0xb0 sysfs_remove_group+0x4d/0x100 sysfs_remove_groups+0x31/0x60 __kobject_del+0x23/0xf0 kobject_del+0x17/0x40 blk_mq_unregister_hctx+0x5d/0x80 blk_mq_sysfs_unregister_hctxs+0x94/0xd0 blk_mq_update_nr_hw_queues+0x124/0x760 nullb_update_nr_hw_queues+0x71/0xf0 [null_blk] nullb_device_submit_queues_store+0x92/0x120 [null_blk] kobjct_del() was called unconditionally even if sysfs creation failed. Fix it by checkig the kobject creation statusbefore deleting it. Fixes: 477e19dedc9d ("blk-mq: adjust debugfs and sysfs register when updating nr_hw_queues") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20250826084854.1030545-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 24656980f443..5c399ac562ea 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -150,9 +150,11 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) return; hctx_for_each_ctx(hctx, ctx, i) - kobject_del(&ctx->kobj); + if (ctx->kobj.state_in_sysfs) + kobject_del(&ctx->kobj); - kobject_del(&hctx->kobj); + if (hctx->kobj.state_in_sysfs) + kobject_del(&hctx->kobj); } static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) From 2a0614522885b136e9e650791c794dd49abb5b31 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 11 Aug 2025 14:56:28 +0800 Subject: [PATCH 018/164] brd: use page reference to protect page lifetime As discussed [1], hold rcu for copying data from/to page is too heavy, it's better to protect page with rcu around for page lookup and then grab a reference to prevent page to be freed by discard. [1] https://lore.kernel.org/all/eb41cab3-5946-4fe3-a1be-843dd6fca159@kernel.dk/ Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20250811065628.1829339-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 75 +++++++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 0c2eabe14af3..9778259b30d4 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -44,45 +44,74 @@ struct brd_device { }; /* - * Look up and return a brd's page for a given sector. + * Look up and return a brd's page with reference grabbed for a given sector. */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { - return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); + struct page *page; + XA_STATE(xas, &brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); + + rcu_read_lock(); +repeat: + page = xas_load(&xas); + if (xas_retry(&xas, page)) { + xas_reset(&xas); + goto repeat; + } + + if (!page) + goto out; + + if (!get_page_unless_zero(page)) { + xas_reset(&xas); + goto repeat; + } + + if (unlikely(page != xas_reload(&xas))) { + put_page(page); + xas_reset(&xas); + goto repeat; + } +out: + rcu_read_unlock(); + + return page; } /* * Insert a new page for a given sector, if one does not already exist. + * The returned page will grab reference. */ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, blk_opf_t opf) - __releases(rcu) - __acquires(rcu) { gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO; struct page *page, *ret; - rcu_read_unlock(); page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); - if (!page) { - rcu_read_lock(); + if (!page) return ERR_PTR(-ENOMEM); - } xa_lock(&brd->brd_pages); ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, page, gfp); - rcu_read_lock(); - if (ret) { + if (!ret) { + brd->brd_nr_pages++; + get_page(page); xa_unlock(&brd->brd_pages); - __free_page(page); - if (xa_is_err(ret)) - return ERR_PTR(xa_err(ret)); + return page; + } + + if (!xa_is_err(ret)) { + get_page(ret); + xa_unlock(&brd->brd_pages); + put_page(page); return ret; } - brd->brd_nr_pages++; + xa_unlock(&brd->brd_pages); - return page; + put_page(page); + return ERR_PTR(xa_err(ret)); } /* @@ -95,7 +124,7 @@ static void brd_free_pages(struct brd_device *brd) pgoff_t idx; xa_for_each(&brd->brd_pages, idx, page) { - __free_page(page); + put_page(page); cond_resched(); } @@ -117,7 +146,6 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset); - rcu_read_lock(); page = brd_lookup_page(brd, sector); if (!page && op_is_write(opf)) { page = brd_insert_page(brd, sector, opf); @@ -135,13 +163,13 @@ static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio) memset(kaddr, 0, bv.bv_len); } kunmap_local(kaddr); - rcu_read_unlock(); bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len); + if (page) + put_page(page); return true; out_error: - rcu_read_unlock(); if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT)) bio_wouldblock_error(bio); else @@ -149,13 +177,6 @@ out_error: return false; } -static void brd_free_one_page(struct rcu_head *head) -{ - struct page *page = container_of(head, struct page, rcu_head); - - __free_page(page); -} - static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) { sector_t aligned_sector = round_up(sector, PAGE_SECTORS); @@ -170,7 +191,7 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) { page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); if (page) { - call_rcu(&page->rcu_head, brd_free_one_page); + put_page(page); brd->brd_nr_pages--; } aligned_sector += PAGE_SECTORS; From d5d060d624e34c6ce1748a157cf2391e49af2a54 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:54:55 +0200 Subject: [PATCH 019/164] rust: str: normalize imports in `str.rs` Clean up imports in `str.rs`. This makes future code manipulation more manageable. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-1-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/str.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index 6c892550c0ba..082790b7a621 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -2,12 +2,13 @@ //! String representations. -use crate::alloc::{flags::*, AllocError, KVec}; -use crate::fmt::{self, Write}; +use crate::{ + alloc::{flags::*, AllocError, KVec}, + fmt::{self, Write}, + prelude::*, +}; use core::ops::{self, Deref, DerefMut, Index}; -use crate::prelude::*; - /// Byte string without UTF-8 validity guarantee. #[repr(transparent)] pub struct BStr([u8]); From 87482d6d9104d087935592ebbf52b70f8a777c47 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:54:56 +0200 Subject: [PATCH 020/164] rust: str: allow `str::Formatter` to format into `&mut [u8]`. Improve `Formatter` so that it can write to an array or slice buffer. Reviewed-by: Daniel Almeida Reviewed-by: Alice Ryhl Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-2-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/str.rs | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index 082790b7a621..76632da357a6 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -7,7 +7,10 @@ use crate::{ fmt::{self, Write}, prelude::*, }; -use core::ops::{self, Deref, DerefMut, Index}; +use core::{ + marker::PhantomData, + ops::{self, Deref, DerefMut, Index}, +}; /// Byte string without UTF-8 validity guarantee. #[repr(transparent)] @@ -825,9 +828,9 @@ impl fmt::Write for RawFormatter { /// Allows formatting of [`fmt::Arguments`] into a raw buffer. /// /// Fails if callers attempt to write more than will fit in the buffer. -pub(crate) struct Formatter(RawFormatter); +pub(crate) struct Formatter<'a>(RawFormatter, PhantomData<&'a mut ()>); -impl Formatter { +impl Formatter<'_> { /// Creates a new instance of [`Formatter`] with the given buffer. /// /// # Safety @@ -836,11 +839,19 @@ impl Formatter { /// for the lifetime of the returned [`Formatter`]. pub(crate) unsafe fn from_buffer(buf: *mut u8, len: usize) -> Self { // SAFETY: The safety requirements of this function satisfy those of the callee. - Self(unsafe { RawFormatter::from_buffer(buf, len) }) + Self(unsafe { RawFormatter::from_buffer(buf, len) }, PhantomData) + } + + /// Create a new [`Self`] instance. + #[expect(dead_code)] + pub(crate) fn new(buffer: &mut [u8]) -> Self { + // SAFETY: `buffer` is valid for writes for the entire length for + // the lifetime of `Self`. + unsafe { Formatter::from_buffer(buffer.as_mut_ptr(), buffer.len()) } } } -impl Deref for Formatter { +impl Deref for Formatter<'_> { type Target = RawFormatter; fn deref(&self) -> &Self::Target { @@ -848,7 +859,7 @@ impl Deref for Formatter { } } -impl fmt::Write for Formatter { +impl fmt::Write for Formatter<'_> { fn write_str(&mut self, s: &str) -> fmt::Result { self.0.write_str(s)?; From 8c5ac71cf19bc8a2ad5bc905d1fd3191d887d469 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:54:57 +0200 Subject: [PATCH 021/164] rust: str: expose `str::{Formatter, RawFormatter}` publicly. rnull is going to make use of `str::Formatter` and `str::RawFormatter`, so expose them with public visibility. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-3-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/str.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index 76632da357a6..46cdc85dad63 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -736,7 +736,7 @@ mod tests { /// /// The memory region between `pos` (inclusive) and `end` (exclusive) is valid for writes if `pos` /// is less than `end`. -pub(crate) struct RawFormatter { +pub struct RawFormatter { // Use `usize` to use `saturating_*` functions. beg: usize, pos: usize, @@ -794,7 +794,7 @@ impl RawFormatter { } /// Returns the number of bytes written to the formatter. - pub(crate) fn bytes_written(&self) -> usize { + pub fn bytes_written(&self) -> usize { self.pos - self.beg } } @@ -828,7 +828,7 @@ impl fmt::Write for RawFormatter { /// Allows formatting of [`fmt::Arguments`] into a raw buffer. /// /// Fails if callers attempt to write more than will fit in the buffer. -pub(crate) struct Formatter<'a>(RawFormatter, PhantomData<&'a mut ()>); +pub struct Formatter<'a>(RawFormatter, PhantomData<&'a mut ()>); impl Formatter<'_> { /// Creates a new instance of [`Formatter`] with the given buffer. @@ -843,8 +843,7 @@ impl Formatter<'_> { } /// Create a new [`Self`] instance. - #[expect(dead_code)] - pub(crate) fn new(buffer: &mut [u8]) -> Self { + pub fn new(buffer: &mut [u8]) -> Self { // SAFETY: `buffer` is valid for writes for the entire length for // the lifetime of `Self`. unsafe { Formatter::from_buffer(buffer.as_mut_ptr(), buffer.len()) } From cdde7a1951ff0600adc45718ba251559e4d3fd7c Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:54:58 +0200 Subject: [PATCH 022/164] rust: str: introduce `NullTerminatedFormatter` Add `NullTerminatedFormatter`, a formatter that writes a null terminated string to an array or slice buffer. Because this type needs to manage the trailing null marker, the existing formatters cannot be used to implement this type. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-4-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/str.rs | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index 46cdc85dad63..d8326f7bc9c1 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -871,6 +871,55 @@ impl fmt::Write for Formatter<'_> { } } +/// A mutable reference to a byte buffer where a string can be written into. +/// +/// The buffer will be automatically null terminated after the last written character. +/// +/// # Invariants +/// +/// * The first byte of `buffer` is always zero. +/// * The length of `buffer` is at least 1. +pub(crate) struct NullTerminatedFormatter<'a> { + buffer: &'a mut [u8], +} + +impl<'a> NullTerminatedFormatter<'a> { + /// Create a new [`Self`] instance. + #[expect(dead_code)] + pub(crate) fn new(buffer: &'a mut [u8]) -> Option> { + *(buffer.first_mut()?) = 0; + + // INVARIANT: + // - We wrote zero to the first byte above. + // - If buffer was not at least length 1, `buffer.first_mut()` would return None. + Some(Self { buffer }) + } +} + +impl Write for NullTerminatedFormatter<'_> { + fn write_str(&mut self, s: &str) -> fmt::Result { + let bytes = s.as_bytes(); + let len = bytes.len(); + + // We want space for a zero. By type invariant, buffer length is always at least 1, so no + // underflow. + if len > self.buffer.len() - 1 { + return Err(fmt::Error); + } + + let buffer = core::mem::take(&mut self.buffer); + // We break the zero start invariant for a short while. + buffer[..len].copy_from_slice(bytes); + // INVARIANT: We checked above that buffer will have size at least 1 after this assignment. + self.buffer = &mut buffer[len..]; + + // INVARIANT: We write zero to the first byte of the buffer. + self.buffer[0] = 0; + + Ok(()) + } +} + /// An owned string that is guaranteed to have exactly one `NUL` byte, which is at the end. /// /// Used for interoperability with kernel APIs that take C strings. From b1dae0be89278348e2c99ddca820d91292856b10 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:54:59 +0200 Subject: [PATCH 023/164] rust: str: introduce `kstrtobool` function Add a Rust wrapper for the kernel's `kstrtobool` function that converts common user inputs into boolean values. Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-5-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/str.rs | 79 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index d8326f7bc9c1..d2f9ebc94b75 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -4,6 +4,7 @@ use crate::{ alloc::{flags::*, AllocError, KVec}, + error::{to_result, Result}, fmt::{self, Write}, prelude::*, }; @@ -920,6 +921,84 @@ impl Write for NullTerminatedFormatter<'_> { } } +/// # Safety +/// +/// - `string` must point to a null terminated string that is valid for read. +unsafe fn kstrtobool_raw(string: *const u8) -> Result { + let mut result: bool = false; + + // SAFETY: + // - By function safety requirement, `string` is a valid null-terminated string. + // - `result` is a valid `bool` that we own. + to_result(unsafe { bindings::kstrtobool(string, &mut result) })?; + Ok(result) +} + +/// Convert common user inputs into boolean values using the kernel's `kstrtobool` function. +/// +/// This routine returns `Ok(bool)` if the first character is one of 'YyTt1NnFf0', or +/// \[oO\]\[NnFf\] for "on" and "off". Otherwise it will return `Err(EINVAL)`. +/// +/// # Examples +/// +/// ``` +/// # use kernel::{c_str, str::kstrtobool}; +/// +/// // Lowercase +/// assert_eq!(kstrtobool(c_str!("true")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("tr")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("t")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("twrong")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("false")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("f")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("yes")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("no")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("on")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("off")), Ok(false)); +/// +/// // Camel case +/// assert_eq!(kstrtobool(c_str!("True")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("False")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("Yes")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("No")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("On")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("Off")), Ok(false)); +/// +/// // All caps +/// assert_eq!(kstrtobool(c_str!("TRUE")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("FALSE")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("YES")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("NO")), Ok(false)); +/// assert_eq!(kstrtobool(c_str!("ON")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("OFF")), Ok(false)); +/// +/// // Numeric +/// assert_eq!(kstrtobool(c_str!("1")), Ok(true)); +/// assert_eq!(kstrtobool(c_str!("0")), Ok(false)); +/// +/// // Invalid input +/// assert_eq!(kstrtobool(c_str!("invalid")), Err(EINVAL)); +/// assert_eq!(kstrtobool(c_str!("2")), Err(EINVAL)); +/// ``` +pub fn kstrtobool(string: &CStr) -> Result { + // SAFETY: + // - The pointer returned by `CStr::as_char_ptr` is guaranteed to be + // null terminated. + // - `string` is live and thus the string is valid for read. + unsafe { kstrtobool_raw(string.as_char_ptr()) } +} + +/// Convert `&[u8]` to `bool` by deferring to [`kernel::str::kstrtobool`]. +/// +/// Only considers at most the first two bytes of `bytes`. +pub fn kstrtobool_bytes(bytes: &[u8]) -> Result { + // `ktostrbool` only considers the first two bytes of the input. + let stack_string = [*bytes.first().unwrap_or(&0), *bytes.get(1).unwrap_or(&0), 0]; + // SAFETY: `stack_string` is null terminated and it is live on the stack so + // it is valid for read. + unsafe { kstrtobool_raw(stack_string.as_ptr()) } +} + /// An owned string that is guaranteed to have exactly one `NUL` byte, which is at the end. /// /// Used for interoperability with kernel APIs that take C strings. From 60e1eeed8b53f65ef7919fd3f79cc9b7f20795c5 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:00 +0200 Subject: [PATCH 024/164] rust: configfs: re-export `configfs_attrs` from `configfs` module Re-export `configfs_attrs` from `configfs` module, so that users can import the macro from the `configfs` module rather than the root of the `kernel` crate. Also update users to import from the new path. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-6-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/configfs.rs | 2 ++ samples/rust/rust_configfs.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/rust/kernel/configfs.rs b/rust/kernel/configfs.rs index 2736b798cdc6..318a2f073d1c 100644 --- a/rust/kernel/configfs.rs +++ b/rust/kernel/configfs.rs @@ -1039,3 +1039,5 @@ macro_rules! configfs_attrs { }; } + +pub use crate::configfs_attrs; diff --git a/samples/rust/rust_configfs.rs b/samples/rust/rust_configfs.rs index af04bfa35cb2..ad364fb93e53 100644 --- a/samples/rust/rust_configfs.rs +++ b/samples/rust/rust_configfs.rs @@ -5,7 +5,7 @@ use kernel::alloc::flags; use kernel::c_str; use kernel::configfs; -use kernel::configfs_attrs; +use kernel::configfs::configfs_attrs; use kernel::new_mutex; use kernel::page::PAGE_SIZE; use kernel::prelude::*; From f4b72f1558be1e2b173b6b1f93c09dc668592a26 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:01 +0200 Subject: [PATCH 025/164] rust: block: normalize imports for `gen_disk.rs` Clean up the import statements in `gen_disk.rs` to make the code easier to maintain. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-7-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/block/mq/gen_disk.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index cd54cd64ea88..679ee1bb2195 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -5,9 +5,13 @@ //! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h) //! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h) -use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; -use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc}; -use crate::{error, static_lock_class}; +use crate::{ + bindings, + block::mq::{raw_writer::RawWriter, Operations, TagSet}, + error::{self, from_err_ptr, Result}, + static_lock_class, + sync::Arc, +}; use core::fmt::{self, Write}; /// A builder for [`GenDisk`]. From c3a54220b54a1bda0662f0e7ab90ffabf5036d50 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:02 +0200 Subject: [PATCH 026/164] rust: block: use `NullTerminatedFormatter` Use the new `NullTerminatedFormatter` to write the name of a `GenDisk` to the name buffer. This new formatter automatically adds a trailing null marker after the written characters, so we don't need to append that at the call site any longer. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-8-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/block/mq/gen_disk.rs | 12 +++++++----- rust/kernel/block/mq/raw_writer.rs | 1 + rust/kernel/str.rs | 1 - 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index 679ee1bb2195..20f1d46c774d 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -7,9 +7,11 @@ use crate::{ bindings, - block::mq::{raw_writer::RawWriter, Operations, TagSet}, + block::mq::{Operations, TagSet}, error::{self, from_err_ptr, Result}, + prelude::*, static_lock_class, + str::NullTerminatedFormatter, sync::Arc, }; use core::fmt::{self, Write}; @@ -143,14 +145,14 @@ impl GenDiskBuilder { // SAFETY: `gendisk` is a valid pointer as we initialized it above unsafe { (*gendisk).fops = &TABLE }; - let mut raw_writer = RawWriter::from_array( + let mut writer = NullTerminatedFormatter::new( // SAFETY: `gendisk` points to a valid and initialized instance. We // have exclusive access, since the disk is not added to the VFS // yet. unsafe { &mut (*gendisk).disk_name }, - )?; - raw_writer.write_fmt(name)?; - raw_writer.write_char('\0')?; + ) + .ok_or(EINVAL)?; + writer.write_fmt(name)?; // SAFETY: `gendisk` points to a valid and initialized instance of // `struct gendisk`. `set_capacity` takes a lock to synchronize this diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs index 7e2159e4f6a6..0aef55703e71 100644 --- a/rust/kernel/block/mq/raw_writer.rs +++ b/rust/kernel/block/mq/raw_writer.rs @@ -24,6 +24,7 @@ impl<'a> RawWriter<'a> { Ok(Self { buffer, pos: 0 }) } + #[expect(dead_code)] pub(crate) fn from_array( a: &'a mut [crate::ffi::c_char; N], ) -> Result> { diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs index d2f9ebc94b75..5c74e5f77601 100644 --- a/rust/kernel/str.rs +++ b/rust/kernel/str.rs @@ -886,7 +886,6 @@ pub(crate) struct NullTerminatedFormatter<'a> { impl<'a> NullTerminatedFormatter<'a> { /// Create a new [`Self`] instance. - #[expect(dead_code)] pub(crate) fn new(buffer: &'a mut [u8]) -> Option> { *(buffer.first_mut()?) = 0; From f52689fcd8a2f78436b57d10ba742455431885a0 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:03 +0200 Subject: [PATCH 027/164] rust: block: remove `RawWriter` `RawWriter` is now dead code, so remove it. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-9-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/block/mq.rs | 1 - rust/kernel/block/mq/raw_writer.rs | 56 ------------------------------ 2 files changed, 57 deletions(-) delete mode 100644 rust/kernel/block/mq/raw_writer.rs diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index 831445d37181..98fa0d6bc8f7 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -89,7 +89,6 @@ pub mod gen_disk; mod operations; -mod raw_writer; mod request; mod tag_set; diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs deleted file mode 100644 index 0aef55703e71..000000000000 --- a/rust/kernel/block/mq/raw_writer.rs +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -use core::fmt::{self, Write}; - -use crate::error::Result; -use crate::prelude::EINVAL; - -/// A mutable reference to a byte buffer where a string can be written into. -/// -/// # Invariants -/// -/// `buffer` is always null terminated. -pub(crate) struct RawWriter<'a> { - buffer: &'a mut [u8], - pos: usize, -} - -impl<'a> RawWriter<'a> { - /// Create a new `RawWriter` instance. - fn new(buffer: &'a mut [u8]) -> Result> { - *(buffer.last_mut().ok_or(EINVAL)?) = 0; - - // INVARIANT: We null terminated the buffer above. - Ok(Self { buffer, pos: 0 }) - } - - #[expect(dead_code)] - pub(crate) fn from_array( - a: &'a mut [crate::ffi::c_char; N], - ) -> Result> { - Self::new( - // SAFETY: the buffer of `a` is valid for read and write as `u8` for - // at least `N` bytes. - unsafe { core::slice::from_raw_parts_mut(a.as_mut_ptr().cast::(), N) }, - ) - } -} - -impl Write for RawWriter<'_> { - fn write_str(&mut self, s: &str) -> fmt::Result { - let bytes = s.as_bytes(); - let len = bytes.len(); - - // We do not want to overwrite our null terminator - if self.pos + len > self.buffer.len() - 1 { - return Err(fmt::Error); - } - - // INVARIANT: We are not overwriting the last byte - self.buffer[self.pos..self.pos + len].copy_from_slice(bytes); - - self.pos += len; - - Ok(()) - } -} From 8c32697c4edd4180bc90d367e54fb64490c230c6 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:04 +0200 Subject: [PATCH 028/164] rust: block: remove trait bound from `mq::Request` definition Remove the trait bound `T:Operations` from `mq::Request`. The bound is not required, so remove it to reduce complexity. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-10-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/block/mq/request.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index fefd394f064a..f723d74091c1 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -53,7 +53,7 @@ use core::{ /// [`struct request`]: srctree/include/linux/blk-mq.h /// #[repr(transparent)] -pub struct Request(Opaque, PhantomData); +pub struct Request(Opaque, PhantomData); impl Request { /// Create an [`ARef`] from a [`struct request`] pointer. From 19c37c91b4a0ff7abfc3dcfe165d1377939469ac Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:05 +0200 Subject: [PATCH 029/164] rust: block: add block related constants Add a few block subsystem constants to the rust `kernel::block` name space. This makes it easier to access the constants from rust code. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-11-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/block.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/rust/kernel/block.rs b/rust/kernel/block.rs index 150f710efe5b..32c8d865afb6 100644 --- a/rust/kernel/block.rs +++ b/rust/kernel/block.rs @@ -3,3 +3,16 @@ //! Types for working with the block layer. pub mod mq; + +/// Bit mask for masking out [`SECTOR_SIZE`]. +pub const SECTOR_MASK: u32 = bindings::SECTOR_MASK; + +/// Sectors are size `1 << SECTOR_SHIFT`. +pub const SECTOR_SHIFT: u32 = bindings::SECTOR_SHIFT; + +/// Size of a sector. +pub const SECTOR_SIZE: u32 = bindings::SECTOR_SIZE; + +/// The difference between the size of a page and the size of a sector, +/// expressed as a power of two. +pub const PAGE_SECTORS_SHIFT: u32 = bindings::PAGE_SECTORS_SHIFT; From edd8650691c374bdce133af1b25ff4f3496f489f Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:06 +0200 Subject: [PATCH 030/164] rnull: move driver to separate directory The rust null block driver is about to gain some additional modules. Rather than pollute the current directory, move the driver to a subdirectory. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-12-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- drivers/block/Kconfig | 10 +--------- drivers/block/Makefile | 4 +--- drivers/block/rnull/Kconfig | 13 +++++++++++++ drivers/block/rnull/Makefile | 3 +++ drivers/block/{ => rnull}/rnull.rs | 0 6 files changed, 19 insertions(+), 13 deletions(-) create mode 100644 drivers/block/rnull/Kconfig create mode 100644 drivers/block/rnull/Makefile rename drivers/block/{ => rnull}/rnull.rs (100%) diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d79..7f3121d0a784 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4343,7 +4343,7 @@ W: https://rust-for-linux.com B: https://github.com/Rust-for-Linux/linux/issues C: https://rust-for-linux.zulipchat.com/#narrow/stream/Block T: git https://github.com/Rust-for-Linux/linux.git rust-block-next -F: drivers/block/rnull.rs +F: drivers/block/rnull/ F: rust/kernel/block.rs F: rust/kernel/block/ diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index df38fb364904..77d694448990 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -17,6 +17,7 @@ menuconfig BLK_DEV if BLK_DEV source "drivers/block/null_blk/Kconfig" +source "drivers/block/rnull/Kconfig" config BLK_DEV_FD tristate "Normal floppy disk support" @@ -311,15 +312,6 @@ config VIRTIO_BLK This is the virtual block driver for virtio. It can be used with QEMU based VMMs (like KVM or Xen). Say Y or M. -config BLK_DEV_RUST_NULL - tristate "Rust null block driver (Experimental)" - depends on RUST - help - This is the Rust implementation of the null block driver. For now it - is only a minimal stub. - - If unsure, say N. - config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK diff --git a/drivers/block/Makefile b/drivers/block/Makefile index a695ce74ef22..2d8096eb8cdf 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -9,9 +9,6 @@ # needed for trace events ccflags-y += -I$(src) -obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o -rnull_mod-y := rnull.o - obj-$(CONFIG_MAC_FLOPPY) += swim3.o obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o obj-$(CONFIG_BLK_DEV_FD) += floppy.o @@ -38,6 +35,7 @@ obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull/ obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o diff --git a/drivers/block/rnull/Kconfig b/drivers/block/rnull/Kconfig new file mode 100644 index 000000000000..6dc5aff96bf4 --- /dev/null +++ b/drivers/block/rnull/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Rust null block device driver configuration + +config BLK_DEV_RUST_NULL + tristate "Rust null block driver (Experimental)" + depends on RUST + help + This is the Rust implementation of the null block driver. Like + the C version, the driver allows the user to create virutal block + devices that can be configured via various configuration options. + + If unsure, say N. diff --git a/drivers/block/rnull/Makefile b/drivers/block/rnull/Makefile new file mode 100644 index 000000000000..11cfa5e615dc --- /dev/null +++ b/drivers/block/rnull/Makefile @@ -0,0 +1,3 @@ + +obj-$(CONFIG_BLK_DEV_RUST_NULL) += rnull_mod.o +rnull_mod-y := rnull.o diff --git a/drivers/block/rnull.rs b/drivers/block/rnull/rnull.rs similarity index 100% rename from drivers/block/rnull.rs rename to drivers/block/rnull/rnull.rs From d969d504bc13b2f0c1f208e009e73f2625b421c0 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:07 +0200 Subject: [PATCH 031/164] rnull: enable configuration via `configfs` Allow rust null block devices to be configured and instantiated via `configfs`. Reviewed-by: Daniel Almeida Reviewed-by: Alice Ryhl Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-13-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- drivers/block/rnull/Kconfig | 2 +- drivers/block/rnull/configfs.rs | 207 +++++++++++++++++++++++++++++++ drivers/block/rnull/rnull.rs | 59 ++++----- rust/kernel/block/mq/gen_disk.rs | 2 +- 4 files changed, 240 insertions(+), 30 deletions(-) create mode 100644 drivers/block/rnull/configfs.rs diff --git a/drivers/block/rnull/Kconfig b/drivers/block/rnull/Kconfig index 6dc5aff96bf4..7bc5b376c128 100644 --- a/drivers/block/rnull/Kconfig +++ b/drivers/block/rnull/Kconfig @@ -4,7 +4,7 @@ config BLK_DEV_RUST_NULL tristate "Rust null block driver (Experimental)" - depends on RUST + depends on RUST && CONFIGFS_FS help This is the Rust implementation of the null block driver. Like the C version, the driver allows the user to create virutal block diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs new file mode 100644 index 000000000000..46710a1e1af4 --- /dev/null +++ b/drivers/block/rnull/configfs.rs @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 + +use super::{NullBlkDevice, THIS_MODULE}; +use core::fmt::Write; +use kernel::{ + block::mq::gen_disk::{GenDisk, GenDiskBuilder}, + c_str, + configfs::{self, AttributeOperations}, + configfs_attrs, new_mutex, + page::PAGE_SIZE, + prelude::*, + str::{kstrtobool_bytes, CString}, + sync::Mutex, +}; +use pin_init::PinInit; + +pub(crate) fn subsystem() -> impl PinInit, Error> { + let item_type = configfs_attrs! { + container: configfs::Subsystem, + data: Config, + child: DeviceConfig, + attributes: [ + features: 0, + ], + }; + + kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {})) +} + +#[pin_data] +pub(crate) struct Config {} + +#[vtable] +impl AttributeOperations<0> for Config { + type Data = Config; + + fn show(_this: &Config, page: &mut [u8; PAGE_SIZE]) -> Result { + let mut writer = kernel::str::Formatter::new(page); + writer.write_str("blocksize,size,rotational\n")?; + Ok(writer.bytes_written()) + } +} + +#[vtable] +impl configfs::GroupOperations for Config { + type Child = DeviceConfig; + + fn make_group( + &self, + name: &CStr, + ) -> Result, Error>> { + let item_type = configfs_attrs! { + container: configfs::Group, + data: DeviceConfig, + attributes: [ + // Named for compatibility with C null_blk + power: 0, + blocksize: 1, + rotational: 2, + size: 3, + ], + }; + + Ok(configfs::Group::new( + name.try_into()?, + item_type, + // TODO: cannot coerce new_mutex!() to impl PinInit<_, Error>, so put mutex inside + try_pin_init!( DeviceConfig { + data <- new_mutex!(DeviceConfigInner { + powered: false, + block_size: 4096, + rotational: false, + disk: None, + capacity_mib: 4096, + name: name.try_into()?, + }), + }), + )) + } +} + +#[pin_data] +pub(crate) struct DeviceConfig { + #[pin] + data: Mutex, +} + +#[pin_data] +struct DeviceConfigInner { + powered: bool, + name: CString, + block_size: u32, + rotational: bool, + capacity_mib: u64, + disk: Option>, +} + +#[vtable] +impl configfs::AttributeOperations<0> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result { + let mut writer = kernel::str::Formatter::new(page); + + if this.data.lock().powered { + writer.write_str("1\n")?; + } else { + writer.write_str("0\n")?; + } + + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + let power_op = kstrtobool_bytes(page)?; + let mut guard = this.data.lock(); + + if !guard.powered && power_op { + guard.disk = Some(NullBlkDevice::new( + &guard.name, + guard.block_size, + guard.rotational, + guard.capacity_mib, + )?); + guard.powered = true; + } else if guard.powered && !power_op { + drop(guard.disk.take()); + guard.powered = false; + } + + Ok(()) + } +} + +#[vtable] +impl configfs::AttributeOperations<1> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result { + let mut writer = kernel::str::Formatter::new(page); + writer.write_fmt(fmt!("{}\n", this.data.lock().block_size))?; + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + let text = core::str::from_utf8(page)?.trim(); + let value = text.parse::().map_err(|_| EINVAL)?; + + GenDiskBuilder::validate_block_size(value)?; + this.data.lock().block_size = value; + Ok(()) + } +} + +#[vtable] +impl configfs::AttributeOperations<2> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result { + let mut writer = kernel::str::Formatter::new(page); + + if this.data.lock().rotational { + writer.write_str("1\n")?; + } else { + writer.write_str("0\n")?; + } + + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + this.data.lock().rotational = kstrtobool_bytes(page)?; + + Ok(()) + } +} + +#[vtable] +impl configfs::AttributeOperations<3> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result { + let mut writer = kernel::str::Formatter::new(page); + writer.write_fmt(fmt!("{}\n", this.data.lock().capacity_mib))?; + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + let text = core::str::from_utf8(page)?.trim(); + let value = text.parse::().map_err(|_| EINVAL)?; + + this.data.lock().capacity_mib = value; + Ok(()) + } +} diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index d07e76ae2c13..8690ff5f974f 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -1,28 +1,25 @@ // SPDX-License-Identifier: GPL-2.0 //! This is a Rust implementation of the C null block driver. -//! -//! Supported features: -//! -//! - blk-mq interface -//! - direct completion -//! - block size 4k -//! -//! The driver is not configurable. + +mod configfs; use kernel::{ - alloc::flags, - block::mq::{ + block::{ self, - gen_disk::{self, GenDisk}, - Operations, TagSet, + mq::{ + self, + gen_disk::{self, GenDisk}, + Operations, TagSet, + }, }, error::Result, - new_mutex, pr_info, + pr_info, prelude::*, - sync::{Arc, Mutex}, + sync::Arc, types::ARef, }; +use pin_init::PinInit; module! { type: NullBlkModule, @@ -35,33 +32,39 @@ module! { #[pin_data] struct NullBlkModule { #[pin] - _disk: Mutex>, + configfs_subsystem: kernel::configfs::Subsystem, } impl kernel::InPlaceModule for NullBlkModule { fn init(_module: &'static ThisModule) -> impl PinInit { pr_info!("Rust null_blk loaded\n"); - // Use a immediately-called closure as a stable `try` block - let disk = /* try */ (|| { - let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; - - gen_disk::GenDiskBuilder::new() - .capacity_sectors(4096 << 11) - .logical_block_size(4096)? - .physical_block_size(4096)? - .rotational(false) - .build(format_args!("rnullb{}", 0), tagset) - })(); - try_pin_init!(Self { - _disk <- new_mutex!(disk?, "nullb:disk"), + configfs_subsystem <- configfs::subsystem(), }) } } struct NullBlkDevice; +impl NullBlkDevice { + fn new( + name: &CStr, + block_size: u32, + rotational: bool, + capacity_mib: u64, + ) -> Result> { + let tagset = Arc::pin_init(TagSet::new(1, 256, 1), GFP_KERNEL)?; + + gen_disk::GenDiskBuilder::new() + .capacity_sectors(capacity_mib << (20 - block::SECTOR_SHIFT)) + .logical_block_size(block_size)? + .physical_block_size(block_size)? + .rotational(rotational) + .build(fmt!("{}", name.to_str()?), tagset) + } +} + #[vtable] impl Operations for NullBlkDevice { #[inline(always)] diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index 20f1d46c774d..6b1b846874db 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -51,7 +51,7 @@ impl GenDiskBuilder { /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, /// and that it is a power of two. - fn validate_block_size(size: u32) -> Result { + pub fn validate_block_size(size: u32) -> Result { if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { Err(error::code::EINVAL) } else { From 90d952fac8ac1aa6cb21aad7010d33af4d309f4a Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:08 +0200 Subject: [PATCH 032/164] rust: block: add `GenDisk` private data support Allow users of the rust block device driver API to install private data in the `GenDisk` structure. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-14-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- drivers/block/rnull/rnull.rs | 8 ++++-- rust/kernel/block/mq.rs | 7 +++-- rust/kernel/block/mq/gen_disk.rs | 32 ++++++++++++++++++--- rust/kernel/block/mq/operations.rs | 46 ++++++++++++++++++++++++------ 4 files changed, 74 insertions(+), 19 deletions(-) diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index 8690ff5f974f..8255236bc550 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -61,14 +61,16 @@ impl NullBlkDevice { .logical_block_size(block_size)? .physical_block_size(block_size)? .rotational(rotational) - .build(fmt!("{}", name.to_str()?), tagset) + .build(fmt!("{}", name.to_str()?), tagset, ()) } } #[vtable] impl Operations for NullBlkDevice { + type QueueData = (); + #[inline(always)] - fn queue_rq(rq: ARef>, _is_last: bool) -> Result { + fn queue_rq(_queue_data: (), rq: ARef>, _is_last: bool) -> Result { mq::Request::end_ok(rq) .map_err(|_e| kernel::error::code::EIO) // We take no refcounts on the request, so we expect to be able to @@ -79,5 +81,5 @@ impl Operations for NullBlkDevice { Ok(()) } - fn commit_rqs() {} + fn commit_rqs(_queue_data: ()) {} } diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index 98fa0d6bc8f7..6e546f4f3d1c 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -69,20 +69,21 @@ //! //! #[vtable] //! impl Operations for MyBlkDevice { +//! type QueueData = (); //! -//! fn queue_rq(rq: ARef>, _is_last: bool) -> Result { +//! fn queue_rq(_queue_data: (), rq: ARef>, _is_last: bool) -> Result { //! Request::end_ok(rq); //! Ok(()) //! } //! -//! fn commit_rqs() {} +//! fn commit_rqs(_queue_data: ()) {} //! } //! //! let tagset: Arc> = //! Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; //! let mut disk = gen_disk::GenDiskBuilder::new() //! .capacity_sectors(4096) -//! .build(format_args!("myblk"), tagset)?; +//! .build(format_args!("myblk"), tagset, ())?; //! //! # Ok::<(), kernel::error::Error>(()) //! ``` diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index 6b1b846874db..46ec80269970 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -13,6 +13,7 @@ use crate::{ static_lock_class, str::NullTerminatedFormatter, sync::Arc, + types::{ForeignOwnable, ScopeGuard}, }; use core::fmt::{self, Write}; @@ -98,7 +99,14 @@ impl GenDiskBuilder { self, name: fmt::Arguments<'_>, tagset: Arc>, + queue_data: T::QueueData, ) -> Result> { + let data = queue_data.into_foreign(); + let recover_data = ScopeGuard::new(|| { + // SAFETY: T::QueueData was created by the call to `into_foreign()` above + drop(unsafe { T::QueueData::from_foreign(data) }); + }); + // SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed. let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() }; @@ -113,7 +121,7 @@ impl GenDiskBuilder { bindings::__blk_mq_alloc_disk( tagset.raw_tag_set(), &mut lim, - core::ptr::null_mut(), + data, static_lock_class!().as_ptr(), ) })?; @@ -167,8 +175,12 @@ impl GenDiskBuilder { }, )?; + recover_data.dismiss(); + // INVARIANT: `gendisk` was initialized above. // INVARIANT: `gendisk` was added to the VFS via `device_add_disk` above. + // INVARIANT: `gendisk.queue.queue_data` is set to `data` in the call to + // `__blk_mq_alloc_disk` above. Ok(GenDisk { _tagset: tagset, gendisk, @@ -180,9 +192,10 @@ impl GenDiskBuilder { /// /// # Invariants /// -/// - `gendisk` must always point to an initialized and valid `struct gendisk`. -/// - `gendisk` was added to the VFS through a call to -/// `bindings::device_add_disk`. +/// - `gendisk` must always point to an initialized and valid `struct gendisk`. +/// - `gendisk` was added to the VFS through a call to +/// `bindings::device_add_disk`. +/// - `self.gendisk.queue.queuedata` is initialized by a call to `ForeignOwnable::into_foreign`. pub struct GenDisk { _tagset: Arc>, gendisk: *mut bindings::gendisk, @@ -194,9 +207,20 @@ unsafe impl Send for GenDisk {} impl Drop for GenDisk { fn drop(&mut self) { + // SAFETY: By type invariant of `Self`, `self.gendisk` points to a valid + // and initialized instance of `struct gendisk`, and, `queuedata` was + // initialized with the result of a call to + // `ForeignOwnable::into_foreign`. + let queue_data = unsafe { (*(*self.gendisk).queue).queuedata }; + // SAFETY: By type invariant, `self.gendisk` points to a valid and // initialized instance of `struct gendisk`, and it was previously added // to the VFS. unsafe { bindings::del_gendisk(self.gendisk) }; + + // SAFETY: `queue.queuedata` was created by `GenDiskBuilder::build` with + // a call to `ForeignOwnable::into_foreign` to create `queuedata`. + // `ForeignOwnable::from_foreign` is only called here. + drop(unsafe { T::QueueData::from_foreign(queue_data) }); } } diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index c2b98f507bcb..9e31d24e1164 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -6,14 +6,15 @@ use crate::{ bindings, - block::mq::request::RequestDataWrapper, - block::mq::Request, + block::mq::{request::RequestDataWrapper, Request}, error::{from_result, Result}, prelude::*, - types::ARef, + types::{ARef, ForeignOwnable}, }; use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering}; +type ForeignBorrowed<'a, T> = ::Borrowed<'a>; + /// Implement this trait to interface blk-mq as block devices. /// /// To implement a block device driver, implement this trait as described in the @@ -26,12 +27,20 @@ use core::{marker::PhantomData, sync::atomic::AtomicU64, sync::atomic::Ordering} /// [module level documentation]: kernel::block::mq #[macros::vtable] pub trait Operations: Sized { + /// Data associated with the `struct request_queue` that is allocated for + /// the `GenDisk` associated with this `Operations` implementation. + type QueueData: ForeignOwnable; + /// Called by the kernel to queue a request with the driver. If `is_last` is /// `false`, the driver is allowed to defer committing the request. - fn queue_rq(rq: ARef>, is_last: bool) -> Result; + fn queue_rq( + queue_data: ForeignBorrowed<'_, Self::QueueData>, + rq: ARef>, + is_last: bool, + ) -> Result; /// Called by the kernel to indicate that queued requests should be submitted. - fn commit_rqs(); + fn commit_rqs(queue_data: ForeignBorrowed<'_, Self::QueueData>); /// Called by the kernel to poll the device for completed requests. Only /// used for poll queues. @@ -70,7 +79,7 @@ impl OperationsVTable { /// promise to not access the request until the driver calls /// `bindings::blk_mq_end_request` for the request. unsafe extern "C" fn queue_rq_callback( - _hctx: *mut bindings::blk_mq_hw_ctx, + hctx: *mut bindings::blk_mq_hw_ctx, bd: *const bindings::blk_mq_queue_data, ) -> bindings::blk_status_t { // SAFETY: `bd.rq` is valid as required by the safety requirement for @@ -88,10 +97,20 @@ impl OperationsVTable { // reference counted by `ARef` until then. let rq = unsafe { Request::aref_from_raw((*bd).rq) }; + // SAFETY: `hctx` is valid as required by this function. + let queue_data = unsafe { (*(*hctx).queue).queuedata }; + + // SAFETY: `queue.queuedata` was created by `GenDiskBuilder::build` with + // a call to `ForeignOwnable::into_foreign` to create `queuedata`. + // `ForeignOwnable::from_foreign` is only called when the tagset is + // dropped, which happens after we are dropped. + let queue_data = unsafe { T::QueueData::borrow(queue_data) }; + // SAFETY: We have exclusive access and we just set the refcount above. unsafe { Request::start_unchecked(&rq) }; let ret = T::queue_rq( + queue_data, rq, // SAFETY: `bd` is valid as required by the safety requirement for // this function. @@ -110,9 +129,18 @@ impl OperationsVTable { /// /// # Safety /// - /// This function may only be called by blk-mq C infrastructure. - unsafe extern "C" fn commit_rqs_callback(_hctx: *mut bindings::blk_mq_hw_ctx) { - T::commit_rqs() + /// This function may only be called by blk-mq C infrastructure. The caller + /// must ensure that `hctx` is valid. + unsafe extern "C" fn commit_rqs_callback(hctx: *mut bindings::blk_mq_hw_ctx) { + // SAFETY: `hctx` is valid as required by this function. + let queue_data = unsafe { (*(*hctx).queue).queuedata }; + + // SAFETY: `queue.queuedata` was created by `GenDisk::try_new()` with a + // call to `ForeignOwnable::into_foreign()` to create `queuedata`. + // `ForeignOwnable::from_foreign()` is only called when the tagset is + // dropped, which happens after we are dropped. + let queue_data = unsafe { T::QueueData::borrow(queue_data) }; + T::commit_rqs(queue_data) } /// This function is called by the C kernel. It is not currently From bde50e28f7c5fe874112fe9d98e84873548fa8de Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:09 +0200 Subject: [PATCH 033/164] rust: block: mq: fix spelling in a safety comment Add code block quotes to a safety comment. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-15-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/block/mq/request.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index f723d74091c1..3848cfe63f77 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -148,7 +148,7 @@ impl Request { // valid allocation. let wrapper_ptr = unsafe { bindings::blk_mq_rq_to_pdu(request_ptr).cast::() }; - // SAFETY: By C API contract, wrapper_ptr points to a valid allocation + // SAFETY: By C API contract, `wrapper_ptr` points to a valid allocation // and is not null. unsafe { NonNull::new_unchecked(wrapper_ptr) } } From 4ec052841a545297a276e833817990f0e13b1b32 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:10 +0200 Subject: [PATCH 034/164] rust: block: add remote completion to `Request` Allow users of rust block device driver API to schedule completion of requests via `blk_mq_complete_request_remote`. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-16-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- drivers/block/rnull/rnull.rs | 9 +++++++++ rust/kernel/block/mq.rs | 6 ++++++ rust/kernel/block/mq/operations.rs | 19 +++++++++++++++---- rust/kernel/block/mq/request.rs | 17 +++++++++++++++++ 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index 8255236bc550..a19c55717c4f 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -82,4 +82,13 @@ impl Operations for NullBlkDevice { } fn commit_rqs(_queue_data: ()) {} + + fn complete(rq: ARef>) { + mq::Request::end_ok(rq) + .map_err(|_e| kernel::error::code::EIO) + // We take no refcounts on the request, so we expect to be able to + // end the request. The request reference must be unique at this + // point, and so `end_ok` cannot fail. + .expect("Fatal error - expected to be able to end request"); + } } diff --git a/rust/kernel/block/mq.rs b/rust/kernel/block/mq.rs index 6e546f4f3d1c..c0ec06b84355 100644 --- a/rust/kernel/block/mq.rs +++ b/rust/kernel/block/mq.rs @@ -77,6 +77,12 @@ //! } //! //! fn commit_rqs(_queue_data: ()) {} +//! +//! fn complete(rq: ARef>) { +//! Request::end_ok(rq) +//! .map_err(|_e| kernel::error::code::EIO) +//! .expect("Fatal error - expected to be able to end request"); +//! } //! } //! //! let tagset: Arc> = diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs index 9e31d24e1164..d098a8a3e434 100644 --- a/rust/kernel/block/mq/operations.rs +++ b/rust/kernel/block/mq/operations.rs @@ -42,6 +42,9 @@ pub trait Operations: Sized { /// Called by the kernel to indicate that queued requests should be submitted. fn commit_rqs(queue_data: ForeignBorrowed<'_, Self::QueueData>); + /// Called by the kernel when the request is completed. + fn complete(rq: ARef>); + /// Called by the kernel to poll the device for completed requests. Only /// used for poll queues. fn poll() -> bool { @@ -143,13 +146,21 @@ impl OperationsVTable { T::commit_rqs(queue_data) } - /// This function is called by the C kernel. It is not currently - /// implemented, and there is no way to exercise this code path. + /// This function is called by the C kernel. A pointer to this function is + /// installed in the `blk_mq_ops` vtable for the driver. /// /// # Safety /// - /// This function may only be called by blk-mq C infrastructure. - unsafe extern "C" fn complete_callback(_rq: *mut bindings::request) {} + /// This function may only be called by blk-mq C infrastructure. `rq` must + /// point to a valid request that has been marked as completed. The pointee + /// of `rq` must be valid for write for the duration of this function. + unsafe extern "C" fn complete_callback(rq: *mut bindings::request) { + // SAFETY: This function can only be dispatched through + // `Request::complete`. We leaked a refcount then which we pick back up + // now. + let aref = unsafe { Request::aref_from_raw(rq) }; + T::complete(aref); + } /// This function is called by the C kernel. A pointer to this function is /// installed in the `blk_mq_ops` vtable for the driver. diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs index 3848cfe63f77..f7f757f7459f 100644 --- a/rust/kernel/block/mq/request.rs +++ b/rust/kernel/block/mq/request.rs @@ -135,6 +135,23 @@ impl Request { Ok(()) } + /// Complete the request by scheduling `Operations::complete` for + /// execution. + /// + /// The function may be scheduled locally, via SoftIRQ or remotely via IPMI. + /// See `blk_mq_complete_request_remote` in [`blk-mq.c`] for details. + /// + /// [`blk-mq.c`]: srctree/block/blk-mq.c + pub fn complete(this: ARef) { + let ptr = ARef::into_raw(this).cast::().as_ptr(); + // SAFETY: By type invariant, `self.0` is a valid `struct request` + if !unsafe { bindings::blk_mq_complete_request_remote(ptr) } { + // SAFETY: We released a refcount above that we can reclaim here. + let this = unsafe { Request::aref_from_raw(ptr) }; + T::complete(this); + } + } + /// Return a pointer to the [`RequestDataWrapper`] stored in the private area /// of the request structure. /// From 34585dc649fb255b40075dab56af063c1bfc9933 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Tue, 2 Sep 2025 11:55:11 +0200 Subject: [PATCH 035/164] rnull: add soft-irq completion support rnull currently only supports direct completion. Add option for completing requests across CPU nodes via soft IRQ or IPI. Reviewed-by: Alice Ryhl Reviewed-by: Daniel Almeida Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20250902-rnull-up-v6-16-v7-17-b5212cc89b98@kernel.org Signed-off-by: Jens Axboe --- drivers/block/rnull/configfs.rs | 59 +++++++++++++++++++++++++++++++-- drivers/block/rnull/rnull.rs | 32 ++++++++++++------ 2 files changed, 78 insertions(+), 13 deletions(-) diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs index 46710a1e1af4..8498e9bae6fd 100644 --- a/drivers/block/rnull/configfs.rs +++ b/drivers/block/rnull/configfs.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 use super::{NullBlkDevice, THIS_MODULE}; -use core::fmt::Write; +use core::fmt::{Display, Write}; use kernel::{ block::mq::gen_disk::{GenDisk, GenDiskBuilder}, c_str, @@ -36,7 +36,7 @@ impl AttributeOperations<0> for Config { fn show(_this: &Config, page: &mut [u8; PAGE_SIZE]) -> Result { let mut writer = kernel::str::Formatter::new(page); - writer.write_str("blocksize,size,rotational\n")?; + writer.write_str("blocksize,size,rotational,irqmode\n")?; Ok(writer.bytes_written()) } } @@ -58,6 +58,7 @@ impl configfs::GroupOperations for Config { blocksize: 1, rotational: 2, size: 3, + irqmode: 4, ], }; @@ -72,6 +73,7 @@ impl configfs::GroupOperations for Config { rotational: false, disk: None, capacity_mib: 4096, + irq_mode: IRQMode::None, name: name.try_into()?, }), }), @@ -79,6 +81,34 @@ impl configfs::GroupOperations for Config { } } +#[derive(Debug, Clone, Copy)] +pub(crate) enum IRQMode { + None, + Soft, +} + +impl TryFrom for IRQMode { + type Error = kernel::error::Error; + + fn try_from(value: u8) -> Result { + match value { + 0 => Ok(Self::None), + 1 => Ok(Self::Soft), + _ => Err(EINVAL), + } + } +} + +impl Display for IRQMode { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + Self::None => f.write_str("0")?, + Self::Soft => f.write_str("1")?, + } + Ok(()) + } +} + #[pin_data] pub(crate) struct DeviceConfig { #[pin] @@ -92,6 +122,7 @@ struct DeviceConfigInner { block_size: u32, rotational: bool, capacity_mib: u64, + irq_mode: IRQMode, disk: Option>, } @@ -121,6 +152,7 @@ impl configfs::AttributeOperations<0> for DeviceConfig { guard.block_size, guard.rotational, guard.capacity_mib, + guard.irq_mode, )?); guard.powered = true; } else if guard.powered && !power_op { @@ -205,3 +237,26 @@ impl configfs::AttributeOperations<3> for DeviceConfig { Ok(()) } } + +#[vtable] +impl configfs::AttributeOperations<4> for DeviceConfig { + type Data = DeviceConfig; + + fn show(this: &DeviceConfig, page: &mut [u8; PAGE_SIZE]) -> Result { + let mut writer = kernel::str::Formatter::new(page); + writer.write_fmt(fmt!("{}\n", this.data.lock().irq_mode))?; + Ok(writer.bytes_written()) + } + + fn store(this: &DeviceConfig, page: &[u8]) -> Result { + if this.data.lock().powered { + return Err(EBUSY); + } + + let text = core::str::from_utf8(page)?.trim(); + let value = text.parse::().map_err(|_| EINVAL)?; + + this.data.lock().irq_mode = IRQMode::try_from(value)?; + Ok(()) + } +} diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index a19c55717c4f..1ec694d7f1a6 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -4,6 +4,7 @@ mod configfs; +use configfs::IRQMode; use kernel::{ block::{ self, @@ -53,35 +54,44 @@ impl NullBlkDevice { block_size: u32, rotational: bool, capacity_mib: u64, + irq_mode: IRQMode, ) -> Result> { let tagset = Arc::pin_init(TagSet::new(1, 256, 1), GFP_KERNEL)?; + let queue_data = Box::new(QueueData { irq_mode }, GFP_KERNEL)?; + gen_disk::GenDiskBuilder::new() .capacity_sectors(capacity_mib << (20 - block::SECTOR_SHIFT)) .logical_block_size(block_size)? .physical_block_size(block_size)? .rotational(rotational) - .build(fmt!("{}", name.to_str()?), tagset, ()) + .build(fmt!("{}", name.to_str()?), tagset, queue_data) } } +struct QueueData { + irq_mode: IRQMode, +} + #[vtable] impl Operations for NullBlkDevice { - type QueueData = (); + type QueueData = KBox; #[inline(always)] - fn queue_rq(_queue_data: (), rq: ARef>, _is_last: bool) -> Result { - mq::Request::end_ok(rq) - .map_err(|_e| kernel::error::code::EIO) - // We take no refcounts on the request, so we expect to be able to - // end the request. The request reference must be unique at this - // point, and so `end_ok` cannot fail. - .expect("Fatal error - expected to be able to end request"); - + fn queue_rq(queue_data: &QueueData, rq: ARef>, _is_last: bool) -> Result { + match queue_data.irq_mode { + IRQMode::None => mq::Request::end_ok(rq) + .map_err(|_e| kernel::error::code::EIO) + // We take no refcounts on the request, so we expect to be able to + // end the request. The request reference must be unique at this + // point, and so `end_ok` cannot fail. + .expect("Fatal error - expected to be able to end request"), + IRQMode::Soft => mq::Request::complete(rq), + } Ok(()) } - fn commit_rqs(_queue_data: ()) {} + fn commit_rqs(_queue_data: &QueueData) {} fn complete(rq: ARef>) { mq::Request::end_ok(rq) From b0b4518c992eb5f316c6e40ff186cbb7a5009518 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 2 Sep 2025 21:09:30 +0800 Subject: [PATCH 036/164] block: use int to store blk_stack_limits() return value Change the 'ret' variable in blk_stack_limits() from unsigned int to int, as it needs to store negative value -1. Storing the negative error codes in unsigned type, or performing equality comparisons (e.g., ret == -1), doesn't cause an issue at runtime [1] but can be confusing. Additionally, assigning negative error codes to unsigned type may trigger a GCC warning when the -Wsign-conversion flag is enabled. No effect on runtime. Link: https://lore.kernel.org/all/x3wogjf6vgpkisdhg3abzrx7v7zktmdnfmqeih5kosszmagqfs@oh3qxrgzkikf/ #1 Signed-off-by: Qianfeng Rong Reviewed-by: John Garry Fixes: fe0b393f2c0a ("block: Correct handling of bottom device misaligment") Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20250902130930.68317-1-rongqianfeng@vivo.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index d6438e6c276d..693bc8d20acf 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -763,7 +763,8 @@ unsupported: int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t start) { - unsigned int top, bottom, alignment, ret = 0; + unsigned int top, bottom, alignment; + int ret = 0; t->features |= (b->features & BLK_FEAT_INHERIT_MASK); From 225dc96f35afce6ffe3d798ffc0064445847a63b Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 8 Aug 2025 09:32:50 -0600 Subject: [PATCH 037/164] ublk: inline __ublk_ch_uring_cmd() ublk_ch_uring_cmd_local() is a thin wrapper around __ublk_ch_uring_cmd() that copies the ublksrv_io_cmd from user-mapped memory to the stack using READ_ONCE(). This ublksrv_io_cmd is passed by pointer to __ublk_ch_uring_cmd() and __ublk_ch_uring_cmd() is a large function unlikely to be inlined, so __ublk_ch_uring_cmd() will have to load the ublksrv_io_cmd fields back from the stack. Inline __ublk_ch_uring_cmd() into ublk_ch_uring_cmd_local() and load the ublksrv_io_cmd fields into local variables with READ_ONCE(). This allows the compiler to delay loading the fields until they are needed and choose whether to store them in registers or on the stack. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250808153251.282107-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 62 +++++++++++++++------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 99abd67b708b..3cf6d344d1c0 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2265,23 +2265,28 @@ static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io, return ublk_start_io(ubq, req, io); } -static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, - unsigned int issue_flags, - const struct ublksrv_io_cmd *ub_cmd) +static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, + unsigned int issue_flags) { + /* May point to userspace-mapped memory */ + const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); u16 buf_idx = UBLK_INVALID_BUF_IDX; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; u32 cmd_op = cmd->cmd_op; - unsigned tag = ub_cmd->tag; + u16 q_id = READ_ONCE(ub_src->q_id); + u16 tag = READ_ONCE(ub_src->tag); + s32 result = READ_ONCE(ub_src->result); + u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */ struct request *req; int ret; bool compl; + WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); + pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", - __func__, cmd->cmd_op, ub_cmd->q_id, tag, - ub_cmd->result); + __func__, cmd->cmd_op, q_id, tag, result); ret = ublk_check_cmd_op(cmd_op); if (ret) @@ -2292,14 +2297,13 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * so no need to validate the q_id, tag, or task */ if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF) - return ublk_unregister_io_buf(cmd, ub, ub_cmd->addr, - issue_flags); + return ublk_unregister_io_buf(cmd, ub, addr, issue_flags); ret = -EINVAL; - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) + if (q_id >= ub->dev_info.nr_hw_queues) goto out; - ubq = ublk_get_queue(ub, ub_cmd->q_id); + ubq = ublk_get_queue(ub, q_id); if (tag >= ubq->q_depth) goto out; @@ -2307,10 +2311,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, io = &ubq->ios[tag]; /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { - ret = ublk_check_fetch_buf(ubq, ub_cmd->addr); + ret = ublk_check_fetch_buf(ubq, addr); if (ret) goto out; - ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr); + ret = ublk_fetch(cmd, ubq, io, addr); if (ret) goto out; @@ -2324,7 +2328,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * so can be handled on any task */ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) - return ublk_register_io_buf(cmd, ubq, io, ub_cmd->addr, + return ublk_register_io_buf(cmd, ubq, io, addr, issue_flags); goto out; @@ -2346,22 +2350,22 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_daemon_register_io_buf(cmd, ubq, io, ub_cmd->addr, + return ublk_daemon_register_io_buf(cmd, ubq, io, addr, issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: - ret = ublk_check_commit_and_fetch(ubq, io, ub_cmd->addr); + ret = ublk_check_commit_and_fetch(ubq, io, addr); if (ret) goto out; - io->res = ub_cmd->result; + io->res = result; req = ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, &buf_idx); + ret = ublk_config_io_buf(ubq, io, cmd, addr, &buf_idx); compl = ublk_need_complete_req(ubq, io); /* can't touch 'ublk_io' any more */ if (buf_idx != UBLK_INVALID_BUF_IDX) io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); if (req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = ub_cmd->zone_append_lba; + req->__sector = addr; if (compl) __ublk_complete_rq(req); @@ -2375,7 +2379,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, * request */ req = ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, ub_cmd->addr, NULL); + ret = ublk_config_io_buf(ubq, io, cmd, addr, NULL); WARN_ON_ONCE(ret); if (likely(ublk_get_data(ubq, io, req))) { __ublk_prep_compl_io_cmd(io, req); @@ -2426,26 +2430,6 @@ fail_put: return NULL; } -static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - /* - * Not necessary for async retry, but let's keep it simple and always - * copy the values to avoid any potential reuse. - */ - const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); - const struct ublksrv_io_cmd ub_cmd = { - .q_id = READ_ONCE(ub_src->q_id), - .tag = READ_ONCE(ub_src->tag), - .result = READ_ONCE(ub_src->result), - .addr = READ_ONCE(ub_src->addr) - }; - - WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED); - - return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); -} - static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, unsigned int issue_flags) { From 7d337eef4affc5e26e0570513168c69ddbc40f92 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 21 Aug 2025 14:06:11 +0800 Subject: [PATCH 038/164] blk-mq: fix elevator depth_updated method Current depth_updated has some problems: 1) depth_updated() will be called for each hctx, while all elevators will update async_depth for the disk level, this is not related to hctx; 2) In blk_mq_update_nr_requests(), if previous hctx update succeed and this hctx update failed, q->nr_requests will not be updated, while async_depth is already updated with new nr_reqeuests in previous depth_updated(); 3) All elevators are using q->nr_requests to calculate async_depth now, however, q->nr_requests is still the old value when depth_updated() is called from blk_mq_update_nr_requests(); Those problems are first from error path, then mq-deadline, and recently for bfq and kyber, fix those problems by: - pass in request_queue instead of hctx; - move depth_updated() after q->nr_requests is updated in blk_mq_update_nr_requests(); - add depth_updated() call inside init_sched() method to initialize async_depth; - remove init_hctx() method for mq-deadline and bfq that is useless now; Fixes: 77f1e0a52d26 ("bfq: update internal depth state when queue depth changes") Fixes: 39823b47bbd4 ("block/mq-deadline: Fix the tag reservation code") Fixes: 42e6c6ce03fd ("lib/sbitmap: convert shallow_depth from one word to the whole sbitmap") Signed-off-by: Yu Kuai Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan Reviewed-by: Nilay Shroff Link: https://lore.kernel.org/r/20250821060612.1729939-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 22 +++++----------------- block/blk-mq-sched.h | 11 +++++++++++ block/blk-mq.c | 25 +++++++++++++------------ block/elevator.h | 2 +- block/kyber-iosched.c | 19 +++++++++---------- block/mq-deadline.c | 16 +++------------- 6 files changed, 42 insertions(+), 53 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50e51047e1fe..4a8d3d96bfe4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7109,9 +7109,10 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) +static void bfq_depth_updated(struct request_queue *q) { - unsigned int nr_requests = bfqd->queue->nr_requests; + struct bfq_data *bfqd = q->elevator->elevator_data; + unsigned int nr_requests = q->nr_requests; /* * In-word depths if no bfq_queue is being weight-raised: @@ -7143,21 +7144,8 @@ static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); -} -static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - - bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); -} - -static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) -{ - bfq_depth_updated(hctx); - return 0; + blk_mq_set_min_shallow_depth(q, 1); } static void bfq_exit_queue(struct elevator_queue *e) @@ -7369,6 +7357,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) goto out_free; bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + bfq_depth_updated(q); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); @@ -7628,7 +7617,6 @@ static struct elevator_type iosched_bfq_mq = { .request_merged = bfq_request_merged, .has_work = bfq_has_work, .depth_updated = bfq_depth_updated, - .init_hctx = bfq_init_hctx, .init_sched = bfq_init_queue, .exit_sched = bfq_exit_queue, }, diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index b554e1d55950..fe83187f41db 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -92,4 +92,15 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } +static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, + unsigned int depth) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + queue_for_each_hw_ctx(q, hctx, i) + sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, + depth); +} + #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index ba3a4b77f578..9055cd624700 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4951,20 +4951,21 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) false); } if (ret) - break; - if (q->elevator && q->elevator->type->ops.depth_updated) - q->elevator->type->ops.depth_updated(hctx); - } - if (!ret) { - q->nr_requests = nr; - if (blk_mq_is_shared_tags(set->flags)) { - if (q->elevator) - blk_mq_tag_update_sched_shared_tags(q); - else - blk_mq_tag_resize_shared_tags(set, nr); - } + goto out; } + q->nr_requests = nr; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); + + if (blk_mq_is_shared_tags(set->flags)) { + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q); + else + blk_mq_tag_resize_shared_tags(set, nr); + } + +out: blk_mq_unquiesce_queue(q); return ret; diff --git a/block/elevator.h b/block/elevator.h index adc5c157e17e..c4d20155065e 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -37,7 +37,7 @@ struct elevator_mq_ops { void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); - void (*depth_updated)(struct blk_mq_hw_ctx *); + void (*depth_updated)(struct request_queue *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 70cbc7b2deb4..18efd6ef2a2b 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -399,6 +399,14 @@ err: return ERR_PTR(ret); } +static void kyber_depth_updated(struct request_queue *q) +{ + struct kyber_queue_data *kqd = q->elevator->elevator_data; + + kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U; + blk_mq_set_min_shallow_depth(q, kqd->async_depth); +} + static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) { struct kyber_queue_data *kqd; @@ -413,6 +421,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) eq->elevator_data = kqd; q->elevator = eq; + kyber_depth_updated(q); return 0; } @@ -440,15 +449,6 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) INIT_LIST_HEAD(&kcq->rq_list[i]); } -static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - - kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); -} - static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct kyber_hctx_data *khd; @@ -493,7 +493,6 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - kyber_depth_updated(hctx); return 0; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b9b7cdf1d3c9..2e689b2c4021 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -507,22 +507,12 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) } /* Called by blk_mq_update_nr_requests(). */ -static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +static void dd_depth_updated(struct request_queue *q) { - struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; - - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); -} - -/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ -static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - dd_depth_updated(hctx); - return 0; + blk_mq_set_min_shallow_depth(q, 1); } static void dd_exit_sched(struct elevator_queue *e) @@ -587,6 +577,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; + dd_depth_updated(q); return 0; } @@ -1048,7 +1039,6 @@ static struct elevator_type mq_deadline = { .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, - .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS From ba28afbd9eff2a6370f23ef4e6a036ab0cfda409 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 21 Aug 2025 14:06:12 +0800 Subject: [PATCH 039/164] blk-mq: fix blk_mq_tags double free while nr_requests grown In the case user trigger tags grow by queue sysfs attribute nr_requests, hctx->sched_tags will be freed directly and replaced with a new allocated tags, see blk_mq_tag_update_depth(). The problem is that hctx->sched_tags is from elevator->et->tags, while et->tags is still the freed tags, hence later elevator exit will try to free the tags again, causing kernel panic. Fix this problem by replacing et->tags with new allocated tags as well. Noted there are still some long term problems that will require some refactor to be fixed thoroughly[1]. [1] https://lore.kernel.org/all/20250815080216.410665-1-yukuai1@huaweicloud.com/ Fixes: f5a6604f7a44 ("block: fix lockdep warning caused by lock dependency in elv_iosched_store") Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Nilay Shroff Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan Link: https://lore.kernel.org/r/20250821060612.1729939-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d880c50629d6..5cffa5668d0c 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -622,6 +622,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, return -ENOMEM; blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); + hctx->queue->elevator->et->tags[hctx->queue_num] = new; *tagsptr = new; } else { /* From e57b225c28b270d46c84ec4742607eab4093ae7c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:26:59 +0800 Subject: [PATCH 040/164] md/md-bitmap: remove the parameter 'init' for bitmap_ops->resize() It's set to 'false' for all callers, hence it's useless and can be removed. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-3-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/dm-raid.c | 2 +- drivers/md/md-bitmap.c | 5 ++--- drivers/md/md-bitmap.h | 3 +-- drivers/md/md-cluster.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 8 ++++---- drivers/md/raid5.c | 2 +- 7 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 79ea85d18e24..05811316dedd 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -4071,7 +4071,7 @@ static int raid_preresume(struct dm_target *ti) int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, - chunksize, false); + chunksize); if (r) DMERR("Failed to resize bitmap"); } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 334b71404930..4b87a09589d5 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2593,15 +2593,14 @@ err: return ret; } -static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, - bool init) +static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; - return __bitmap_resize(bitmap, blocks, chunksize, init); + return __bitmap_resize(bitmap, blocks, chunksize, false); } static ssize_t diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 59e9dd45cfde..28c1f1c1cc83 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -64,8 +64,7 @@ struct md_bitmap_stats { struct bitmap_operations { bool (*enabled)(struct mddev *mddev); int (*create)(struct mddev *mddev); - int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, - bool init); + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); int (*load)(struct mddev *mddev); void (*destroy)(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 5497eaee96e7..6a7deb7e545b 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -630,7 +630,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) ret = mddev->bitmap_ops->resize(mddev, le64_to_cpu(msg->high), - 0, false); + 0); break; default: ret = -1; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 408c26398321..ce24e3d01d4a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3330,7 +3330,7 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) return ret; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c7..269187faa011 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4239,7 +4239,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, size, 0, false); + ret = mddev->bitmap_ops->resize(mddev, size, 0); if (ret) return ret; @@ -4508,7 +4508,7 @@ static int raid10_start_reshape(struct mddev *mddev) newsize = raid10_size(mddev, 0, conf->geo.raid_disks); if (!mddev_is_clustered(mddev)) { - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; else @@ -4530,13 +4530,13 @@ static int raid10_start_reshape(struct mddev *mddev) MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); + ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; ret = mddev->cluster_ops->resize_bitmaps(mddev, newsize, oldsize); if (ret) { - mddev->bitmap_ops->resize(mddev, oldsize, 0, false); + mddev->bitmap_ops->resize(mddev, oldsize, 0); goto abort; } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 023649fe2476..bb95069e31f3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8322,7 +8322,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); + ret = mddev->bitmap_ops->resize(mddev, sectors, 0); if (ret) return ret; From 9307dbac0ea3fea977677d6c590b49f35a8203fc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:00 +0800 Subject: [PATCH 041/164] md/md-bitmap: merge md_bitmap_group into bitmap_operations Now that all bitmap implementations are internal, it doesn't make sense to export md_bitmap_group anymore. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-5-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md-bitmap.c | 5 ++++- drivers/md/md-bitmap.h | 2 ++ drivers/md/md.c | 6 +++++- drivers/md/md.h | 1 - 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 4b87a09589d5..30b6b42b9a48 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2989,7 +2989,8 @@ static struct attribute *md_bitmap_attrs[] = { &max_backlog_used.attr, NULL }; -const struct attribute_group md_bitmap_group = { + +static struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; @@ -3025,6 +3026,8 @@ static struct bitmap_operations bitmap_ops = { .copy_from_slot = bitmap_copy_from_slot, .set_pages = bitmap_set_pages, .free = md_bitmap_free, + + .group = &md_bitmap_group, }; void mddev_set_bitmap_ops(struct mddev *mddev) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 28c1f1c1cc83..0ceb9e97d21f 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -100,6 +100,8 @@ struct bitmap_operations { sector_t *hi, bool clear_bits); void (*set_pages)(void *data, unsigned long pages); void (*free)(void *data); + + struct attribute_group *group; }; /* the bitmap API */ diff --git a/drivers/md/md.c b/drivers/md/md.c index a9b31f02a9c7..ee1454948fc5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5801,7 +5801,6 @@ static const struct attribute_group md_redundancy_group = { static const struct attribute_group *md_attr_groups[] = { &md_default_group, - &md_bitmap_group, NULL, }; @@ -6055,6 +6054,11 @@ struct mddev *md_alloc(dev_t dev, char *name) return ERR_PTR(error); } + if (mddev->bitmap_ops && mddev->bitmap_ops->group) + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); diff --git a/drivers/md/md.h b/drivers/md/md.h index 51af29a03079..06364e15e8d1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -796,7 +796,6 @@ struct md_sysfs_entry { ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; -extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { From 9c41ead04ec0d3ee54505039879aee20cf495e0a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:01 +0800 Subject: [PATCH 042/164] md/md-bitmap: add a new parameter 'flush' to bitmap_ops->enabled The method is only used from raid1/raid10 IO path, to check if write bio should be pluged, the parameter is always set to true for now, following patch will use this helper in other context like updating superblock. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-6-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md-bitmap.c | 19 +++++++++++++------ drivers/md/md-bitmap.h | 2 +- drivers/md/raid1-10.c | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 30b6b42b9a48..69e4a43c38b4 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -232,20 +232,27 @@ static inline char *bmname(struct bitmap *bitmap) return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } -static bool __bitmap_enabled(struct bitmap *bitmap) +static bool __bitmap_enabled(struct bitmap *bitmap, bool flush) { - return bitmap->storage.filemap && - !test_bit(BITMAP_STALE, &bitmap->flags); + if (!flush) + return true; + + /* + * If caller want to flush bitmap pages to underlying disks, check if + * there are cached pages in filemap. + */ + return !test_bit(BITMAP_STALE, &bitmap->flags) && + bitmap->storage.filemap != NULL; } -static bool bitmap_enabled(struct mddev *mddev) +static bool bitmap_enabled(struct mddev *mddev, bool flush) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return false; - return __bitmap_enabled(bitmap); + return __bitmap_enabled(bitmap, flush); } /* @@ -1244,7 +1251,7 @@ static void __bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!__bitmap_enabled(bitmap)) + if (!__bitmap_enabled(bitmap, true)) return; /* look at each page to see if there are any set bits that need to be diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 0ceb9e97d21f..63d91831655f 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -62,7 +62,7 @@ struct md_bitmap_stats { }; struct bitmap_operations { - bool (*enabled)(struct mddev *mddev); + bool (*enabled)(struct mddev *mddev, bool flush); int (*create)(struct mddev *mddev); int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 52881e6032da..f3c3376cdd06 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!mddev->bitmap_ops->enabled(mddev)) { + if (!mddev->bitmap_ops->enabled(mddev, true)) { raid1_submit_write(bio); return true; } From 110332074dc6ad2f07a3cf9cc45b03adbce0e54f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:02 +0800 Subject: [PATCH 043/164] md/md-bitmap: add md_bitmap_registered/enabled() helper There are no functional changes, prepare to handle the case that mddev->bitmap_ops can be NULL, which is possible after introducing CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-7-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md-bitmap.c | 16 ++++------------ drivers/md/md-bitmap.h | 19 ++++++++++++++++++- drivers/md/raid1-10.c | 2 +- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 69e4a43c38b4..613e767578aa 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -232,8 +232,10 @@ static inline char *bmname(struct bitmap *bitmap) return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; } -static bool __bitmap_enabled(struct bitmap *bitmap, bool flush) +static bool bitmap_enabled(void *data, bool flush) { + struct bitmap *bitmap = data; + if (!flush) return true; @@ -245,16 +247,6 @@ static bool __bitmap_enabled(struct bitmap *bitmap, bool flush) bitmap->storage.filemap != NULL; } -static bool bitmap_enabled(struct mddev *mddev, bool flush) -{ - struct bitmap *bitmap = mddev->bitmap; - - if (!bitmap) - return false; - - return __bitmap_enabled(bitmap, flush); -} - /* * check a page and, if necessary, allocate it (or hijack it if the alloc fails) * @@ -1251,7 +1243,7 @@ static void __bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!__bitmap_enabled(bitmap, true)) + if (!bitmap_enabled(bitmap, true)) return; /* look at each page to see if there are any set bits that need to be diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 63d91831655f..a36ed32ec0d5 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -62,7 +62,7 @@ struct md_bitmap_stats { }; struct bitmap_operations { - bool (*enabled)(struct mddev *mddev, bool flush); + bool (*enabled)(void *data, bool flush); int (*create)(struct mddev *mddev); int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); @@ -107,4 +107,21 @@ struct bitmap_operations { /* the bitmap API */ void mddev_set_bitmap_ops(struct mddev *mddev); +static inline bool md_bitmap_registered(struct mddev *mddev) +{ + return mddev->bitmap_ops != NULL; +} + +static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush) +{ + /* bitmap_ops must be registered before creating bitmap. */ + if (!md_bitmap_registered(mddev)) + return false; + + if (!mddev->bitmap) + return false; + + return mddev->bitmap_ops->enabled(mddev->bitmap, flush); +} + #endif diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index f3c3376cdd06..521625756128 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, * If bitmap is not enabled, it's safe to submit the io directly, and * this can get optimal performance. */ - if (!mddev->bitmap_ops->enabled(mddev, true)) { + if (!md_bitmap_enabled(mddev, true)) { raid1_submit_write(bio); return true; } From 5ae58d1500e3fc16df3267fad810955c949afdb2 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:03 +0800 Subject: [PATCH 044/164] md/md-bitmap: handle the case bitmap is not enabled before start_sync() This case can be handled without knowing internal implementation. Prepare to introduce CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-8-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md-bitmap.c | 8 +------- drivers/md/md-bitmap.h | 12 ++++++++++++ drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 15 ++++++--------- drivers/md/raid5.c | 7 ++----- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 613e767578aa..625f74115352 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1787,15 +1787,9 @@ static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, bool degraded) { bitmap_counter_t *bmc; - bool rv; + bool rv = false; - if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ - *blocks = 1024; - return true; /* always resync if no bitmap */ - } spin_lock_irq(&bitmap->counts.lock); - - rv = false; bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc) { /* locked */ diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index a36ed32ec0d5..7a16de62ee35 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -124,4 +124,16 @@ static inline bool md_bitmap_enabled(struct mddev *mddev, bool flush) return mddev->bitmap_ops->enabled(mddev->bitmap, flush); } +static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + /* always resync if no bitmap */ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return true; + } + + return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded); +} + #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ce24e3d01d4a..1485c3ffe577 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2829,7 +2829,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* before building a request, check if we can skip these blocks.. * This call the bitmap_start_sync doesn't actually record anything */ - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ *skipped = 1; @@ -3004,8 +3004,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (len == 0) break; if (sync_blocks == 0) { - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, still_degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, + &sync_blocks, still_degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) break; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 269187faa011..eeaf7eaf8a07 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3351,9 +3351,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * we only need to recover the block if it is set in * the bitmap */ - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, - true); + must_sync = md_bitmap_start_sync(mddev, sect, + &sync_blocks, true); if (sync_blocks < max_sync) max_sync = sync_blocks; if (!must_sync && @@ -3396,9 +3395,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - must_sync = mddev->bitmap_ops->start_sync(mddev, sect, - &sync_blocks, still_degraded); - + md_bitmap_start_sync(mddev, sect, &sync_blocks, + still_degraded); any_working = 0; for (j=0; jcopies;j++) { int k; @@ -3574,9 +3572,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); - if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, - &sync_blocks, - mddev->degraded) && + if (!md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, + mddev->degraded) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bb95069e31f3..dbbf9290417a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6525,8 +6525,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && - !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - true) && + !md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, true) && sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); @@ -6557,9 +6556,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n still_degraded = true; } - mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, - still_degraded); - + md_bitmap_start_sync(mddev, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); From bb74b093c33cf20876e23ad8aa0d206b537ccb69 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:04 +0800 Subject: [PATCH 045/164] md/md-bitmap: handle the case bitmap is not enabled before end_sync() This case can be handled without knowing internal implementation. Prepare to introduce CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-9-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md-bitmap.c | 4 ---- drivers/md/md-bitmap.h | 11 +++++++++++ drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 8 +++----- drivers/md/raid5.c | 4 ++-- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 625f74115352..62b0eac28a7d 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1838,10 +1838,6 @@ static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, bitmap_counter_t *bmc; unsigned long flags; - if (bitmap == NULL) { - *blocks = 1024; - return; - } spin_lock_irqsave(&bitmap->counts.lock, flags); bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); if (bmc == NULL) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 7a16de62ee35..61cfc650c69c 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -136,4 +136,15 @@ static inline bool md_bitmap_start_sync(struct mddev *mddev, sector_t offset, return mddev->bitmap_ops->start_sync(mddev, offset, blocks, degraded); } +static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + if (!md_bitmap_enabled(mddev, false)) { + *blocks = 1024; + return; + } + + mddev->bitmap_ops->end_sync(mddev, offset, blocks); +} + #endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1485c3ffe577..0864da7d9adc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2057,7 +2057,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) /* make sure these bits don't get cleared. */ do { - mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); + md_bitmap_end_sync(mddev, s, &sync_blocks); s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); @@ -2804,8 +2804,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * We can find the current addess in mddev->curr_resync */ if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index eeaf7eaf8a07..d047e9dfdb85 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3221,15 +3221,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (mddev->curr_resync < max_sector) { /* aborted */ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) - mddev->bitmap_ops->end_sync(mddev, - mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else for (i = 0; i < conf->geo.raid_disks; i++) { sector_t sect = raid10_find_virt(conf, mddev->curr_resync, i); - mddev->bitmap_ops->end_sync(mddev, sect, - &sync_blocks); + md_bitmap_end_sync(mddev, sect, &sync_blocks); } } else { /* completed sync */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dbbf9290417a..955906bd4778 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6492,8 +6492,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n } if (mddev->curr_resync < max_sector) /* aborted */ - mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, - &sync_blocks); + md_bitmap_end_sync(mddev, mddev->curr_resync, + &sync_blocks); else /* completed sync */ conf->fullsync = 0; mddev->bitmap_ops->close_sync(mddev); From 20cecae877a634ffc49b4cd7b0f6927209badbab Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:05 +0800 Subject: [PATCH 046/164] md/raid1: check bitmap before behind write behind write rely on bitmap, because the number of IO are recorded in bitmap->behind_writes, and callers rely on bitmap_wait_behind_writes() to wait for IO to be done. However, currently callers doesn't check if bitmap is enabeld before calling into behind methods. Hence if behind write start without bitmap, readers will not wait for slow write IO to be done and old data can be read in some corner cases. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-10-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md-bitmap.c | 6 ------ drivers/md/raid1.c | 45 ++++++++++++++++++++++++++---------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 62b0eac28a7d..29de99e0de3e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2049,9 +2049,6 @@ static void bitmap_start_behind_write(struct mddev *mddev) struct bitmap *bitmap = mddev->bitmap; int bw; - if (!bitmap) - return; - atomic_inc(&bitmap->behind_writes); bw = atomic_read(&bitmap->behind_writes); if (bw > bitmap->behind_writes_used) @@ -2065,9 +2062,6 @@ static void bitmap_end_behind_write(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; - if (!bitmap) - return; - if (atomic_dec_and_test(&bitmap->behind_writes)) wake_up(&bitmap->behind_wait); pr_debug("dec write-behind count %d/%lu\n", diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0864da7d9adc..5599dcbd2991 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1366,7 +1366,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, (unsigned long long)r1_bio->sector, mirror->rdev->bdev); - if (test_bit(WriteMostly, &mirror->rdev->flags)) { + if (test_bit(WriteMostly, &mirror->rdev->flags) && + md_bitmap_enabled(mddev, false)) { /* * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' @@ -1452,6 +1453,30 @@ retry: return true; } +static void raid1_start_write_behind(struct mddev *mddev, struct r1bio *r1_bio, + struct bio *bio) +{ + unsigned long max_write_behind = mddev->bitmap_info.max_write_behind; + struct md_bitmap_stats stats; + int err; + + /* behind write rely on bitmap, see bitmap_operations */ + if (!md_bitmap_enabled(mddev, false)) + return; + + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); + if (err) + return; + + /* Don't do behind IO if reader is waiting, or there are too many. */ + if (!stats.behind_wait && stats.behind_writes < max_write_behind) + alloc_behind_master_bio(r1_bio, bio); + + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->start_behind_write(mddev); + +} + static void raid1_write_request(struct mddev *mddev, struct bio *bio, int max_write_sectors) { @@ -1612,22 +1637,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; if (first_clone) { - unsigned long max_write_behind = - mddev->bitmap_info.max_write_behind; - struct md_bitmap_stats stats; - int err; - - /* do behind I/O ? - * Not if there are too many, or cannot - * allocate memory, or a reader on WriteMostly - * is waiting for behind writes to flush */ - err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); - if (!err && write_behind && !stats.behind_wait && - stats.behind_writes < max_write_behind) - alloc_behind_master_bio(r1_bio, bio); - - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) - mddev->bitmap_ops->start_behind_write(mddev); + if (write_behind) + raid1_start_write_behind(mddev, r1_bio, bio); first_clone = 0; } From 8d31ed3b776eb2a4c4397f96dc92643f57e2e447 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:06 +0800 Subject: [PATCH 047/164] md/raid1: check before referencing mddev->bitmap_ops Prepare to introduce CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-11-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/raid1.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 5599dcbd2991..0e792b9bfff8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2820,7 +2820,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); if (mddev_is_clustered(mddev)) { @@ -2857,10 +2858,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ - - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, - mddev_is_clustered(mddev) && - (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (raise_barrier(conf, sector_nr)) return 0; @@ -3335,15 +3337,17 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize = raid1_size(mddev, sectors, 0); - int ret; if (mddev->external_size && mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, newsize, 0); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, newsize, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && From 969f996243ae1d7b95b95368262f5e3a6e8bb0b4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:07 +0800 Subject: [PATCH 048/164] md/raid10: check before referencing mddev->bitmap_ops Prepare to introduce CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-12-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/raid10.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d047e9dfdb85..2411399a7352 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3247,7 +3247,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } conf->fullsync = 0; } - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; return sectors_skipped; @@ -3566,7 +3567,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * safety reason, which ensures curr_resync_completed is * updated in bitmap_cond_end_sync. */ - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); @@ -4220,7 +4222,6 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) */ struct r10conf *conf = mddev->private; sector_t oldsize, size; - int ret; if (mddev->reshape_position != MaxSector) return -EBUSY; @@ -4234,9 +4235,12 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > size) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, size, 0); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, size, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && @@ -4502,7 +4506,8 @@ static int raid10_start_reshape(struct mddev *mddev) oldsize = raid10_size(mddev, 0, 0); newsize = raid10_size(mddev, 0, conf->geo.raid_disks); - if (!mddev_is_clustered(mddev)) { + if (!mddev_is_clustered(mddev) && + md_bitmap_enabled(mddev, false)) { ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; @@ -4525,6 +4530,7 @@ static int raid10_start_reshape(struct mddev *mddev) MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) goto out; + /* cluster can't be setup without bitmap */ ret = mddev->bitmap_ops->resize(mddev, newsize, 0); if (ret) goto abort; From bb9317b13ade9e5e550aa2efecbd209ff96697ff Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:08 +0800 Subject: [PATCH 049/164] md/raid5: check before referencing mddev->bitmap_ops Prepare to introduce CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-13-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/raid5.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 955906bd4778..5285e72341a2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6496,7 +6496,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n &sync_blocks); else /* completed sync */ conf->fullsync = 0; - mddev->bitmap_ops->close_sync(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->close_sync(mddev); return 0; } @@ -6534,7 +6535,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return sync_blocks * RAID5_STRIPE_SECTORS(conf); } - mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); sh = raid5_get_active_stripe(conf, NULL, sector_nr, R5_GAS_NOBLOCK); @@ -6760,7 +6762,8 @@ static void raid5d(struct md_thread *thread) /* Now is a good time to flush some bitmap updates */ conf->seq_flush++; spin_unlock_irq(&conf->device_lock); - mddev->bitmap_ops->unplug(mddev, true); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->unplug(mddev, true); spin_lock_irq(&conf->device_lock); conf->seq_write = conf->seq_flush; activate_bit_delay(conf, conf->temp_inactive_list); @@ -8309,7 +8312,6 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) */ sector_t newsize; struct r5conf *conf = mddev->private; - int ret; if (raid5_has_log(conf) || raid5_has_ppl(conf)) return -EINVAL; @@ -8319,9 +8321,12 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) mddev->array_sectors > newsize) return -EINVAL; - ret = mddev->bitmap_ops->resize(mddev, sectors, 0); - if (ret) - return ret; + if (md_bitmap_enabled(mddev, false)) { + int ret = mddev->bitmap_ops->resize(mddev, sectors, 0); + + if (ret) + return ret; + } md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && From 0e18745420e332870eabed5a9d7efb45c0283bf0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:09 +0800 Subject: [PATCH 050/164] md/dm-raid: check before referencing mddev->bitmap_ops Prepare to introduce CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-14-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/dm-raid.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 05811316dedd..967d72929909 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3953,9 +3953,11 @@ static int __load_dirty_region_bitmap(struct raid_set *rs) !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { struct mddev *mddev = &rs->md; - r = mddev->bitmap_ops->load(mddev); - if (r) - DMERR("Failed to load bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->load(mddev); + if (r) + DMERR("Failed to load bitmap"); + } } return r; @@ -4070,10 +4072,12 @@ static int raid_preresume(struct dm_target *ti) mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; - r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, - chunksize); - if (r) - DMERR("Failed to resize bitmap"); + if (md_bitmap_enabled(mddev, false)) { + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, + chunksize); + if (r) + DMERR("Failed to resize bitmap"); + } } /* Check for any resize/reshape on @rs and adjust/initiate */ From 26292657add33b8cb627bd582e7097755e85a3ac Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:10 +0800 Subject: [PATCH 051/164] md: check before referencing mddev->bitmap_ops Prepare to introduce CONFIG_MD_BITMAP. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-15-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/md.c | 68 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ee1454948fc5..e750ae4883e6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1356,6 +1356,9 @@ static u64 md_bitmap_events_cleared(struct mddev *mddev) struct md_bitmap_stats stats; int err; + if (!md_bitmap_enabled(mddev, false)) + return 0; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return 0; @@ -2313,13 +2316,15 @@ static int super_1_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) { + struct mddev *mddev = rdev->mddev; + /* All necessary checks on new >= old have been done */ if (new_offset >= rdev->data_offset) return 1; /* with 1.0 metadata, there is no metadata to tread on * so we can always move back */ - if (rdev->mddev->minor_version == 0) + if (mddev->minor_version == 0) return 1; /* otherwise we must be sure not to step on @@ -2331,8 +2336,7 @@ super_1_allow_new_offset(struct md_rdev *rdev, if (rdev->sb_start + (32+4)*2 > new_offset) return 0; - if (!rdev->mddev->bitmap_info.file) { - struct mddev *mddev = rdev->mddev; + if (md_bitmap_registered(mddev) && !mddev->bitmap_info.file) { struct md_bitmap_stats stats; int err; @@ -2804,7 +2808,8 @@ repeat: mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: - mddev->bitmap_ops->update_sb(mddev->bitmap); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { if (rdev->sb_loaded != 1) continue; /* no noise on spare devices */ @@ -4680,6 +4685,9 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) unsigned long chunk, end_chunk; int err; + if (!md_bitmap_enabled(mddev, false)) + return len; + err = mddev_lock(mddev); if (err) return err; @@ -6054,7 +6062,7 @@ struct mddev *md_alloc(dev_t dev, char *name) return ERR_PTR(error); } - if (mddev->bitmap_ops && mddev->bitmap_ops->group) + if (md_bitmap_registered(mddev) && mddev->bitmap_ops->group) if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) pr_warn("md: cannot register extra bitmap attributes for %s\n", mdname(mddev)); @@ -6301,7 +6309,7 @@ int md_run(struct mddev *mddev) (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; } - if (err == 0 && pers->sync_request && + if (err == 0 && pers->sync_request && md_bitmap_registered(mddev) && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = mddev->bitmap_ops->create(mddev); if (err) @@ -6376,7 +6384,8 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; put_pers(pers); - mddev->bitmap_ops->destroy(mddev); + if (md_bitmap_registered(mddev)) + mddev->bitmap_ops->destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6396,10 +6405,12 @@ int do_md_run(struct mddev *mddev) if (err) goto out; - err = mddev->bitmap_ops->load(mddev); - if (err) { - mddev->bitmap_ops->destroy(mddev); - goto out; + if (md_bitmap_registered(mddev)) { + err = mddev->bitmap_ops->load(mddev); + if (err) { + mddev->bitmap_ops->destroy(mddev); + goto out; + } } if (mddev_is_clustered(mddev)) @@ -6550,7 +6561,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->pers->quiesce(mddev, 0); } - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || @@ -6577,7 +6589,8 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { - mddev->bitmap_ops->wait_behind_writes(mddev); + if (md_bitmap_enabled(mddev, false)) + mddev->bitmap_ops->wait_behind_writes(mddev); if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -6593,7 +6606,8 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - mddev->bitmap_ops->destroy(mddev); + if (md_bitmap_registered(mddev)) + mddev->bitmap_ops->destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -7311,6 +7325,9 @@ static int set_bitmap_file(struct mddev *mddev, int fd) { int err = 0; + if (!md_bitmap_registered(mddev)) + return -EINVAL; + if (mddev->pers) { if (!mddev->pers->quiesce || !mddev->thread) return -EBUSY; @@ -7661,6 +7678,14 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<pers->quiesce == NULL || mddev->thread == NULL) { rv = -EINVAL; goto err; @@ -8495,6 +8520,9 @@ static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) unsigned long chunk_kb; int err; + if (!md_bitmap_enabled(mddev, false)) + return; + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); if (err) return; @@ -8897,7 +8925,7 @@ static void md_end_clone_io(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -8924,7 +8952,7 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); - if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { + if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { md_io_clone->offset = (*bio)->bi_iter.bi_sector; md_io_clone->sectors = bio_sectors(*bio); md_bitmap_start(mddev, md_io_clone); @@ -8948,7 +8976,7 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; - if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); if (bio->bi_status && !orig_bio->bi_status) @@ -9681,7 +9709,7 @@ static void md_start_sync(struct work_struct *ws) * We are adding a device or devices to an array which has the bitmap * stored on all devices. So make sure all bitmap pages get written. */ - if (spares) + if (spares && md_bitmap_enabled(mddev, true)) mddev->bitmap_ops->write_all(mddev); name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? @@ -9769,7 +9797,7 @@ static void unregister_sync_thread(struct mddev *mddev) */ void md_check_recovery(struct mddev *mddev) { - if (mddev->bitmap) + if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { @@ -10149,7 +10177,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size)); if (ret) pr_info("md-cluster: resize failed\n"); - else + else if (md_bitmap_enabled(mddev, false)) mddev->bitmap_ops->update_sb(mddev->bitmap); } From c27474ac1d4609af3c1c38ccac252c2575b47b9e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 7 Jul 2025 09:27:11 +0800 Subject: [PATCH 052/164] md/md-bitmap: introduce CONFIG_MD_BITMAP Now that all implementations are internal, it's sensible to add a config option for md-bitmap, and it's a good way for isolation. Link: https://lore.kernel.org/linux-raid/20250707012711.376844-16-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni --- drivers/md/Kconfig | 18 ++++++++++++++++++ drivers/md/Makefile | 3 ++- drivers/md/md-bitmap.c | 23 +++++++++++++++++++++-- drivers/md/md-bitmap.h | 17 +++++++++++++++-- drivers/md/md.c | 40 ++++++++++++++++++++++++++++------------ drivers/md/md.h | 3 +-- 6 files changed, 85 insertions(+), 19 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index ddb37f6670de..f913579e731c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -37,6 +37,21 @@ config BLK_DEV_MD If unsure, say N. +config MD_BITMAP + bool "MD RAID bitmap support" + default y + depends on BLK_DEV_MD + help + If you say Y here, support for the write intent bitmap will be + enabled. The bitmap can be used to optimize resync speed after power + failure or readding a disk, limiting it to recorded dirty sectors in + bitmap. + + This feature can be added to existing MD array or MD array can be + created with bitmap via mdadm(8). + + If unsure, say Y. + config MD_AUTODETECT bool "Autodetect RAID arrays during kernel boot" depends on BLK_DEV_MD=y @@ -54,6 +69,7 @@ config MD_AUTODETECT config MD_BITMAP_FILE bool "MD bitmap file support (deprecated)" default y + depends on MD_BITMAP help If you say Y here, support for write intent bitmaps in files on an external file system is enabled. This is an alternative to the internal @@ -174,6 +190,7 @@ config MD_RAID456 config MD_CLUSTER tristate "Cluster Support for MD" + select MD_BITMAP depends on BLK_DEV_MD depends on DLM default n @@ -393,6 +410,7 @@ config DM_RAID select MD_RAID1 select MD_RAID10 select MD_RAID456 + select MD_BITMAP select BLK_DEV_MD help A dm target that supports RAID1, RAID10, RAID4, RAID5 and RAID6 mappings diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 87bdfc9fe14c..2e18147a9c40 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -27,7 +27,8 @@ dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o -md-mod-y += md.o md-bitmap.o +md-mod-y += md.o +md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o linear-y += md-linear.o diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 29de99e0de3e..5f62f2fd8f3f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -224,6 +224,8 @@ struct bitmap { int cluster_slot; }; +static struct workqueue_struct *md_bitmap_wq; + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, bool init); @@ -2979,6 +2981,12 @@ static struct attribute_group md_bitmap_group = { }; static struct bitmap_operations bitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_BITMAP, + .name = "bitmap", + }, + .enabled = bitmap_enabled, .create = bitmap_create, .resize = bitmap_resize, @@ -3013,7 +3021,18 @@ static struct bitmap_operations bitmap_ops = { .group = &md_bitmap_group, }; -void mddev_set_bitmap_ops(struct mddev *mddev) +int md_bitmap_init(void) { - mddev->bitmap_ops = &bitmap_ops; + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + return -ENOMEM; + + return register_md_submodule(&bitmap_ops.head); +} + +void md_bitmap_exit(void) +{ + destroy_workqueue(md_bitmap_wq); + unregister_md_submodule(&bitmap_ops.head); } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 61cfc650c69c..42f91755a341 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -62,6 +62,8 @@ struct md_bitmap_stats { }; struct bitmap_operations { + struct md_submodule_head head; + bool (*enabled)(void *data, bool flush); int (*create)(struct mddev *mddev); int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize); @@ -105,8 +107,6 @@ struct bitmap_operations { }; /* the bitmap API */ -void mddev_set_bitmap_ops(struct mddev *mddev); - static inline bool md_bitmap_registered(struct mddev *mddev) { return mddev->bitmap_ops != NULL; @@ -147,4 +147,17 @@ static inline void md_bitmap_end_sync(struct mddev *mddev, sector_t offset, mddev->bitmap_ops->end_sync(mddev, offset, blocks); } +#ifdef CONFIG_MD_BITMAP +int md_bitmap_init(void); +void md_bitmap_exit(void); +#else +static inline int md_bitmap_init(void) +{ + return 0; +} +static inline void md_bitmap_exit(void) +{ +} +#endif + #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index e750ae4883e6..fbc107b52819 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -94,7 +94,6 @@ static struct workqueue_struct *md_wq; * workqueue whith reconfig_mutex grabbed. */ static struct workqueue_struct *md_misc_wq; -struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -677,15 +676,34 @@ static void active_io_release(struct percpu_ref *ref) static void no_op(struct percpu_ref *r) {} +static void mddev_set_bitmap_ops(struct mddev *mddev, enum md_submodule_id id) +{ + xa_lock(&md_submodule); + mddev->bitmap_ops = xa_load(&md_submodule, id); + xa_unlock(&md_submodule); + if (!mddev->bitmap_ops) + pr_warn_once("md: can't find bitmap id %d\n", id); +} + +static void mddev_clear_bitmap_ops(struct mddev *mddev) +{ + mddev->bitmap_ops = NULL; +} + int mddev_init(struct mddev *mddev) { + /* TODO: support more versions */ + mddev_set_bitmap_ops(mddev, ID_BITMAP); if (percpu_ref_init(&mddev->active_io, active_io_release, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + mddev_clear_bitmap_ops(mddev); return -ENOMEM; + } if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + mddev_clear_bitmap_ops(mddev); percpu_ref_exit(&mddev->active_io); return -ENOMEM; } @@ -713,7 +731,6 @@ int mddev_init(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; - mddev_set_bitmap_ops(mddev); INIT_WORK(&mddev->sync_work, md_start_sync); INIT_WORK(&mddev->del_work, mddev_delayed_delete); @@ -724,6 +741,7 @@ EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { + mddev_clear_bitmap_ops(mddev); percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } @@ -10121,8 +10139,12 @@ static void md_geninit(void) static int __init md_init(void) { - int ret = -ENOMEM; + int ret = md_bitmap_init(); + if (ret) + return ret; + + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -10131,11 +10153,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, - 0); - if (!md_bitmap_wq) - goto err_bitmap_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -10154,12 +10171,11 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_bitmap_wq); -err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_bitmap_exit(); return ret; } @@ -10465,8 +10481,8 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); + md_bitmap_exit(); } subsys_initcall(md_init); diff --git a/drivers/md/md.h b/drivers/md/md.h index 06364e15e8d1..081152c8de1f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -38,7 +38,7 @@ enum md_submodule_id { ID_RAID6 = 6, ID_RAID10 = 10, ID_CLUSTER, - ID_BITMAP, /* TODO */ + ID_BITMAP, ID_LLBITMAP, /* TODO */ }; @@ -1012,7 +1012,6 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; -extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); From d01acbce391767318c94fb1f6d648cfabb428f9d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:16 +0800 Subject: [PATCH 053/164] md: add a new parameter 'offset' to md_super_write() The parameter is always set to 0 for now, following patches will use this helper to write llbitmap to underlying disks, allow writing dirty sectors instead of the whole page. Also rename md_super_write to md_write_metadata since there is nothing super-block specific. Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-2-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan --- drivers/md/md-bitmap.c | 3 ++- drivers/md/md.c | 50 +++++++++++++++++++++++++----------------- drivers/md/md.h | 5 +++-- 3 files changed, 35 insertions(+), 23 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 5f62f2fd8f3f..b157119de123 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -485,7 +485,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, return -EINVAL; } - md_super_write(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), page); + md_write_metadata(mddev, rdev, sboff + ps, (int)min(size, bitmap_limit), + page, 0); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index fbc107b52819..528c99396458 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1038,15 +1038,26 @@ static void super_written(struct bio *bio) wake_up(&mddev->sb_wait); } -void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page) +/** + * md_write_metadata - write metadata to underlying disk, including + * array superblock, badblocks, bitmap superblock and bitmap bits. + * @mddev: the array to write + * @rdev: the underlying disk to write + * @sector: the offset to @rdev + * @size: the length of the metadata + * @page: the metadata + * @offset: the offset to @page + * + * Write @size bytes of @page start from @offset, to @sector of @rdev, Increment + * mddev->pending_writes before returning, and decrement it on completion, + * waking up sb_wait. Caller must call md_super_wait() after issuing io to all + * rdev. If an error occurred, md_error() will be called, and the @rdev will be + * kicked out from @mddev. + */ +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset) { - /* write first size bytes of page to sector of rdev - * Increment mddev->pending_writes before returning - * and decrement it on completion, waking up sb_wait - * if zero is reached. - * If an error occurred, call md_error - */ struct bio *bio; if (!page) @@ -1064,7 +1075,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - __bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, offset); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -1674,8 +1685,8 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) num_sectors = (sector_t)(2ULL << 32) - 2; do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; } @@ -2323,8 +2334,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) sb->super_offset = cpu_to_le64(rdev->sb_start); sb->sb_csum = calc_sb_1_csum(sb); do { - md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(rdev->mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); } while (md_super_wait(rdev->mddev) < 0); return num_sectors; @@ -2833,18 +2844,17 @@ rewrite: continue; /* no noise on spare devices */ if (!test_bit(Faulty, &rdev->flags)) { - md_super_write(mddev,rdev, - rdev->sb_start, rdev->sb_size, - rdev->sb_page); + md_write_metadata(mddev, rdev, rdev->sb_start, + rdev->sb_size, rdev->sb_page, 0); pr_debug("md: (write) %pg's sb offset: %llu\n", rdev->bdev, (unsigned long long)rdev->sb_start); rdev->sb_events = mddev->events; if (rdev->badblocks.size) { - md_super_write(mddev, rdev, - rdev->badblocks.sector, - rdev->badblocks.size << 9, - rdev->bb_page); + md_write_metadata(mddev, rdev, + rdev->badblocks.sector, + rdev->badblocks.size << 9, + rdev->bb_page, 0); rdev->badblocks.size = 0; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 081152c8de1f..cadd9bc99938 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -908,8 +908,9 @@ void md_account_bio(struct mddev *mddev, struct bio **bio); void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); -extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, - sector_t sector, int size, struct page *page); +void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, + sector_t sector, int size, struct page *page, + unsigned int offset); extern int md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, blk_opf_t opf, bool metadata_op); From 7797da149d4622bbe803eaee6eaf22a7f62f44ab Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:17 +0800 Subject: [PATCH 054/164] md: factor out a helper raid_is_456() There are no functional changes, the helper will be used by llbitmap in following patches. Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-3-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan --- drivers/md/md.c | 9 +-------- drivers/md/md.h | 6 ++++++ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 528c99396458..4bc35362f4d0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9121,19 +9121,12 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) static bool sync_io_within_limit(struct mddev *mddev) { - int io_sectors; - /* * For raid456, sync IO is stripe(4k) per IO, for other levels, it's * RESYNC_PAGES(64k) per IO. */ - if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) - io_sectors = 8; - else - io_sectors = 128; - return atomic_read(&mddev->recovery_active) < - io_sectors * sync_io_depth(mddev); + (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); } #define SYNC_MARKS 10 diff --git a/drivers/md/md.h b/drivers/md/md.h index cadd9bc99938..5ef73109d14d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -1033,6 +1033,12 @@ static inline bool mddev_is_dm(struct mddev *mddev) return !mddev->gendisk; } +static inline bool raid_is_456(struct mddev *mddev) +{ + return mddev->level == ID_RAID4 || mddev->level == ID_RAID5 || + mddev->level == ID_RAID6; +} + static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, sector_t sector) { From ac9dad8faaa7b2a7c7c6de0017f2d7d54525d33c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:18 +0800 Subject: [PATCH 055/164] md/md-bitmap: support discard for bitmap ops Use two new methods {start, end}_discard in bitmap_ops and a new field 'rw' in struct md_io_clone to handle discard IO, prepare to support new md bitmap. Since all bitmap functions to hanlde write IO are the same, also add typedef to make code cleaner. Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-4-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan --- drivers/md/md-bitmap.c | 3 +++ drivers/md/md-bitmap.h | 12 ++++++++---- drivers/md/md.c | 15 +++++++++++---- drivers/md/md.h | 1 + 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b157119de123..dc050ff94d5b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -3005,6 +3005,9 @@ static struct bitmap_operations bitmap_ops = { .start_write = bitmap_start_write, .end_write = bitmap_end_write, + .start_discard = bitmap_start_write, + .end_discard = bitmap_end_write, + .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 42f91755a341..8616ced49077 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -61,6 +61,9 @@ struct md_bitmap_stats { struct file *file; }; +typedef void (md_bitmap_fn)(struct mddev *mddev, sector_t offset, + unsigned long sectors); + struct bitmap_operations { struct md_submodule_head head; @@ -81,10 +84,11 @@ struct bitmap_operations { void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); - void (*start_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); - void (*end_write)(struct mddev *mddev, sector_t offset, - unsigned long sectors); + md_bitmap_fn *start_write; + md_bitmap_fn *end_write; + md_bitmap_fn *start_discard; + md_bitmap_fn *end_discard; + bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); diff --git a/drivers/md/md.c b/drivers/md/md.c index 4bc35362f4d0..8a89dca8209c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8933,18 +8933,24 @@ EXPORT_SYMBOL_GPL(md_submit_discard_bio); static void md_bitmap_start(struct mddev *mddev, struct md_io_clone *md_io_clone) { + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->start_discard : + mddev->bitmap_ops->start_write; + if (mddev->pers->bitmap_sector) mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, &md_io_clone->sectors); - mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) { - mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, - md_io_clone->sectors); + md_bitmap_fn *fn = unlikely(md_io_clone->rw == STAT_DISCARD) ? + mddev->bitmap_ops->end_discard : + mddev->bitmap_ops->end_write; + + fn(mddev, md_io_clone->offset, md_io_clone->sectors); } static void md_end_clone_io(struct bio *bio) @@ -8983,6 +8989,7 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (bio_data_dir(*bio) == WRITE && md_bitmap_enabled(mddev, false)) { md_io_clone->offset = (*bio)->bi_iter.bi_sector; md_io_clone->sectors = bio_sectors(*bio); + md_io_clone->rw = op_stat_group(bio_op(*bio)); md_bitmap_start(mddev, md_io_clone); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 5ef73109d14d..1b767b5320cf 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -872,6 +872,7 @@ struct md_io_clone { unsigned long start_time; sector_t offset; unsigned long sectors; + enum stat_group rw; struct bio bio_clone; }; From 300bffa870c5dfaae87c169b1d5edc9c285a81a6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:19 +0800 Subject: [PATCH 056/164] md: add a new mddev field 'bitmap_id' Prepare to store the bitmap id selected by user, also refactor mddev_set_bitmap_ops a bit in case the value is invalid. Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-5-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan Reviewed-by: Xiao Ni --- drivers/md/md.c | 37 +++++++++++++++++++++++++++++++------ drivers/md/md.h | 2 ++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8a89dca8209c..806fa7fb25a5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -676,13 +676,33 @@ static void active_io_release(struct percpu_ref *ref) static void no_op(struct percpu_ref *r) {} -static void mddev_set_bitmap_ops(struct mddev *mddev, enum md_submodule_id id) +static bool mddev_set_bitmap_ops(struct mddev *mddev) { + struct md_submodule_head *head; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + return true; + xa_lock(&md_submodule); - mddev->bitmap_ops = xa_load(&md_submodule, id); + head = xa_load(&md_submodule, mddev->bitmap_id); + + if (!head) { + pr_warn("md: can't find bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + if (head->type != MD_BITMAP) { + pr_warn("md: invalid bitmap id %d\n", mddev->bitmap_id); + goto err; + } + + mddev->bitmap_ops = (void *)head; xa_unlock(&md_submodule); - if (!mddev->bitmap_ops) - pr_warn_once("md: can't find bitmap id %d\n", id); + return true; + +err: + xa_unlock(&md_submodule); + return false; } static void mddev_clear_bitmap_ops(struct mddev *mddev) @@ -692,8 +712,13 @@ static void mddev_clear_bitmap_ops(struct mddev *mddev) int mddev_init(struct mddev *mddev) { - /* TODO: support more versions */ - mddev_set_bitmap_ops(mddev, ID_BITMAP); + if (!IS_ENABLED(CONFIG_MD_BITMAP)) { + mddev->bitmap_id = ID_BITMAP_NONE; + } else { + mddev->bitmap_id = ID_BITMAP; + if (!mddev_set_bitmap_ops(mddev)) + return -EINVAL; + } if (percpu_ref_init(&mddev->active_io, active_io_release, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 1b767b5320cf..4fa5a3e68a0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -40,6 +40,7 @@ enum md_submodule_id { ID_CLUSTER, ID_BITMAP, ID_LLBITMAP, /* TODO */ + ID_BITMAP_NONE, }; struct md_submodule_head { @@ -565,6 +566,7 @@ struct mddev { struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ + enum md_submodule_id bitmap_id; void *bitmap; /* the bitmap for the device */ struct bitmap_operations *bitmap_ops; struct { From 180b82c1c7b2c2c5bffc1642239bed36d7e83f76 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:20 +0800 Subject: [PATCH 057/164] md/md-bitmap: add a new sysfs api bitmap_type The api will be used by mdadm to set bitmap_type while creating new array or assembling array, prepare to add a new bitmap. Currently available options are: cat /sys/block/md0/md/bitmap_type none [bitmap] Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-6-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Hannes Reinecke Reviewed-by: Xiao Ni Reviewed-by: Li Nan --- Documentation/admin-guide/md.rst | 73 ++++++++++++++++------------ drivers/md/md.c | 81 ++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 30 deletions(-) diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index 4ff2cc291d18..356d2a344f08 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -347,6 +347,49 @@ All md devices contain: active-idle like active, but no writes have been seen for a while (safe_mode_delay). + consistency_policy + This indicates how the array maintains consistency in case of unexpected + shutdown. It can be: + + none + Array has no redundancy information, e.g. raid0, linear. + + resync + Full resync is performed and all redundancy is regenerated when the + array is started after unclean shutdown. + + bitmap + Resync assisted by a write-intent bitmap. + + journal + For raid4/5/6, journal device is used to log transactions and replay + after unclean shutdown. + + ppl + For raid5 only, Partial Parity Log is used to close the write hole and + eliminate resync. + + The accepted values when writing to this file are ``ppl`` and ``resync``, + used to enable and disable PPL. + + uuid + This indicates the UUID of the array in the following format: + xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + + bitmap_type + [RW] When read, this file will display the current and available + bitmap for this array. The currently active bitmap will be enclosed + in [] brackets. Writing an bitmap name or ID to this file will switch + control of this array to that new bitmap. Note that writing a new + bitmap for created array is forbidden. + + none + No bitmap + bitmap + The default internal bitmap + +If bitmap_type is bitmap, then the md device will also contain: + bitmap/location This indicates where the write-intent bitmap for the array is stored. @@ -401,36 +444,6 @@ All md devices contain: once the array becomes non-degraded, and this fact has been recorded in the metadata. - consistency_policy - This indicates how the array maintains consistency in case of unexpected - shutdown. It can be: - - none - Array has no redundancy information, e.g. raid0, linear. - - resync - Full resync is performed and all redundancy is regenerated when the - array is started after unclean shutdown. - - bitmap - Resync assisted by a write-intent bitmap. - - journal - For raid4/5/6, journal device is used to log transactions and replay - after unclean shutdown. - - ppl - For raid5 only, Partial Parity Log is used to close the write hole and - eliminate resync. - - The accepted values when writing to this file are ``ppl`` and ``resync``, - used to enable and disable PPL. - - uuid - This indicates the UUID of the array in the following format: - xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - - As component devices are added to an md array, they appear in the ``md`` directory as new directories named:: diff --git a/drivers/md/md.c b/drivers/md/md.c index 806fa7fb25a5..17cb33054ecf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4207,6 +4207,86 @@ new_level_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry md_new_level = __ATTR(new_level, 0664, new_level_show, new_level_store); +static ssize_t +bitmap_type_show(struct mddev *mddev, char *page) +{ + struct md_submodule_head *head; + unsigned long i; + ssize_t len = 0; + + if (mddev->bitmap_id == ID_BITMAP_NONE) + len += sprintf(page + len, "[none] "); + else + len += sprintf(page + len, "none "); + + xa_lock(&md_submodule); + xa_for_each(&md_submodule, i, head) { + if (head->type != MD_BITMAP) + continue; + + if (mddev->bitmap_id == head->id) + len += sprintf(page + len, "[%s] ", head->name); + else + len += sprintf(page + len, "%s ", head->name); + } + xa_unlock(&md_submodule); + + len += sprintf(page + len, "\n"); + return len; +} + +static ssize_t +bitmap_type_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct md_submodule_head *head; + enum md_submodule_id id; + unsigned long i; + int err = 0; + + xa_lock(&md_submodule); + + if (mddev->bitmap_ops) { + err = -EBUSY; + goto out; + } + + if (cmd_match(buf, "none")) { + mddev->bitmap_id = ID_BITMAP_NONE; + goto out; + } + + xa_for_each(&md_submodule, i, head) { + if (head->type == MD_BITMAP && cmd_match(buf, head->name)) { + mddev->bitmap_id = head->id; + goto out; + } + } + + err = kstrtoint(buf, 10, &id); + if (err) + goto out; + + if (id == ID_BITMAP_NONE) { + mddev->bitmap_id = id; + goto out; + } + + head = xa_load(&md_submodule, id); + if (head && head->type == MD_BITMAP) { + mddev->bitmap_id = id; + goto out; + } + + err = -ENOENT; + +out: + xa_unlock(&md_submodule); + return err ? err : len; +} + +static struct md_sysfs_entry md_bitmap_type = +__ATTR(bitmap_type, 0664, bitmap_type_show, bitmap_type_store); + static ssize_t layout_show(struct mddev *mddev, char *page) { @@ -5813,6 +5893,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show, static struct attribute *md_default_attrs[] = { &md_level.attr, &md_new_level.attr, + &md_bitmap_type.attr, &md_layout.attr, &md_raid_disks.attr, &md_uuid.attr, From fb8cc3b0d9db94817a5aae2bcaf78dd079a89e52 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:21 +0800 Subject: [PATCH 058/164] md/md-bitmap: delay registration of bitmap_ops until creating bitmap Currently bitmap_ops is registered while allocating mddev, this is fine when there is only one bitmap_ops. Delay setting bitmap_ops until creating bitmap, so that user can choose which bitmap to use before running the array. Link: https://lore.kernel.org/linux-raid/20250721171557.34587-7-yukuai@kernel.org Signed-off-by: Yu Kuai Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan Reviewed-by: Xiao Ni --- Documentation/admin-guide/md.rst | 3 ++ drivers/md/md.c | 90 +++++++++++++++++++------------- 2 files changed, 56 insertions(+), 37 deletions(-) diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index 356d2a344f08..001363f81850 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -388,6 +388,9 @@ All md devices contain: bitmap The default internal bitmap +If bitmap_type is not none, then additional bitmap attributes bitmap/xxx or +llbitmap/xxx will be created after md device KOBJ_CHANGE event. + If bitmap_type is bitmap, then the md device will also contain: bitmap/location diff --git a/drivers/md/md.c b/drivers/md/md.c index 17cb33054ecf..51dee0a98af1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -678,9 +678,11 @@ static void no_op(struct percpu_ref *r) {} static bool mddev_set_bitmap_ops(struct mddev *mddev) { + struct bitmap_operations *old = mddev->bitmap_ops; struct md_submodule_head *head; - if (mddev->bitmap_id == ID_BITMAP_NONE) + if (mddev->bitmap_id == ID_BITMAP_NONE || + (old && old->head.id == mddev->bitmap_id)) return true; xa_lock(&md_submodule); @@ -698,6 +700,18 @@ static bool mddev_set_bitmap_ops(struct mddev *mddev) mddev->bitmap_ops = (void *)head; xa_unlock(&md_submodule); + + if (!mddev_is_dm(mddev) && mddev->bitmap_ops->group) { + if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) + pr_warn("md: cannot register extra bitmap attributes for %s\n", + mdname(mddev)); + else + /* + * Inform user with KOBJ_CHANGE about new bitmap + * attributes. + */ + kobject_uevent(&mddev->kobj, KOBJ_CHANGE); + } return true; err: @@ -707,28 +721,26 @@ err: static void mddev_clear_bitmap_ops(struct mddev *mddev) { + if (!mddev_is_dm(mddev) && mddev->bitmap_ops && + mddev->bitmap_ops->group) + sysfs_remove_group(&mddev->kobj, mddev->bitmap_ops->group); + mddev->bitmap_ops = NULL; } int mddev_init(struct mddev *mddev) { - if (!IS_ENABLED(CONFIG_MD_BITMAP)) { + if (!IS_ENABLED(CONFIG_MD_BITMAP)) mddev->bitmap_id = ID_BITMAP_NONE; - } else { + else mddev->bitmap_id = ID_BITMAP; - if (!mddev_set_bitmap_ops(mddev)) - return -EINVAL; - } if (percpu_ref_init(&mddev->active_io, active_io_release, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - mddev_clear_bitmap_ops(mddev); + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) return -ENOMEM; - } if (percpu_ref_init(&mddev->writes_pending, no_op, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { - mddev_clear_bitmap_ops(mddev); percpu_ref_exit(&mddev->active_io); return -ENOMEM; } @@ -766,7 +778,6 @@ EXPORT_SYMBOL_GPL(mddev_init); void mddev_destroy(struct mddev *mddev) { - mddev_clear_bitmap_ops(mddev); percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->writes_pending); } @@ -6196,11 +6207,6 @@ struct mddev *md_alloc(dev_t dev, char *name) return ERR_PTR(error); } - if (md_bitmap_registered(mddev) && mddev->bitmap_ops->group) - if (sysfs_create_group(&mddev->kobj, mddev->bitmap_ops->group)) - pr_warn("md: cannot register extra bitmap attributes for %s\n", - mdname(mddev)); - kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); @@ -6279,6 +6285,26 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +static int md_bitmap_create(struct mddev *mddev) +{ + if (mddev->bitmap_id == ID_BITMAP_NONE) + return -EINVAL; + + if (!mddev_set_bitmap_ops(mddev)) + return -ENOENT; + + return mddev->bitmap_ops->create(mddev); +} + +static void md_bitmap_destroy(struct mddev *mddev) +{ + if (!md_bitmap_registered(mddev)) + return; + + mddev->bitmap_ops->destroy(mddev); + mddev_clear_bitmap_ops(mddev); +} + int md_run(struct mddev *mddev) { int err; @@ -6443,9 +6469,9 @@ int md_run(struct mddev *mddev) (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; } - if (err == 0 && pers->sync_request && md_bitmap_registered(mddev) && + if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -6518,8 +6544,7 @@ bitmap_abort: pers->free(mddev, mddev->private); mddev->private = NULL; put_pers(pers); - if (md_bitmap_registered(mddev)) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->io_clone_set); exit_sync_set: @@ -6542,7 +6567,7 @@ int do_md_run(struct mddev *mddev) if (md_bitmap_registered(mddev)) { err = mddev->bitmap_ops->load(mddev); if (err) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); goto out; } } @@ -6740,8 +6765,7 @@ static void __md_stop(struct mddev *mddev) { struct md_personality *pers = mddev->pers; - if (md_bitmap_registered(mddev)) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev_detach(mddev); spin_lock(&mddev->lock); mddev->pers = NULL; @@ -7518,16 +7542,16 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev); + err = md_bitmap_create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); if (err) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); fd = -1; } } else if (fd < 0) { - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } } @@ -7812,14 +7836,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = update_raid_disks(mddev, info->raid_disks); if ((state ^ info->state) & (1<pers->quiesce == NULL || mddev->thread == NULL) { rv = -EINVAL; goto err; @@ -7842,12 +7858,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev); + rv = md_bitmap_create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); if (rv) - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); } else { struct md_bitmap_stats stats; @@ -7873,7 +7889,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) put_cluster_ops(mddev); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } - mddev->bitmap_ops->destroy(mddev); + md_bitmap_destroy(mddev); mddev->bitmap_info.offset = 0; } } From f196d72888640e35002ac3511757cefaa9c5e339 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:22 +0800 Subject: [PATCH 059/164] md/md-bitmap: add a new method skip_sync_blocks() in bitmap_operations This method is used to check if blocks can be skipped before calling into pers->sync_request(), llbitmap will use this method to skip resync for unwritten/clean data blocks, and recovery/check/repair for unwritten data blocks; Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-8-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Xiao Ni Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan --- drivers/md/md-bitmap.h | 1 + drivers/md/md.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 8616ced49077..95453696c68e 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -89,6 +89,7 @@ struct bitmap_operations { md_bitmap_fn *start_discard; md_bitmap_fn *end_discard; + sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); diff --git a/drivers/md/md.c b/drivers/md/md.c index 51dee0a98af1..ec80c8d6cbcb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9460,6 +9460,12 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; + if (mddev->bitmap_ops && mddev->bitmap_ops->skip_sync_blocks) { + sectors = mddev->bitmap_ops->skip_sync_blocks(mddev, j); + if (sectors) + goto update; + } + sectors = mddev->pers->sync_request(mddev, j, max_sectors, &skipped); if (sectors == 0) { @@ -9475,6 +9481,7 @@ void md_do_sync(struct md_thread *thread) if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) break; +update: j += sectors; if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ From a4dd9ba39ba4f86ae8848c63945d443f0569efb1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:23 +0800 Subject: [PATCH 060/164] md/md-bitmap: add a new method blocks_synced() in bitmap_operations Currently, raid456 must perform a whole array initial recovery to build initail xor data, then IO to the array won't have to read all the blocks in underlying disks. This behavior will affect IO performance a lot, and nowadays there are huge disks and the initial recovery can take a long time. Hence llbitmap will support lazy initial recovery in following patches. This method is used to check if data blocks is synced or not, if not then IO will still have to read all blocks for raid456. Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-9-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai --- drivers/md/md-bitmap.h | 1 + drivers/md/raid5.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 95453696c68e..5f41724cbcd8 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -90,6 +90,7 @@ struct bitmap_operations { md_bitmap_fn *end_discard; sector_t (*skip_sync_blocks)(struct mddev *mddev, sector_t offset); + bool (*blocks_synced)(struct mddev *mddev, sector_t offset); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5285e72341a2..672ab226e43c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4097,7 +4097,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - sector_t resync_offset = conf->mddev->resync_offset; + struct mddev *mddev = conf->mddev; + sector_t resync_offset = mddev->resync_offset; /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or @@ -4116,6 +4117,12 @@ static int handle_stripe_dirtying(struct r5conf *conf, pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", conf->rmw_level, (unsigned long long)resync_offset, (unsigned long long)sh->sector); + } else if (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && + !mddev->bitmap_ops->blocks_synced(mddev, sh->sector)) { + /* The initial recover is not done, must read everything */ + rcw = 1; rmw = 2; + pr_debug("force RCW by lazy recovery, sh->sector=%llu\n", + sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; @@ -4148,7 +4155,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", + mddev_add_trace_msg(mddev, "raid5 rmw %llu %d", sh->sector, rmw); for (i = disks; i--; ) { @@ -4227,8 +4234,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_DELAYED, &sh->state); } } - if (rcw && !mddev_is_dm(conf->mddev)) - blk_add_trace_msg(conf->mddev->gendisk->queue, + if (rcw && !mddev_is_dm(mddev)) + blk_add_trace_msg(mddev->gendisk->queue, "raid5 rcw %llu %d %d %d", (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); From c951ccf0bf2df4f39b45dc77998dd71da7a85369 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:24 +0800 Subject: [PATCH 061/164] md: add a new recovery_flag MD_RECOVERY_LAZY_RECOVER This flag is used by llbitmap in later patches to skip raid456 initial recover and delay building initial xor data to first write. https://lore.kernel.org/linux-raid/20250829080426.1441678-10-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai --- drivers/md/md.c | 47 +++++++++++++++++++++++++++++++++++++++++++++- drivers/md/md.h | 2 ++ drivers/md/raid5.c | 19 +++++++++++++++---- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ec80c8d6cbcb..f5d002537989 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9199,6 +9199,39 @@ static sector_t md_sync_max_sectors(struct mddev *mddev, } } +/* + * If lazy recovery is requested and all rdevs are in sync, select the rdev with + * the higest index to perfore recovery to build initial xor data, this is the + * same as old bitmap. + */ +static bool mddev_select_lazy_recover_rdev(struct mddev *mddev) +{ + struct md_rdev *recover_rdev = NULL; + struct md_rdev *rdev; + bool ret = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev->raid_disk < 0) + continue; + + if (test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) + break; + + if (!recover_rdev || recover_rdev->raid_disk < rdev->raid_disk) + recover_rdev = rdev; + } + + if (recover_rdev) { + clear_bit(In_sync, &recover_rdev->flags); + ret = true; + } + + rcu_read_unlock(); + return ret; +} + static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) { sector_t start = 0; @@ -9230,6 +9263,14 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) start = rdev->recovery_offset; rcu_read_unlock(); + /* + * If there are no spares, and raid456 lazy initial recover is + * requested. + */ + if (test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery) && + start == MaxSector && mddev_select_lazy_recover_rdev(mddev)) + start = 0; + /* If there is a bitmap, we need to make sure all * writes that started before we added a spare * complete before we start doing a recovery. @@ -9791,6 +9832,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9799,6 +9841,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); return true; } @@ -9808,7 +9851,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) * re-add. */ *spares = remove_and_add_spares(mddev, NULL); - if (*spares) { + if (*spares || test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) { clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); @@ -10021,6 +10064,7 @@ void md_check_recovery(struct mddev *mddev) } clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); @@ -10131,6 +10175,7 @@ void md_reap_sync_thread(struct mddev *mddev) clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + clear_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); /* * We call mddev->cluster_ops->update_size here because sync_size could * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, diff --git a/drivers/md/md.h b/drivers/md/md.h index 4fa5a3e68a0c..7b6357879a84 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -667,6 +667,8 @@ enum recovery_flags { MD_RECOVERY_RESHAPE, /* remote node is running resync thread */ MD_RESYNCING_REMOTE, + /* raid456 lazy initial recover */ + MD_RECOVERY_LAZY_RECOVER, }; enum md_ro_state { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 672ab226e43c..5112658ef5f6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4705,10 +4705,21 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) - /* in sync if before recovery_offset */ - set_bit(R5_Insync, &dev->flags); - else if (test_bit(R5_UPTODATE, &dev->flags) && + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= + rdev->recovery_offset) { + /* + * in sync if: + * - normal IO, or + * - resync IO that is not lazy recovery + * + * For lazy recovery, we have to mark the rdev without + * In_sync as failed, to build initial xor data. + */ + if (!test_bit(STRIPE_SYNCING, &sh->state) || + !test_bit(MD_RECOVERY_LAZY_RECOVER, + &conf->mddev->recovery)) + set_bit(R5_Insync, &dev->flags); + } else if (test_bit(R5_UPTODATE, &dev->flags) && test_bit(R5_Expanded, &dev->flags)) /* If we've reshaped into here, we assume it is Insync. * We will shortly update recovery_offset to make From 66be318e6659d10e90f487e2610409cef659dbfe Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:25 +0800 Subject: [PATCH 062/164] md/md-bitmap: make method bitmap_ops->daemon_work optional daemon_work() will be called by daemon thread, on the one hand, daemon thread doesn't have strict wake-up time; on the other hand, too much work are put to daemon thread, like handle sync IO, handle failed or specail normal IO, handle recovery, and so on. Hence daemon thread may be too busy to clear dirty bits in time. Make bitmap_ops->daemon_work() optional and following patches will use separate async work to clear dirty bits for the new bitmap. Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-11-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Li Nan --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f5d002537989..2f1907f3bc25 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9997,7 +9997,7 @@ static void unregister_sync_thread(struct mddev *mddev) */ void md_check_recovery(struct mddev *mddev) { - if (md_bitmap_enabled(mddev, false)) + if (md_bitmap_enabled(mddev, false) && mddev->bitmap_ops->daemon_work) mddev->bitmap_ops->daemon_work(mddev); if (signal_pending(current)) { From 5ab829f1971dc99f2aac10846c378e67fc875abc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Aug 2025 16:04:26 +0800 Subject: [PATCH 063/164] md/md-llbitmap: introduce new lockless bitmap Redundant data is used to enhance data fault tolerance, and the storage method for redundant data vary depending on the RAID levels. And it's important to maintain the consistency of redundant data. Bitmap is used to record which data blocks have been synchronized and which ones need to be resynchronized or recovered. Each bit in the bitmap represents a segment of data in the array. When a bit is set, it indicates that the multiple redundant copies of that data segment may not be consistent. Data synchronization can be performed based on the bitmap after power failure or readding a disk. If there is no bitmap, a full disk synchronization is required. Due to known performance issues with md-bitmap and the unreasonable implementations: - self-managed IO submitting like filemap_write_page(); - global spin_lock I have decided not to continue optimizing based on the current bitmap implementation, this new bitmap is invented without locking from IO fast path and can be used with fast disks. For designs and details, see the comments in drivers/md-llbitmap.c. Link: https://lore.kernel.org/linux-raid/20250829080426.1441678-12-yukuai1@huaweicloud.com Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- Documentation/admin-guide/md.rst | 20 + drivers/md/Kconfig | 11 + drivers/md/Makefile | 1 + drivers/md/md-bitmap.c | 9 - drivers/md/md-bitmap.h | 31 +- drivers/md/md-llbitmap.c | 1626 ++++++++++++++++++++++++++++++ drivers/md/md.c | 6 + drivers/md/md.h | 4 +- 8 files changed, 1696 insertions(+), 12 deletions(-) create mode 100644 drivers/md/md-llbitmap.c diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index 001363f81850..1c2eacc94758 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -387,6 +387,8 @@ All md devices contain: No bitmap bitmap The default internal bitmap + llbitmap + The lockless internal bitmap If bitmap_type is not none, then additional bitmap attributes bitmap/xxx or llbitmap/xxx will be created after md device KOBJ_CHANGE event. @@ -447,6 +449,24 @@ If bitmap_type is bitmap, then the md device will also contain: once the array becomes non-degraded, and this fact has been recorded in the metadata. +If bitmap_type is llbitmap, then the md device will also contain: + + llbitmap/bits + This is read-only, show status of bitmap bits, the number of each + value. + + llbitmap/metadata + This is read-only, show bitmap metadata, include chunksize, chunkshift, + chunks, offset and daemon_sleep. + + llbitmap/daemon_sleep + This is read-write, time in seconds that daemon function will be + triggered to clear dirty bits. + + llbitmap/barrier_idle + This is read-write, time in seconds that page barrier will be idled, + means dirty bits in the page will be cleared. + As component devices are added to an md array, they appear in the ``md`` directory as new directories named:: diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f913579e731c..07c19b2182ca 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -52,6 +52,17 @@ config MD_BITMAP If unsure, say Y. +config MD_LLBITMAP + bool "MD RAID lockless bitmap support" + depends on BLK_DEV_MD + help + If you say Y here, support for the lockless write intent bitmap will + be enabled. + + Note, this is an experimental feature. + + If unsure, say N. + config MD_AUTODETECT bool "Autodetect RAID arrays during kernel boot" depends on BLK_DEV_MD=y diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 2e18147a9c40..5a51b3408b70 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,6 +29,7 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o md-mod-y += md.o md-mod-$(CONFIG_MD_BITMAP) += md-bitmap.o +md-mod-$(CONFIG_MD_LLBITMAP) += md-llbitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o linear-y += md-linear.o diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index dc050ff94d5b..84b7e2af6dba 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -34,15 +34,6 @@ #include "md-bitmap.h" #include "md-cluster.h" -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - /* * in-memory bitmap: * diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 5f41724cbcd8..b42a28fa83a0 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -9,10 +9,26 @@ #define BITMAP_MAGIC 0x6d746962 +/* + * version 3 is host-endian order, this is deprecated and not used for new + * array + */ +#define BITMAP_MAJOR_LO 3 +#define BITMAP_MAJOR_HOSTENDIAN 3 +/* version 4 is little-endian order, the default value */ +#define BITMAP_MAJOR_HI 4 +/* version 5 is only used for cluster */ +#define BITMAP_MAJOR_CLUSTERED 5 +/* version 6 is only used for lockless bitmap */ +#define BITMAP_MAJOR_LOCKLESS 6 + /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_FIRST_USE = 3, /* llbitmap is just created */ + BITMAP_CLEAN = 4, /* llbitmap is created with assume_clean */ + BITMAP_DAEMON_BUSY = 5, /* llbitmap daemon is not finished after daemon_sleep */ BITMAP_HOSTENDIAN =15, }; @@ -166,4 +182,17 @@ static inline void md_bitmap_exit(void) } #endif +#ifdef CONFIG_MD_LLBITMAP +int md_llbitmap_init(void); +void md_llbitmap_exit(void); +#else +static inline int md_llbitmap_init(void) +{ + return 0; +} +static inline void md_llbitmap_exit(void) +{ +} +#endif + #endif diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c new file mode 100644 index 000000000000..3337d5c7e7e5 --- /dev/null +++ b/drivers/md/md-llbitmap.c @@ -0,0 +1,1626 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "md.h" +#include "md-bitmap.h" + +/* + * #### Background + * + * Redundant data is used to enhance data fault tolerance, and the storage + * methods for redundant data vary depending on the RAID levels. And it's + * important to maintain the consistency of redundant data. + * + * Bitmap is used to record which data blocks have been synchronized and which + * ones need to be resynchronized or recovered. Each bit in the bitmap + * represents a segment of data in the array. When a bit is set, it indicates + * that the multiple redundant copies of that data segment may not be + * consistent. Data synchronization can be performed based on the bitmap after + * power failure or readding a disk. If there is no bitmap, a full disk + * synchronization is required. + * + * #### Key Features + * + * - IO fastpath is lockless, if user issues lots of write IO to the same + * bitmap bit in a short time, only the first write has additional overhead + * to update bitmap bit, no additional overhead for the following writes; + * - support only resync or recover written data, means in the case creating + * new array or replacing with a new disk, there is no need to do a full disk + * resync/recovery; + * + * #### Key Concept + * + * ##### State Machine + * + * Each bit is one byte, contain 6 different states, see llbitmap_state. And + * there are total 8 different actions, see llbitmap_action, can change state: + * + * llbitmap state machine: transitions between states + * + * | | Startwrite | Startsync | Endsync | Abortsync| + * | --------- | ---------- | --------- | ------- | ------- | + * | Unwritten | Dirty | x | x | x | + * | Clean | Dirty | x | x | x | + * | Dirty | x | x | x | x | + * | NeedSync | x | Syncing | x | x | + * | Syncing | x | Syncing | Dirty | NeedSync | + * + * | | Reload | Daemon | Discard | Stale | + * | --------- | -------- | ------ | --------- | --------- | + * | Unwritten | x | x | x | x | + * | Clean | x | x | Unwritten | NeedSync | + * | Dirty | NeedSync | Clean | Unwritten | NeedSync | + * | NeedSync | x | x | Unwritten | x | + * | Syncing | NeedSync | x | Unwritten | NeedSync | + * + * Typical scenarios: + * + * 1) Create new array + * All bits will be set to Unwritten by default, if --assume-clean is set, + * all bits will be set to Clean instead. + * + * 2) write data, raid1/raid10 have full copy of data, while raid456 doesn't and + * rely on xor data + * + * 2.1) write new data to raid1/raid10: + * Unwritten --StartWrite--> Dirty + * + * 2.2) write new data to raid456: + * Unwritten --StartWrite--> NeedSync + * + * Because the initial recover for raid456 is skipped, the xor data is not built + * yet, the bit must be set to NeedSync first and after lazy initial recover is + * finished, the bit will finally set to Dirty(see 5.1 and 5.4); + * + * 2.3) cover write + * Clean --StartWrite--> Dirty + * + * 3) daemon, if the array is not degraded: + * Dirty --Daemon--> Clean + * + * 4) discard + * {Clean, Dirty, NeedSync, Syncing} --Discard--> Unwritten + * + * 5) resync and recover + * + * 5.1) common process + * NeedSync --Startsync--> Syncing --Endsync--> Dirty --Daemon--> Clean + * + * 5.2) resync after power failure + * Dirty --Reload--> NeedSync + * + * 5.3) recover while replacing with a new disk + * By default, the old bitmap framework will recover all data, and llbitmap + * implements this by a new helper, see llbitmap_skip_sync_blocks: + * + * skip recover for bits other than dirty or clean; + * + * 5.4) lazy initial recover for raid5: + * By default, the old bitmap framework will only allow new recover when there + * are spares(new disk), a new recovery flag MD_RECOVERY_LAZY_RECOVER is added + * to perform raid456 lazy recover for set bits(from 2.2). + * + * 6. special handling for degraded array: + * + * - Dirty bits will never be cleared, daemon will just do nothing, so that if + * a disk is readded, Clean bits can be skipped with recovery; + * - Dirty bits will convert to Syncing from start write, to do data recovery + * for new added disks; + * - New write will convert bits to NeedSync directly; + * + * ##### Bitmap IO + * + * ##### Chunksize + * + * The default bitmap size is 128k, incluing 1k bitmap super block, and + * the default size of segment of data in the array each bit(chunksize) is 64k, + * and chunksize will adjust to twice the old size each time if the total number + * bits is not less than 127k.(see llbitmap_init) + * + * ##### READ + * + * While creating bitmap, all pages will be allocated and read for llbitmap, + * there won't be read afterwards + * + * ##### WRITE + * + * WRITE IO is divided into logical_block_size of the array, the dirty state + * of each block is tracked independently, for example: + * + * each page is 4k, contain 8 blocks; each block is 512 bytes contain 512 bit; + * + * | page0 | page1 | ... | page 31 | + * | | + * | \-----------------------\ + * | | + * | block0 | block1 | ... | block 8| + * | | + * | \-----------------\ + * | | + * | bit0 | bit1 | ... | bit511 | + * + * From IO path, if one bit is changed to Dirty or NeedSync, the corresponding + * subpage will be marked dirty, such block must write first before the IO is + * issued. This behaviour will affect IO performance, to reduce the impact, if + * multiple bits are changed in the same block in a short time, all bits in this + * block will be changed to Dirty/NeedSync, so that there won't be any overhead + * until daemon clears dirty bits. + * + * ##### Dirty Bits synchronization + * + * IO fast path will set bits to dirty, and those dirty bits will be cleared + * by daemon after IO is done. llbitmap_page_ctl is used to synchronize between + * IO path and daemon; + * + * IO path: + * 1) try to grab a reference, if succeed, set expire time after 5s and return; + * 2) if failed to grab a reference, wait for daemon to finish clearing dirty + * bits; + * + * Daemon (Daemon will be woken up every daemon_sleep seconds): + * For each page: + * 1) check if page expired, if not skip this page; for expired page: + * 2) suspend the page and wait for inflight write IO to be done; + * 3) change dirty page to clean; + * 4) resume the page; + */ + +#define BITMAP_DATA_OFFSET 1024 + +/* 64k is the max IO size of sync IO for raid1/raid10 */ +#define MIN_CHUNK_SIZE (64 * 2) + +/* By default, daemon will be woken up every 30s */ +#define DEFAULT_DAEMON_SLEEP 30 + +/* + * Dirtied bits that have not been accessed for more than 5s will be cleared + * by daemon. + */ +#define DEFAULT_BARRIER_IDLE 5 + +enum llbitmap_state { + /* No valid data, init state after assemble the array */ + BitUnwritten = 0, + /* data is consistent */ + BitClean, + /* data will be consistent after IO is done, set directly for writes */ + BitDirty, + /* + * data need to be resynchronized: + * 1) set directly for writes if array is degraded, prevent full disk + * synchronization after readding a disk; + * 2) reassemble the array after power failure, and dirty bits are + * found after reloading the bitmap; + * 3) set for first write for raid5, to build initial xor data lazily + */ + BitNeedSync, + /* data is synchronizing */ + BitSyncing, + BitStateCount, + BitNone = 0xff, +}; + +enum llbitmap_action { + /* User write new data, this is the only action from IO fast path */ + BitmapActionStartwrite = 0, + /* Start recovery */ + BitmapActionStartsync, + /* Finish recovery */ + BitmapActionEndsync, + /* Failed recovery */ + BitmapActionAbortsync, + /* Reassemble the array */ + BitmapActionReload, + /* Daemon thread is trying to clear dirty bits */ + BitmapActionDaemon, + /* Data is deleted */ + BitmapActionDiscard, + /* + * Bitmap is stale, mark all bits in addition to BitUnwritten to + * BitNeedSync. + */ + BitmapActionStale, + BitmapActionCount, + /* Init state is BitUnwritten */ + BitmapActionInit, +}; + +enum llbitmap_page_state { + LLPageFlush = 0, + LLPageDirty, +}; + +struct llbitmap_page_ctl { + char *state; + struct page *page; + unsigned long expire; + unsigned long flags; + wait_queue_head_t wait; + struct percpu_ref active; + /* Per block size dirty state, maximum 64k page / 1 sector = 128 */ + unsigned long dirty[]; +}; + +struct llbitmap { + struct mddev *mddev; + struct llbitmap_page_ctl **pctl; + + unsigned int nr_pages; + unsigned int io_size; + unsigned int blocks_per_page; + + /* shift of one chunk */ + unsigned long chunkshift; + /* size of one chunk in sector */ + unsigned long chunksize; + /* total number of chunks */ + unsigned long chunks; + unsigned long last_end_sync; + /* + * time in seconds that dirty bits will be cleared if the page is not + * accessed. + */ + unsigned long barrier_idle; + /* fires on first BitDirty state */ + struct timer_list pending_timer; + struct work_struct daemon_work; + + unsigned long flags; + __u64 events_cleared; + + /* for slow disks */ + atomic_t behind_writes; + wait_queue_head_t behind_wait; +}; + +struct llbitmap_unplug_work { + struct work_struct work; + struct llbitmap *llbitmap; + struct completion *done; +}; + +static struct workqueue_struct *md_llbitmap_io_wq; +static struct workqueue_struct *md_llbitmap_unplug_wq; + +static char state_machine[BitStateCount][BitmapActionCount] = { + [BitUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitNone, + [BitmapActionStale] = BitNone, + }, + [BitClean] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitDirty] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitClean, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, + [BitNeedSync] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNone, + }, + [BitSyncing] = { + [BitmapActionStartwrite] = BitNone, + [BitmapActionStartsync] = BitSyncing, + [BitmapActionEndsync] = BitDirty, + [BitmapActionAbortsync] = BitNeedSync, + [BitmapActionReload] = BitNeedSync, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitNeedSync, + }, +}; + +static void __llbitmap_flush(struct mddev *mddev); + +static enum llbitmap_state llbitmap_read(struct llbitmap *llbitmap, loff_t pos) +{ + unsigned int idx; + unsigned int offset; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + offset = offset_in_page(pos); + + return llbitmap->pctl[idx]->state[offset]; +} + +/* set all the bits in the subpage as dirty */ +static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, + struct llbitmap_page_ctl *pctl, + unsigned int block) +{ + bool level_456 = raid_is_456(llbitmap->mddev); + unsigned int io_size = llbitmap->io_size; + int pos; + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + switch (pctl->state[pos]) { + case BitUnwritten: + pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; + break; + case BitClean: + pctl->state[pos] = BitDirty; + break; + }; + } +} + +static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, + int offset) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + unsigned int io_size = llbitmap->io_size; + int block = offset / io_size; + int pos; + + if (!test_bit(LLPageDirty, &pctl->flags)) + set_bit(LLPageDirty, &pctl->flags); + + /* + * For degraded array, dirty bits will never be cleared, and we must + * resync all the dirty bits, hence skip infect new dirty bits to + * prevent resync unnecessary data. + */ + if (llbitmap->mddev->degraded) { + set_bit(block, pctl->dirty); + return; + } + + /* + * The subpage usually contains a total of 512 bits. If any single bit + * within the subpage is marked as dirty, the entire sector will be + * written. To avoid impacting write performance, when multiple bits + * within the same sector are modified within llbitmap->barrier_idle, + * all bits in the sector will be collectively marked as dirty at once. + */ + if (test_and_set_bit(block, pctl->dirty)) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + + for (pos = block * io_size; pos < (block + 1) * io_size; pos++) { + if (pos == offset) + continue; + if (pctl->state[pos] == BitDirty || + pctl->state[pos] == BitNeedSync) { + llbitmap_infect_dirty_bits(llbitmap, pctl, block); + return; + } + } +} + +static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, + loff_t pos) +{ + unsigned int idx; + unsigned int bit; + + pos += BITMAP_DATA_OFFSET; + idx = pos >> PAGE_SHIFT; + bit = offset_in_page(pos); + + llbitmap->pctl[idx]->state[bit] = state; + if (state == BitDirty || state == BitNeedSync) + llbitmap_set_page_dirty(llbitmap, idx, bit); +} + +static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) +{ + struct mddev *mddev = llbitmap->mddev; + struct page *page = NULL; + struct md_rdev *rdev; + + if (llbitmap->pctl && llbitmap->pctl[idx]) + page = llbitmap->pctl[idx]->page; + if (page) + return page; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return ERR_PTR(-ENOMEM); + + rdev_for_each(rdev, mddev) { + sector_t sector; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + + (idx << PAGE_SECTORS_SHIFT); + + if (sync_page_io(rdev, sector, PAGE_SIZE, page, REQ_OP_READ, + true)) + return page; + + md_error(mddev, rdev); + } + + __free_page(page); + return ERR_PTR(-EIO); +} + +static void llbitmap_write_page(struct llbitmap *llbitmap, int idx) +{ + struct page *page = llbitmap->pctl[idx]->page; + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + int block; + + for (block = 0; block < llbitmap->blocks_per_page; block++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (!test_and_clear_bit(block, pctl->dirty)) + continue; + + rdev_for_each(rdev, mddev) { + sector_t sector; + sector_t bit_sector = llbitmap->io_size >> SECTOR_SHIFT; + + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + sector = mddev->bitmap_info.offset + rdev->sb_start + + (idx << PAGE_SECTORS_SHIFT) + + block * bit_sector; + md_write_metadata(mddev, rdev, sector, + llbitmap->io_size, page, + block * llbitmap->io_size); + } + } +} + +static void active_release(struct percpu_ref *ref) +{ + struct llbitmap_page_ctl *pctl = + container_of(ref, struct llbitmap_page_ctl, active); + + wake_up(&pctl->wait); +} + +static void llbitmap_free_pages(struct llbitmap *llbitmap) +{ + int i; + + if (!llbitmap->pctl) + return; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + if (!pctl || !pctl->page) + break; + + __free_page(pctl->page); + percpu_ref_exit(&pctl->active); + } + + kfree(llbitmap->pctl[0]); + kfree(llbitmap->pctl); + llbitmap->pctl = NULL; +} + +static int llbitmap_cache_pages(struct llbitmap *llbitmap) +{ + struct llbitmap_page_ctl *pctl; + unsigned int nr_pages = DIV_ROUND_UP(llbitmap->chunks + + BITMAP_DATA_OFFSET, PAGE_SIZE); + unsigned int size = struct_size(pctl, dirty, BITS_TO_LONGS( + llbitmap->blocks_per_page)); + int i; + + llbitmap->pctl = kmalloc_array(nr_pages, sizeof(void *), + GFP_KERNEL | __GFP_ZERO); + if (!llbitmap->pctl) + return -ENOMEM; + + size = round_up(size, cache_line_size()); + pctl = kmalloc_array(nr_pages, size, GFP_KERNEL | __GFP_ZERO); + if (!pctl) { + kfree(llbitmap->pctl); + return -ENOMEM; + } + + llbitmap->nr_pages = nr_pages; + + for (i = 0; i < nr_pages; i++, pctl = (void *)pctl + size) { + struct page *page = llbitmap_read_page(llbitmap, i); + + llbitmap->pctl[i] = pctl; + + if (IS_ERR(page)) { + llbitmap_free_pages(llbitmap); + return PTR_ERR(page); + } + + if (percpu_ref_init(&pctl->active, active_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { + __free_page(page); + llbitmap_free_pages(llbitmap); + return -ENOMEM; + } + + pctl->page = page; + pctl->state = page_address(page); + init_waitqueue_head(&pctl->wait); + } + + return 0; +} + +static void llbitmap_init_state(struct llbitmap *llbitmap) +{ + enum llbitmap_state state = BitUnwritten; + unsigned long i; + + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + state = BitClean; + + for (i = 0; i < llbitmap->chunks; i++) + llbitmap_write(llbitmap, state, i); +} + +/* The return value is only used from resync, where @start == @end. */ +static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, + unsigned long start, + unsigned long end, + enum llbitmap_action action) +{ + struct mddev *mddev = llbitmap->mddev; + enum llbitmap_state state = BitNone; + bool level_456 = raid_is_456(llbitmap->mddev); + bool need_resync = false; + bool need_recovery = false; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return BitNone; + + if (action == BitmapActionInit) { + llbitmap_init_state(llbitmap); + return BitNone; + } + + while (start <= end) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) { + pr_err("%s: invalid bit %lu state %d action %d, forcing resync\n", + __func__, start, c, action); + state = BitNeedSync; + goto write_bitmap; + } + + if (c == BitNeedSync) + need_resync = !mddev->degraded; + + state = state_machine[c][action]; + +write_bitmap: + if (unlikely(mddev->degraded)) { + /* For degraded array, mark new data as need sync. */ + if (state == BitDirty && + action == BitmapActionStartwrite) + state = BitNeedSync; + /* + * For degraded array, resync dirty data as well, noted + * if array is still degraded after resync is done, all + * new data will still be dirty until array is clean. + */ + else if (c == BitDirty && + action == BitmapActionStartsync) + state = BitSyncing; + } else if (c == BitUnwritten && state == BitDirty && + action == BitmapActionStartwrite && level_456) { + /* Delay raid456 initial recovery to first write. */ + state = BitNeedSync; + } + + if (state == BitNone) { + start++; + continue; + } + + llbitmap_write(llbitmap, state, start); + + if (state == BitNeedSync) + need_resync = !mddev->degraded; + else if (state == BitDirty && + !timer_pending(&llbitmap->pending_timer)) + mod_timer(&llbitmap->pending_timer, + jiffies + mddev->bitmap_info.daemon_sleep * HZ); + + start++; + } + + if (need_resync && level_456) + need_recovery = true; + + if (need_recovery) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } else if (need_resync) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } + + return state; +} + +static void llbitmap_raise_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + +retry: + if (likely(percpu_ref_tryget_live(&pctl->active))) { + WRITE_ONCE(pctl->expire, jiffies + llbitmap->barrier_idle * HZ); + return; + } + + wait_event(pctl->wait, !percpu_ref_is_dying(&pctl->active)); + goto retry; +} + +static void llbitmap_release_barrier(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_put(&pctl->active); +} + +static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + percpu_ref_kill(&pctl->active); + + if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) + return -ETIMEDOUT; + + return 0; +} + +static void llbitmap_resume(struct llbitmap *llbitmap, int page_idx) +{ + struct llbitmap_page_ctl *pctl = llbitmap->pctl[page_idx]; + + pctl->expire = LONG_MAX; + percpu_ref_resurrect(&pctl->active); + wake_up(&pctl->wait); +} + +static int llbitmap_check_support(struct mddev *mddev) +{ + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/llbitmap: %s: array with journal cannot have bitmap\n", + mdname(mddev)); + return -EBUSY; + } + + if (mddev->bitmap_info.space == 0) { + if (mddev->bitmap_info.default_space == 0) { + pr_notice("md/llbitmap: %s: no space for bitmap\n", + mdname(mddev)); + return -ENOSPC; + } + } + + if (!mddev->persistent) { + pr_notice("md/llbitmap: %s: array must be persistent\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.file) { + pr_notice("md/llbitmap: %s: doesn't support bitmap file\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev->bitmap_info.external) { + pr_notice("md/llbitmap: %s: doesn't support external metadata\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + if (mddev_is_dm(mddev)) { + pr_notice("md/llbitmap: %s: doesn't support dm-raid\n", + mdname(mddev)); + return -EOPNOTSUPP; + } + + return 0; +} + +static int llbitmap_init(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + sector_t blocks = mddev->resync_max_sectors; + unsigned long chunksize = MIN_CHUNK_SIZE; + unsigned long chunks = DIV_ROUND_UP(blocks, chunksize); + unsigned long space = mddev->bitmap_info.space << SECTOR_SHIFT; + int ret; + + while (chunks > space) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP(blocks, chunksize); + } + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + mddev->bitmap_info.daemon_sleep = DEFAULT_DAEMON_SLEEP; + + ret = llbitmap_cache_pages(llbitmap); + if (ret) + return ret; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionInit); + /* flush initial llbitmap to disk */ + __llbitmap_flush(mddev); + + return 0; +} + +static int llbitmap_read_sb(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + unsigned long daemon_sleep; + unsigned long chunksize; + unsigned long events; + struct page *sb_page; + bitmap_super_t *sb; + int ret = -EINVAL; + + if (!mddev->bitmap_info.offset) { + pr_err("md/llbitmap: %s: no super block found", mdname(mddev)); + return -EINVAL; + } + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("md/llbitmap: %s: read super block failed", + mdname(mddev)); + return -EIO; + } + + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_err("md/llbitmap: %s: invalid super block magic number", + mdname(mddev)); + goto out_put_page; + } + + if (sb->version != cpu_to_le32(BITMAP_MAJOR_LOCKLESS)) { + pr_err("md/llbitmap: %s: invalid super block version", + mdname(mddev)); + goto out_put_page; + } + + if (memcmp(sb->uuid, mddev->uuid, 16)) { + pr_err("md/llbitmap: %s: bitmap superblock UUID mismatch\n", + mdname(mddev)); + goto out_put_page; + } + + if (mddev->bitmap_info.space == 0) { + int room = le32_to_cpu(sb->sectors_reserved); + + if (room) + mddev->bitmap_info.space = room; + else + mddev->bitmap_info.space = mddev->bitmap_info.default_space; + } + llbitmap->flags = le32_to_cpu(sb->state); + if (test_and_clear_bit(BITMAP_FIRST_USE, &llbitmap->flags)) { + ret = llbitmap_init(llbitmap); + goto out_put_page; + } + + chunksize = le32_to_cpu(sb->chunksize); + if (!is_power_of_2(chunksize)) { + pr_err("md/llbitmap: %s: chunksize not a power of 2", + mdname(mddev)); + goto out_put_page; + } + + if (chunksize < DIV_ROUND_UP(mddev->resync_max_sectors, + mddev->bitmap_info.space << SECTOR_SHIFT)) { + pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", + mdname(mddev), chunksize, mddev->resync_max_sectors, + mddev->bitmap_info.space); + goto out_put_page; + } + + daemon_sleep = le32_to_cpu(sb->daemon_sleep); + if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) { + pr_err("md/llbitmap: %s: daemon sleep %lu period out of range", + mdname(mddev), daemon_sleep); + goto out_put_page; + } + + events = le64_to_cpu(sb->events); + if (events < mddev->events) { + pr_warn("md/llbitmap :%s: bitmap file is out of date (%lu < %llu) -- forcing full recovery", + mdname(mddev), events, mddev->events); + set_bit(BITMAP_STALE, &llbitmap->flags); + } + + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + mddev->bitmap_info.chunksize = chunksize; + mddev->bitmap_info.daemon_sleep = daemon_sleep; + + llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; + llbitmap->chunksize = chunksize; + llbitmap->chunks = DIV_ROUND_UP(mddev->resync_max_sectors, chunksize); + llbitmap->chunkshift = ffz(~chunksize); + ret = llbitmap_cache_pages(llbitmap); + +out_put_page: + __free_page(sb_page); + kunmap_local(sb); + return ret; +} + +static void llbitmap_pending_timer_fn(struct timer_list *pending_timer) +{ + struct llbitmap *llbitmap = + container_of(pending_timer, struct llbitmap, pending_timer); + + if (work_busy(&llbitmap->daemon_work)) { + pr_warn("md/llbitmap: %s daemon_work not finished in %lu seconds\n", + mdname(llbitmap->mddev), + llbitmap->mddev->bitmap_info.daemon_sleep); + set_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags); + return; + } + + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); +} + +static void md_llbitmap_daemon_fn(struct work_struct *work) +{ + struct llbitmap *llbitmap = + container_of(work, struct llbitmap, daemon_work); + unsigned long start; + unsigned long end; + bool restart; + int idx; + + if (llbitmap->mddev->degraded) + return; +retry: + start = 0; + end = min(llbitmap->chunks, PAGE_SIZE - BITMAP_DATA_OFFSET) - 1; + restart = false; + + for (idx = 0; idx < llbitmap->nr_pages; idx++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; + + if (idx > 0) { + start = end + 1; + end = min(end + PAGE_SIZE, llbitmap->chunks - 1); + } + + if (!test_bit(LLPageFlush, &pctl->flags) && + time_before(jiffies, pctl->expire)) { + restart = true; + continue; + } + + if (llbitmap_suspend_timeout(llbitmap, idx) < 0) { + pr_warn("md/llbitmap: %s: %s waiting for page %d timeout\n", + mdname(llbitmap->mddev), __func__, idx); + continue; + } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDaemon); + llbitmap_resume(llbitmap, idx); + } + + /* + * If the daemon took a long time to finish, retry to prevent missing + * clearing dirty bits. + */ + if (test_and_clear_bit(BITMAP_DAEMON_BUSY, &llbitmap->flags)) + goto retry; + + /* If some page is dirty but not expired, setup timer again */ + if (restart) + mod_timer(&llbitmap->pending_timer, + jiffies + llbitmap->mddev->bitmap_info.daemon_sleep * HZ); +} + +static int llbitmap_create(struct mddev *mddev) +{ + struct llbitmap *llbitmap; + int ret; + + ret = llbitmap_check_support(mddev); + if (ret) + return ret; + + llbitmap = kzalloc(sizeof(*llbitmap), GFP_KERNEL); + if (!llbitmap) + return -ENOMEM; + + llbitmap->mddev = mddev; + llbitmap->io_size = bdev_logical_block_size(mddev->gendisk->part0); + llbitmap->blocks_per_page = PAGE_SIZE / llbitmap->io_size; + + timer_setup(&llbitmap->pending_timer, llbitmap_pending_timer_fn, 0); + INIT_WORK(&llbitmap->daemon_work, md_llbitmap_daemon_fn); + atomic_set(&llbitmap->behind_writes, 0); + init_waitqueue_head(&llbitmap->behind_wait); + + mutex_lock(&mddev->bitmap_info.mutex); + mddev->bitmap = llbitmap; + ret = llbitmap_read_sb(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); + if (ret) { + kfree(llbitmap); + mddev->bitmap = NULL; + } + + return ret; +} + +static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long chunks; + + if (chunksize == 0) + chunksize = llbitmap->chunksize; + + /* If there is enough space, leave the chunksize unchanged. */ + chunks = DIV_ROUND_UP(blocks, chunksize); + while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { + chunksize = chunksize << 1; + chunks = DIV_ROUND_UP(blocks, chunksize); + } + + llbitmap->chunkshift = ffz(~chunksize); + llbitmap->chunksize = chunksize; + llbitmap->chunks = chunks; + + return 0; +} + +static int llbitmap_load(struct mddev *mddev) +{ + enum llbitmap_action action = BitmapActionReload; + struct llbitmap *llbitmap = mddev->bitmap; + + if (test_and_clear_bit(BITMAP_STALE, &llbitmap->flags)) + action = BitmapActionStale; + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, action); + return 0; +} + +static void llbitmap_destroy(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + mutex_lock(&mddev->bitmap_info.mutex); + + timer_delete_sync(&llbitmap->pending_timer); + flush_workqueue(md_llbitmap_io_wq); + flush_workqueue(md_llbitmap_unplug_wq); + + mddev->bitmap = NULL; + llbitmap_free_pages(llbitmap); + kfree(llbitmap); + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static void llbitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = offset >> llbitmap->chunkshift; + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); + + while (page_start <= page_end) { + llbitmap_raise_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, + unsigned long sectors) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize); + unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; + int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; + + while (page_start <= page_end) { + llbitmap_release_barrier(llbitmap, page_start); + page_start++; + } +} + +static void llbitmap_unplug_fn(struct work_struct *work) +{ + struct llbitmap_unplug_work *unplug_work = + container_of(work, struct llbitmap_unplug_work, work); + struct llbitmap *llbitmap = unplug_work->llbitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + + for (i = 0; i < llbitmap->nr_pages; i++) { + if (!test_bit(LLPageDirty, &llbitmap->pctl[i]->flags) || + !test_and_clear_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + continue; + + llbitmap_write_page(llbitmap, i); + } + + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); + complete(unplug_work->done); +} + +static bool llbitmap_dirty(struct llbitmap *llbitmap) +{ + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + if (test_bit(LLPageDirty, &llbitmap->pctl[i]->flags)) + return true; + + return false; +} + +static void llbitmap_unplug(struct mddev *mddev, bool sync) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct llbitmap *llbitmap = mddev->bitmap; + struct llbitmap_unplug_work unplug_work = { + .llbitmap = llbitmap, + .done = &done, + }; + + if (!llbitmap_dirty(llbitmap)) + return; + + /* + * Issue new bitmap IO under submit_bio() context will deadlock: + * - the bio will wait for bitmap bio to be done, before it can be + * issued; + * - bitmap bio will be added to current->bio_list and wait for this + * bio to be issued; + */ + INIT_WORK_ONSTACK(&unplug_work.work, llbitmap_unplug_fn); + queue_work(md_llbitmap_unplug_wq, &unplug_work.work); + wait_for_completion(&done); + destroy_work_on_stack(&unplug_work.work); +} + +/* + * Force to write all bitmap pages to disk, called when stopping the array, or + * every daemon_sleep seconds when sync_thread is running. + */ +static void __llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + struct blk_plug plug; + int i; + + blk_start_plug(&plug); + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* mark all blocks as dirty */ + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + llbitmap_write_page(llbitmap, i); + } + blk_finish_plug(&plug); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_flush(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) + set_bit(LLPageFlush, &llbitmap->pctl[i]->flags); + + timer_delete_sync(&llbitmap->pending_timer); + queue_work(md_llbitmap_io_wq, &llbitmap->daemon_work); + flush_work(&llbitmap->daemon_work); + + __llbitmap_flush(mddev); +} + +/* This is used for raid5 lazy initial recovery */ +static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + return c == BitClean || c == BitDirty; +} + +static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + int blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + enum llbitmap_state c = llbitmap_read(llbitmap, p); + + /* always skip unwritten blocks */ + if (c == BitUnwritten) + return blocks; + + /* For degraded array, don't skip */ + if (mddev->degraded) + return 0; + + /* For resync also skip clean/dirty blocks */ + if ((c == BitClean || c == BitDirty) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + return blocks; + + return 0; +} + +static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks, bool degraded) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + /* + * Handle one bit at a time, this is much simpler. And it doesn't matter + * if md_do_sync() loop more times. + */ + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + return llbitmap_state_machine(llbitmap, p, p, + BitmapActionStartsync) == BitSyncing; +} + +/* Something is wrong, sync_thread stop at @offset */ +static void llbitmap_end_sync(struct mddev *mddev, sector_t offset, + sector_t *blocks) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long p = offset >> llbitmap->chunkshift; + + *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); + llbitmap_state_machine(llbitmap, p, llbitmap->chunks - 1, + BitmapActionAbortsync); +} + +/* A full sync_thread is finished */ +static void llbitmap_close_sync(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + int i; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + /* let daemon_fn clear dirty bits immediately */ + WRITE_ONCE(pctl->expire, jiffies); + } + + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionEndsync); +} + +/* + * sync_thread have reached @sector, update metadata every daemon_sleep seconds, + * just in case sync_thread have to restart after power failure. + */ +static void llbitmap_cond_end_sync(struct mddev *mddev, sector_t sector, + bool force) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (sector == 0) { + llbitmap->last_end_sync = jiffies; + return; + } + + if (time_before(jiffies, llbitmap->last_end_sync + + HZ * mddev->bitmap_info.daemon_sleep)) + return; + + wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + + mddev->curr_resync_completed = sector; + set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); + llbitmap_state_machine(llbitmap, 0, sector >> llbitmap->chunkshift, + BitmapActionEndsync); + __llbitmap_flush(mddev); + + llbitmap->last_end_sync = jiffies; + sysfs_notify_dirent_safe(mddev->sysfs_completed); +} + +static bool llbitmap_enabled(void *data, bool flush) +{ + struct llbitmap *llbitmap = data; + + return llbitmap && !test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); +} + +static void llbitmap_dirty_bits(struct mddev *mddev, unsigned long s, + unsigned long e) +{ + llbitmap_state_machine(mddev->bitmap, s, e, BitmapActionStartwrite); +} + +static void llbitmap_write_sb(struct llbitmap *llbitmap) +{ + int nr_blocks = DIV_ROUND_UP(BITMAP_DATA_OFFSET, llbitmap->io_size); + + bitmap_fill(llbitmap->pctl[0]->dirty, nr_blocks); + llbitmap_write_page(llbitmap, 0); + md_super_wait(llbitmap->mddev); +} + +static void llbitmap_update_sb(void *data) +{ + struct llbitmap *llbitmap = data; + struct mddev *mddev = llbitmap->mddev; + struct page *sb_page; + bitmap_super_t *sb; + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) + return; + + sb_page = llbitmap_read_page(llbitmap, 0); + if (IS_ERR(sb_page)) { + pr_err("%s: %s: read super block failed", __func__, + mdname(mddev)); + set_bit(BITMAP_WRITE_ERROR, &llbitmap->flags); + return; + } + + if (mddev->events < llbitmap->events_cleared) + llbitmap->events_cleared = mddev->events; + + sb = kmap_local_page(sb_page); + sb->events = cpu_to_le64(mddev->events); + sb->state = cpu_to_le32(llbitmap->flags); + sb->chunksize = cpu_to_le32(llbitmap->chunksize); + sb->sync_size = cpu_to_le64(mddev->resync_max_sectors); + sb->events_cleared = cpu_to_le64(llbitmap->events_cleared); + sb->sectors_reserved = cpu_to_le32(mddev->bitmap_info.space); + sb->daemon_sleep = cpu_to_le32(mddev->bitmap_info.daemon_sleep); + + kunmap_local(sb); + llbitmap_write_sb(llbitmap); +} + +static int llbitmap_get_stats(void *data, struct md_bitmap_stats *stats) +{ + struct llbitmap *llbitmap = data; + + memset(stats, 0, sizeof(*stats)); + + stats->missing_pages = 0; + stats->pages = llbitmap->nr_pages; + stats->file_pages = llbitmap->nr_pages; + + stats->behind_writes = atomic_read(&llbitmap->behind_writes); + stats->behind_wait = wq_has_sleeper(&llbitmap->behind_wait); + stats->events_cleared = llbitmap->events_cleared; + + return 0; +} + +/* just flag all pages as needing to be written */ +static void llbitmap_write_all(struct mddev *mddev) +{ + int i; + struct llbitmap *llbitmap = mddev->bitmap; + + for (i = 0; i < llbitmap->nr_pages; i++) { + struct llbitmap_page_ctl *pctl = llbitmap->pctl[i]; + + set_bit(LLPageDirty, &pctl->flags); + bitmap_fill(pctl->dirty, llbitmap->blocks_per_page); + } +} + +static void llbitmap_start_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + atomic_inc(&llbitmap->behind_writes); +} + +static void llbitmap_end_behind_write(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (atomic_dec_and_test(&llbitmap->behind_writes)) + wake_up(&llbitmap->behind_wait); +} + +static void llbitmap_wait_behind_writes(struct mddev *mddev) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + if (!llbitmap) + return; + + wait_event(llbitmap->behind_wait, + atomic_read(&llbitmap->behind_writes) == 0); + +} + +static ssize_t bits_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + int bits[BitStateCount] = {0}; + loff_t start = 0; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + if (test_bit(BITMAP_WRITE_ERROR, &llbitmap->flags)) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "bitmap io error\n"); + } + + while (start < llbitmap->chunks) { + enum llbitmap_state c = llbitmap_read(llbitmap, start); + + if (c < 0 || c >= BitStateCount) + pr_err("%s: invalid bit %llu state %d\n", + __func__, start, c); + else + bits[c]++; + start++; + } + + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + bits[BitUnwritten], bits[BitClean], bits[BitDirty], + bits[BitNeedSync], bits[BitSyncing]); +} + +static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap; + ssize_t ret; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap) { + mutex_unlock(&mddev->bitmap_info.mutex); + return sprintf(page, "no bitmap\n"); + } + + ret = sprintf(page, "chunksize %lu\nchunkshift %lu\nchunks %lu\noffset %llu\ndaemon_sleep %lu\n", + llbitmap->chunksize, llbitmap->chunkshift, + llbitmap->chunks, mddev->bitmap_info.offset, + llbitmap->mddev->bitmap_info.daemon_sleep); + mutex_unlock(&mddev->bitmap_info.mutex); + + return ret; +} + +static struct md_sysfs_entry llbitmap_metadata = __ATTR_RO(metadata); + +static ssize_t +daemon_sleep_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.daemon_sleep); +} + +static ssize_t +daemon_sleep_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + mddev->bitmap_info.daemon_sleep = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_daemon_sleep = __ATTR_RW(daemon_sleep); + +static ssize_t +barrier_idle_show(struct mddev *mddev, char *page) +{ + struct llbitmap *llbitmap = mddev->bitmap; + + return sprintf(page, "%lu\n", llbitmap->barrier_idle); +} + +static ssize_t +barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap = mddev->bitmap; + unsigned long timeout; + int rv = kstrtoul(buf, 10, &timeout); + + if (rv) + return rv; + + llbitmap->barrier_idle = timeout; + return len; +} + +static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); + +static struct attribute *md_llbitmap_attrs[] = { + &llbitmap_bits.attr, + &llbitmap_metadata.attr, + &llbitmap_daemon_sleep.attr, + &llbitmap_barrier_idle.attr, + NULL +}; + +static struct attribute_group md_llbitmap_group = { + .name = "llbitmap", + .attrs = md_llbitmap_attrs, +}; + +static struct bitmap_operations llbitmap_ops = { + .head = { + .type = MD_BITMAP, + .id = ID_LLBITMAP, + .name = "llbitmap", + }, + + .enabled = llbitmap_enabled, + .create = llbitmap_create, + .resize = llbitmap_resize, + .load = llbitmap_load, + .destroy = llbitmap_destroy, + + .start_write = llbitmap_start_write, + .end_write = llbitmap_end_write, + .start_discard = llbitmap_start_discard, + .end_discard = llbitmap_end_discard, + .unplug = llbitmap_unplug, + .flush = llbitmap_flush, + + .start_behind_write = llbitmap_start_behind_write, + .end_behind_write = llbitmap_end_behind_write, + .wait_behind_writes = llbitmap_wait_behind_writes, + + .blocks_synced = llbitmap_blocks_synced, + .skip_sync_blocks = llbitmap_skip_sync_blocks, + .start_sync = llbitmap_start_sync, + .end_sync = llbitmap_end_sync, + .close_sync = llbitmap_close_sync, + .cond_end_sync = llbitmap_cond_end_sync, + + .update_sb = llbitmap_update_sb, + .get_stats = llbitmap_get_stats, + .dirty_bits = llbitmap_dirty_bits, + .write_all = llbitmap_write_all, + + .group = &md_llbitmap_group, +}; + +int md_llbitmap_init(void) +{ + md_llbitmap_io_wq = alloc_workqueue("md_llbitmap_io", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_io_wq) + return -ENOMEM; + + md_llbitmap_unplug_wq = alloc_workqueue("md_llbitmap_unplug", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!md_llbitmap_unplug_wq) { + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + return -ENOMEM; + } + + return register_md_submodule(&llbitmap_ops.head); +} + +void md_llbitmap_exit(void) +{ + destroy_workqueue(md_llbitmap_io_wq); + md_llbitmap_io_wq = NULL; + destroy_workqueue(md_llbitmap_unplug_wq); + md_llbitmap_unplug_wq = NULL; + unregister_md_submodule(&llbitmap_ops.head); +} diff --git a/drivers/md/md.c b/drivers/md/md.c index 2f1907f3bc25..1de550108756 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -10328,6 +10328,10 @@ static int __init md_init(void) if (ret) return ret; + ret = md_llbitmap_init(); + if (ret) + goto err_bitmap; + ret = -ENOMEM; md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) @@ -10359,6 +10363,8 @@ err_md: err_misc_wq: destroy_workqueue(md_wq); err_wq: + md_llbitmap_exit(); +err_bitmap: md_bitmap_exit(); return ret; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 7b6357879a84..1979c2d4fe89 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -26,7 +26,7 @@ enum md_submodule_type { MD_PERSONALITY = 0, MD_CLUSTER, - MD_BITMAP, /* TODO */ + MD_BITMAP, }; enum md_submodule_id { @@ -39,7 +39,7 @@ enum md_submodule_id { ID_RAID10 = 10, ID_CLUSTER, ID_BITMAP, - ID_LLBITMAP, /* TODO */ + ID_LLBITMAP, ID_BITMAP_NONE, }; From aba19ee71cd7c0253612d6a2a0b3a828ba9ece29 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 30 Aug 2025 10:18:19 +0800 Subject: [PATCH 064/164] blk-mq: Move flush queue allocation into blk_mq_init_hctx() Move flush queue allocation into blk_mq_init_hctx() and its release into blk_mq_exit_hctx(), and prepare for replacing tags->lock with SRCU to draining inflight request walking. blk_mq_exit_hctx() is the last chance for us to get valid `tag_set` reference, and we need to add one SRCU to `tag_set` for freeing flush request via call_srcu(). It is safe to move flush queue & request release into blk_mq_exit_hctx(), because blk_mq_clear_flush_rq_mapping() clears the flush request reference int driver tags inflight request table, meantime inflight request walking is drained. Reviewed-by: Hannes Reinecke Reviewed-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 1 - block/blk-mq.c | 20 +++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 5c399ac562ea..58ec293373c6 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -34,7 +34,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); kfree(hctx->ctxs); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9055cd624700..72f9274315f1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3939,6 +3939,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + blk_free_flush_queue(hctx->fq); + hctx->fq = NULL; + xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); @@ -3964,13 +3967,19 @@ static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); + if (!hctx->fq) + goto fail; + hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) - goto fail; + goto fail_free_fq; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) @@ -3987,6 +3996,9 @@ static int blk_mq_init_hctx(struct request_queue *q, exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + fail_free_fq: + blk_free_flush_queue(hctx->fq); + hctx->fq = NULL; fail: return -1; } @@ -4038,16 +4050,10 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); - hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); - if (!hctx->fq) - goto free_bitmap; - blk_mq_hctx_kobj_init(hctx); return hctx; - free_bitmap: - sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: From 9ad8e5af327904dcc52e64ee5ab731c7018ffb0f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 30 Aug 2025 10:18:20 +0800 Subject: [PATCH 065/164] blk-mq: Pass tag_set to blk_mq_free_rq_map/tags To prepare for converting the tag->rqs freeing to be SRCU-based, the tag_set is needed in the freeing helper functions. This patch adds 'struct blk_mq_tag_set *' as the first parameter to blk_mq_free_rq_map() and blk_mq_free_tags(), and updates all their call sites. This allows access to the tag_set's SRCU structure in the next step, which will be used to free the tag maps after a grace period. No functional change is intended in this patch. Reviewed-by: Hannes Reinecke Reviewed-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 2 +- block/blk-mq.c | 10 +++++----- block/blk-mq.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 5cffa5668d0c..f09a4cbe486f 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -576,7 +576,7 @@ out_free_tags: return NULL; } -void blk_mq_free_tags(struct blk_mq_tags *tags) +void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); diff --git a/block/blk-mq.c b/block/blk-mq.c index 72f9274315f1..5efa0712aac7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3491,14 +3491,14 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } -void blk_mq_free_rq_map(struct blk_mq_tags *tags) +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, @@ -3560,7 +3560,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, err_free_rqs: kfree(tags->rqs); err_free_tags: - blk_mq_free_tags(tags); + blk_mq_free_tags(set, tags); return NULL; } @@ -4107,7 +4107,7 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); return NULL; } @@ -4135,7 +4135,7 @@ void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); - blk_mq_free_rq_map(tags); + blk_mq_free_rq_map(set, tags); } } diff --git a/block/blk-mq.h b/block/blk-mq.h index affb2e14b56e..b96a753809ab 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -59,7 +59,7 @@ void blk_mq_put_rq_ref(struct request *rq); */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -void blk_mq_free_rq_map(struct blk_mq_tags *tags); +void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, @@ -162,7 +162,7 @@ struct blk_mq_alloc_data { struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, unsigned int flags, int node); -void blk_mq_free_tags(struct blk_mq_tags *tags); +void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, From ad0d05dbddc1bf86e92220fea873176de6b12f78 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 30 Aug 2025 10:18:21 +0800 Subject: [PATCH 066/164] blk-mq: Defer freeing of tags page_list to SRCU callback Tag iterators can race with the freeing of the request pages(tags->page_list), potentially leading to use-after-free issues. Defer the freeing of the page list and the tags structure itself until after an SRCU grace period has passed. This ensures that any concurrent tag iterators have completed before the memory is released. With this way, we can replace the big tags->lock in tags iterator code path with srcu for solving the issue. This is achieved by: - Adding a new `srcu_struct tags_srcu` to `blk_mq_tag_set` to protect tag map iteration. - Adding an `rcu_head` to `struct blk_mq_tags` to be used with `call_srcu`. - Moving the page list freeing logic and the `kfree(tags)` call into a new callback function, `blk_mq_free_tags_callback`. - In `blk_mq_free_tags`, invoking `call_srcu` to schedule the new callback for deferred execution. The read-side protection for the tag iterators will be added in a subsequent patch. Reviewed-by: Hannes Reinecke Reviewed-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 24 +++++++++++++++++++++++- block/blk-mq.c | 26 +++++++++++++------------- include/linux/blk-mq.h | 2 ++ 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f09a4cbe486f..3c2ec6e86d54 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -8,6 +8,9 @@ */ #include #include +#include +#include +#include #include #include "blk.h" @@ -576,11 +579,30 @@ out_free_tags: return NULL; } +static void blk_mq_free_tags_callback(struct rcu_head *head) +{ + struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags, + rcu_head); + struct page *page; + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_alloc_rqs(). + */ + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } + kfree(tags); +} + void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); - kfree(tags); + call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq.c b/block/blk-mq.c index 5efa0712aac7..e1b44173029c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3454,7 +3454,6 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; - struct page *page; if (list_empty(&tags->page_list)) return; @@ -3478,17 +3477,10 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } blk_mq_clear_rq_mapping(drv_tags, tags); - - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_alloc_rqs(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* + * Free request pages in SRCU callback, which is called from + * blk_mq_free_tags(). + */ } void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) @@ -4834,6 +4826,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_srcu; } + ret = init_srcu_struct(&set->tags_srcu); + if (ret) + goto out_cleanup_srcu; init_rwsem(&set->update_nr_hwq_lock); @@ -4842,7 +4837,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out_cleanup_srcu; + goto out_cleanup_tags_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, @@ -4871,6 +4866,8 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_tags_srcu: + cleanup_srcu_struct(&set->tags_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); @@ -4916,6 +4913,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + + srcu_barrier(&set->tags_srcu); + cleanup_srcu_struct(&set->tags_srcu); if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2a5a828f19a0..1325ceeb743a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -531,6 +531,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; + struct srcu_struct tags_srcu; struct rw_semaphore update_nr_hwq_lock; }; @@ -767,6 +768,7 @@ struct blk_mq_tags { * request pool */ spinlock_t lock; + struct rcu_head rcu_head; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, From 135b8521f21d4d4d4fde74e73b80d8e4d417e20a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 30 Aug 2025 10:18:22 +0800 Subject: [PATCH 067/164] blk-mq: Defer freeing flush queue to SRCU callback The freeing of the flush queue/request in blk_mq_exit_hctx() can race with tag iterators that may still be accessing it. To prevent a potential use-after-free, the deallocation should be deferred until after a grace period. With this way, we can replace the big tags->lock in tags iterator code path with srcu for solving the issue. This patch introduces an SRCU-based deferred freeing mechanism for the flush queue. The changes include: - Adding a `rcu_head` to `struct blk_flush_queue`. - Creating a new callback function, `blk_free_flush_queue_callback`, to handle the actual freeing. - Replacing the direct call to `blk_free_flush_queue()` in `blk_mq_exit_hctx()` with `call_srcu()`, using the `tags_srcu` instance to ensure synchronization with tag iterators. Reviewed-by: Hannes Reinecke Reviewed-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++++++++++- block/blk.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index e1b44173029c..1c3cdf17af79 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3912,6 +3912,14 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, spin_unlock_irqrestore(&tags->lock, flags); } +static void blk_free_flush_queue_callback(struct rcu_head *head) +{ + struct blk_flush_queue *fq = + container_of(head, struct blk_flush_queue, rcu_head); + + blk_free_flush_queue(fq); +} + /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, @@ -3931,7 +3939,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - blk_free_flush_queue(hctx->fq); + call_srcu(&set->tags_srcu, &hctx->fq->rcu_head, + blk_free_flush_queue_callback); hctx->fq = NULL; xa_erase(&q->hctx_table, hctx_idx); diff --git a/block/blk.h b/block/blk.h index 46f566f9b126..7d420c247d81 100644 --- a/block/blk.h +++ b/block/blk.h @@ -41,6 +41,7 @@ struct blk_flush_queue { struct list_head flush_queue[2]; unsigned long flush_data_in_flight; struct request *flush_rq; + struct rcu_head rcu_head; }; bool is_flush_rq(struct request *req); From 995412e23bb2560a73db444e4c60bf5826b33aaa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 30 Aug 2025 10:18:23 +0800 Subject: [PATCH 068/164] blk-mq: Replace tags->lock with SRCU for tag iterators Replace the spinlock in blk_mq_find_and_get_req() with an SRCU read lock around the tag iterators. This is done by: - Holding the SRCU read lock in blk_mq_queue_tag_busy_iter(), blk_mq_tagset_busy_iter(), and blk_mq_hctx_has_requests(). - Removing the now-redundant tags->lock from blk_mq_find_and_get_req(). This change fixes lockup issue in scsi_host_busy() in case of shost->host_blocked. Also avoids big tags->lock when reading disk sysfs attribute `inflight`. Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Yu Kuai Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 12 ++++++++---- block/blk-mq.c | 24 ++++-------------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 3c2ec6e86d54..086c67849e06 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -256,13 +256,10 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; - unsigned long flags; - spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; - spin_unlock_irqrestore(&tags->lock, flags); return rq; } @@ -440,7 +437,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { unsigned int flags = tagset->flags; - int i, nr_tags; + int i, nr_tags, srcu_idx; + + srcu_idx = srcu_read_lock(&tagset->tags_srcu); nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; @@ -449,6 +448,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } + srcu_read_unlock(&tagset->tags_srcu, srcu_idx); } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); @@ -499,6 +499,8 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { + int srcu_idx; + /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid @@ -507,6 +509,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, if (!percpu_ref_tryget(&q->q_usage_counter)) return; + srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu); if (blk_mq_is_shared_tags(q->tag_set->flags)) { struct blk_mq_tags *tags = q->tag_set->shared_tags; struct sbitmap_queue *bresv = &tags->breserved_tags; @@ -536,6 +539,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, bt_for_each(hctx, q, btags, fn, priv, false); } } + srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx); blk_queue_exit(q); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 1c3cdf17af79..9abcd4c5d6a2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3415,7 +3415,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; - unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized @@ -3439,15 +3438,6 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, } } } - - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&drv_tags->lock, flags); - spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, @@ -3670,8 +3660,12 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) struct rq_iter_data data = { .hctx = hctx, }; + int srcu_idx; + srcu_idx = srcu_read_lock(&hctx->queue->tag_set->tags_srcu); blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + srcu_read_unlock(&hctx->queue->tag_set->tags_srcu, srcu_idx); + return data.has_rq; } @@ -3891,7 +3885,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; - unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) @@ -3901,15 +3894,6 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); - - /* - * Wait until all pending iteration is done. - * - * Request reference is cleared and it is guaranteed to be observed - * after the ->lock is released. - */ - spin_lock_irqsave(&tags->lock, flags); - spin_unlock_irqrestore(&tags->lock, flags); } static void blk_free_flush_queue_callback(struct rcu_head *head) From 7942b226e6b84df13b46b76c01d3b6e07a1b349e Mon Sep 17 00:00:00 2001 From: Genjian Zhang Date: Fri, 15 Aug 2025 17:07:32 +0800 Subject: [PATCH 069/164] null_blk: Fix the description of the cache_size module argument When executing modinfo null_blk, there is an error in the description of module parameter mbps, and the output information of cache_size is incomplete.The output of modinfo before and after applying this patch is as follows: Before: [...] parm: cache_size:ulong [...] parm: mbps:Cache size in MiB for memory-backed device. Default: 0 (none) (uint) [...] After: [...] parm: cache_size:Cache size in MiB for memory-backed device. Default: 0 (none) (ulong) [...] parm: mbps:Limit maximum bandwidth (in MiB/s). Default: 0 (no limit) (uint) [...] Fixes: 058efe000b31 ("null_blk: add module parameters for 4 options") Signed-off-by: Genjian Zhang Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 91642c9a3b29..f982027e8c85 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -223,7 +223,7 @@ MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed nu static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); -MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); +MODULE_PARM_DESC(cache_size, "Cache size in MiB for memory-backed device. Default: 0 (none)"); static bool g_fua = true; module_param_named(fua, g_fua, bool, 0444); From bd9fd5be6bc0836820500f68fff144609fbd85a9 Mon Sep 17 00:00:00 2001 From: Han Guangjiang Date: Fri, 5 Sep 2025 18:24:11 +0800 Subject: [PATCH 070/164] blk-throttle: fix access race during throttle policy activation On repeated cold boots we occasionally hit a NULL pointer crash in blk_should_throtl() when throttling is consulted before the throttle policy is fully enabled for the queue. Checking only q->td != NULL is insufficient during early initialization, so blkg_to_pd() for the throttle policy can still return NULL and blkg_to_tg() becomes NULL, which later gets dereferenced. Unable to handle kernel NULL pointer dereference at virtual address 0000000000000156 ... pc : submit_bio_noacct+0x14c/0x4c8 lr : submit_bio_noacct+0x48/0x4c8 sp : ffff800087f0b690 x29: ffff800087f0b690 x28: 0000000000005f90 x27: ffff00068af393c0 x26: 0000000000080000 x25: 000000000002fbc0 x24: ffff000684ddcc70 x23: 0000000000000000 x22: 0000000000000000 x21: 0000000000000000 x20: 0000000000080000 x19: ffff000684ddcd08 x18: ffffffffffffffff x17: 0000000000000000 x16: ffff80008132a550 x15: 0000ffff98020fff x14: 0000000000000000 x13: 1fffe000d11d7021 x12: ffff000688eb810c x11: ffff00077ec4bb80 x10: ffff000688dcb720 x9 : ffff80008068ef60 x8 : 00000a6fb8a86e85 x7 : 000000000000111e x6 : 0000000000000002 x5 : 0000000000000246 x4 : 0000000000015cff x3 : 0000000000394500 x2 : ffff000682e35e40 x1 : 0000000000364940 x0 : 000000000000001a Call trace: submit_bio_noacct+0x14c/0x4c8 verity_map+0x178/0x2c8 __map_bio+0x228/0x250 dm_submit_bio+0x1c4/0x678 __submit_bio+0x170/0x230 submit_bio_noacct_nocheck+0x16c/0x388 submit_bio_noacct+0x16c/0x4c8 submit_bio+0xb4/0x210 f2fs_submit_read_bio+0x4c/0xf0 f2fs_mpage_readpages+0x3b0/0x5f0 f2fs_readahead+0x90/0xe8 Tighten blk_throtl_activated() to also require that the throttle policy bit is set on the queue: return q->td != NULL && test_bit(blkcg_policy_throtl.plid, q->blkcg_pols); This prevents blk_should_throtl() from accessing throttle group state until policy data has been attached to blkgs. Fixes: a3166c51702b ("blk-throttle: delay initialization until configuration") Co-developed-by: Liang Jie Signed-off-by: Liang Jie Signed-off-by: Han Guangjiang Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 ------ block/blk-cgroup.h | 6 ++++++ block/blk-throttle.c | 6 +----- block/blk-throttle.h | 18 +++++++++++------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fe9ebd6a2e14..7246fc256315 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -110,12 +110,6 @@ static struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } -static bool blkcg_policy_enabled(struct request_queue *q, - const struct blkcg_policy *pol) -{ - return pol && test_bit(pol->plid, q->blkcg_pols); -} - static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 81868ad86330..83367086cb6a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -459,6 +459,12 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } +static inline bool blkcg_policy_enabled(struct request_queue *q, + const struct blkcg_policy *pol) +{ + return pol && test_bit(pol->plid, q->blkcg_pols); +} + void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 397b6a410f9e..cfa1cd60d2c5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1327,17 +1327,13 @@ static int blk_throtl_init(struct gendisk *disk) INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); - /* - * Freeze queue before activating policy, to synchronize with IO path, - * which is protected by 'q_usage_counter'. - */ memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; - /* activate policy */ + /* activate policy, blk_throtl_activated() will return true */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; diff --git a/block/blk-throttle.h b/block/blk-throttle.h index 3b27755bfbff..9d7a42c039a1 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -156,7 +156,13 @@ void blk_throtl_cancel_bios(struct gendisk *disk); static inline bool blk_throtl_activated(struct request_queue *q) { - return q->td != NULL; + /* + * q->td guarantees that the blk-throttle module is already loaded, + * and the plid of blk-throttle is assigned. + * blkcg_policy_enabled() guarantees that the policy is activated + * in the request_queue. + */ + return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl); } static inline bool blk_should_throtl(struct bio *bio) @@ -164,11 +170,6 @@ static inline bool blk_should_throtl(struct bio *bio) struct throtl_grp *tg; int rw = bio_data_dir(bio); - /* - * This is called under bio_queue_enter(), and it's synchronized with - * the activation of blk-throtl, which is protected by - * blk_mq_freeze_queue(). - */ if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) return false; @@ -194,7 +195,10 @@ static inline bool blk_should_throtl(struct bio *bio) static inline bool blk_throtl_bio(struct bio *bio) { - + /* + * block throttling takes effect if the policy is activated + * in the bio's request_queue. + */ if (!blk_should_throtl(bio)) return false; From 9f7c02e031570e8291a63162c6c046dc15ff85b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2025 13:22:43 +0000 Subject: [PATCH 071/164] nbd: restrict sockets to TCP and UDP Recently, syzbot started to abuse NBD with all kinds of sockets. Commit cf1b2326b734 ("nbd: verify socket is supported during setup") made sure the socket supported a shutdown() method. Explicitely accept TCP and UNIX stream sockets. Fixes: cf1b2326b734 ("nbd: verify socket is supported during setup") Reported-by: syzbot+e1cd6bd8493060bd701d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/CANn89iJ+76eE3A_8S_zTpSyW5hvPRn6V57458hCZGY5hbH_bFA@mail.gmail.com/T/#m081036e8747cd7e2626c1da5d78c8b9d1e55b154 Signed-off-by: Eric Dumazet Cc: Mike Christie Cc: Richard W.M. Jones Cc: Jens Axboe Cc: Yu Kuai Cc: linux-block@vger.kernel.org Cc: nbd@other.debian.org Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6463d0e8d0ce..87b0b78249da 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1217,6 +1217,14 @@ static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd, if (!sock) return NULL; + if (!sk_is_tcp(sock->sk) && + !sk_is_stream_unix(sock->sk)) { + dev_err(disk_to_dev(nbd->disk), "Unsupported socket: should be TCP or UNIX.\n"); + *err = -EINVAL; + sockfd_put(sock); + return NULL; + } + if (sock->ops->shutdown == sock_no_shutdown) { dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n"); *err = -EINVAL; From 70a6f71b1a77decfc5b1db426ccbe914b58adb38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Sep 2025 12:56:38 +0200 Subject: [PATCH 072/164] block: add a bio_init_inline helper Just a simpler wrapper around bio_init for callers that want to initialize a bio with inline bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- block/blk-crypto-fallback.c | 3 +-- block/blk-map.c | 8 ++++---- drivers/md/bcache/debug.c | 3 +-- drivers/md/bcache/io.c | 3 +-- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/movinggc.c | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/bcache/writeback.c | 2 +- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 4 ++-- drivers/target/target_core_pscsi.c | 2 +- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/journal.c | 2 +- fs/bcachefs/journal_io.c | 2 +- fs/bcachefs/super-io.c | 2 +- fs/squashfs/block.c | 2 +- include/linux/bio.h | 5 +++++ 20 files changed, 32 insertions(+), 27 deletions(-) diff --git a/block/bio.c b/block/bio.c index 44c43b970387..bd8bf179981a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -462,7 +462,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->nr--; put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + if (nr_vecs) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -578,7 +581,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { - bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 005c9157ffb3..dbc2d8784dab 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -167,8 +167,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, - bio_src->bi_opf); + bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; diff --git a/block/blk-map.c b/block/blk-map.c index 23e5d5ebe59e..92b7e19b1a1f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -157,7 +157,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); + bio_init_inline(bio, NULL, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1U << map_data->page_order; @@ -264,7 +264,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + bio_init_inline(bio, NULL, nr_vecs, req_op(rq)); } return bio; } @@ -326,7 +326,7 @@ static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + bio_init_inline(bio, NULL, nr_vecs, op); if (is_vmalloc_addr(data)) { bio->bi_private = data; if (!bio_add_vmalloc(bio, data, len)) { @@ -392,7 +392,7 @@ static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); + bio_init_inline(bio, NULL, nr_pages, op); while (len) { struct page *page; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7ff14bd2feb8..d50eb82ccb4f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); + bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_iter.bi_size = bucket_bytes(ca); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 26a6a535ec32..4fc80c6d5b31 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1492c8552255..6d250e366412 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* * When the cache disk is first registered, ca->sb.njournal_buckets diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 302e75f1fc4b..36dd8f14a6df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff7595caf440..8f3a23f4b168 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1342,7 +1342,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, use_dmio(b, op, sector, n_sectors, offset, ioprio); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op); + bio_init_inline(bio, b->c->bdev, 1, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index cf17fd46e255..08925aca838c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -441,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 408c26398321..bc11aaa38615 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -167,7 +167,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c7..c52ccd12d86b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index f991cf759836..db4e09042469 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -861,7 +861,7 @@ new_bio: bio = bio_kmalloc(nr_vecs, GFP_KERNEL); if (!bio) goto fail; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, + bio_init_inline(bio, NULL, nr_vecs, rw ? REQ_OP_WRITE : REQ_OP_READ); bio->bi_end_io = pscsi_bi_endio; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 590cd29f3e86..e701e6d6e321 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2084,7 +2084,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, INIT_WORK(&scrub->work, btree_node_scrub_work); - bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bio_init_inline(&scrub->bio, ca->disk_sb.bdev, vecs, REQ_OP_READ); bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); scrub->bio.bi_iter.bi_sector = pick.ptr.offset; scrub->bio.bi_end_io = btree_node_scrub_endio; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ddfeb0dafc9d..3dbf9faaaa4c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1634,7 +1634,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; - bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); + bio_init_inline(&ja->bio[i]->bio, NULL, nr_bvecs, 0); } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9e028dbcc3d0..ce8888d518c3 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1071,7 +1071,7 @@ reread: bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) return bch_err_throw(c, ENOMEM_journal_read_bucket); - bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); + bio_init_inline(bio, ca->disk_sb.bdev, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6c2e1d647403..e3fcffb93d2b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -232,7 +232,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) if (!bio) return -BCH_ERR_ENOMEM_sb_bio_realloc; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); + bio_init_inline(bio, NULL, nr_bvecs, 0); kfree(sb->bio); sb->bio = bio; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b69c294e3ef0..a05e3793f93a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -231,7 +231,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; - bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); + bio_init_inline(bio, sb->s_bdev, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 46ffac5caab7..eb7f4fbd8aa9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -405,6 +405,11 @@ struct request_queue; void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); +static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, + unsigned short max_vecs, blk_opf_t opf) +{ + bio_init(bio, bdev, bio->bi_inline_vecs, max_vecs, opf); +} extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); From d86eaa0f3c56da286853b698b45c8ce404291082 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Sep 2025 12:56:39 +0200 Subject: [PATCH 073/164] block: remove the bi_inline_vecs variable sized array from struct bio Bios are embedded into other structures, and at least spare is unhappy about embedding structures with variable sized arrays. There's no real need to the array anyway, we can replace it with a helper pointing to the memory just behind the bio, and with the previous cleanups there is very few site doing anything special with it. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- drivers/md/bcache/movinggc.c | 6 +++--- drivers/md/bcache/writeback.c | 6 +++--- drivers/md/dm-vdo/vio.c | 2 +- fs/bcachefs/data_update.h | 1 - fs/bcachefs/journal.c | 4 ++-- include/linux/bio.h | 2 +- include/linux/blk_types.h | 12 +++++------- 8 files changed, 17 insertions(+), 19 deletions(-) diff --git a/block/bio.c b/block/bio.c index bd8bf179981a..971d96afaf8d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -617,7 +617,8 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; - return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); + return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), + gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 4fc80c6d5b31..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 36dd8f14a6df..6ba73dc1a3df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e7f4153e55e3..8fc22fb14196 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t return VDO_SUCCESS; bio->bi_ioprio = 0; - bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_io_vec = bio_inline_vecs(bio); bio->bi_max_vecs = vio->block_count + 1; if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", size, vio_size) != VDO_SUCCESS) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 5e14d13568de..1b37780abfda 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -62,7 +62,6 @@ struct promote_op { struct work_struct work; struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 3dbf9faaaa4c..474e0e867b68 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1627,8 +1627,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, - nr_bvecs), GFP_KERNEL); + ja->bio[i] = kzalloc(sizeof(*ja->bio[i]) + + sizeof(struct bio_vec) * nr_bvecs, GFP_KERNEL); if (!ja->bio[i]) return bch_err_throw(c, ENOMEM_dev_journal_init); diff --git a/include/linux/bio.h b/include/linux/bio.h index eb7f4fbd8aa9..27cbff5b0356 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,7 +408,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, unsigned short max_vecs, blk_opf_t opf) { - bio_init(bio, bdev, bio->bi_inline_vecs, max_vecs, opf); + bio_init(bio, bdev, bio_inline_vecs(bio), max_vecs, opf); } extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 930daff207df..bbb7893e0542 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -269,18 +269,16 @@ struct bio { struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_set *bi_pool; - - /* - * We can inline a number of vecs at the end of the bio, to avoid - * double allocations for a small number of bio_vecs. This member - * MUST obviously be kept at the very end of the bio. - */ - struct bio_vec bi_inline_vecs[]; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) #define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT) +static inline struct bio_vec *bio_inline_vecs(struct bio *bio) +{ + return (struct bio_vec *)(bio + 1); +} + /* * bio flags */ From 199c9a8d26638845f509b76e3c176c27e7baafd7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Sep 2025 20:33:10 +0800 Subject: [PATCH 074/164] blk-mq: Document tags_srcu member in blk_mq_tag_set structure Add missing documentation for the tags_srcu member that was introduced to defer freeing of tags page_list to prevent use-after-free when iterating tags. Fixes htmldocs warning: WARNING: include/linux/blk-mq.h:536 struct member 'tags_srcu' not described in 'blk_mq_tag_set' Reported-by: Stephen Rothwell Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1325ceeb743a..b25d12545f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -507,6 +507,8 @@ enum hctx_type { * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). + * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent + * use-after-free when iterating tags. * @update_nr_hwq_lock: * Synchronize updating nr_hw_queues with add/del disk & * switching elevator. From 6214cadd79c610ca903f993c7a6c46a713715e7b Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 8 Sep 2025 22:10:20 +0200 Subject: [PATCH 075/164] block: floppy: Replace kmalloc() + copy_from_user() with memdup_user() Replace kmalloc() followed by copy_from_user() with memdup_user() to improve and simplify raw_cmd_copyin(). No functional changes intended. Signed-off-by: Thorsten Blum Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index ae8d598ee317..5336c3c5ca36 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3090,16 +3090,13 @@ static int raw_cmd_copyin(int cmd, void __user *param, *rcmd = NULL; loop: - ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL); - if (!ptr) - return -ENOMEM; + ptr = memdup_user(param, sizeof(*ptr)); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); *rcmd = ptr; - ret = copy_from_user(ptr, param, sizeof(*ptr)); ptr->next = NULL; ptr->buffer_length = 0; ptr->kernel_data = NULL; - if (ret) - return -EFAULT; param += sizeof(struct floppy_raw_cmd); if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE) return -EINVAL; From 51723bf92679427bba09a76fc13e1b0a93b680bc Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:51:39 +0200 Subject: [PATCH 076/164] drivers/block: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- drivers/block/sunvdc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 87b0b78249da..1188f32a5e5e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -311,7 +311,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, if (args) { INIT_WORK(&args->work, nbd_dead_link_work); args->index = nbd->index; - queue_work(system_wq, &args->work); + queue_work(system_percpu_wq, &args->work); } } if (!nsock->dead) { diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 107751721178..3ab46bb0c23d 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -1188,7 +1188,7 @@ static void vdc_ldc_reset(struct vdc_port *port) } if (port->ldc_timeout) - mod_delayed_work(system_wq, &port->ldc_reset_timer_work, + mod_delayed_work(system_percpu_wq, &port->ldc_reset_timer_work, round_jiffies(jiffies + HZ * port->ldc_timeout)); mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ)); return; From 456cefcb312d90d12dbcf7eaf6b3f7cfae6f622b Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:51:40 +0200 Subject: [PATCH 077/164] drivers/block: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8acad3cc6e6e..78184d3c526a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1085,7 +1085,7 @@ static int read_from_bdev_sync(struct zram *zram, struct page *page, work.entry = entry; INIT_WORK_ONSTACK(&work.work, zram_sync_read); - queue_work(system_unbound_wq, &work.work); + queue_work(system_dfl_wq, &work.work); flush_work(&work.work); destroy_work_on_stack(&work.work); From d7b1cdc9108f46f47a0899597d6fa270f64dd98c Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 10:51:41 +0200 Subject: [PATCH 078/164] drivers/block: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Jens Axboe --- drivers/block/aoe/aoemain.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/block/sunvdc.c | 2 +- drivers/block/virtio_blk.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c index cdf6e4041bb9..3b21750038ee 100644 --- a/drivers/block/aoe/aoemain.c +++ b/drivers/block/aoe/aoemain.c @@ -44,7 +44,7 @@ aoe_init(void) { int ret; - aoe_wq = alloc_workqueue("aoe_wq", 0, 0); + aoe_wq = alloc_workqueue("aoe_wq", WQ_PERCPU, 0); if (!aoe_wq) return -ENOMEM; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index faafd7ff43d6..af0e21149dbc 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7389,7 +7389,7 @@ static int __init rbd_init(void) * The number of active work items is limited by the number of * rbd devices * queue depth, so leave @max_active at default. */ - rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); + rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rbd_wq) { rc = -ENOMEM; goto err_out_slab; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 119b7e27023f..f1409e54010a 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1809,7 +1809,7 @@ static int __init rnbd_client_init(void) unregister_blkdev(rnbd_client_major, "rnbd"); return err; } - rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0); + rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", WQ_PERCPU, 0); if (!rnbd_clt_wq) { pr_err("Failed to load module, alloc_workqueue failed.\n"); rnbd_clt_destroy_sysfs_files(); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 3ab46bb0c23d..db1fe9772a4d 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -1216,7 +1216,7 @@ static int __init vdc_init(void) { int err; - sunvdc_wq = alloc_workqueue("sunvdc", 0, 0); + sunvdc_wq = alloc_workqueue("sunvdc", WQ_PERCPU, 0); if (!sunvdc_wq) return -ENOMEM; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 85efa7e535f8..f061420dfb10 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1682,7 +1682,7 @@ static int __init virtio_blk_init(void) { int error; - virtblk_wq = alloc_workqueue("virtio-blk", 0, 0); + virtblk_wq = alloc_workqueue("virtio-blk", WQ_PERCPU, 0); if (!virtblk_wq) return -ENOMEM; From fec2e705729dc93de5399d8b139e4746805c3d81 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:51 -0700 Subject: [PATCH 079/164] block: check for valid bio while splitting We're already iterating every segment, so check these for a valid IO lengths at the same time. Individual segment lengths will not be checked on passthrough commands. The read/write command segments must be sized to the dma alignment. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- block/blk-merge.c | 21 +++++++++++++++++---- include/linux/bio.h | 4 ++-- include/linux/blkdev.h | 7 +++++++ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 92b7e19b1a1f..ca4f5cf00038 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -443,7 +443,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) int ret; /* check that the data layout matches the hardware restrictions */ - ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + ret = bio_split_io_at(bio, lim, &nr_segs, max_bytes, 0); if (ret) { /* if we would have to split the bio, copy instead */ if (ret > 0) diff --git a/block/blk-merge.c b/block/blk-merge.c index 70d704615be5..cffc0fe48d8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -279,25 +279,30 @@ static unsigned int bio_split_alignment(struct bio *bio, } /** - * bio_split_rw_at - check if and where to split a read/write bio + * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors * @max_bytes: [in] maximum number of bytes per bio + * @len_align_mask: [in] length alignment mask for each vector * * Find out if @bio needs to be split to fit the queue limits in @lim and a * maximum size of @max_bytes. Returns a negative error number if @bio can't be * split, 0 if the bio doesn't have to be split, or a positive sector offset if * @bio needs to be split. */ -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes) +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { + if (bv.bv_offset & lim->dma_alignment || + bv.bv_len & len_align_mask) + return -EINVAL; + /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -339,8 +344,16 @@ split: * Individual bvecs might not be logical block aligned. Round down the * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. + * + * It is possible to submit a bio that can't be split into a valid io: + * there may either be too many discontiguous vectors for the max + * segments limit, or contain virtual boundary gaps without having a + * valid block sized split. A zero byte result means one of those + * conditions occured. */ bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); + if (!bytes) + return -EINVAL; /* * Bio splitting may cause subtle trouble such as hang when doing sync @@ -350,7 +363,7 @@ split: bio_clear_polled(bio); return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw_at); +EXPORT_SYMBOL_GPL(bio_split_io_at); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) diff --git a/include/linux/bio.h b/include/linux/bio.h index 27cbff5b0356..13d1df02656a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -322,8 +322,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes); +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7709d55adc23..9efacabaa2f7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1870,6 +1870,13 @@ bdev_atomic_write_unit_max_bytes(struct block_device *bdev) return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); } +static inline int bio_split_rw_at(struct bio *bio, + const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) +{ + return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ From 743bf2e0c49c835cb7c4e4ac7d5a2610587047be Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:52 -0700 Subject: [PATCH 080/164] block: add size alignment to bio_iov_iter_get_pages The block layer tries to align bio vectors to the block device's logical block size. Some cases don't have a block device, or we may need to align to something larger, which we can't derive it from the queue limits. Have the caller specify what they want, or allow any length alignment if nothing was specified. Since the most common use case relies on the block device's limits, a helper function is provided. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 19 +++++++++++-------- block/fops.c | 6 +++--- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 9 ++++++++- include/linux/blkdev.h | 7 +++++++ 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/block/bio.c b/block/bio.c index 971d96afaf8d..f91dc9f32bdc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1208,7 +1208,8 @@ static unsigned int get_contig_folio_len(unsigned int *num_pages, * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ -static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; @@ -1217,7 +1218,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; - size_t offset, folio_offset, left, len; + size_t offset, folio_offset, left, len, trim; int ret = 0; /* @@ -1246,8 +1247,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - if (bio->bi_bdev) { - size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + trim = size & len_align_mask; + if (trim) { iov_iter_revert(iter, trim); size -= trim; } @@ -1302,9 +1303,10 @@ out: } /** - * bio_iov_iter_get_pages - add user or kernel pages to a bio + * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added + * @len_align_mask: the mask to align each vector size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and @@ -1321,7 +1323,8 @@ out: * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { int ret = 0; @@ -1337,12 +1340,12 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { - ret = __bio_iov_iter_get_pages(bio, iter); + ret = __bio_iov_iter_get_pages(bio, iter, len_align_mask); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); return bio->bi_vcnt ? 0 : ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); static void submit_bio_wait_endio(struct bio *bio) { diff --git a/block/fops.c b/block/fops.c index 82451ac8ff25..d136fb5f6b6a 100644 --- a/block/fops.c +++ b/block/fops.c @@ -78,7 +78,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; - ret = bio_iov_iter_get_pages(&bio, iter); + ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; @@ -212,7 +212,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -348,7 +348,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, */ bio_iov_bvec_set(bio, iter); } else { - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b84f6af2eb4c..fea23fa6a402 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -434,7 +434,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_pages(bio, dio->submit.iter); + ret = bio_iov_iter_get_bdev_pages(bio, dio->submit.iter, iomap->bdev); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall diff --git a/include/linux/bio.h b/include/linux/bio.h index 13d1df02656a..a64a30131031 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -446,7 +446,14 @@ int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask); + +static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, 0); +} + void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9efacabaa2f7..44e1066f7446 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1877,6 +1877,13 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } +static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, + struct iov_iter *iter, struct block_device *bdev) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, + bdev_logical_block_size(bdev) - 1); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ From 20a0e6276edba4318c13486df02c31e5f3c09431 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:53 -0700 Subject: [PATCH 081/164] block: align the bio after building it Instead of ensuring each vector is block size aligned while constructing the bio, just ensure the entire size is aligned after it's built. This makes getting bio pages more flexible to accepting device valid io vectors that would otherwise get rejected by alignment checks. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 65 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/block/bio.c b/block/bio.c index f91dc9f32bdc..9603ca3ec770 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1208,8 +1208,7 @@ static unsigned int get_contig_folio_len(unsigned int *num_pages, * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ -static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, - unsigned len_align_mask) +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; @@ -1218,7 +1217,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; - size_t offset, folio_offset, left, len, trim; + size_t offset, folio_offset, left, len; int ret = 0; /* @@ -1232,13 +1231,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) extraction_flags |= ITER_ALLOW_P2PDMA; - /* - * Each segment in the iov is required to be a block size multiple. - * However, we may not be able to get the entire segment if it spans - * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the - * result to ensure the bio's total size is correct. The remainder of - * the iov data will be picked up in the next bio iteration. - */ size = iov_iter_extract_pages(iter, &pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); @@ -1246,18 +1238,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - - trim = size & len_align_mask; - if (trim) { - iov_iter_revert(iter, trim); - size -= trim; - } - - if (unlikely(!size)) { - ret = -EFAULT; - goto out; - } - for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); @@ -1302,11 +1282,44 @@ out: return ret; } +/* + * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that + * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length + * for the next iteration. + */ +static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) +{ + size_t nbytes = bio->bi_iter.bi_size & len_align_mask; + + if (!nbytes) + return 0; + + iov_iter_revert(iter, nbytes); + bio->bi_iter.bi_size -= nbytes; + do { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (nbytes < bv->bv_len) { + bv->bv_len -= nbytes; + break; + } + + bio_release_page(bio, bv->bv_page); + bio->bi_vcnt--; + nbytes -= bv->bv_len; + } while (nbytes); + + if (!bio->bi_vcnt) + return -EFAULT; + return 0; +} + /** * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added - * @len_align_mask: the mask to align each vector size to, 0 for any length + * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and @@ -1340,10 +1353,12 @@ int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { - ret = __bio_iov_iter_get_pages(bio, iter, len_align_mask); + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - return bio->bi_vcnt ? 0 : ret; + if (bio->bi_vcnt) + return bio_iov_iter_align_down(bio, iter, len_align_mask); + return ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); From 5ff3f74e145adc79b49668adb8de276446acf6be Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:54 -0700 Subject: [PATCH 082/164] block: simplify direct io validity check The block layer checks all the segments for validity later, so no need for an early check. Just reduce it to a simple position and total length check, and defer the more invasive segment checks to the block layer. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/fops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index d136fb5f6b6a..19814bddd77e 100644 --- a/block/fops.c +++ b/block/fops.c @@ -38,8 +38,8 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, struct iov_iter *iter) { - return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || - !bdev_iter_is_aligned(bdev, iter); + return (iocb->ki_pos | iov_iter_count(iter)) & + (bdev_logical_block_size(bdev) - 1); } #define DIO_INLINE_BIO_VECS 4 From 7eac331869575d81eaa2dd68b19e7468f8fa93cb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:55 -0700 Subject: [PATCH 083/164] iomap: simplify direct io validity check The block layer checks all the segments for validity later, so no need for an early check. Just reduce it to a simple position and total length check, and defer the more invasive segment checks to the block layer. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index fea23fa6a402..c06e41fd4d0a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -337,8 +337,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) u64 copied = 0; size_t orig_count; - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || - !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) + if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1)) return -EINVAL; if (dio->flags & IOMAP_DIO_WRITE) { From 9eab1d4e0d15b633adc170c458c51e8be3b1c553 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:56 -0700 Subject: [PATCH 084/164] block: remove bdev_iter_is_aligned No more callers. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 44e1066f7446..c8cb08b2ed29 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1590,13 +1590,6 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev) return queue_dma_alignment(bdev_get_queue(bdev)); } -static inline bool bdev_iter_is_aligned(struct block_device *bdev, - struct iov_iter *iter) -{ - return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), - bdev_logical_block_size(bdev) - 1); -} - static inline unsigned int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { From 69d7ed5b9ef661230264bfa0db4c96fa25b8efa4 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:57 -0700 Subject: [PATCH 085/164] blk-integrity: use simpler alignment check We're checking length and addresses against the same alignment value, so use the more simple iterator check. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6b077ca937f6..6d069a49b4aa 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -262,7 +262,6 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; size_t offset, bytes = iter->count; @@ -285,7 +284,8 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) pages = NULL; } - copy = !iov_iter_is_aligned(iter, align, align); + copy = iov_iter_alignment(iter) & + blk_lim_dma_alignment_and_pad(&q->limits); ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; From b475272f03ca5d0c437c8f899ff229b21010ec83 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:58 -0700 Subject: [PATCH 086/164] iov_iter: remove iov_iter_is_aligned No more callers. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/uio.h | 2 - lib/iov_iter.c | 95 --------------------------------------------- 2 files changed, 97 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e86c653186c..5b127043a151 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -286,8 +286,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f9193f952f49..2fe66a6b8789 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -784,101 +784,6 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) } EXPORT_SYMBOL(iov_iter_discard); -static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct iovec *iov = iter_iov(i); - size_t size = i->count; - size_t skip = i->iov_offset; - - do { - size_t len = iov->iov_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(iov->iov_base + skip) & addr_mask) - return false; - - iov++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct bio_vec *bvec = i->bvec; - unsigned skip = i->iov_offset; - size_t size = i->count; - - do { - size_t len = bvec->bv_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) - return false; - - bvec++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -/** - * iov_iter_is_aligned() - Check if the addresses and lengths of each segments - * are aligned to the parameters. - * - * @i: &struct iov_iter to restore - * @addr_mask: bit mask to check against the iov element's addresses - * @len_mask: bit mask to check against the iov element's lengths - * - * Return: false if any addresses or lengths intersect with the provided masks - */ -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - if (likely(iter_is_ubuf(i))) { - if (i->count & len_mask) - return false; - if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) - return false; - return true; - } - - if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return iov_iter_aligned_iovec(i, addr_mask, len_mask); - - if (iov_iter_is_bvec(i)) - return iov_iter_aligned_bvec(i, addr_mask, len_mask); - - /* With both xarray and folioq types, we're dealing with whole folios. */ - if (iov_iter_is_xarray(i)) { - if (i->count & len_mask) - return false; - if ((i->xarray_start + i->iov_offset) & addr_mask) - return false; - } - if (iov_iter_is_folioq(i)) { - if (i->count & len_mask) - return false; - if (i->iov_offset & addr_mask) - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(iov_iter_is_aligned); - static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); From 05ceea5d3ec9a1b1d6858ffd4739fdb0ed1b8eaf Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 3 Sep 2025 12:33:16 -0700 Subject: [PATCH 087/164] blk-integrity: enable p2p source and destination Set the extraction flags to allow p2p pages for the metadata buffer if the block device allows it. Similar to data payloads, ensure the bio does not use merging if we see a p2p page. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/bio-integrity.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 6d069a49b4aa..bed26f1ec869 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -230,7 +230,8 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, } static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, - int nr_vecs, ssize_t bytes, ssize_t offset) + int nr_vecs, ssize_t bytes, ssize_t offset, + bool *is_p2p) { unsigned int nr_bvecs = 0; int i, j; @@ -251,6 +252,9 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, bytes -= next; } + if (is_pci_p2pdma_page(pages[i])) + *is_p2p = true; + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); offset = 0; nr_bvecs++; @@ -264,10 +268,11 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + iov_iter_extraction_t extraction_flags = 0; size_t offset, bytes = iter->count; + bool copy, is_p2p = false; unsigned int nr_bvecs; int ret, nr_vecs; - bool copy; if (bio_integrity(bio)) return -EINVAL; @@ -286,15 +291,23 @@ int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) copy = iov_iter_alignment(iter) & blk_lim_dma_alignment_and_pad(&q->limits); - ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); + + if (blk_queue_pci_p2pdma(q)) + extraction_flags |= ITER_ALLOW_P2PDMA; + + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, + extraction_flags, &offset); if (unlikely(ret < 0)) goto free_bvec; - nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset, + &is_p2p); if (pages != stack_pages) kvfree(pages); if (nr_bvecs > queue_max_integrity_segments(q)) copy = true; + if (is_p2p) + bio->bi_opf |= REQ_NOMERGE; if (copy) ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes); From d57447ffb5fadffdba920f2fb933296fb6c5ff57 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 3 Sep 2025 12:33:17 -0700 Subject: [PATCH 088/164] blk-mq-dma: bring back p2p request flags We only need to consider data and metadata dma mapping types separately. The request and bio integrity payload have enough flag bits to internally track the mapping type for each. Use these so the caller doesn't need to track them, and provide separete request and integrity helpers to the common code. This will make it easier to scale new mappings, like the proposed MMIO attribute, without burdening the caller to track such things. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Leon Romanovsky Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++++ drivers/nvme/host/pci.c | 21 ++++----------------- include/linux/bio-integrity.h | 1 + include/linux/blk-integrity.h | 15 +++++++++++++++ include/linux/blk-mq-dma.h | 11 +++++++++-- include/linux/blk_types.h | 2 ++ 6 files changed, 35 insertions(+), 19 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 660b5e200ccf..449950029872 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -174,6 +174,10 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: + if (iter->iter.is_integrity) + bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; + else + req->cmd_flags |= REQ_P2PDMA; return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d8a9dee55de3..28e203b894eb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -260,12 +260,6 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, - /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ - IOD_P2P_BUS_ADDR = 1U << 3, - - /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ - IOD_META_P2P_BUS_ADDR = 1U << 4, - /* Metadata using non-coalesced MPTR */ IOD_SINGLE_META_SEGMENT = 1U << 5, }; @@ -737,9 +731,8 @@ static void nvme_unmap_metadata(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, - iod->meta_total_len, - iod->flags & IOD_META_P2P_BUS_ADDR)) { + if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len)) { if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) nvme_free_sgls(req, sge, &sge[1]); else @@ -766,8 +759,7 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, - iod->flags & IOD_P2P_BUS_ADDR)) { + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req, iod->descriptors[0], &iod->cmd.common.dptr.sgl); @@ -1043,9 +1035,6 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; - if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) - iod->flags |= IOD_P2P_BUS_ADDR; - if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) @@ -1068,9 +1057,7 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) &iod->meta_dma_state, &iter)) return iter.status; - if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) - iod->flags |= IOD_META_P2P_BUS_ADDR; - else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) entries = 1; /* diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0a25716820fe..851254f36eb3 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,6 +13,7 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ + BIP_P2P_DMA = 1 << 8, /* using P2P address */ }; struct bio_integrity_payload { diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 78fe2459e661..b659373788f6 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -27,6 +27,15 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); + +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); +} + int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -115,6 +124,12 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return false; +} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 0f45ea110ca1..51829958d872 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -43,7 +43,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_rq_dma_unmap - try to DMA unmap a request + * blk_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state @@ -53,7 +53,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, +static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { if (is_p2p) @@ -68,4 +68,11 @@ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, return !dma_need_unmap(dma_dev); } +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + req->cmd_flags & REQ_P2PDMA); +} + #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index bbb7893e0542..4bd098fd61cb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -384,6 +384,7 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ + __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -416,6 +417,7 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) +#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) From d0d1d522316e91f2b935a78bbf962b8e529d8c4f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 3 Sep 2025 13:27:46 -0700 Subject: [PATCH 089/164] blk-map: provide the bdev to bio if one exists We can now safely provide a block device when extracting user pages for driver and user passthrough commands. Set the bdev so the caller doesn't have to do that later. This has an additional benefit of being able to extract P2P pages in the passthrough path. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-map.c | 5 +++-- drivers/nvme/host/ioctl.c | 5 ----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index ca4f5cf00038..165f2234f00f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -253,10 +253,11 @@ static void blk_mq_map_bio_put(struct bio *bio) static struct bio *blk_rq_map_bio_alloc(struct request *rq, unsigned int nr_vecs, gfp_t gfp_mask) { + struct block_device *bdev = rq->q->disk ? rq->q->disk->part0 : NULL; struct bio *bio; if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) { - bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, + bio = bio_alloc_bioset(bdev, nr_vecs, rq->cmd_flags, gfp_mask, &fs_bio_set); if (!bio) return NULL; @@ -264,7 +265,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; - bio_init_inline(bio, NULL, nr_vecs, req_op(rq)); + bio_init_inline(bio, bdev, nr_vecs, req_op(rq)); } return bio; } diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 6b3ac8ae3f34..f778f3b5214b 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -142,14 +142,9 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, ret = blk_rq_map_user_io(req, NULL, nvme_to_user_ptr(ubuffer), bufflen, GFP_KERNEL, flags & NVME_IOCTL_VEC, 0, 0, rq_data_dir(req)); - if (ret) return ret; - bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); - if (has_metadata) { ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len); if (ret) From 1733e88874838ddebf7774440c285700865e6b08 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:41 +0800 Subject: [PATCH 090/164] block: cleanup bio_issue Now that bio->bi_issue is only used by blk-iolatency to get bio issue time, replace bio_issue with u64 time directly and remove bio_issue to make code cleaner. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-cgroup.h | 2 +- block/blk-iolatency.c | 14 +++---------- block/blk.h | 42 --------------------------------------- include/linux/blk_types.h | 7 ++----- 5 files changed, 7 insertions(+), 60 deletions(-) diff --git a/block/bio.c b/block/bio.c index 9603ca3ec770..3a1a848940dd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; - bio->bi_issue.value = 0; + bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 83367086cb6a..8328427e3165 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -372,7 +372,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) static inline void blkcg_bio_issue_init(struct bio *bio) { - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio->issue_time_ns = blk_time_get_ns(); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f8fdecdd7a9..554b191a6892 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) mod_timer(&blkiolat->timer, jiffies + HZ); } -static void iolatency_record_time(struct iolatency_grp *iolat, - struct bio_issue *issue, u64 now, - bool issue_as_root) +static void iolatency_record_time(struct iolatency_grp *iolat, u64 start, + u64 now, bool issue_as_root) { - u64 start = bio_issue_time(issue); u64 req_time; - /* - * Have to do this so we are truncated to the correct time that our - * issue is truncated to. - */ - now = __bio_issue_time(now); - if (now <= start) return; @@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) * submitted, so do not account for it. */ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { - iolatency_record_time(iolat, &bio->bi_issue, now, + iolatency_record_time(iolat, bio->issue_time_ns, now, issue_as_root); window_start = atomic64_read(&iolat->window_start); if (now > window_start && diff --git a/block/blk.h b/block/blk.h index 7d420c247d81..41397688b941 100644 --- a/block/blk.h +++ b/block/blk.h @@ -681,48 +681,6 @@ static inline ktime_t blk_time_get(void) return ns_to_ktime(blk_time_get_ns()); } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4bd098fd61cb..8e8d1cc8b06c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -198,10 +198,6 @@ static inline bool blk_path_error(blk_status_t error) return true; } -struct bio_issue { - u64 value; -}; - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; @@ -242,7 +238,8 @@ struct bio { * on release of the bio. */ struct blkcg_gq *bi_blkg; - struct bio_issue bi_issue; + /* Time that this bio was issued. */ + u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif From 1f963bdd6420b6080bcfd0ee84a75c96f35545a6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:42 +0800 Subject: [PATCH 091/164] block: initialize bio issue time in blk_mq_submit_bio() bio->issue_time_ns is only used by blk-iolatency, which can only be enabled for rq-based disk, hence it's not necessary to initialize the time for bio-based disk. Meanwhile, if bio is split by blk_crypto_fallback_split_bio_if_needed(), the issue time is not initialized for new split bio, this can be fixed as well. Noted the next patch will optimize better that bio issue time will only be used when blk-iolatency is really enabled by the disk. Fixes: 488f6682c832 ("block: blk-crypto-fallback for Inline Encryption") Signed-off-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 6 ------ block/blk-core.c | 1 - block/blk-merge.c | 1 - block/blk-mq.c | 8 ++++++++ 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8328427e3165..1cce3294634d 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -370,11 +370,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) -static inline void blkcg_bio_issue_init(struct bio *bio) -{ - bio->issue_time_ns = blk_time_get_ns(); -} - static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) @@ -497,7 +492,6 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } -static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } diff --git a/block/blk-core.c b/block/blk-core.c index 4201504158a1..83c262a3dfd9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -728,7 +728,6 @@ static void __submit_bio_noacct_mq(struct bio *bio) void submit_bio_noacct_nocheck(struct bio *bio) { blk_cgroup_bio_start(bio); - blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index cffc0fe48d8a..354a3070e6a1 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -119,7 +119,6 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors) goto error; } split->bi_opf |= REQ_NOMERGE; - blkcg_bio_issue_init(split); bio_chain(split, bio); trace_block_split(split, bio->bi_iter.bi_sector); WARN_ON_ONCE(bio_zone_write_plugging(bio)); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9abcd4c5d6a2..2c0c8ead7080 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -396,6 +396,13 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) #endif } +static inline void blk_mq_bio_issue_init(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + bio->issue_time_ns = blk_time_get_ns(); +#endif +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { @@ -3168,6 +3175,7 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) goto queue_exit; + blk_mq_bio_issue_init(bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; From ea3d1f104db60f9d5074b33819ccea3c216e0bee Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:43 +0800 Subject: [PATCH 092/164] blk-mq: add QUEUE_FLAG_BIO_ISSUE_TIME bio->issue_time_ns is initialized for every bio, however, it's only used by blk-iolatency. Add a new queue_flag and only set this flag when blk-iolatency is enabled, so that extra blk_time_get_ns() can be saved for disks that blk-iolatency is not enabled. Signed-off-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 5 +++++ block/blk-mq-debugfs.c | 1 + block/blk-mq.c | 8 +++++--- include/linux/blkdev.h | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 554b191a6892..45bd18f68541 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -742,10 +742,15 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { + struct request_queue *q = blkiolat->rqos.disk->queue; unsigned int memflags; memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; + if (enabled) + blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE_TIME, q); + else + blk_queue_flag_clear(QUEUE_FLAG_BIO_ISSUE_TIME, q); blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags); } } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 32c65efdda46..4896525b1c05 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -96,6 +96,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), + QUEUE_FLAG_NAME(BIO_ISSUE_TIME), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c0c8ead7080..31cc743ffad7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -396,10 +396,12 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) #endif } -static inline void blk_mq_bio_issue_init(struct bio *bio) +static inline void blk_mq_bio_issue_init(struct request_queue *q, + struct bio *bio) { #ifdef CONFIG_BLK_CGROUP - bio->issue_time_ns = blk_time_get_ns(); + if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) + bio->issue_time_ns = blk_time_get_ns(); #endif } @@ -3175,7 +3177,7 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) goto queue_exit; - blk_mq_bio_issue_init(bio); + blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8cb08b2ed29..7c542b1851fa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -657,6 +657,7 @@ enum { QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ + QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ QUEUE_FLAG_MAX }; From 22f166218f7313e8fe2d19213b5f4b3265f8c39e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:44 +0800 Subject: [PATCH 093/164] md: fix mssing blktrace bio split events If bio is split by internal handling like chunksize or badblocks, the corresponding trace_block_split() is missing, resulting in blktrace inability to catch BIO split events and making it harder to analyze the BIO sequence. Cc: stable@vger.kernel.org Fixes: 4b1faf931650 ("block: Kill bio_pair_split()") Signed-off-by: Yu Kuai Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/md-linear.c | 1 + drivers/md/raid0.c | 4 ++++ drivers/md/raid1.c | 4 ++++ drivers/md/raid10.c | 8 ++++++++ drivers/md/raid5.c | 2 ++ 5 files changed, 19 insertions(+) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 5d9b08115375..59d7963c7843 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -266,6 +266,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) } bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f1d8811a542a..1ba7d0c090f7 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -472,7 +472,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; end = zone->zone_end; @@ -620,7 +622,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); return true; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); raid0_map_submit_bio(mddev, bio); bio = split; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f096f4d0fca7..64a1e0b47e87 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1384,7 +1384,9 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; @@ -1616,7 +1618,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 312e8a0e93e7..3d7b24d6b4eb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); submit_bio_noacct(bio); wait_barrier(conf, false); @@ -1495,7 +1497,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, error = PTR_ERR(split); goto err_handle; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); submit_bio_noacct(bio); wait_barrier(conf, false); @@ -1679,7 +1683,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the fist split part */ submit_bio_noacct(split); @@ -1694,7 +1700,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); return 0; } + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); allow_barrier(conf); /* Resend the second split part */ submit_bio_noacct(bio); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5112658ef5f6..96a6d63b3d62 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5493,8 +5493,10 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; + split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, raid_bio); + trace_block_split(split, raid_bio->bi_iter.bi_sector); submit_bio_noacct(raid_bio); raid_bio = split; } From 06d712d297649f48ebf1381d19bd24e942813b37 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:45 +0800 Subject: [PATCH 094/164] blk-crypto: fix missing blktrace bio split events trace_block_split() is missing, resulting in blktrace inability to catch BIO split events and making it harder to analyze the BIO sequence. Cc: stable@vger.kernel.org Fixes: 488f6682c832 ("block: blk-crypto-fallback for Inline Encryption") Signed-off-by: Yu Kuai Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index dbc2d8784dab..27fa1ec4b264 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "blk-cgroup.h" #include "blk-crypto-internal.h" @@ -230,7 +231,9 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) bio->bi_status = BLK_STS_RESOURCE; return false; } + bio_chain(split_bio, bio); + trace_block_split(split_bio, bio->bi_iter.bi_sector); submit_bio_noacct(bio); *bio_ptr = split_bio; } From e37b5596a19be9a150cb194ec32e78f295a3574b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:46 +0800 Subject: [PATCH 095/164] block: factor out a helper bio_submit_split_bioset() No functional changes are intended, some drivers like mdraid will split bio by internal processing, prepare to unify bio split codes. Signed-off-by: Yu Kuai Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 59 ++++++++++++++++++++++++++++-------------- include/linux/blkdev.h | 2 ++ 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 354a3070e6a1..f680ccf81864 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -104,33 +104,54 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } +/* + * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector + * @bio: the original bio to be submitted and split + * @split_sectors: the sector count at which to split + * @bs: the bio set used for allocating the new split bio + * + * The original bio is modified to contain the remaining sectors and submitted. + * The caller is responsible for submitting the returned bio. + * + * If succeed, the newly allocated bio representing the initial part will be + * returned, on failure NULL will be returned and original bio will fail. + */ +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs) +{ + struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return NULL; + } + + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + + return split; +} +EXPORT_SYMBOL_GPL(bio_submit_split_bioset); + static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) - goto error; + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } if (split_sectors) { - struct bio *split; - - split = bio_split(bio, split_sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, split_sectors, &bio->bi_bdev->bd_disk->bio_split); - if (IS_ERR(split)) { - split_sectors = PTR_ERR(split); - goto error; - } - split->bi_opf |= REQ_NOMERGE; - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; + if (bio) + bio->bi_opf |= REQ_NOMERGE; } return bio; -error: - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7c542b1851fa..066e5309bd45 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,8 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); From 5b38ee5a4a12cfdefd848f7ec09da3e9007ad55f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:47 +0800 Subject: [PATCH 096/164] md/raid0: convert raid0_handle_discard() to use bio_submit_split_bioset() Unify bio split code, and prepare to fix ordering of split IO Noted commit 319ff40a5427 ("md/raid0: Fix performance regression for large sequential writes") already fix ordering of split IO by remapping bio to underlying disks before resubmitting it, with the respect md_submit_bio() already split it by sectors, and raid0_make_request() will split at most once for unaligned IO. This is a bit hacky and we'll convert this to solution in general later. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid0.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 1ba7d0c090f7..ca08ec2e1f27 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -463,23 +463,16 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { - struct bio *split = bio_split(bio, - zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, - &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, + zone->zone_end - bio->bi_iter.bi_sector, + &mddev->bio_set); + if (!bio) return; - } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; end = zone->zone_end; - } else + } else { end = bio_end_sector(bio); + } orig_end = end; if (zone != conf->strip_zone) From a6fcc160d6fd9b4ddd229e351518daee21eecad7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:48 +0800 Subject: [PATCH 097/164] md/raid1: convert to use bio_submit_split_bioset() Unify bio split code, and prepare to fix ordering of split IO. Noted that bio_submit_split_bioset() can fail the original bio directly by split error, set R1BIO_Returned in this case to notify raid_end_bio_io() that the original bio is returned already. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 38 +++++++++++--------------------------- drivers/md/raid1.h | 4 +++- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 64a1e0b47e87..3f4ed2272ac4 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1317,7 +1317,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct raid1_info *mirror; struct bio *read_bio; int max_sectors; - int rdisk, error; + int rdisk; bool r1bio_existed = !!r1_bio; /* @@ -1377,18 +1377,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1416,8 +1411,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, err_handle: atomic_dec(&mirror->rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } @@ -1484,7 +1477,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; - int i, disks, k, error; + int i, disks, k; unsigned long flags; int first_clone; int max_sectors; @@ -1588,10 +1581,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) @@ -1611,18 +1602,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, max_sectors = min_t(int, max_sectors, BIO_MAX_VECS * (PAGE_SIZE >> 9)); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - GFP_NOIO, &conf->bio_split); - - if (IS_ERR(split)) { - error = PTR_ERR(split); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + if (!bio) { + set_bit(R1BIO_Returned, &r1_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; } @@ -1698,8 +1684,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R1BIO_Uptodate, &r1_bio->state); raid_end_bio_io(r1_bio); } diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index d236ef179cfb..2ebe35aaa534 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -178,7 +178,9 @@ enum r1bio_state { * any write was successful. Otherwise we call when * any write-behind write succeeds, otherwise we call * with failure when last write completes (and all failed). - * Record that bi_end_io was called with this flag... + * + * And for bio_split errors, record that bi_end_io was called + * with this flag... */ R1BIO_Returned, /* If a write for this request means we can clear some From deeeab3028afebf2f13428f69dcba9f572f0463b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:49 +0800 Subject: [PATCH 098/164] md/raid10: add a new r10bio flag R10BIO_Returned The new helper bio_submit_split_bioset() can failed the orginal bio on split errors, prepare to handle this case in raid_end_bio_io(). The flag name is refer to the r1bio flag name. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 8 +++++--- drivers/md/raid10.h | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3d7b24d6b4eb..cac5c6df75bb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -322,10 +322,12 @@ static void raid_end_bio_io(struct r10bio *r10_bio) struct bio *bio = r10_bio->master_bio; struct r10conf *conf = r10_bio->mddev->private; - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - bio->bi_status = BLK_STS_IOERR; + if (!test_and_set_bit(R10BIO_Returned, &r10_bio->state)) { + if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } - bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 3f16ad6904a9..da00a55f7a55 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -165,6 +165,8 @@ enum r10bio_state { * so that raid10d knows what to do with them. */ R10BIO_ReadError, +/* For bio_split errors, record that bi_end_io was called. */ + R10BIO_Returned, /* If a write for this request means we can clear some * known-bad-block records, we set this flag. */ From 6fc07785d9b89255bba45fc84475bb32f9737a90 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:50 +0800 Subject: [PATCH 099/164] md/raid10: convert read/write to use bio_submit_split_bioset() Unify bio split code, prepare to fix ordering of split IO, the error path is modified a bit, however no functional changes are intended: - bio_submit_split_bioset() can fail the original bio directly by split error, set R10BIO_Uptodate in this case to notify raid_end_bio_io() that the original bio is returned already. - set R10BIO_Uptodate and set error value to -EIO is useless now, for r10_bio without R10BIO_Uptodate, -EIO will be returned for original bio. And discard is not handled, because discard is only split for unaligned head and tail, and this can be considered slow path, the reorder here does not matter much. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index cac5c6df75bb..8996fa0dff60 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1156,7 +1156,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - int error; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1205,19 +1204,15 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, rdev->bdev, (unsigned long long)r10_bio->sector); if (max_sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, max_sectors, - gfp, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); + allow_barrier(conf); + bio = bio_submit_split_bioset(bio, max_sectors, + &conf->bio_split); + wait_barrier(conf, false); + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - allow_barrier(conf); - submit_bio_noacct(bio); - wait_barrier(conf, false); - bio = split; r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; } @@ -1245,8 +1240,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, return; err_handle: atomic_dec(&rdev->nr_pending); - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } @@ -1355,7 +1348,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, int i, k; sector_t sectors; int max_sectors; - int error; if ((mddev_is_clustered(mddev) && mddev->cluster_ops->area_resyncing(mddev, WRITE, @@ -1469,10 +1461,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * complexity of supporting that is not worth * the benefit. */ - if (bio->bi_opf & REQ_ATOMIC) { - error = -EIO; + if (bio->bi_opf & REQ_ATOMIC) goto err_handle; - } good_sectors = first_bad - dev_sector; if (good_sectors < max_sectors) @@ -1493,19 +1483,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->sectors = max_sectors; if (r10_bio->sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, r10_bio->sectors, - GFP_NOIO, &conf->bio_split); - if (IS_ERR(split)) { - error = PTR_ERR(split); + allow_barrier(conf); + bio = bio_submit_split_bioset(bio, r10_bio->sectors, + &conf->bio_split); + wait_barrier(conf, false); + if (!bio) { + set_bit(R10BIO_Returned, &r10_bio->state); goto err_handle; } - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - allow_barrier(conf); - submit_bio_noacct(bio); - wait_barrier(conf, false); - bio = split; r10_bio->master_bio = bio; } @@ -1537,8 +1523,6 @@ err_handle: } } - bio->bi_status = errno_to_blk_status(error); - set_bit(R10BIO_Uptodate, &r10_bio->state); raid_end_bio_io(r10_bio); } From 9e8a5b37c9ea3ae9db9028ea756ececf221d9a5a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:51 +0800 Subject: [PATCH 100/164] md/raid5: convert to use bio_submit_split_bioset() Unify bio split code, prepare to fix ordering of split IO. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid5.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 96a6d63b3d62..04add718be77 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5486,7 +5486,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { - struct bio *split; sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); @@ -5494,11 +5493,10 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; - split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, raid_bio); - trace_block_split(split, raid_bio->bi_iter.bi_sector); - submit_bio_noacct(raid_bio); - raid_bio = split; + raid_bio = bio_submit_split_bioset(raid_bio, sectors, + &conf->bio_split); + if (!raid_bio) + return NULL; } if (!raid5_read_one_chunk(mddev, raid_bio)) From 6529d41d87827f9a27f6c0c6d34c2b77b250b6c6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:52 +0800 Subject: [PATCH 101/164] md/md-linear: convert to use bio_submit_split_bioset() Unify bio split code, prepare to fix reordered split IO. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/md-linear.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 59d7963c7843..76f85cc32942 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -256,19 +256,10 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio_end_sector(bio) > end_sector)) { /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + bio = bio_submit_split_bioset(bio, end_sector - bio_sector, + &mddev->bio_set); + if (!bio) return true; - } - - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - bio = split; } md_account_bio(mddev, &bio); From e3290419d9be6cbd7a42c0691504dd66825cabf5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:53 +0800 Subject: [PATCH 102/164] blk-crypto: convert to use bio_submit_split_bioset() Unify bio split code, prepare to fix ordering of split IO. Signed-off-by: Yu Kuai Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 27fa1ec4b264..86b27f96051a 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "blk-cgroup.h" #include "blk-crypto-internal.h" @@ -222,20 +221,14 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) if (++i == BIO_MAX_VECS) break; } + if (num_sectors < bio_sectors(bio)) { - struct bio *split_bio; - - split_bio = bio_split(bio, num_sectors, GFP_NOIO, - &crypto_bio_split); - if (IS_ERR(split_bio)) { - bio->bi_status = BLK_STS_RESOURCE; + bio = bio_submit_split_bioset(bio, num_sectors, + &crypto_bio_split); + if (!bio) return false; - } - bio_chain(split_bio, bio); - trace_block_split(split_bio, bio->bi_iter.bi_sector); - submit_bio_noacct(bio); - *bio_ptr = split_bio; + *bio_ptr = bio; } return true; From 0b64682e78f7a53ea863e368b1aa66f05767858d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:54 +0800 Subject: [PATCH 103/164] block: skip unnecessary checks for split bio Lots of checks are already done while submitting this bio the first time, and there is no need to check them again when this bio is resubmitted after split. Hence open code should_fail_bio() and blk_throtl_bio() that are still necessary from submit_bio_split_bioset(). Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 6 +++++- block/blk.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 83c262a3dfd9..1021a09c5958 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -539,7 +539,7 @@ static inline void bio_check_ro(struct bio *bio) } } -static noinline int should_fail_bio(struct bio *bio) +int should_fail_bio(struct bio *bio) { if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; diff --git a/block/blk-merge.c b/block/blk-merge.c index f680ccf81864..c16f4bdf251f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -130,7 +130,11 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, bio_chain(split, bio); trace_block_split(split, bio->bi_iter.bi_sector); WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); + + if (should_fail_bio(bio)) + bio_io_error(bio); + else if (!blk_throtl_bio(bio)) + submit_bio_noacct_nocheck(bio); return split; } diff --git a/block/blk.h b/block/blk.h index 41397688b941..694ae6c9bb0f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -616,6 +616,7 @@ extern const struct address_space_operations def_blk_aops; int disk_register_independent_access_ranges(struct gendisk *disk); void disk_unregister_independent_access_ranges(struct gendisk *disk); +int should_fail_bio(struct bio *bio); #ifdef CONFIG_FAIL_MAKE_REQUEST bool should_fail_request(struct block_device *part, unsigned int bytes); #else /* CONFIG_FAIL_MAKE_REQUEST */ From b2f5974079d82a4761f002e80601064d4e39a81f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:55 +0800 Subject: [PATCH 104/164] block: fix ordering of recursive split IO Currently, split bio will be chained to original bio, and original bio will be resubmitted to the tail of current->bio_list, waiting for split bio to be issued. However, if split bio get split again, the IO order will be messed up. This problem, on the one hand, will cause performance degradation, especially for mdraid with large IO size; on the other hand, will cause write errors for zoned block devices[1]. For example, in raid456 IO will first be split by max_sector from md_submit_bio(), and then later be split again by chunksize for internal handling: For example, assume max_sectors is 1M, and chunksize is 512k 1) issue a 2M IO: bio issuing: 0+2M current->bio_list: NULL 2) md_submit_bio() split by max_sector: bio issuing: 0+1M current->bio_list: 1M+1M 3) chunk_aligned_read() split by chunksize: bio issuing: 0+512k current->bio_list: 1M+1M -> 512k+512k 4) after first bio issued, __submit_bio_noacct() will contuine issuing next bio: bio issuing: 1M+1M current->bio_list: 512k+512k bio issued: 0+512k 5) chunk_aligned_read() split by chunksize: bio issuing: 1M+512k current->bio_list: 512k+512k -> 1536k+512k bio issued: 0+512k 6) no split afterwards, finally the issue order is: 0+512k -> 1M+512k -> 512k+512k -> 1536k+512k This behaviour will cause large IO read on raid456 endup to be small discontinuous IO in underlying disks. Fix this problem by placing split bio to the head of current->bio_list. Test script: test on 8 disk raid5 with 64k chunksize dd if=/dev/md0 of=/dev/null bs=4480k iflag=direct Test results: Before this patch 1) iostat results: Device r/s rMB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util md0 52430.00 3276.87 0.00 0.00 0.62 64.00 32.60 80.10 sd* 4487.00 409.00 2054.00 31.40 0.82 93.34 3.68 71.20 2) blktrace G stage: 8,0 0 486445 11.357392936 843 G R 14071424 + 128 [dd] 8,0 0 486451 11.357466360 843 G R 14071168 + 128 [dd] 8,0 0 486454 11.357515868 843 G R 14071296 + 128 [dd] 8,0 0 486468 11.357968099 843 G R 14072192 + 128 [dd] 8,0 0 486474 11.358031320 843 G R 14071936 + 128 [dd] 8,0 0 486480 11.358096298 843 G R 14071552 + 128 [dd] 8,0 0 486490 11.358303858 843 G R 14071808 + 128 [dd] 3) io seek for sdx: Noted io seek is the result from blktrace D stage, statistic of: ABS((offset of next IO) - (offset + len of previous IO)) Read|Write seek cnt 55175, zero cnt 25079 >=(KB) .. <(KB) : count ratio |distribution | 0 .. 1 : 25079 45.5% |########################################| 1 .. 2 : 0 0.0% | | 2 .. 4 : 0 0.0% | | 4 .. 8 : 0 0.0% | | 8 .. 16 : 0 0.0% | | 16 .. 32 : 0 0.0% | | 32 .. 64 : 12540 22.7% |##################### | 64 .. 128 : 2508 4.5% |##### | 128 .. 256 : 0 0.0% | | 256 .. 512 : 10032 18.2% |################# | 512 .. 1024 : 5016 9.1% |######### | After this patch: 1) iostat results: Device r/s rMB/s rrqm/s %rrqm r_await rareq-sz aqu-sz %util md0 87965.00 5271.88 0.00 0.00 0.16 61.37 14.03 90.60 sd* 6020.00 658.44 5117.00 45.95 0.44 112.00 2.68 86.50 2) blktrace G stage: 8,0 0 206296 5.354894072 664 G R 7156992 + 128 [dd] 8,0 0 206305 5.355018179 664 G R 7157248 + 128 [dd] 8,0 0 206316 5.355204438 664 G R 7157504 + 128 [dd] 8,0 0 206319 5.355241048 664 G R 7157760 + 128 [dd] 8,0 0 206333 5.355500923 664 G R 7158016 + 128 [dd] 8,0 0 206344 5.355837806 664 G R 7158272 + 128 [dd] 8,0 0 206353 5.355960395 664 G R 7158528 + 128 [dd] 8,0 0 206357 5.356020772 664 G R 7158784 + 128 [dd] 3) io seek for sdx Read|Write seek cnt 28644, zero cnt 21483 >=(KB) .. <(KB) : count ratio |distribution | 0 .. 1 : 21483 75.0% |########################################| 1 .. 2 : 0 0.0% | | 2 .. 4 : 0 0.0% | | 4 .. 8 : 0 0.0% | | 8 .. 16 : 0 0.0% | | 16 .. 32 : 0 0.0% | | 32 .. 64 : 7161 25.0% |############## | BTW, this looks like a long term problem from day one, and large sequential IO read is pretty common case like video playing. And even with this patch, in this test case IO is merged to at most 128k is due to block layer plug limit BLK_PLUG_FLUSH_SIZE, increase such limit can get even better performance. However, we'll figure out how to do this properly later. [1] https://lore.kernel.org/all/e40b076d-583d-406b-b223-005910a9f46f@acm.org/ Fixes: d89d87965dcb ("When stacked block devices are in-use (e.g. md or dm), the recursive calls") Reported-by: Tie Ren Closes: https://lore.kernel.org/all/7dro5o7u5t64d6bgiansesjavxcuvkq5p2pok7dtwkav7b7ape@3isfr44b6352/ Signed-off-by: Yu Kuai Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 16 ++++++++++------ block/blk-merge.c | 2 +- block/blk-throttle.c | 2 +- block/blk.h | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 1021a09c5958..dd39ff651095 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -725,7 +725,7 @@ static void __submit_bio_noacct_mq(struct bio *bio) current->bio_list = NULL; } -void submit_bio_noacct_nocheck(struct bio *bio) +void submit_bio_noacct_nocheck(struct bio *bio, bool split) { blk_cgroup_bio_start(bio); @@ -744,12 +744,16 @@ void submit_bio_noacct_nocheck(struct bio *bio) * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ - if (current->bio_list) - bio_list_add(¤t->bio_list[0], bio); - else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) + if (current->bio_list) { + if (split) + bio_list_add_head(¤t->bio_list[0], bio); + else + bio_list_add(¤t->bio_list[0], bio); + } else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { __submit_bio_noacct_mq(bio); - else + } else { __submit_bio_noacct(bio); + } } static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, @@ -870,7 +874,7 @@ void submit_bio_noacct(struct bio *bio) if (blk_throtl_bio(bio)) return; - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); return; not_supported: diff --git a/block/blk-merge.c b/block/blk-merge.c index c16f4bdf251f..37864c5d287e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -134,7 +134,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, if (should_fail_bio(bio)) bio_io_error(bio); else if (!blk_throtl_bio(bio)) - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, true); return split; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index cfa1cd60d2c5..f510ae072868 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1224,7 +1224,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) - submit_bio_noacct_nocheck(bio); + submit_bio_noacct_nocheck(bio, false); blk_finish_plug(&plug); } } diff --git a/block/blk.h b/block/blk.h index 694ae6c9bb0f..170794632135 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,7 +55,7 @@ bool blk_queue_start_drain(struct request_queue *q); bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); -void submit_bio_noacct_nocheck(struct bio *bio); +void submit_bio_noacct_nocheck(struct bio *bio, bool split); void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) From e0ed2bca7bef9267da0928a8ed6d1de41f19ecf6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:56 +0800 Subject: [PATCH 105/164] md/raid0: convert raid0_make_request() to use bio_submit_split_bioset() Currently, raid0_make_request() will remap the original bio to underlying disks to prevent reordered IO. Now that bio_submit_split_bioset() will put original bio to the head of current->bio_list, it's safe converting to use this helper and bio will still be ordered. CC: Jan Kara Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/raid0.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ca08ec2e1f27..adc9e68d064d 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -607,19 +607,10 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) : sector_div(sector, chunk_sects)); if (sectors < bio_sectors(bio)) { - struct bio *split = bio_split(bio, sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, sectors, &mddev->bio_set); - - if (IS_ERR(split)) { - bio->bi_status = errno_to_blk_status(PTR_ERR(split)); - bio_endio(bio); + if (!bio) return true; - } - - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - raid0_map_submit_bio(mddev, bio); - bio = split; } raid0_map_submit_bio(mddev, bio); From 97e8ba31b8f1b743c918bf31586cdc272376226b Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 8 Sep 2025 12:45:41 -0600 Subject: [PATCH 106/164] ublk: consolidate nr_io_ready and nr_queues_ready ublk_mark_io_ready() tracks whether all the ublk_device's I/Os have been fetched by incrementing ublk_queue's nr_io_ready count and incrementing ublk_device's nr_queues_ready count if the whole queue is ready. Simplify the logic by just tracking the total number of fetched I/Os on each ublk_device. When this count reaches nr_hw_queues * queue_depth, the ublk_device is ready to receive I/O. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3cf6d344d1c0..aa64f530d5e9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -201,7 +201,6 @@ struct ublk_queue { bool force_abort; bool canceling; bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ - unsigned short nr_io_ready; /* how many ios setup */ spinlock_t cancel_lock; struct ublk_device *dev; struct ublk_io ios[]; @@ -234,7 +233,7 @@ struct ublk_device { struct ublk_params params; struct completion completion; - unsigned int nr_queues_ready; + u32 nr_io_ready; bool unprivileged_daemons; struct mutex cancel_mutex; bool canceling; @@ -1499,9 +1498,6 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; - /* All old ioucmds have to be completed */ - ubq->nr_io_ready = 0; - for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1550,7 +1546,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; - ub->nr_queues_ready = 0; + ub->nr_io_ready = 0; ub->unprivileged_daemons = false; ub->ublksrv_tgid = -1; } @@ -1848,9 +1844,11 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } -static inline bool ublk_queue_ready(struct ublk_queue *ubq) +static inline bool ublk_dev_ready(const struct ublk_device *ub) { - return ubq->nr_io_ready == ubq->q_depth; + u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth; + + return ub->nr_io_ready == total; } static void ublk_cancel_queue(struct ublk_queue *ubq) @@ -1974,16 +1972,14 @@ static void ublk_reset_io_flags(struct ublk_device *ub) } /* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) +static void ublk_mark_io_ready(struct ublk_device *ub) __must_hold(&ub->mutex) { - ubq->nr_io_ready++; - if (ublk_queue_ready(ubq)) - ub->nr_queues_ready++; if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) ub->unprivileged_daemons = true; - if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) { + ub->nr_io_ready++; + if (ublk_dev_ready(ub)) { /* now we are ready for handling ublk io request */ ublk_reset_io_flags(ub); complete_all(&ub->completion); @@ -2189,8 +2185,8 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, * FETCH, so it is fine even for IO_URING_F_NONBLOCK. */ mutex_lock(&ub->mutex); - /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ - if (ublk_queue_ready(ubq)) { + /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ + if (ublk_dev_ready(ub)) { ret = -EBUSY; goto out; } @@ -2209,7 +2205,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, goto out; WRITE_ONCE(io->task, get_task_struct(current)); - ublk_mark_io_ready(ub, ubq); + ublk_mark_io_ready(ub); out: mutex_unlock(&ub->mutex); return ret; From dc1dd13d44fa4e4d466476c0f3517c1230c237e4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:36 +0800 Subject: [PATCH 107/164] blk-mq: remove useless checking in queue_requests_store() blk_mq_queue_attr_visible() already checked queue_is_mq(), no need to check this again in queue_requests_store(). Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index c94b8b6ab024..1ffa65feca4f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -69,9 +69,6 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) unsigned int memflags; struct request_queue *q = disk->queue; - if (!queue_is_mq(q)) - return -EINVAL; - ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; From 8bd7195fea6d9662aa3b32498a3828bfd9b63185 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:37 +0800 Subject: [PATCH 108/164] blk-mq: remove useless checkings in blk_mq_update_nr_requests() 1) queue_requests_store() is the only caller of blk_mq_update_nr_requests(), where queue is already freezed, no need to check mq_freeze_depth; 2) q->tag_set must be set for request based device, and queue_is_mq() is already checked in blk_mq_queue_attr_visible(), no need to check q->tag_set. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 31cc743ffad7..55ccc9f4435d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4930,21 +4930,14 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; - int ret; + int ret = 0; unsigned long i; - if (WARN_ON_ONCE(!q->mq_freeze_depth)) - return -EINVAL; - - if (!set) - return -EINVAL; - if (q->nr_requests == nr) return 0; blk_mq_quiesce_queue(q); - ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; From b46d4c447db76e36906ed59ebb9b3ef8f3383322 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:38 +0800 Subject: [PATCH 109/164] blk-mq: check invalid nr_requests in queue_requests_store() queue_requests_store() is the only caller of blk_mq_update_nr_requests(), and blk_mq_update_nr_requests() is the only caller of blk_mq_tag_update_depth(), however, they all have checkings for nr_requests input by user. Make code cleaner by moving all the checkings to the top function: 1) nr_requests > reserved tags; 2) if there is elevator, 4 <= nr_requests <= 2048; 3) if elevator is none, 4 <= nr_requests <= tag_set->queue_depth; Meanwhile, case 2 is the only case tags can grow and -ENOMEM might be returned. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 16 +--------------- block/blk-mq.c | 8 ++------ block/blk-mq.h | 2 +- block/blk-sysfs.c | 13 +++++++++++++ 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 086c67849e06..56f9bc839000 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -610,14 +610,10 @@ void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tagsptr, unsigned int tdepth, - bool can_grow) + struct blk_mq_tags **tagsptr, unsigned int tdepth) { struct blk_mq_tags *tags = *tagsptr; - if (tdepth <= tags->nr_reserved_tags) - return -EINVAL; - /* * If we are allowed to grow beyond the original size, allocate * a new set of tags before freeing the old one. @@ -626,16 +622,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; - if (!can_grow) - return -EINVAL; - - /* - * We need some sort of upper limit, set it high enough that - * no valid use cases should require more. - */ - if (tdepth > MAX_SCHED_RQ) - return -EINVAL; - /* * Only the sbitmap needs resizing since we allocated the max * initially. diff --git a/block/blk-mq.c b/block/blk-mq.c index 55ccc9f4435d..9b97f2f3f2c9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4933,9 +4933,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) int ret = 0; unsigned long i; - if (q->nr_requests == nr) - return 0; - blk_mq_quiesce_queue(q); queue_for_each_hw_ctx(q, hctx, i) { @@ -4947,10 +4944,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) */ if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); + nr); } else { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr); } if (ret) goto out; diff --git a/block/blk-mq.h b/block/blk-mq.h index b96a753809ab..5d42c7d3a952 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -171,7 +171,7 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tags, unsigned int depth, bool can_grow); + struct blk_mq_tags **tags, unsigned int depth); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1ffa65feca4f..f99519f7a820 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -75,12 +75,25 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) memflags = blk_mq_freeze_queue(q); mutex_lock(&q->elevator_lock); + + if (nr == q->nr_requests) + goto unlock; + if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; + if (nr <= q->tag_set->reserved_tags || + (q->elevator && nr > MAX_SCHED_RQ) || + (!q->elevator && nr > q->tag_set->queue_depth)) { + ret = -EINVAL; + goto unlock; + } + err = blk_mq_update_nr_requests(disk->queue, nr); if (err) ret = err; + +unlock: mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); return ret; From 626ff4f8ebcb7207f01e7810acb85812ccf06bd8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:39 +0800 Subject: [PATCH 110/164] blk-mq: convert to serialize updating nr_requests with update_nr_hwq_lock request_queue->nr_requests can be changed by: a) switch elevator by updating nr_hw_queues b) switch elevator by elevator sysfs attribute c) configue queue sysfs attribute nr_requests Current lock order is: 1) update_nr_hwq_lock, case a,b 2) freeze_queue 3) elevator_lock, case a,b,c And update nr_requests is seriablized by elevator_lock() already, however, in the case c, we'll have to allocate new sched_tags if nr_requests grow, and do this with elevator_lock held and queue freezed has the risk of deadlock. Hence use update_nr_hwq_lock instead, make it possible to allocate memory if tags grow, meanwhile also prevent nr_requests to be changed concurrently. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f99519f7a820..ff66d1b47169 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -68,13 +68,17 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) int ret, err; unsigned int memflags; struct request_queue *q = disk->queue; + struct blk_mq_tag_set *set = q->tag_set; ret = queue_var_store(&nr, page, count); if (ret < 0) return ret; - memflags = blk_mq_freeze_queue(q); - mutex_lock(&q->elevator_lock); + /* + * Serialize updating nr_requests with concurrent queue_requests_store() + * and switching elevator. + */ + down_write(&set->update_nr_hwq_lock); if (nr == q->nr_requests) goto unlock; @@ -82,20 +86,31 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - if (nr <= q->tag_set->reserved_tags || + /* + * Switching elevator is protected by update_nr_hwq_lock: + * - read lock is held from elevator sysfs attribute; + * - write lock is held from updating nr_hw_queues; + * Hence it's safe to access q->elevator here with write lock held. + */ + if (nr <= set->reserved_tags || (q->elevator && nr > MAX_SCHED_RQ) || - (!q->elevator && nr > q->tag_set->queue_depth)) { + (!q->elevator && nr > set->queue_depth)) { ret = -EINVAL; goto unlock; } + memflags = blk_mq_freeze_queue(q); + mutex_lock(&q->elevator_lock); + err = blk_mq_update_nr_requests(disk->queue, nr); if (err) ret = err; -unlock: mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); + +unlock: + up_write(&set->update_nr_hwq_lock); return ret; } From 7f2799c546dba9e12f9ff4d07936601e416c640d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:40 +0800 Subject: [PATCH 111/164] blk-mq: cleanup shared tags case in blk_mq_update_nr_requests() For shared tags case, all hctx->sched_tags/tags are the same, it doesn't make sense to call into blk_mq_tag_update_depth() multiple times for the same tags. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 7 ------- block/blk-mq.c | 43 ++++++++++++++++++++++--------------------- 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 56f9bc839000..936b68273cad 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -622,13 +622,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; - /* - * Only the sbitmap needs resizing since we allocated the max - * initially. - */ - if (blk_mq_is_shared_tags(set->flags)) - return 0; - new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; diff --git a/block/blk-mq.c b/block/blk-mq.c index 9b97f2f3f2c9..80c20700bce8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4935,34 +4935,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) blk_mq_quiesce_queue(q); - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->tags) - continue; - /* - * If we're using an MQ scheduler, just update the scheduler - * queue depth. This is similar to what the old code would do. - */ - if (hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr); - } else { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr); - } - if (ret) - goto out; - } - - q->nr_requests = nr; - if (q->elevator && q->elevator->type->ops.depth_updated) - q->elevator->type->ops.depth_updated(q); - if (blk_mq_is_shared_tags(set->flags)) { if (q->elevator) blk_mq_tag_update_sched_shared_tags(q); else blk_mq_tag_resize_shared_tags(set, nr); + } else { + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; + /* + * If we're using an MQ scheduler, just update the + * scheduler queue depth. This is similar to what the + * old code would do. + */ + if (hctx->sched_tags) + ret = blk_mq_tag_update_depth(hctx, + &hctx->sched_tags, nr); + else + ret = blk_mq_tag_update_depth(hctx, + &hctx->tags, nr); + if (ret) + goto out; + } } + q->nr_requests = nr; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); + out: blk_mq_unquiesce_queue(q); From e63200404477456ec60c62dd8b3b1092aba2e211 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:41 +0800 Subject: [PATCH 112/164] blk-mq: split bitmap grow and resize case in blk_mq_update_nr_requests() No functional changes are intended, make code cleaner and prepare to fix the grow case in following patches. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 80c20700bce8..299aac458185 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4936,25 +4936,40 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) blk_mq_quiesce_queue(q); if (blk_mq_is_shared_tags(set->flags)) { + /* + * Shared tags, for sched tags, we allocate max initially hence + * tags can't grow, see blk_mq_alloc_sched_tags(). + */ if (q->elevator) blk_mq_tag_update_sched_shared_tags(q); else blk_mq_tag_resize_shared_tags(set, nr); - } else { + } else if (!q->elevator) { + /* + * Non-shared hardware tags, nr is already checked from + * queue_requests_store() and tags can't grow. + */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; - /* - * If we're using an MQ scheduler, just update the - * scheduler queue depth. This is similar to what the - * old code would do. - */ - if (hctx->sched_tags) - ret = blk_mq_tag_update_depth(hctx, - &hctx->sched_tags, nr); - else - ret = blk_mq_tag_update_depth(hctx, - &hctx->tags, nr); + sbitmap_queue_resize(&hctx->tags->bitmap_tags, + nr - hctx->tags->nr_reserved_tags); + } + } else if (nr <= q->elevator->et->nr_requests) { + /* Non-shared sched tags, and tags don't grow. */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags, + nr - hctx->sched_tags->nr_reserved_tags); + } + } else { + /* Non-shared sched tags, and tags grow */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->sched_tags) + continue; + ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, + nr); if (ret) goto out; } From 6293e336f6d7d3f3415346ce34993b3398846166 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:42 +0800 Subject: [PATCH 113/164] blk-mq-sched: add new parameter nr_requests in blk_mq_alloc_sched_tags() This helper only support to allocate the default number of requests, add a new parameter to support specific number of requests. Prepare to fix potential deadlock in the case nr_requests grow. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 14 +++++--------- block/blk-mq-sched.h | 2 +- block/blk-mq.h | 11 +++++++++++ block/elevator.c | 3 ++- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e2ce4a28e6c9..d06bb137a743 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -454,7 +454,7 @@ void blk_mq_free_sched_tags_batch(struct xarray *et_table, } struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, - unsigned int nr_hw_queues) + unsigned int nr_hw_queues, unsigned int nr_requests) { unsigned int nr_tags; int i; @@ -470,13 +470,8 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, nr_tags * sizeof(struct blk_mq_tags *), gfp); if (!et) return NULL; - /* - * Default to double of smaller one between hw queue_depth and - * 128, since we don't split into sync/async like the old code - * did. Additionally, this is a per-hw queue depth. - */ - et->nr_requests = 2 * min_t(unsigned int, set->queue_depth, - BLKDEV_DEFAULT_RQ); + + et->nr_requests = nr_requests; et->nr_hw_queues = nr_hw_queues; if (blk_mq_is_shared_tags(set->flags)) { @@ -521,7 +516,8 @@ int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, * concurrently. */ if (q->elevator) { - et = blk_mq_alloc_sched_tags(set, nr_hw_queues); + et = blk_mq_alloc_sched_tags(set, nr_hw_queues, + blk_mq_default_nr_requests(set)); if (!et) goto out_unwind; if (xa_insert(et_table, q->id, et, gfp)) diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index fe83187f41db..8e21a6b1415d 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -24,7 +24,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set, - unsigned int nr_hw_queues); + unsigned int nr_hw_queues, unsigned int nr_requests); int blk_mq_alloc_sched_tags_batch(struct xarray *et_table, struct blk_mq_tag_set *set, unsigned int nr_hw_queues); void blk_mq_free_sched_tags(struct elevator_tags *et, diff --git a/block/blk-mq.h b/block/blk-mq.h index 5d42c7d3a952..3a1d4c37d1bc 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -109,6 +109,17 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(blk_opf_t opf, return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } +/* + * Default to double of smaller one between hw queue_depth and + * 128, since we don't split into sync/async like the old code + * did. Additionally, this is a per-hw queue depth. + */ +static inline unsigned int blk_mq_default_nr_requests( + struct blk_mq_tag_set *set) +{ + return 2 * min_t(unsigned int, set->queue_depth, BLKDEV_DEFAULT_RQ); +} + /* * sysfs helpers */ diff --git a/block/elevator.c b/block/elevator.c index fe96c6f4753c..e2ebfbf107b3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -669,7 +669,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx) lockdep_assert_held(&set->update_nr_hwq_lock); if (strncmp(ctx->name, "none", 4)) { - ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues); + ctx->et = blk_mq_alloc_sched_tags(set, set->nr_hw_queues, + blk_mq_default_nr_requests(set)); if (!ctx->et) return -ENOMEM; } From b86433721f46d934940528f28d49c1dedb690df1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:43 +0800 Subject: [PATCH 114/164] blk-mq: fix potential deadlock while nr_requests grown Allocate and free sched_tags while queue is freezed can deadlock[1], this is a long term problem, hence allocate memory before freezing queue and free memory after queue is unfreezed. [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/ Fixes: e3a2b3f931f5 ("blk-mq: allow changing of queue depth through sysfs") Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 +++++++++------------- block/blk-mq.h | 5 ++++- block/blk-sysfs.c | 29 +++++++++++++++++++++-------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 299aac458185..4ccf11cadf8c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4926,11 +4926,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_free_tag_set); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, + struct elevator_tags *et, + unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags *old_et = NULL; struct blk_mq_hw_ctx *hctx; - int ret = 0; unsigned long i; blk_mq_quiesce_queue(q); @@ -4965,24 +4967,18 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } } else { /* Non-shared sched tags, and tags grow */ - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->sched_tags) - continue; - ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr); - if (ret) - goto out; - } + queue_for_each_hw_ctx(q, hctx, i) + hctx->sched_tags = et->tags[i]; + old_et = q->elevator->et; + q->elevator->et = et; } q->nr_requests = nr; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(q); -out: blk_mq_unquiesce_queue(q); - - return ret; + return old_et; } /* diff --git a/block/blk-mq.h b/block/blk-mq.h index 3a1d4c37d1bc..0ef96f510477 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -6,6 +6,7 @@ #include "blk-stat.h" struct blk_mq_tag_set; +struct elevator_tags; struct blk_mq_ctxs { struct kobject kobj; @@ -45,7 +46,9 @@ void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); -int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); +struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, + struct elevator_tags *tags, + unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ff66d1b47169..76c47fe9b8d6 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -64,11 +64,12 @@ static ssize_t queue_requests_show(struct gendisk *disk, char *page) static ssize_t queue_requests_store(struct gendisk *disk, const char *page, size_t count) { - unsigned long nr; - int ret, err; - unsigned int memflags; struct request_queue *q = disk->queue; struct blk_mq_tag_set *set = q->tag_set; + struct elevator_tags *et = NULL; + unsigned int memflags; + unsigned long nr; + int ret; ret = queue_var_store(&nr, page, count); if (ret < 0) @@ -99,16 +100,28 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count) goto unlock; } + if (!blk_mq_is_shared_tags(set->flags) && q->elevator && + nr > q->elevator->et->nr_requests) { + /* + * Tags will grow, allocate memory before freezing queue to + * prevent deadlock. + */ + et = blk_mq_alloc_sched_tags(set, q->nr_hw_queues, nr); + if (!et) { + ret = -ENOMEM; + goto unlock; + } + } + memflags = blk_mq_freeze_queue(q); mutex_lock(&q->elevator_lock); - - err = blk_mq_update_nr_requests(disk->queue, nr); - if (err) - ret = err; - + et = blk_mq_update_nr_requests(q, et, nr); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); + if (et) + blk_mq_free_sched_tags(et, set); + unlock: up_write(&set->update_nr_hwq_lock); return ret; From 9784041145796994f2b21f4c7e628d7c9db762f4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:44 +0800 Subject: [PATCH 115/164] blk-mq: remove blk_mq_tag_update_depth() This helper is not used now. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 32 -------------------------------- block/blk-mq.h | 2 -- 2 files changed, 34 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 936b68273cad..a63d21a4aab4 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -609,38 +609,6 @@ void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } -int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tagsptr, unsigned int tdepth) -{ - struct blk_mq_tags *tags = *tagsptr; - - /* - * If we are allowed to grow beyond the original size, allocate - * a new set of tags before freeing the old one. - */ - if (tdepth > tags->nr_tags) { - struct blk_mq_tag_set *set = hctx->queue->tag_set; - struct blk_mq_tags *new; - - new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); - if (!new) - return -ENOMEM; - - blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); - hctx->queue->elevator->et->tags[hctx->queue_num] = new; - *tagsptr = new; - } else { - /* - * Don't need (or can't) update reserved tags here, they - * remain static and should never need resizing. - */ - sbitmap_queue_resize(&tags->bitmap_tags, - tdepth - tags->nr_reserved_tags); - } - - return 0; -} - void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) { struct blk_mq_tags *tags = set->shared_tags; diff --git a/block/blk-mq.h b/block/blk-mq.h index 0ef96f510477..af42dc018808 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -184,8 +184,6 @@ unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); -int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tags, unsigned int depth); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); From a75fe12fa2e2f96b619f25b8cda1fdef6d616ab1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 16:04:45 +0800 Subject: [PATCH 116/164] blk-mq: fix stale nr_requests documentation The nr_requests documentation is still the removed single queue, remove it and update to current blk-mq. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 0ddffc9133d0..0ed10aeff86b 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -603,16 +603,10 @@ Date: July 2003 Contact: linux-block@vger.kernel.org Description: [RW] This controls how many requests may be allocated in the - block layer for read or write requests. Note that the total - allocated number may be twice this amount, since it applies only - to reads or writes (not the accumulated sum). - - To avoid priority inversion through request starvation, a - request queue maintains a separate request pool per each cgroup - when CONFIG_BLK_CGROUP is enabled, and this parameter applies to - each such per-block-cgroup request pool. IOW, if there are N - block cgroups, each request queue may have up to N request - pools, each independently regulated by nr_requests. + block layer. Noted this value only represents the quantity for a + single blk_mq_tags instance. The actual number for the entire + device depends on the hardware queue count, whether elevator is + enabled, and whether tags are shared. What: /sys/block//queue/nr_zones From 7935b843ce2184164f41c3b5c64e9f52994306f4 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 10 Sep 2025 13:47:26 -0700 Subject: [PATCH 117/164] md/md-llbitmap: Use DIV_ROUND_UP_SECTOR_T When building for 32-bit platforms, there are several link (if builtin) or modpost (if a module) errors due to dividends of type 'sector_t' in DIV_ROUND_UP: arm-linux-gnueabi-ld: drivers/md/md-llbitmap.o: in function `llbitmap_resize': drivers/md/md-llbitmap.c:1017:(.text+0xae8): undefined reference to `__aeabi_uldivmod' arm-linux-gnueabi-ld: drivers/md/md-llbitmap.c:1020:(.text+0xb10): undefined reference to `__aeabi_uldivmod' arm-linux-gnueabi-ld: drivers/md/md-llbitmap.o: in function `llbitmap_end_discard': drivers/md/md-llbitmap.c:1114:(.text+0xf14): undefined reference to `__aeabi_uldivmod' arm-linux-gnueabi-ld: drivers/md/md-llbitmap.o: in function `llbitmap_start_discard': drivers/md/md-llbitmap.c:1097:(.text+0x1808): undefined reference to `__aeabi_uldivmod' arm-linux-gnueabi-ld: drivers/md/md-llbitmap.o: in function `llbitmap_read_sb': drivers/md/md-llbitmap.c:867:(.text+0x2080): undefined reference to `__aeabi_uldivmod' arm-linux-gnueabi-ld: drivers/md/md-llbitmap.o:drivers/md/md-llbitmap.c:895: more undefined references to `__aeabi_uldivmod' follow Use DIV_ROUND_UP_SECTOR_T instead of DIV_ROUND_UP, which exists to handle this exact situation. Fixes: 5ab829f1971d ("md/md-llbitmap: introduce new lockless bitmap") Signed-off-by: Nathan Chancellor Signed-off-by: Jens Axboe --- drivers/md/md-llbitmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index 3337d5c7e7e5..1eb434306162 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -781,7 +781,7 @@ static int llbitmap_init(struct llbitmap *llbitmap) while (chunks > space) { chunksize = chunksize << 1; - chunks = DIV_ROUND_UP(blocks, chunksize); + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); } llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; @@ -864,8 +864,8 @@ static int llbitmap_read_sb(struct llbitmap *llbitmap) goto out_put_page; } - if (chunksize < DIV_ROUND_UP(mddev->resync_max_sectors, - mddev->bitmap_info.space << SECTOR_SHIFT)) { + if (chunksize < DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, + mddev->bitmap_info.space << SECTOR_SHIFT)) { pr_err("md/llbitmap: %s: chunksize too small %lu < %llu / %lu", mdname(mddev), chunksize, mddev->resync_max_sectors, mddev->bitmap_info.space); @@ -892,7 +892,7 @@ static int llbitmap_read_sb(struct llbitmap *llbitmap) llbitmap->barrier_idle = DEFAULT_BARRIER_IDLE; llbitmap->chunksize = chunksize; - llbitmap->chunks = DIV_ROUND_UP(mddev->resync_max_sectors, chunksize); + llbitmap->chunks = DIV_ROUND_UP_SECTOR_T(mddev->resync_max_sectors, chunksize); llbitmap->chunkshift = ffz(~chunksize); ret = llbitmap_cache_pages(llbitmap); @@ -1014,10 +1014,10 @@ static int llbitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize) chunksize = llbitmap->chunksize; /* If there is enough space, leave the chunksize unchanged. */ - chunks = DIV_ROUND_UP(blocks, chunksize); + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); while (chunks > mddev->bitmap_info.space << SECTOR_SHIFT) { chunksize = chunksize << 1; - chunks = DIV_ROUND_UP(blocks, chunksize); + chunks = DIV_ROUND_UP_SECTOR_T(blocks, chunksize); } llbitmap->chunkshift = ffz(~chunksize); @@ -1094,7 +1094,7 @@ static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct llbitmap *llbitmap = mddev->bitmap; - unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize); + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; @@ -1111,7 +1111,7 @@ static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, unsigned long sectors) { struct llbitmap *llbitmap = mddev->bitmap; - unsigned long start = DIV_ROUND_UP(offset, llbitmap->chunksize); + unsigned long start = DIV_ROUND_UP_SECTOR_T(offset, llbitmap->chunksize); unsigned long end = (offset + sectors - 1) >> llbitmap->chunkshift; int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; From 6ff1bd7846680dfdaafc68d7fcd0ab7e3bcbc4a0 Mon Sep 17 00:00:00 2001 From: Martin George Date: Mon, 15 Sep 2025 17:19:21 +0530 Subject: [PATCH 118/164] nvme-auth: update bi_directional flag While setting chap->s2 to zero as part of secure channel concatenation, the host missed out to disable the bi_directional flag to indicate that controller authentication is not requested. Fix the same. Fixes: e88a7595b57f ("nvme-tcp: request secure channel concatenation") Signed-off-by: Martin George Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 201fc8809a62..012fcfc79a73 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -331,9 +331,10 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, } else { memset(chap->c2, 0, chap->hash_len); } - if (ctrl->opts->concat) + if (ctrl->opts->concat) { chap->s2 = 0; - else + chap->bi_directional = false; + } else chap->s2 = nvme_auth_get_seqnum(); data->seqnum = cpu_to_le32(chap->s2); if (chap->host_key_len) { From db5a5406fb7e5337a074385c7a3e53c77f2c1bd3 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 2 Sep 2025 12:22:00 +0200 Subject: [PATCH 119/164] nvmet-fc: move lsop put work to nvmet_fc_ls_req_op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It’s possible for more than one async command to be in flight from __nvmet_fc_send_ls_req. For each command, a tgtport reference is taken. In the current code, only one put work item is queued at a time, which results in a leaked reference. To fix this, move the work item to the nvmet_fc_ls_req_op struct, which already tracks all resources related to the command. Fixes: 710c69dbaccd ("nvmet-fc: avoid deadlock on delete association path") Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index a9b18c051f5b..6725c34dd7c9 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -54,6 +54,8 @@ struct nvmet_fc_ls_req_op { /* for an LS RQST XMT */ int ls_error; struct list_head lsreq_list; /* tgtport->ls_req_list */ bool req_queued; + + struct work_struct put_work; }; @@ -111,8 +113,6 @@ struct nvmet_fc_tgtport { struct nvmet_fc_port_entry *pe; struct kref ref; u32 max_sg_cnt; - - struct work_struct put_work; }; struct nvmet_fc_port_entry { @@ -235,12 +235,13 @@ static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc); static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue); static int nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue); static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport); -static void nvmet_fc_put_tgtport_work(struct work_struct *work) +static void nvmet_fc_put_lsop_work(struct work_struct *work) { - struct nvmet_fc_tgtport *tgtport = - container_of(work, struct nvmet_fc_tgtport, put_work); + struct nvmet_fc_ls_req_op *lsop = + container_of(work, struct nvmet_fc_ls_req_op, put_work); - nvmet_fc_tgtport_put(tgtport); + nvmet_fc_tgtport_put(lsop->tgtport); + kfree(lsop); } static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, @@ -367,7 +368,7 @@ __nvmet_fc_finish_ls_req(struct nvmet_fc_ls_req_op *lsop) DMA_BIDIRECTIONAL); out_putwork: - queue_work(nvmet_wq, &tgtport->put_work); + queue_work(nvmet_wq, &lsop->put_work); } static int @@ -388,6 +389,7 @@ __nvmet_fc_send_ls_req(struct nvmet_fc_tgtport *tgtport, lsreq->done = done; lsop->req_queued = false; INIT_LIST_HEAD(&lsop->lsreq_list); + INIT_WORK(&lsop->put_work, nvmet_fc_put_lsop_work); lsreq->rqstdma = fc_dma_map_single(tgtport->dev, lsreq->rqstaddr, lsreq->rqstlen + lsreq->rsplen, @@ -447,8 +449,6 @@ nvmet_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status) __nvmet_fc_finish_ls_req(lsop); /* fc-nvme target doesn't care about success or failure of cmd */ - - kfree(lsop); } /* @@ -1410,7 +1410,6 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, kref_init(&newrec->ref); ida_init(&newrec->assoc_cnt); newrec->max_sg_cnt = template->max_sgl_segments; - INIT_WORK(&newrec->put_work, nvmet_fc_put_tgtport_work); ret = nvmet_fc_alloc_ls_iodlist(newrec); if (ret) { From f2537be4f8421f6495edfa0bc284d722f253841d Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 2 Sep 2025 12:22:01 +0200 Subject: [PATCH 120/164] nvmet-fc: avoid scheduling association deletion twice When forcefully shutting down a port via the configfs interface, nvmet_port_subsys_drop_link() first calls nvmet_port_del_ctrls() and then nvmet_disable_port(). Both functions will eventually schedule all remaining associations for deletion. The current implementation checks whether an association is about to be removed, but only after the work item has already been scheduled. As a result, it is possible for the first scheduled work item to free all resources, and then for the same work item to be scheduled again for deletion. Because the association list is an RCU list, it is not possible to take a lock and remove the list entry directly, so it cannot be looked up again. Instead, a flag (terminating) must be used to determine whether the association is already in the process of being deleted. Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/all/rsdinhafrtlguauhesmrrzkybpnvwantwmyfq2ih5aregghax5@mhr7v3eryci3/ Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 6725c34dd7c9..7d84527d5a43 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1075,6 +1075,14 @@ nvmet_fc_delete_assoc_work(struct work_struct *work) static void nvmet_fc_schedule_delete_assoc(struct nvmet_fc_tgt_assoc *assoc) { + int terminating; + + terminating = atomic_xchg(&assoc->terminating, 1); + + /* if already terminating, do nothing */ + if (terminating) + return; + nvmet_fc_tgtport_get(assoc->tgtport); if (!queue_work(nvmet_wq, &assoc->del_work)) nvmet_fc_tgtport_put(assoc->tgtport); @@ -1202,13 +1210,7 @@ nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc) { struct nvmet_fc_tgtport *tgtport = assoc->tgtport; unsigned long flags; - int i, terminating; - - terminating = atomic_xchg(&assoc->terminating, 1); - - /* if already terminating, do nothing */ - if (terminating) - return; + int i; spin_lock_irqsave(&tgtport->lock, flags); list_del_rcu(&assoc->a_list); From 10c165af35d225eb033f4edc7fcc699a8d2d533d Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 2 Sep 2025 12:22:02 +0200 Subject: [PATCH 121/164] nvmet-fcloop: call done callback even when remote port is gone When the target port is gone, it's not possible to access any of the request resources. The function should just silently drop the response. The comment is misleading in this regard. Though it's still necessary to call the driver via the ->done callback so the driver is able to release all resources. Reported-by: Yi Zhang Closes: https://lore.kernel.org/all/CAHj4cs-OBA0WMt5f7R0dz+rR4HcEz19YLhnyGsj-MRV3jWDsPg@mail.gmail.com/ Fixes: 84eedced1c5b ("nvmet-fcloop: drop response if targetport is gone") Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Signed-off-by: Keith Busch --- drivers/nvme/target/fcloop.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 257b497d515a..5dffcc5becae 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -496,13 +496,15 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport, if (!targetport) { /* * The target port is gone. The target doesn't expect any - * response anymore and the ->done call is not valid - * because the resources have been freed by - * nvmet_fc_free_pending_reqs. + * response anymore and thus lsreq can't be accessed anymore. * * We end up here from delete association exchange: * nvmet_fc_xmt_disconnect_assoc sends an async request. + * + * Return success because this is what LLDDs do; silently + * drop the response. */ + lsrsp->done(lsrsp); kmem_cache_free(lsreq_cache, tls_req); return 0; } From 891cdbb162ccdb079cd5228ae43bdeebce8597ad Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 2 Sep 2025 12:22:03 +0200 Subject: [PATCH 122/164] nvme-fc: use lock accessing port_state and rport state nvme_fc_unregister_remote removes the remote port on a lport object at any point in time when there is no active association. This races with with the reconnect logic, because nvme_fc_create_association is not taking a lock to check the port_state and atomically increase the active count on the rport. Reported-by: Shinichiro Kawasaki Closes: https://lore.kernel.org/all/u4ttvhnn7lark5w3sgrbuy2rxupcvosp4qmvj46nwzgeo5ausc@uyrkdls2muwx Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 3e12d4683ac7..03987f497a5b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3032,11 +3032,17 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ++ctrl->ctrl.nr_reconnects; - if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + spin_lock_irqsave(&ctrl->rport->lock, flags); + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&ctrl->rport->lock, flags); return -ENODEV; + } - if (nvme_fc_ctlr_active_on_rport(ctrl)) + if (nvme_fc_ctlr_active_on_rport(ctrl)) { + spin_unlock_irqrestore(&ctrl->rport->lock, flags); return -ENOTUNIQ; + } + spin_unlock_irqrestore(&ctrl->rport->lock, flags); dev_info(ctrl->ctrl.device, "NVME-FC{%d}: create association : host wwpn 0x%016llx " From df4666a4908a6d883f628f93a3e6c80981332035 Mon Sep 17 00:00:00 2001 From: Martin George Date: Tue, 9 Sep 2025 16:05:09 +0530 Subject: [PATCH 123/164] nvme-tcp: send only permitted commands for secure concat In addition to sending permitted commands such as connect/auth over the initial unencrypted admin connection as part of secure channel concatenation, the host also sends commands such as Property Get and Identify on the same. This is a spec violation leading to secure concat failures. Fix this by ensuring these additional commands are avoided on this connection. Fixes: 104d0e2f6222 ("nvme-fabrics: reset admin connection for secure concatenation") Signed-off-by: Martin George Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c0fe8cfb7229..1413788ca7d5 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2250,6 +2250,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_tagset; + if (ctrl->opts->concat && !ctrl->tls_pskid) + return 0; + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; From 74b1db86847cce1c0fb54d362f8f5fde3adfd41b Mon Sep 17 00:00:00 2001 From: chengkaitao Date: Mon, 15 Sep 2025 20:33:07 +0800 Subject: [PATCH 124/164] block/mq-deadline: Remove the redundant rb_entry_rq in the deadline_from_pos(). In commit(fde02699c242), the "if (blk_rq_is_seq_zoned_write(rq))" was removed, but the "rb_entry_rq(node)" and some other code were inadvertently left behind. This patch fixed it. Signed-off-by: chengkaitao Reviewed-by: Bart Van Assche Reviewed-by: Li Nan Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 2e689b2c4021..3e741d33142d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -136,10 +136,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; - if (!node) - return NULL; - - rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { From 1cab50da62aa810e532396fcaeb96cf9c3fdf87a Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Thu, 21 Aug 2025 13:48:15 -0700 Subject: [PATCH 125/164] nvme-auth: add hkdf_expand_label() Provide an implementation of RFC 8446 (TLS 1.3) HKDF-Expand-Label Signed-off-by: Chris Leech Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 53 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 91e273b89fea..c6eae8e6b6f9 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -683,6 +683,59 @@ out_free_enc: } EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); +/** + * hkdf_expand_label - HKDF-Expand-Label (RFC 8846 section 7.1) + * @hmac_tfm: hash context keyed with pseudorandom key + * @label: ASCII label without "tls13 " prefix + * @labellen: length of @label + * @context: context bytes + * @contextlen: length of @context + * @okm: output keying material + * @okmlen: length of @okm + * + * Build the TLS 1.3 HkdfLabel structure and invoke hkdf_expand(). + * + * Returns 0 on success with output keying material stored in @okm, + * or a negative errno value otherwise. + */ +static int hkdf_expand_label(struct crypto_shash *hmac_tfm, + const u8 *label, unsigned int labellen, + const u8 *context, unsigned int contextlen, + u8 *okm, unsigned int okmlen) +{ + int err; + u8 *info; + unsigned int infolen; + const char *tls13_prefix = "tls13 "; + unsigned int prefixlen = strlen(tls13_prefix); + + if (WARN_ON(labellen > (255 - prefixlen))) + return -EINVAL; + if (WARN_ON(contextlen > 255)) + return -EINVAL; + + infolen = 2 + (1 + prefixlen + labellen) + (1 + contextlen); + info = kzalloc(infolen, GFP_KERNEL); + if (!info) + return -ENOMEM; + + /* HkdfLabel.Length */ + put_unaligned_be16(okmlen, info); + + /* HkdfLabel.Label */ + info[2] = prefixlen + labellen; + memcpy(info + 3, tls13_prefix, prefixlen); + memcpy(info + 3 + prefixlen, label, labellen); + + /* HkdfLabel.Context */ + info[3 + prefixlen + labellen] = contextlen; + memcpy(info + 4 + prefixlen + labellen, context, contextlen); + + err = hkdf_expand(hmac_tfm, info, infolen, okm, okmlen); + kfree_sensitive(info); + return err; +} + /** * nvme_auth_derive_tls_psk - Derive TLS PSK * @hmac_id: Hash function identifier From c5931d590e793c0291c0ba9fd1247567786612ea Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Thu, 21 Aug 2025 13:48:16 -0700 Subject: [PATCH 126/164] nvme-auth: use hkdf_expand_label() When generating keying material during an authentication transaction (secure channel concatenation), the HKDF-Expand-Label function is part of the specified key derivation process. The current open-coded implementation misses the length prefix requirements on the HkdfLabel label and context variable-length vectors (RFC 8446 Section 3.4). Instead, use the hkdf_expand_label() function. Signed-off-by: Chris Leech Signed-off-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index c6eae8e6b6f9..1f51fbebd9fa 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -768,10 +768,10 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, { struct crypto_shash *hmac_tfm; const char *hmac_name; - const char *psk_prefix = "tls13 nvme-tls-psk"; + const char *label = "nvme-tls-psk"; static const char default_salt[HKDF_MAX_HASHLEN]; - size_t info_len, prk_len; - char *info; + size_t prk_len; + const char *ctx; unsigned char *prk, *tls_key; int ret; @@ -811,36 +811,29 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, if (ret) goto out_free_prk; - /* - * 2 additional bytes for the length field from HDKF-Expand-Label, - * 2 additional bytes for the HMAC ID, and one byte for the space - * separator. - */ - info_len = strlen(psk_digest) + strlen(psk_prefix) + 5; - info = kzalloc(info_len + 1, GFP_KERNEL); - if (!info) { + ctx = kasprintf(GFP_KERNEL, "%02d %s", hmac_id, psk_digest); + if (!ctx) { ret = -ENOMEM; goto out_free_prk; } - put_unaligned_be16(psk_len, info); - memcpy(info + 2, psk_prefix, strlen(psk_prefix)); - sprintf(info + 2 + strlen(psk_prefix), "%02d %s", hmac_id, psk_digest); - tls_key = kzalloc(psk_len, GFP_KERNEL); if (!tls_key) { ret = -ENOMEM; - goto out_free_info; + goto out_free_ctx; } - ret = hkdf_expand(hmac_tfm, info, info_len, tls_key, psk_len); + ret = hkdf_expand_label(hmac_tfm, + label, strlen(label), + ctx, strlen(ctx), + tls_key, psk_len); if (ret) { kfree(tls_key); - goto out_free_info; + goto out_free_ctx; } *ret_psk = tls_key; -out_free_info: - kfree(info); +out_free_ctx: + kfree(ctx); out_free_prk: kfree(prk); out_free_shash: From eeaed48980a7aeb0d3d8b438185d4b5a66154ff9 Mon Sep 17 00:00:00 2001 From: Georg Gottleuber Date: Tue, 1 Jul 2025 22:55:49 +0200 Subject: [PATCH 127/164] nvme-pci: Add TUXEDO IBS Gen8 to Samsung sleep quirk On the TUXEDO InfinityBook S Gen8, a Samsung 990 Evo NVMe leads to a high power consumption in s2idle sleep (3.5 watts). This patch applies 'Force No Simple Suspend' quirk to achieve a sleep with a lower power consumption, typically around 1 watts. Signed-off-by: Georg Gottleuber Signed-off-by: Werner Sembach Cc: stable@vger.kernel.org Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d8a9dee55de3..4c6ee8f9f681 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3343,10 +3343,12 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev) * Exclude Samsung 990 Evo from NVME_QUIRK_SIMPLE_SUSPEND * because of high power consumption (> 2 Watt) in s2idle * sleep. Only some boards with Intel CPU are affected. + * (Note for testing: Samsung 990 Evo Plus has same PCI ID) */ if (dmi_match(DMI_BOARD_NAME, "DN50Z-140HC-YD") || dmi_match(DMI_BOARD_NAME, "GMxPXxx") || dmi_match(DMI_BOARD_NAME, "GXxMRXx") || + dmi_match(DMI_BOARD_NAME, "NS5X_NS7XAU") || dmi_match(DMI_BOARD_NAME, "PH4PG31") || dmi_match(DMI_BOARD_NAME, "PH4PRX1_PH6PRX1") || dmi_match(DMI_BOARD_NAME, "PH6PG01_PH6PG71")) From bfd4037296bd7e1f95394a2e3daf8e3c1796c3b3 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 15 Sep 2025 10:34:58 +0000 Subject: [PATCH 128/164] block: update validation of atomic writes boundary for stacked devices In commit 63d092d1c1b1 ("block: use chunk_sectors when evaluating stacked atomic write limits"), it was missed to use a chunk sectors limit check in blk_stack_atomic_writes_boundary_head(), so update that function to do the proper check. Fixes: 63d092d1c1b1 ("block: use chunk_sectors when evaluating stacked atomic write limits") Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 693bc8d20acf..6760dbf130b2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -643,18 +643,24 @@ static bool blk_stack_atomic_writes_tail(struct queue_limits *t, static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, struct queue_limits *b) { + unsigned int boundary_sectors; + + if (!b->atomic_write_hw_boundary || !t->chunk_sectors) + return true; + + boundary_sectors = b->atomic_write_hw_boundary >> SECTOR_SHIFT; + /* * Ensure atomic write boundary is aligned with chunk sectors. Stacked - * devices store chunk sectors in t->io_min. + * devices store any stripe size in t->chunk_sectors. */ - if (b->atomic_write_hw_boundary > t->io_min && - b->atomic_write_hw_boundary % t->io_min) + if (boundary_sectors > t->chunk_sectors && + boundary_sectors % t->chunk_sectors) return false; - if (t->io_min > b->atomic_write_hw_boundary && - t->io_min % b->atomic_write_hw_boundary) + if (t->chunk_sectors > boundary_sectors && + t->chunk_sectors % boundary_sectors) return false; - t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; return true; } @@ -695,13 +701,13 @@ static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) static bool blk_stack_atomic_writes_head(struct queue_limits *t, struct queue_limits *b) { - if (b->atomic_write_hw_boundary && - !blk_stack_atomic_writes_boundary_head(t, b)) + if (!blk_stack_atomic_writes_boundary_head(t, b)) return false; t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; t->atomic_write_hw_max = b->atomic_write_hw_max; + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; return true; } From f2d8c5a2f79c28569edf4948b611052253b5e99a Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 15 Sep 2025 10:34:59 +0000 Subject: [PATCH 129/164] block: fix stacking of atomic writes when atomics are not supported Atomic writes support may not always be possible when stacking devices which support atomic writes. Such as case is a different atomic write boundary between stacked devices (which is not supported). In the case that atomic writes cannot supported, the top device queue HW limits are set to 0. However, in blk_stack_atomic_writes_limits(), we detect that we are stacking the first bottom device by checking the top device atomic_write_hw_max value == 0. This get confused with the case of atomic writes not supported, above. Make the distinction between stacking the first bottom device and no atomics supported by initializing stacked device atomic_write_hw_max = UINT_MAX and checking that for stacking the first bottom device. Fixes: d7f36dc446e8 ("block: Support atomic writes limits for stacked devices") Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 6760dbf130b2..8fa52914e16b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -56,6 +56,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_user_wzeroes_unmap_sectors = UINT_MAX; lim->max_hw_zone_append_sectors = UINT_MAX; lim->max_user_discard_sectors = UINT_MAX; + lim->atomic_write_hw_max = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -232,6 +233,10 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) if (!(lim->features & BLK_FEAT_ATOMIC_WRITES)) goto unsupported; + /* UINT_MAX indicates stacked limits in initial state */ + if (lim->atomic_write_hw_max == UINT_MAX) + goto unsupported; + if (!lim->atomic_write_hw_max) goto unsupported; @@ -723,18 +728,14 @@ static void blk_stack_atomic_writes_limits(struct queue_limits *t, if (!blk_atomic_write_start_sect_aligned(start, b)) goto unsupported; - /* - * If atomic_write_hw_max is set, we have already stacked 1x bottom - * device, so check for compliance. - */ - if (t->atomic_write_hw_max) { + /* UINT_MAX indicates no stacking of bottom devices yet */ + if (t->atomic_write_hw_max == UINT_MAX) { + if (!blk_stack_atomic_writes_head(t, b)) + goto unsupported; + } else { if (!blk_stack_atomic_writes_tail(t, b)) goto unsupported; - return; } - - if (!blk_stack_atomic_writes_head(t, b)) - goto unsupported; blk_stack_atomic_writes_chunk_sectors(t); return; From da7b97ba0d219a14a83e9cc93f98b53939f12944 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 15 Sep 2025 10:35:00 +0000 Subject: [PATCH 130/164] block: relax atomic write boundary vs chunk size check blk_validate_atomic_write_limits() ensures that any boundary fits into and is aligned to any chunk size. However, it should also be possible to fit the chunk size into any boundary. That check is already made in blk_stack_atomic_writes_boundary_head(). Relax the check in blk_validate_atomic_write_limits() by reusing (and renaming) blk_stack_atomic_writes_boundary_head(). Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 66 +++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 8fa52914e16b..54cffaae4df4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -224,6 +224,27 @@ static void blk_atomic_writes_update_limits(struct queue_limits *lim) lim->atomic_write_hw_boundary >> SECTOR_SHIFT; } +/* + * Test whether any boundary is aligned with any chunk size. Stacked + * devices store any stripe size in t->chunk_sectors. + */ +static bool blk_valid_atomic_writes_boundary(unsigned int chunk_sectors, + unsigned int boundary_sectors) +{ + if (!chunk_sectors || !boundary_sectors) + return true; + + if (boundary_sectors > chunk_sectors && + boundary_sectors % chunk_sectors) + return false; + + if (chunk_sectors > boundary_sectors && + chunk_sectors % boundary_sectors) + return false; + + return true; +} + static void blk_validate_atomic_write_limits(struct queue_limits *lim) { unsigned int boundary_sectors; @@ -264,20 +285,9 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) if (WARN_ON_ONCE(lim->atomic_write_hw_max > lim->atomic_write_hw_boundary)) goto unsupported; - /* - * A feature of boundary support is that it disallows bios to - * be merged which would result in a merged request which - * crosses either a chunk sector or atomic write HW boundary, - * even though chunk sectors may be just set for performance. - * For simplicity, disallow atomic writes for a chunk sector - * which is non-zero and smaller than atomic write HW boundary. - * Furthermore, chunk sectors must be a multiple of atomic - * write HW boundary. Otherwise boundary support becomes - * complicated. - * Devices which do not conform to these rules can be dealt - * with if and when they show up. - */ - if (WARN_ON_ONCE(lim->chunk_sectors % boundary_sectors)) + + if (WARN_ON_ONCE(!blk_valid_atomic_writes_boundary( + lim->chunk_sectors, boundary_sectors))) goto unsupported; /* @@ -644,31 +654,6 @@ static bool blk_stack_atomic_writes_tail(struct queue_limits *t, return true; } -/* Check for valid boundary of first bottom device */ -static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, - struct queue_limits *b) -{ - unsigned int boundary_sectors; - - if (!b->atomic_write_hw_boundary || !t->chunk_sectors) - return true; - - boundary_sectors = b->atomic_write_hw_boundary >> SECTOR_SHIFT; - - /* - * Ensure atomic write boundary is aligned with chunk sectors. Stacked - * devices store any stripe size in t->chunk_sectors. - */ - if (boundary_sectors > t->chunk_sectors && - boundary_sectors % t->chunk_sectors) - return false; - if (t->chunk_sectors > boundary_sectors && - t->chunk_sectors % boundary_sectors) - return false; - - return true; -} - static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) { unsigned int chunk_bytes; @@ -706,7 +691,8 @@ static void blk_stack_atomic_writes_chunk_sectors(struct queue_limits *t) static bool blk_stack_atomic_writes_head(struct queue_limits *t, struct queue_limits *b) { - if (!blk_stack_atomic_writes_boundary_head(t, b)) + if (!blk_valid_atomic_writes_boundary(t->chunk_sectors, + b->atomic_write_hw_boundary >> SECTOR_SHIFT)) return false; t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; From 0b507305a08c134722f363de6fe6f1ba84e313b7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Sep 2025 13:40:43 -0700 Subject: [PATCH 131/164] blk-mq: Fix the blk_mq_tagset_busy_iter() documentation Commit 2dd6532e9591 ("blk-mq: Drop 'reserved' arg of busy_tag_iter_fn") removed the 'reserved' argument from tag iteration callback functions. Bring the blk_mq_tagset_busy_iter() documentation in sync with that change. Cc: Jens Axboe Cc: Christoph Hellwig Cc: Ming Lei Cc: John Garry Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index a63d21a4aab4..0602ca7f1e37 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -424,10 +424,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started - * request. @fn will be called as follows: @fn(rq, @priv, - * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. Return - * true to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @priv) where + * rq is a pointer to a request. Return true to continue iterating + * tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after From 336aec7b06be860477be80a4299263a2e9355789 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 17 Sep 2025 15:55:39 +0800 Subject: [PATCH 132/164] blk-throttle: fix throtl_data leak during disk release Tightening the throttle activation check in blk_throtl_activated() to require both q->td presence and policy bit set introduced a memory leak during disk release: blkg_destroy_all() clears the policy bit first during queue deactivation, causing subsequent blk_throtl_exit() to skip throtl_data cleanup when blk_throtl_activated() fails policy check. Idealy we should avoid modifying blk_throtl_exit() activation check because it's intuitive that blk-throtl start from blk_throtl_init() and end in blk_throtl_exit(). However, call blk_throtl_exit() before blkg_destroy_all() will make a long term deadlock problem easier to trigger[1], hence fix this problem by checking if q->td is NULL from blk_throtl_exit(), and remove policy deactivation as well since it's useless. [1] https://lore.kernel.org/all/CAHj4cs9p9H5yx+ywsb3CMUdbqGPhM+8tuBvhW=9ADiCjAqza9w@mail.gmail.com/#t Fixes: bd9fd5be6bc0 ("blk-throttle: fix access race during throttle policy activation") Reported-by: Yi Zhang Closes: https://lore.kernel.org/all/CAHj4cs-p-ZwBEKigBj7T6hQCOo-H68-kVwCrV6ZvRovrr9Z+HA@mail.gmail.com/ Signed-off-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f510ae072868..2c5b64b1a724 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1842,12 +1842,15 @@ void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; - if (!blk_throtl_activated(q)) + /* + * blkg_destroy_all() already deactivate throtl policy, just check and + * free throtl data. + */ + if (!q->td) return; timer_delete_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); - blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } From 1f924cf781de47432f220185bb2beeb12c666aa1 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Thu, 18 Sep 2025 13:34:07 -0600 Subject: [PATCH 133/164] selftests: ublk: kublk: simplify feat_map definition Simplify the definition of feat_map by introducing a helper macro FEAT_NAME to avoid having to type the feature name twice. As a side effect, this changes the names in the feature list to be the full macro name instead of the abbreviated names that were used before, but this is a good change for clarity. Using the full feature macro names ruins the alignment of the output, so change the output format to put each feature's hex value before its name, as this is easier to align nicely. The output now looks as follows: root# ./kublk features ublk_drv features: 0x7fff 0x1 : UBLK_F_SUPPORT_ZERO_COPY 0x2 : UBLK_F_URING_CMD_COMP_IN_TASK 0x4 : UBLK_F_NEED_GET_DATA 0x8 : UBLK_F_USER_RECOVERY 0x10 : UBLK_F_USER_RECOVERY_REISSUE 0x20 : UBLK_F_UNPRIVILEGED_DEV 0x40 : UBLK_F_CMD_IOCTL_ENCODE 0x80 : UBLK_F_USER_COPY 0x100 : UBLK_F_ZONED 0x200 : UBLK_F_USER_RECOVERY_FAIL_IO 0x400 : UBLK_F_UPDATE_SIZE 0x800 : UBLK_F_AUTO_BUF_REG 0x1000 : UBLK_F_QUIESCE 0x2000 : UBLK_F_PER_IO_DAEMON 0x4000 : unknown Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 31 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 6512dfbdbce3..4e5d82f2a14a 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1363,21 +1363,22 @@ static int cmd_dev_list(struct dev_ctx *ctx) static int cmd_dev_get_features(void) { #define const_ilog2(x) (63 - __builtin_clzll(x)) +#define FEAT_NAME(f) [const_ilog2(f)] = #f static const char *feat_map[] = { - [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY", - [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK", - [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA", - [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY", - [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE", - [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV", - [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE", - [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", - [const_ilog2(UBLK_F_ZONED)] = "ZONED", - [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", - [const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE", - [const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG", - [const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE", - [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON", + FEAT_NAME(UBLK_F_SUPPORT_ZERO_COPY), + FEAT_NAME(UBLK_F_URING_CMD_COMP_IN_TASK), + FEAT_NAME(UBLK_F_NEED_GET_DATA), + FEAT_NAME(UBLK_F_USER_RECOVERY), + FEAT_NAME(UBLK_F_USER_RECOVERY_REISSUE), + FEAT_NAME(UBLK_F_UNPRIVILEGED_DEV), + FEAT_NAME(UBLK_F_CMD_IOCTL_ENCODE), + FEAT_NAME(UBLK_F_USER_COPY), + FEAT_NAME(UBLK_F_ZONED), + FEAT_NAME(UBLK_F_USER_RECOVERY_FAIL_IO), + FEAT_NAME(UBLK_F_UPDATE_SIZE), + FEAT_NAME(UBLK_F_AUTO_BUF_REG), + FEAT_NAME(UBLK_F_QUIESCE), + FEAT_NAME(UBLK_F_PER_IO_DAEMON), }; struct ublk_dev *dev; __u64 features = 0; @@ -1404,7 +1405,7 @@ static int cmd_dev_get_features(void) feat = feat_map[i]; else feat = "unknown"; - printf("\t%-20s: 0x%llx\n", feat, 1ULL << i); + printf("0x%-16llx: %s\n", 1ULL << i, feat); } } From 742bcc1101bcaca92901fe3fe434e4b1a467b5e8 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Thu, 18 Sep 2025 13:34:08 -0600 Subject: [PATCH 134/164] selftests: ublk: kublk: add UBLK_F_BUF_REG_OFF_DAEMON to feat_map When UBLK_F_BUF_REG_OFF_DAEMON was added, we missed updating kublk's feat_map, which results in the feature being reported as "unknown." Add UBLK_F_BUF_REG_OFF_DAEMON to feat_map to fix this. Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 4e5d82f2a14a..b636d40b4889 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1379,6 +1379,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_AUTO_BUF_REG), FEAT_NAME(UBLK_F_QUIESCE), FEAT_NAME(UBLK_F_PER_IO_DAEMON), + FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), }; struct ublk_dev *dev; __u64 features = 0; From a755da0dd0530e53aa026fd4d08b3097e1be6455 Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Thu, 18 Sep 2025 13:34:09 -0600 Subject: [PATCH 135/164] selftests: ublk: add test to verify that feat_map is complete Add a test that verifies that the currently running kernel does not report support for any features that are unrecognized by kublk. This should catch cases where features are added without updating kublk's feat_map accordingly, which has happened multiple times in the past (see [1], [2]). Note that this new test may fail if the test suite is older than the kernel, and the newer kernel contains a newly introduced feature. I believe this is not a use case we currently care about - we only care about newer test suites passing on older kernels. [1] https://lore.kernel.org/linux-block/20250606214011.2576398-1-csander@purestorage.com/t/#u [2] https://lore.kernel.org/linux-block/2a370ab1-d85b-409d-b762-f9f3f6bdf705@nvidia.com/t/#m1c520a058448d594fd877f07804e69b28908533f Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + .../testing/selftests/ublk/test_generic_13.sh | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_generic_13.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 5d7f4ecfb816..770269efe42a 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -20,6 +20,7 @@ TEST_PROGS += test_generic_09.sh TEST_PROGS += test_generic_10.sh TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh +TEST_PROGS += test_generic_13.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh new file mode 100755 index 000000000000..b7aa90b1cb74 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_13.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_13" +ERR_CODE=0 + +_prep_test "null" "check that feature list is complete" + +if ${UBLK_PROG} features | grep -q unknown; then + echo "# unknown feature detected!" + echo "# did you add a feature and forget to update feat_map in kublk?" + echo "# this failure is expected if running an older test suite against" + echo "# a newer kernel with new features added" + ERR_CODE=255 +fi + +_cleanup_test "null" +_show_result $TID $ERR_CODE From 163f80dabf4f0c4d9e6c39e0dba474814dac78f8 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:37 -0600 Subject: [PATCH 136/164] ublk: remove ubq check in ublk_check_and_get_req() ublk_get_queue() never returns a NULL pointer, so there's no need to check its return value in ublk_check_and_get_req(). Drop the check. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index aa64f530d5e9..9f2db91af481 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2495,9 +2495,6 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, return ERR_PTR(-EINVAL); ubq = ublk_get_queue(ub, q_id); - if (!ubq) - return ERR_PTR(-EINVAL); - if (!ublk_support_user_copy(ubq)) return ERR_PTR(-EACCES); From b7e255b0340b5319fca4fe076119d0f929a24305 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:38 -0600 Subject: [PATCH 137/164] ublk: don't pass q_id to ublk_queue_cmd_buf_size() ublk_queue_cmd_buf_size() only needs the queue depth, which is the same for all queues. Get the queue depth from the ublk_device instead so the q_id parameter can be dropped. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9f2db91af481..bac16ec3151c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -762,11 +762,9 @@ static inline int __ublk_queue_cmd_buf_size(int depth) return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE); } -static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) +static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub) { - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - - return __ublk_queue_cmd_buf_size(ubq->q_depth); + return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth); } static int ublk_max_cmd_buf_size(void) @@ -1703,7 +1701,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) __func__, q_id, current->pid, vma->vm_start, phys_off, (unsigned long)sz); - if (sz != ublk_queue_cmd_buf_size(ub, q_id)) + if (sz != ublk_queue_cmd_buf_size(ub)) return -EINVAL; pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; @@ -2565,7 +2563,7 @@ static const struct file_operations ublk_ch_fops = { static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { - int size = ublk_queue_cmd_buf_size(ub, q_id); + int size = ublk_queue_cmd_buf_size(ub); struct ublk_queue *ubq = ublk_get_queue(ub, q_id); int i; @@ -2592,7 +2590,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) ubq->flags = ub->dev_info.flags; ubq->q_id = q_id; ubq->q_depth = ub->dev_info.queue_depth; - size = ublk_queue_cmd_buf_size(ub, q_id); + size = ublk_queue_cmd_buf_size(ub); ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); if (!ptr) From 0265595002b989db8e0c32dc33624fa61a974b20 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:39 -0600 Subject: [PATCH 138/164] ublk: don't pass ublk_queue to __ublk_fail_req() __ublk_fail_req() only uses the ublk_queue to get the ublk_device, which its caller already has. So just pass the ublk_device directly. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index bac16ec3151c..4cb023d26593 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1708,12 +1708,12 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } -static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, +static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, struct request *req) { WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); - if (ublk_nosrv_should_reissue_outstanding(ubq->dev)) + if (ublk_nosrv_should_reissue_outstanding(ub)) blk_mq_requeue_request(req, false); else { io->res = -EIO; @@ -1737,7 +1737,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) struct ublk_io *io = &ubq->ios[i]; if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) - __ublk_fail_req(ubq, io, io->req); + __ublk_fail_req(ub, io, io->req); } } From d74a383ec70de33ae6577af889556747d6693269 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:40 -0600 Subject: [PATCH 139/164] ublk: add helpers to check ublk_device flags Introduce ublk_device analogues of the ublk_queue flag helpers: - ublk_support_zero_copy() -> ublk_dev_support_user_copy() - ublk_support_auto_buf_reg() -> ublk_dev_support_auto_buf_reg() - ublk_support_user_copy() -> ublk_dev_support_user_copy() - ublk_need_map_io() -> ublk_dev_need_map_io() - ublk_need_req_ref() -> ublk_dev_need_req_ref() - ublk_need_get_data() -> ublk_dev_need_get_data() These will be used in subsequent changes to avoid accessing the ublk_queue just for the flags, and instead use the ublk_device. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4cb023d26593..04b8613ce623 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -662,22 +662,44 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; } +static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; +} + static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_AUTO_BUF_REG; } +static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; +} + static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_USER_COPY; } +static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && !ublk_support_auto_buf_reg(ubq); } +static inline bool ublk_dev_need_map_io(const struct ublk_device *ub) +{ + return !ublk_dev_support_user_copy(ub) && + !ublk_dev_support_zero_copy(ub) && + !ublk_dev_support_auto_buf_reg(ub); +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { /* @@ -695,6 +717,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) ublk_support_auto_buf_reg(ubq); } +static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub) +{ + return ublk_dev_support_user_copy(ub) || + ublk_dev_support_zero_copy(ub) || + ublk_dev_support_auto_buf_reg(ub); +} + static inline void ublk_init_req_ref(const struct ublk_queue *ubq, struct ublk_io *io) { @@ -726,6 +755,11 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_NEED_GET_DATA; } +static inline bool ublk_dev_need_get_data(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_NEED_GET_DATA; +} + /* Called in slow path only, keep it noinline for trace purpose */ static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) { From 5125535f90564117506d926d0de92c4c2622b720 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:41 -0600 Subject: [PATCH 140/164] ublk: don't dereference ublk_queue in ublk_ch_uring_cmd_local() For ublk servers with many ublk queues, accessing the ublk_queue to handle a ublk command is a frequent cache miss. Get the queue depth from the ublk_device instead, which is accessed just before. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 04b8613ce623..58f688eac742 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2333,7 +2333,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, ubq = ublk_get_queue(ub, q_id); - if (tag >= ubq->q_depth) + if (tag >= ub->dev_info.queue_depth) goto out; io = &ubq->ios[tag]; From b40dcdf8235d536072b9f61eb6d291f0f3720768 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:42 -0600 Subject: [PATCH 141/164] ublk: don't dereference ublk_queue in ublk_check_and_get_req() For ublk servers with many ublk queues, accessing the ublk_queue in ublk_ch_{read,write}_iter() is a frequent cache miss. Get the flags and queue depth from the ublk_device instead, which is accessed just before. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 58f688eac742..d6d8dcb72e4b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2527,10 +2527,10 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, return ERR_PTR(-EINVAL); ubq = ublk_get_queue(ub, q_id); - if (!ublk_support_user_copy(ubq)) + if (!ublk_dev_support_user_copy(ub)) return ERR_PTR(-EACCES); - if (tag >= ubq->q_depth) + if (tag >= ub->dev_info.queue_depth) return ERR_PTR(-EINVAL); *io = &ubq->ios[tag]; From 8a81926e45670c6d9b6e73e0482485d5c9a627e6 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:43 -0600 Subject: [PATCH 142/164] ublk: pass ublk_device to ublk_register_io_buf() Avoid repeating the 2 dereferences to get the ublk_device from the io_uring_cmd by passing it from ublk_ch_uring_cmd_local() to ublk_register_io_buf(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d6d8dcb72e4b..cb51f3f3cd33 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2125,11 +2125,11 @@ static void ublk_io_release(void *priv) } static int ublk_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_device *ub, const struct ublk_queue *ubq, struct ublk_io *io, unsigned int index, unsigned int issue_flags) { - struct ublk_device *ub = cmd->file->private_data; struct request *req; int ret; @@ -2152,6 +2152,7 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_device *ub, const struct ublk_queue *ubq, struct ublk_io *io, unsigned index, unsigned issue_flags) { @@ -2165,7 +2166,8 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, */ new_registered_buffers = io->task_registered_buffers + 1; if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) - return ublk_register_io_buf(cmd, ubq, io, index, issue_flags); + return ublk_register_io_buf(cmd, ub, ubq, io, index, + issue_flags); if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) return -EINVAL; @@ -2356,7 +2358,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, * so can be handled on any task */ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) - return ublk_register_io_buf(cmd, ubq, io, addr, + return ublk_register_io_buf(cmd, ub, ubq, io, addr, issue_flags); goto out; @@ -2378,7 +2380,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_daemon_register_io_buf(cmd, ubq, io, addr, + return ublk_daemon_register_io_buf(cmd, ub, ubq, io, addr, issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: ret = ublk_check_commit_and_fetch(ubq, io, addr); From 692cf47e1af39f86f28069db5ca6b00a7d2daddc Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:44 -0600 Subject: [PATCH 143/164] ublk: don't access ublk_queue in ublk_register_io_buf() For ublk servers with many ublk queues, accessing the ublk_queue in ublk_register_io_buf() is a frequent cache miss. Get the flags from the ublk_device instead, which is accessed earlier in ublk_ch_uring_cmd_local(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index cb51f3f3cd33..751ec62655f8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2133,7 +2133,7 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, struct request *req; int ret; - if (!ublk_support_zero_copy(ubq)) + if (!ublk_dev_support_zero_copy(ub)) return -EINVAL; req = __ublk_check_and_get_req(ub, ubq, io, 0); From ce88e3ef33d35c740d26342be5d8f65972fd5597 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:45 -0600 Subject: [PATCH 144/164] ublk: don't access ublk_queue in ublk_daemon_register_io_buf() For ublk servers with many ublk queues, accessing the ublk_queue in ublk_daemon_register_io_buf() is a frequent cache miss. Get the flags from the ublk_device instead, which is accessed earlier in ublk_ch_uring_cmd_local(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 751ec62655f8..266b46d40886 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2169,7 +2169,7 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, return ublk_register_io_buf(cmd, ub, ubq, io, index, issue_flags); - if (!ublk_support_zero_copy(ubq) || !ublk_rq_has_data(req)) + if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req)) return -EINVAL; ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, From 25c028aa791503fe0876c20bfd67b2676e6e24d0 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:46 -0600 Subject: [PATCH 145/164] ublk: pass q_id and tag to __ublk_check_and_get_req() __ublk_check_and_get_req() only uses its ublk_queue argument to get the q_id and tag. Pass those arguments explicitly to save an access to the ublk_queue. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 266b46d40886..cb61f6213962 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -250,8 +250,7 @@ static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, struct ublk_io *io, - size_t offset); + u16 q_id, u16 tag, struct ublk_io *io, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc * @@ -2126,7 +2125,7 @@ static void ublk_io_release(void *priv) static int ublk_register_io_buf(struct io_uring_cmd *cmd, struct ublk_device *ub, - const struct ublk_queue *ubq, + u16 q_id, u16 tag, struct ublk_io *io, unsigned int index, unsigned int issue_flags) { @@ -2136,7 +2135,7 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, if (!ublk_dev_support_zero_copy(ub)) return -EINVAL; - req = __ublk_check_and_get_req(ub, ubq, io, 0); + req = __ublk_check_and_get_req(ub, q_id, tag, io, 0); if (!req) return -EINVAL; @@ -2153,7 +2152,7 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, static int ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, struct ublk_device *ub, - const struct ublk_queue *ubq, struct ublk_io *io, + u16 q_id, u16 tag, struct ublk_io *io, unsigned index, unsigned issue_flags) { unsigned new_registered_buffers; @@ -2166,7 +2165,7 @@ ublk_daemon_register_io_buf(struct io_uring_cmd *cmd, */ new_registered_buffers = io->task_registered_buffers + 1; if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT)) - return ublk_register_io_buf(cmd, ub, ubq, io, index, + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, issue_flags); if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req)) @@ -2358,8 +2357,8 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, * so can be handled on any task */ if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF) - return ublk_register_io_buf(cmd, ub, ubq, io, addr, - issue_flags); + return ublk_register_io_buf(cmd, ub, q_id, tag, io, + addr, issue_flags); goto out; } @@ -2380,7 +2379,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, switch (_IOC_NR(cmd_op)) { case UBLK_IO_REGISTER_IO_BUF: - return ublk_daemon_register_io_buf(cmd, ub, ubq, io, addr, + return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr, issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: ret = ublk_check_commit_and_fetch(ubq, io, addr); @@ -2429,16 +2428,15 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - const struct ublk_queue *ubq, struct ublk_io *io, size_t offset) + u16 q_id, u16 tag, struct ublk_io *io, size_t offset) { - unsigned tag = io - ubq->ios; struct request *req; /* * can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ, * which would overwrite it with io->cmd */ - req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); if (!req) return NULL; @@ -2536,7 +2534,7 @@ static struct request *ublk_check_and_get_req(struct kiocb *iocb, return ERR_PTR(-EINVAL); *io = &ubq->ios[tag]; - req = __ublk_check_and_get_req(ub, ubq, *io, buf_off); + req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off); if (!req) return ERR_PTR(-EINVAL); From a689efd5fde7b39cfbcf43267bccf0e56295cc16 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:47 -0600 Subject: [PATCH 146/164] ublk: don't access ublk_queue in ublk_check_fetch_buf() Obtain the ublk device flags from ublk_device to avoid needing to access the ublk_queue, which may be a cache miss. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index cb61f6213962..9c6045e6d03b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2190,14 +2190,14 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, return io_buffer_unregister_bvec(cmd, index, issue_flags); } -static int ublk_check_fetch_buf(const struct ublk_queue *ubq, __u64 buf_addr) +static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) { - if (ublk_need_map_io(ubq)) { + if (ublk_dev_need_map_io(ub)) { /* * FETCH_RQ has to provide IO buffer if NEED GET * DATA is not enabled */ - if (!buf_addr && !ublk_need_get_data(ubq)) + if (!buf_addr && !ublk_dev_need_get_data(ub)) return -EINVAL; } else if (buf_addr) { /* User copy requires addr to be unset */ @@ -2340,7 +2340,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, io = &ubq->ios[tag]; /* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */ if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) { - ret = ublk_check_fetch_buf(ubq, addr); + ret = ublk_check_fetch_buf(ub, addr); if (ret) goto out; ret = ublk_fetch(cmd, ubq, io, addr); From 23c014448e97d4b59c54816f545ab963bf8dd644 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:48 -0600 Subject: [PATCH 147/164] ublk: don't access ublk_queue in ublk_config_io_buf() For ublk servers with many ublk queues, accessing the ublk_queue in ublk_config_io_buf() is a frequent cache miss. Get the flags from the ublk_device instead, which is accessed earlier in ublk_ch_uring_cmd_local(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9c6045e6d03b..9535382f9f8e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2081,11 +2081,11 @@ ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd) } static inline int -ublk_config_io_buf(const struct ublk_queue *ubq, struct ublk_io *io, +ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io, struct io_uring_cmd *cmd, unsigned long buf_addr, u16 *buf_idx) { - if (ublk_support_auto_buf_reg(ubq)) + if (ublk_dev_support_auto_buf_reg(ub)) return ublk_handle_auto_buf_reg(io, cmd, buf_idx); io->addr = buf_addr; @@ -2233,7 +2233,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV); ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, buf_addr, NULL); + ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); if (ret) goto out; @@ -2387,7 +2387,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, goto out; io->res = result; req = ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, addr, &buf_idx); + ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); compl = ublk_need_complete_req(ubq, io); /* can't touch 'ublk_io' any more */ @@ -2408,7 +2408,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, * request */ req = ublk_fill_io_cmd(io, cmd); - ret = ublk_config_io_buf(ubq, io, cmd, addr, NULL); + ret = ublk_config_io_buf(ub, io, cmd, addr, NULL); WARN_ON_ONCE(ret); if (likely(ublk_get_data(ubq, io, req))) { __ublk_prep_compl_io_cmd(io, req); From 3576e60a33c7f6be024b80f8c87312032fd27892 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:49 -0600 Subject: [PATCH 148/164] ublk: don't pass ublk_queue to ublk_fetch() ublk_fetch() only uses the ublk_queue to get the ublk_device, which its caller already has. So just pass the ublk_device directly. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9535382f9f8e..9a726d048703 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2206,10 +2206,9 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) return 0; } -static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq, +static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, struct ublk_io *io, __u64 buf_addr) { - struct ublk_device *ub = ubq->dev; int ret = 0; /* @@ -2343,7 +2342,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, ret = ublk_check_fetch_buf(ub, addr); if (ret) goto out; - ret = ublk_fetch(cmd, ubq, io, addr); + ret = ublk_fetch(cmd, ub, io, addr); if (ret) goto out; From be7962d7e3d9dd9ff5b6bcd3faccb3b0f76a9734 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:50 -0600 Subject: [PATCH 149/164] ublk: don't access ublk_queue in ublk_check_commit_and_fetch() For ublk servers with many ublk queues, accessing the ublk_queue in ublk_check_commit_and_fetch() is a frequent cache miss. Get the flags from the ublk_device instead, which is accessed earlier in ublk_ch_uring_cmd_local(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9a726d048703..b92b7823005d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2243,17 +2243,17 @@ out: return ret; } -static int ublk_check_commit_and_fetch(const struct ublk_queue *ubq, +static int ublk_check_commit_and_fetch(const struct ublk_device *ub, struct ublk_io *io, __u64 buf_addr) { struct request *req = io->req; - if (ublk_need_map_io(ubq)) { + if (ublk_dev_need_map_io(ub)) { /* * COMMIT_AND_FETCH_REQ has to provide IO buffer if * NEED GET DATA is not enabled or it is Read IO. */ - if (!buf_addr && (!ublk_need_get_data(ubq) || + if (!buf_addr && (!ublk_dev_need_get_data(ub) || req_op(req) == REQ_OP_READ)) return -EINVAL; } else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) { @@ -2381,7 +2381,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr, issue_flags); case UBLK_IO_COMMIT_AND_FETCH_REQ: - ret = ublk_check_commit_and_fetch(ubq, io, addr); + ret = ublk_check_commit_and_fetch(ub, io, addr); if (ret) goto out; io->res = result; From 122f6387e845dfbfcf1ed795734a1ec779e987f0 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:51 -0600 Subject: [PATCH 150/164] ublk: don't access ublk_queue in ublk_need_complete_req() For ublk servers with many ublk queues, accessing the ublk_queue in ublk_need_complete_req() is a frequent cache miss. Get the flags from the ublk_device instead, which is accessed earlier in ublk_ch_uring_cmd_local(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b92b7823005d..750d0a332685 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2267,10 +2267,10 @@ static int ublk_check_commit_and_fetch(const struct ublk_device *ub, return 0; } -static bool ublk_need_complete_req(const struct ublk_queue *ubq, +static bool ublk_need_complete_req(const struct ublk_device *ub, struct ublk_io *io) { - if (ublk_need_req_ref(ubq)) + if (ublk_dev_need_req_ref(ub)) return ublk_sub_req_ref(io); return true; } @@ -2387,7 +2387,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, io->res = result; req = ublk_fill_io_cmd(io, cmd); ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); - compl = ublk_need_complete_req(ubq, io); + compl = ublk_need_complete_req(ub, io); /* can't touch 'ublk_io' any more */ if (buf_idx != UBLK_INVALID_BUF_IDX) From 97a02be6303646e19e9092042928b0e13543305c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:52 -0600 Subject: [PATCH 151/164] ublk: pass ublk_io to __ublk_complete_rq() All callers of __ublk_complete_rq() already know the ublk_io. Pass it in to avoid looking it up again. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 750d0a332685..a677eca1ee86 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -529,7 +529,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif -static inline void __ublk_complete_rq(struct request *req); +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -738,7 +738,7 @@ static inline bool ublk_get_req_ref(struct ublk_io *io) static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) { if (refcount_dec_and_test(&io->ref)) - __ublk_complete_rq(req); + __ublk_complete_rq(req, io); } static inline bool ublk_sub_req_ref(struct ublk_io *io) @@ -1146,10 +1146,9 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( } /* todo: handle partial completion */ -static inline void __ublk_complete_rq(struct request *req) +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io) { struct ublk_queue *ubq = req->mq_hctx->driver_data; - struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; @@ -1750,7 +1749,7 @@ static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, blk_mq_requeue_request(req, false); else { io->res = -EIO; - __ublk_complete_rq(req); + __ublk_complete_rq(req, io); } } @@ -2395,7 +2394,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = addr; if (compl) - __ublk_complete_rq(req); + __ublk_complete_rq(req, io); if (ret) goto out; From 755a18469ca4eca3b5bb4f52704b7708a9106db9 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 17 Sep 2025 19:49:53 -0600 Subject: [PATCH 152/164] ublk: don't access ublk_queue in ublk_unmap_io() For ublk servers with many ublk queues, accessing the ublk_queue in ublk_unmap_io() is a frequent cache miss. Pass to __ublk_complete_rq() whether the ublk server's data buffer needs to be copied to the request. In the callers __ublk_fail_req() and ublk_ch_uring_cmd_local(), get the flags from the ublk_device instead, as its flags have just been read. In ublk_put_req_ref(), pass false since all the features that require reference counting disable copying of the data buffer upon completion. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a677eca1ee86..5ab7ff5f03f4 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -529,7 +529,8 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif -static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io); +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, + bool need_map); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -737,8 +738,11 @@ static inline bool ublk_get_req_ref(struct ublk_io *io) static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) { - if (refcount_dec_and_test(&io->ref)) - __ublk_complete_rq(req, io); + if (!refcount_dec_and_test(&io->ref)) + return; + + /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */ + __ublk_complete_rq(req, io, false); } static inline bool ublk_sub_req_ref(struct ublk_io *io) @@ -1048,13 +1052,13 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, return rq_bytes; } -static int ublk_unmap_io(const struct ublk_queue *ubq, +static int ublk_unmap_io(bool need_map, const struct request *req, const struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); - if (!ublk_need_map_io(ubq)) + if (!need_map) return rq_bytes; if (ublk_need_unmap_req(req)) { @@ -1146,9 +1150,9 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( } /* todo: handle partial completion */ -static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io) +static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, + bool need_map) { - struct ublk_queue *ubq = req->mq_hctx->driver_data; unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; @@ -1172,7 +1176,7 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io) goto exit; /* for READ request, writing data in iod->addr to rq buffers */ - unmapped_bytes = ublk_unmap_io(ubq, req, io); + unmapped_bytes = ublk_unmap_io(need_map, req, io); /* * Extremely impossible since we got data filled in just before @@ -1749,7 +1753,7 @@ static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, blk_mq_requeue_request(req, false); else { io->res = -EIO; - __ublk_complete_rq(req, io); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); } } @@ -2394,7 +2398,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = addr; if (compl) - __ublk_complete_rq(req, io); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); if (ret) goto out; From a3835a44107fcbf05f183b5e8b60a8e4605b15ea Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Tue, 16 Sep 2025 18:42:52 -0600 Subject: [PATCH 153/164] selftests: ublk: fix behavior when fio is not installed Some ublk selftests have strange behavior when fio is not installed. While most tests behave correctly (run if they don't need fio, or skip if they need fio), the following tests have different behavior: - test_null_01, test_null_02, test_generic_01, test_generic_02, and test_generic_12 try to run fio without checking if it exists first, and fail on any failure of the fio command (including "fio command not found"). So these tests fail when they should skip. - test_stress_05 runs fio without checking if it exists first, but doesn't fail on fio command failure. This test passes, but that pass is misleading as the test doesn't do anything useful without fio installed. So this test passes when it should skip. Fix these issues by adding _have_program fio checks to the top of all of these tests. Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_generic_01.sh | 4 ++++ tools/testing/selftests/ublk/test_generic_02.sh | 4 ++++ tools/testing/selftests/ublk/test_generic_12.sh | 4 ++++ tools/testing/selftests/ublk/test_null_01.sh | 4 ++++ tools/testing/selftests/ublk/test_null_02.sh | 4 ++++ tools/testing/selftests/ublk/test_stress_05.sh | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh index 9227a208ba53..21a31cd5491a 100755 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -10,6 +10,10 @@ if ! _have_program bpftrace; then exit "$UBLK_SKIP_CODE" fi +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "sequential io order" dev_id=$(_add_ublk_dev -t null) diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 3e80121e3bf5..12920768b1a0 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -10,6 +10,10 @@ if ! _have_program bpftrace; then exit "$UBLK_SKIP_CODE" fi +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "sequential io order for MQ" dev_id=$(_add_ublk_dev -t null -q 2) diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh index 7abbb00d251d..b4046201b4d9 100755 --- a/tools/testing/selftests/ublk/test_generic_12.sh +++ b/tools/testing/selftests/ublk/test_generic_12.sh @@ -10,6 +10,10 @@ if ! _have_program bpftrace; then exit "$UBLK_SKIP_CODE" fi +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "do imbalanced load, it should be balanced over I/O threads" NTHREADS=6 diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index a34203f72668..c2cb8f7a09fe 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -6,6 +6,10 @@ TID="null_01" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "basic IO test" dev_id=$(_add_ublk_dev -t null) diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh index 5633ca876655..8accd35beb55 100755 --- a/tools/testing/selftests/ublk/test_null_02.sh +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -6,6 +6,10 @@ TID="null_02" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + _prep_test "null" "basic IO test with zero copy" dev_id=$(_add_ublk_dev -t null -z) diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh index 566cfd90d192..274295061042 100755 --- a/tools/testing/selftests/ublk/test_stress_05.sh +++ b/tools/testing/selftests/ublk/test_stress_05.sh @@ -5,6 +5,10 @@ TID="stress_05" ERR_CODE=0 +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + run_io_and_remove() { local size=$1 From fea55691aca1edf2fe612fd75536ec26354a8c4a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Sep 2025 13:13:24 -0700 Subject: [PATCH 154/164] blk-mq: Fix more tag iteration function documentation Commit 8ab30a331946 ("blk-mq: Drop busy_iter_fn blk_mq_hw_ctx argument") removed the hctx argument from the callback functions called by bt_for_each() and blk_mq_queue_tag_busy_iter(). Commit 2dd6532e9591 ("blk-mq: Drop 'reserved' arg of busy_tag_iter_fn") removed the 'reserved' argument of the busy_tag_iter_fn function pointer type. Bring the documentation of the tag iteration functions in sync with these changes. Cc: John Garry Cc: Ming Lei Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 0602ca7f1e37..271fa005c51e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -297,15 +297,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. - * @q: Request queue to examine. + * @q: Request queue @hctx is associated with (@hctx->queue). * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. - * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) - * where rq is a pointer to a request. Return true to continue - * iterating tags, false to stop. - * @data: Will be passed as third argument to @fn. + * @fn will be called as follows: @fn(rq, @data) where rq is a + * pointer to a request. Return %true to continue iterating tags; + * %false to stop. + * @data: Will be passed as second argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ @@ -371,9 +371,9 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started - * request. @fn will be called as follows: @fn(rq, @data, - * @reserved) where rq is a pointer to a request. Return true - * to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @data) where rq + * is a pointer to a request. Return %true to continue iterating + * tags; %false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ @@ -406,10 +406,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each - * request. @fn will be called as follows: @fn(rq, @priv, - * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. Return - * true to continue iterating tags, false to stop. + * request. @fn will be called as follows: @fn(rq, @priv) where rq + * is a pointer to a request. Return %true to continue iterating + * tags; %false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. @@ -485,11 +484,10 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request - * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, - * reserved) where rq is a pointer to a request and hctx points - * to the hardware queue associated with the request. 'reserved' - * indicates whether or not @rq is a reserved request. - * @priv: Will be passed as third argument to @fn. + * on @q. @fn will be called as follows: @fn(rq, @priv) where rq + * is a pointer to a request and hctx points to the hardware queue + * associated with the request. + * @priv: Will be passed as second argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only From 670bfe683850cb29957a9d71f997e9774eb24de6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 Sep 2025 15:01:01 +0800 Subject: [PATCH 155/164] blk-mq: fix null-ptr-deref in blk_mq_free_tags() from error path blk_mq_free_tags() can be called after blk_mq_init_tags(), while tags->page_list is still not initialized, causing null-ptr-deref. Fix this problem by initializing tags->page_list at blk_mq_init_tags(), meanwhile, also free tags directly from error path because there is no srcu barrier. Fixes: ad0d05dbddc1 ("blk-mq: Defer freeing of tags page_list to SRCU callback") Reported-by: syzbot+5c5d41e80248d610221f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68d1b079.a70a0220.1b52b.0000.GAE@google.com/ Signed-off-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 9 +++++++++ block/blk-mq.c | 2 -- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 271fa005c51e..c7a4d4b9cc87 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -566,6 +566,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); + INIT_LIST_HEAD(&tags->page_list); + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) goto out_free_tags; if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) @@ -603,6 +605,13 @@ void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); + + /* if tags pages is not allocated yet, free tags directly */ + if (list_empty(&tags->page_list)) { + kfree(tags); + return; + } + call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 4ccf11cadf8c..09f579414161 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3582,8 +3582,6 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - INIT_LIST_HEAD(&tags->page_list); - /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size From 5d726c4dbeeddef612e6bed27edd29733f4d13af Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 Sep 2025 15:55:20 +0800 Subject: [PATCH 156/164] blk-cgroup: fix possible deadlock while configuring policy Following deadlock can be triggered easily by lockdep: WARNING: possible circular locking dependency detected 6.17.0-rc3-00124-ga12c2658ced0 #1665 Not tainted ------------------------------------------------------ check/1334 is trying to acquire lock: ff1100011d9d0678 (&q->sysfs_lock){+.+.}-{4:4}, at: blk_unregister_queue+0x53/0x180 but task is already holding lock: ff1100011d9d00e0 (&q->q_usage_counter(queue)#3){++++}-{0:0}, at: del_gendisk+0xba/0x110 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&q->q_usage_counter(queue)#3){++++}-{0:0}: blk_queue_enter+0x40b/0x470 blkg_conf_prep+0x7b/0x3c0 tg_set_limit+0x10a/0x3e0 cgroup_file_write+0xc6/0x420 kernfs_fop_write_iter+0x189/0x280 vfs_write+0x256/0x490 ksys_write+0x83/0x190 __x64_sys_write+0x21/0x30 x64_sys_call+0x4608/0x4630 do_syscall_64+0xdb/0x6b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #1 (&q->rq_qos_mutex){+.+.}-{4:4}: __mutex_lock+0xd8/0xf50 mutex_lock_nested+0x2b/0x40 wbt_init+0x17e/0x280 wbt_enable_default+0xe9/0x140 blk_register_queue+0x1da/0x2e0 __add_disk+0x38c/0x5d0 add_disk_fwnode+0x89/0x250 device_add_disk+0x18/0x30 virtblk_probe+0x13a3/0x1800 virtio_dev_probe+0x389/0x610 really_probe+0x136/0x620 __driver_probe_device+0xb3/0x230 driver_probe_device+0x2f/0xe0 __driver_attach+0x158/0x250 bus_for_each_dev+0xa9/0x130 driver_attach+0x26/0x40 bus_add_driver+0x178/0x3d0 driver_register+0x7d/0x1c0 __register_virtio_driver+0x2c/0x60 virtio_blk_init+0x6f/0xe0 do_one_initcall+0x94/0x540 kernel_init_freeable+0x56a/0x7b0 kernel_init+0x2b/0x270 ret_from_fork+0x268/0x4c0 ret_from_fork_asm+0x1a/0x30 -> #0 (&q->sysfs_lock){+.+.}-{4:4}: __lock_acquire+0x1835/0x2940 lock_acquire+0xf9/0x450 __mutex_lock+0xd8/0xf50 mutex_lock_nested+0x2b/0x40 blk_unregister_queue+0x53/0x180 __del_gendisk+0x226/0x690 del_gendisk+0xba/0x110 sd_remove+0x49/0xb0 [sd_mod] device_remove+0x87/0xb0 device_release_driver_internal+0x11e/0x230 device_release_driver+0x1a/0x30 bus_remove_device+0x14d/0x220 device_del+0x1e1/0x5a0 __scsi_remove_device+0x1ff/0x2f0 scsi_remove_device+0x37/0x60 sdev_store_delete+0x77/0x100 dev_attr_store+0x1f/0x40 sysfs_kf_write+0x65/0x90 kernfs_fop_write_iter+0x189/0x280 vfs_write+0x256/0x490 ksys_write+0x83/0x190 __x64_sys_write+0x21/0x30 x64_sys_call+0x4608/0x4630 do_syscall_64+0xdb/0x6b0 entry_SYSCALL_64_after_hwframe+0x76/0x7e other info that might help us debug this: Chain exists of: &q->sysfs_lock --> &q->rq_qos_mutex --> &q->q_usage_counter(queue)#3 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->q_usage_counter(queue)#3); lock(&q->rq_qos_mutex); lock(&q->q_usage_counter(queue)#3); lock(&q->sysfs_lock); Root cause is that queue_usage_counter is grabbed with rq_qos_mutex held in blkg_conf_prep(), while queue should be freezed before rq_qos_mutex from other context. The blk_queue_enter() from blkg_conf_prep() is used to protect against policy deactivation, which is already protected with blkcg_mutex, hence convert blk_queue_enter() to blkcg_mutex to fix this problem. Meanwhile, consider that blkcg_mutex is held after queue is freezed from policy deactivation, also convert blkg_alloc() to use GFP_NOIO. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7246fc256315..f93de34fe87d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -877,14 +877,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, disk = ctx->bdev->bd_disk; q = disk->queue; - /* - * blkcg_deactivate_policy() requires queue to be frozen, we can grab - * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). - */ - ret = blk_queue_enter(q, 0); - if (ret) - goto fail; - + /* Prevent concurrent with blkcg_deactivate_policy() */ + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { @@ -914,16 +908,16 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); - new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_NOIO); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail_exit_queue; + goto fail_exit; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; - goto fail_exit_queue; + goto fail_exit; } spin_lock_irq(&q->queue_lock); @@ -951,7 +945,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - blk_queue_exit(q); + mutex_unlock(&q->blkcg_mutex); ctx->blkg = blkg; return 0; @@ -959,9 +953,8 @@ fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); -fail_exit_queue: - blk_queue_exit(q); -fail: +fail_exit: + mutex_unlock(&q->blkcg_mutex); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue From a5b852797411a65b99b467766a03be8f24284ddc Mon Sep 17 00:00:00 2001 From: Kamaljit Singh Date: Fri, 5 Sep 2025 16:25:49 -0700 Subject: [PATCH 157/164] nvme-core: add method to check for an I/O controller Add nvme_is_io_ctrl() to check if the controller is of type I/O controller. Uses negative logic by excluding an administrative controller and a discovery controller. Signed-off-by: Kamaljit Singh Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 812c1565114f..0f20adcebcf1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3163,6 +3163,11 @@ static inline bool nvme_admin_ctrl(struct nvme_ctrl *ctrl) return ctrl->cntrltype == NVME_CTRL_ADMIN; } +static inline bool nvme_is_io_ctrl(struct nvme_ctrl *ctrl) +{ + return !nvme_discovery_ctrl(ctrl) && !nvme_admin_ctrl(ctrl); +} + static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { From f7e9a615302fec524445da213745609a06b9914d Mon Sep 17 00:00:00 2001 From: Kamaljit Singh Date: Fri, 5 Sep 2025 16:25:50 -0700 Subject: [PATCH 158/164] nvme-core: do ioccsz/iorcsz validation only for I/O controllers An administrative controller does not support I/O queues, hence it should ignore existing checks for IOCCSZ/IORCSZ. Currently, these checks only exclude a discovery controller but need to also exclude an administrative controller. Signed-off-by: Kamaljit Singh Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0f20adcebcf1..edfe7a267a4b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3492,14 +3492,14 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct return -EINVAL; } - if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) { + if (nvme_is_io_ctrl(ctrl) && ctrl->ioccsz < 4) { dev_err(ctrl->device, "I/O queue command capsule supported size %d < 4\n", ctrl->ioccsz); return -EINVAL; } - if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) { + if (nvme_is_io_ctrl(ctrl) && ctrl->iorcsz < 1) { dev_err(ctrl->device, "I/O queue response capsule supported size %d < 1\n", ctrl->iorcsz); From 80e653fab6676fdcdfe2710cab5fc4312ac47edd Mon Sep 17 00:00:00 2001 From: Martin George Date: Mon, 22 Sep 2025 17:35:32 +0530 Subject: [PATCH 159/164] nvme-core: use nvme_is_io_ctrl() for I/O controller check Replace the current I/O controller check in nvme_init_non_mdts_limits() with the helper nvme_is_io_ctrl() function to maintain consistency with similar checks in other parts of the code and improve code readability. Signed-off-by: Martin George Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index edfe7a267a4b..a46f0f2d1c37 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3370,7 +3370,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) else ctrl->max_zeroes_sectors = 0; - if (ctrl->subsys->subtype != NVME_NQN_NVME || + if (!nvme_is_io_ctrl(ctrl) || !nvme_id_cns_ok(ctrl, NVME_ID_CNS_CS_CTRL) || test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags)) return 0; From 20015410fbdff3633418484165c694f6ceeb855b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 21 Sep 2025 15:44:05 +0300 Subject: [PATCH 160/164] nvmet: add safety check for subsys lock Replace comment about required lock with a lockdep_assert_held() check in the following functions: - nvmet_p2pmem_ns_add_p2p() - nvmet_setup_p2p_ns_map() - nvmet_release_p2p_ns_map() This ensures the subsystem lock is held at runtime. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 0dd7bd99afa3..5d7d483bfbe3 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -513,9 +513,6 @@ static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns) return 0; } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, struct nvmet_ns *ns) { @@ -523,6 +520,8 @@ static void nvmet_p2pmem_ns_add_p2p(struct nvmet_ctrl *ctrl, struct pci_dev *p2p_dev; int ret; + lockdep_assert_held(&ctrl->subsys->lock); + if (!ctrl->p2p_client || !ns->use_p2pmem) return; @@ -1539,15 +1538,14 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) return false; } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, struct device *p2p_client) { struct nvmet_ns *ns; unsigned long idx; + lockdep_assert_held(&ctrl->subsys->lock); + if (!p2p_client) return; @@ -1557,14 +1555,13 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, nvmet_p2pmem_ns_add_p2p(ctrl, ns); } -/* - * Note: ctrl->subsys->lock should be held when calling this function - */ static void nvmet_release_p2p_ns_map(struct nvmet_ctrl *ctrl) { struct radix_tree_iter iter; void __rcu **slot; + lockdep_assert_held(&ctrl->subsys->lock); + radix_tree_for_each_slot(slot, &ctrl->p2p_ns_map, &iter, 0) pci_dev_put(radix_tree_deref_slot(slot)); From 2e482655019ab6fcfe8865b62432c6d03f0b5f80 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Tue, 2 Sep 2025 13:52:11 +1000 Subject: [PATCH 161/164] nvme: Use non zero KATO for persistent discovery connections The NVMe Base Specification 2.1 states that: """ A host requests an explicit persistent connection ... by specifying a non-zero Keep Alive Timer value in the Connect command. """ As such if we are starting a persistent connection to a discovery controller and the KATO is currently 0 we need to update KATO to a non zero value to avoid continuous timeouts on the target. Signed-off-by: Alistair Francis Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a46f0f2d1c37..42c5f05680f1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4991,8 +4991,14 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) * checking that they started once before, hence are reconnecting back. */ if (test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) && - nvme_discovery_ctrl(ctrl)) + nvme_discovery_ctrl(ctrl)) { + if (!ctrl->kato) { + nvme_stop_keep_alive(ctrl); + ctrl->kato = NVME_DEFAULT_KATO; + nvme_start_keep_alive(ctrl); + } nvme_change_uevent(ctrl, "NVME_EVENT=rediscover"); + } if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); From f85e254b51aeadf8dc367aaf2fbd2c20378f75c2 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 23 Sep 2025 09:52:48 -0600 Subject: [PATCH 162/164] ublk: remove redundant zone op check in ublk_setup_iod() ublk_setup_iod() checks first whether the request is a zoned operation issued to a device without zoned support and returns BLK_STS_IOERR if so. However, such a request would already hit the default case in the subsequent switch statement and fail the ublk_queue_is_zoned() check, which also results in a return of BLK_STS_IOERR. So remove the redundant early check for unsupported zone ops. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5ab7ff5f03f4..fcc8b3868137 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1105,13 +1105,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) { struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublk_io *io = &ubq->ios[req->tag]; - enum req_op op = req_op(req); u32 ublk_op; - if (!ublk_queue_is_zoned(ubq) && - (op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND)) - return BLK_STS_IOERR; - switch (req_op(req)) { case REQ_OP_READ: ublk_op = UBLK_IO_OP_READ; From 8f4ed0ce4857ceb444174503fc9058720d4faaa1 Mon Sep 17 00:00:00 2001 From: Jaehoon Kim Date: Thu, 25 Sep 2025 17:47:07 +0200 Subject: [PATCH 163/164] s390/dasd: Return BLK_STS_INVAL for EINVAL from do_dasd_request Currently, if CCW request creation fails with -EINVAL, the DASD driver returns BLK_STS_IOERR to the block layer. This can happen, for example, when a user-space application such as QEMU passes a misaligned buffer, but the original cause of the error is masked as a generic I/O error. This patch changes the behavior so that -EINVAL is returned as BLK_STS_INVAL, allowing user space to properly detect alignment issues instead of interpreting them as I/O errors. Reviewed-by: Stefan Haberland Cc: stable@vger.kernel.org #6.11+ Signed-off-by: Jaehoon Kim Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 582ac7e61b24..1d624388075f 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3114,12 +3114,14 @@ static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx, PTR_ERR(cqr) == -ENOMEM || PTR_ERR(cqr) == -EAGAIN) { rc = BLK_STS_RESOURCE; - goto out; + } else if (PTR_ERR(cqr) == -EINVAL) { + rc = BLK_STS_INVAL; + } else { + DBF_DEV_EVENT(DBF_ERR, basedev, + "CCW creation failed (rc=%ld) on request %p", + PTR_ERR(cqr), req); + rc = BLK_STS_IOERR; } - DBF_DEV_EVENT(DBF_ERR, basedev, - "CCW creation failed (rc=%ld) on request %p", - PTR_ERR(cqr), req); - rc = BLK_STS_IOERR; goto out; } /* From 130e6de62107116eba124647116276266be0f84c Mon Sep 17 00:00:00 2001 From: Jaehoon Kim Date: Thu, 25 Sep 2025 17:47:08 +0200 Subject: [PATCH 164/164] s390/dasd: enforce dma_alignment to ensure proper buffer validation The block layer validates buffer alignment using the device's dma_alignment value. If dma_alignment is smaller than logical_block_size(bp_block) -1, misaligned buffer incorrectly pass validation and propagate to the lower-level driver. This patch adjusts dma_alignment to be at least logical_block_size -1, ensuring that misalignment buffers are properly rejected at the block layer and do not reach the DASD driver unnecessarily. Fixes: 2a07bb64d801 ("s390/dasd: Remove DMA alignment") Reviewed-by: Stefan Haberland Cc: stable@vger.kernel.org #6.11+ Signed-off-by: Jaehoon Kim Signed-off-by: Stefan Haberland Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 1d624388075f..7765e40f7cea 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -334,6 +334,11 @@ static int dasd_state_basic_to_ready(struct dasd_device *device) lim.max_dev_sectors = device->discipline->max_sectors(block); lim.max_hw_sectors = lim.max_dev_sectors; lim.logical_block_size = block->bp_block; + /* + * Adjust dma_alignment to match block_size - 1 + * to ensure proper buffer alignment checks in the block layer. + */ + lim.dma_alignment = lim.logical_block_size - 1; if (device->discipline->has_discard) { unsigned int max_bytes;