From 43bc0aa19a2174d8eec711d9f3bf941a7b81373c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Oct 2025 17:38:41 +0200 Subject: [PATCH 1/5] nvdimm: allow exposing RAM carveouts as NVDIMM DIMM devices There are use cases, for example virtual machine hosts, that create "persistent" memory regions using memmap= option on x86 or dummy pmem-region device tree nodes on DT based systems. Both these options are inflexible because they create static regions and the layout of the "persistent" memory cannot be adjusted without reboot and sometimes they even require firmware update. Add a ramdax driver that allows creation of DIMM devices on top of E820_TYPE_PRAM regions and devicetree pmem-region nodes. The DIMMs support label space management on the "device" and provide a flexible way to access RAM using fsdax and devdax. Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Dan Williams Link: https://patch.msgid.link/20251026153841.752061-2-rppt@kernel.org Signed-off-by: Ira Weiny --- drivers/nvdimm/Kconfig | 19 +++ drivers/nvdimm/Makefile | 1 + drivers/nvdimm/ramdax.c | 282 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 302 insertions(+) create mode 100644 drivers/nvdimm/ramdax.c diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index fde3e17c836c..44ab929a1ad5 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -97,6 +97,25 @@ config OF_PMEM Select Y if unsure. +config RAMDAX + tristate "Support persistent memory interfaces on RAM carveouts" + depends on X86_PMEM_LEGACY || OF || COMPILE_TEST + default LIBNVDIMM + help + Allows creation of DAX devices on RAM carveouts. + + Memory ranges that are manually specified by the + 'memmap=nn[KMG]!ss[KMG]' kernel command line or defined by dummy + pmem-region device tree nodes would be managed by this driver as DIMM + devices with support for dynamic layout of namespaces. + The driver steals 128K in the end of the memmap range for the + namespace management. This allows supporting up to 509 namespaces + (see 'ndctl create-namespace --help'). + The driver should be force bound to e820_pmem or pmem-region platform + devices using 'driver_override' device attribute. + + Select N if unsure. + config NVDIMM_KEYS def_bool y depends on ENCRYPTED_KEYS diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index ba0296dca9db..8c268814936c 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_OF_PMEM) += of_pmem.o obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o +obj-$(CONFIG_RAMDAX) += ramdax.o nd_pmem-y := pmem.o diff --git a/drivers/nvdimm/ramdax.c b/drivers/nvdimm/ramdax.c new file mode 100644 index 000000000000..63cf05791829 --- /dev/null +++ b/drivers/nvdimm/ramdax.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Mike Rapoport, Microsoft + * + * Based on e820 pmem driver: + * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define LABEL_AREA_SIZE SZ_128K + +struct ramdax_dimm { + struct nvdimm *nvdimm; + void *label_area; +}; + +static void ramdax_remove(struct platform_device *pdev) +{ + struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev); + + nvdimm_bus_unregister(nvdimm_bus); +} + +static int ramdax_register_region(struct resource *res, + struct nvdimm *nvdimm, + struct nvdimm_bus *nvdimm_bus) +{ + struct nd_mapping_desc mapping; + struct nd_region_desc ndr_desc; + struct nd_interleave_set *nd_set; + int nid = phys_to_target_node(res->start); + + nd_set = kzalloc(sizeof(*nd_set), GFP_KERNEL); + if (!nd_set) + return -ENOMEM; + + nd_set->cookie1 = 0xcafebeefcafebeef; + nd_set->cookie2 = nd_set->cookie1; + nd_set->altcookie = nd_set->cookie1; + + memset(&mapping, 0, sizeof(mapping)); + mapping.nvdimm = nvdimm; + mapping.start = 0; + mapping.size = resource_size(res) - LABEL_AREA_SIZE; + + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = res; + ndr_desc.numa_node = numa_map_to_online_node(nid); + ndr_desc.target_node = nid; + ndr_desc.num_mappings = 1; + ndr_desc.mapping = &mapping; + ndr_desc.nd_set = nd_set; + + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + goto err_free_nd_set; + + return 0; + +err_free_nd_set: + kfree(nd_set); + return -ENXIO; +} + +static int ramdax_register_dimm(struct resource *res, void *data) +{ + resource_size_t start = res->start; + resource_size_t size = resource_size(res); + unsigned long flags = 0, cmd_mask = 0; + struct nvdimm_bus *nvdimm_bus = data; + struct ramdax_dimm *dimm; + int err; + + dimm = kzalloc(sizeof(*dimm), GFP_KERNEL); + if (!dimm) + return -ENOMEM; + + dimm->label_area = memremap(start + size - LABEL_AREA_SIZE, + LABEL_AREA_SIZE, MEMREMAP_WB); + if (!dimm->label_area) { + err = -ENOMEM; + goto err_free_dimm; + } + + set_bit(NDD_LABELING, &flags); + set_bit(NDD_REGISTER_SYNC, &flags); + set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask); + set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask); + set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask); + dimm->nvdimm = nvdimm_create(nvdimm_bus, dimm, + /* dimm_attribute_groups */ NULL, + flags, cmd_mask, 0, NULL); + if (!dimm->nvdimm) { + err = -ENOMEM; + goto err_unmap_label; + } + + err = ramdax_register_region(res, dimm->nvdimm, nvdimm_bus); + if (err) + goto err_remove_nvdimm; + + return 0; + +err_remove_nvdimm: + nvdimm_delete(dimm->nvdimm); +err_unmap_label: + memunmap(dimm->label_area); +err_free_dimm: + kfree(dimm); + return err; +} + +static int ramdax_get_config_size(struct nvdimm *nvdimm, int buf_len, + struct nd_cmd_get_config_size *cmd) +{ + if (sizeof(*cmd) > buf_len) + return -EINVAL; + + *cmd = (struct nd_cmd_get_config_size){ + .status = 0, + .config_size = LABEL_AREA_SIZE, + .max_xfer = 8, + }; + + return 0; +} + +static int ramdax_get_config_data(struct nvdimm *nvdimm, int buf_len, + struct nd_cmd_get_config_data_hdr *cmd) +{ + struct ramdax_dimm *dimm = nvdimm_provider_data(nvdimm); + + if (sizeof(*cmd) > buf_len) + return -EINVAL; + if (struct_size(cmd, out_buf, cmd->in_length) > buf_len) + return -EINVAL; + if (cmd->in_offset + cmd->in_length > LABEL_AREA_SIZE) + return -EINVAL; + + memcpy(cmd->out_buf, dimm->label_area + cmd->in_offset, cmd->in_length); + + return 0; +} + +static int ramdax_set_config_data(struct nvdimm *nvdimm, int buf_len, + struct nd_cmd_set_config_hdr *cmd) +{ + struct ramdax_dimm *dimm = nvdimm_provider_data(nvdimm); + + if (sizeof(*cmd) > buf_len) + return -EINVAL; + if (struct_size(cmd, in_buf, cmd->in_length) > buf_len) + return -EINVAL; + if (cmd->in_offset + cmd->in_length > LABEL_AREA_SIZE) + return -EINVAL; + + memcpy(dimm->label_area + cmd->in_offset, cmd->in_buf, cmd->in_length); + + return 0; +} + +static int ramdax_nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd, + void *buf, unsigned int buf_len) +{ + unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm); + + if (!test_bit(cmd, &cmd_mask)) + return -ENOTTY; + + switch (cmd) { + case ND_CMD_GET_CONFIG_SIZE: + return ramdax_get_config_size(nvdimm, buf_len, buf); + case ND_CMD_GET_CONFIG_DATA: + return ramdax_get_config_data(nvdimm, buf_len, buf); + case ND_CMD_SET_CONFIG_DATA: + return ramdax_set_config_data(nvdimm, buf_len, buf); + default: + return -ENOTTY; + } +} + +static int ramdax_ctl(struct nvdimm_bus_descriptor *nd_desc, + struct nvdimm *nvdimm, unsigned int cmd, void *buf, + unsigned int buf_len, int *cmd_rc) +{ + /* + * No firmware response to translate, let the transport error + * code take precedence. + */ + *cmd_rc = 0; + + if (!nvdimm) + return -ENOTTY; + return ramdax_nvdimm_ctl(nvdimm, cmd, buf, buf_len); +} + +#ifdef CONFIG_OF +static const struct of_device_id ramdax_of_matches[] = { + { .compatible = "pmem-region", }, + { }, +}; +#endif + +static int ramdax_probe_of(struct platform_device *pdev, + struct nvdimm_bus *bus, struct device_node *np) +{ + int err; + + if (!of_match_node(ramdax_of_matches, np)) + return -ENODEV; + + for (int i = 0; i < pdev->num_resources; i++) { + err = ramdax_register_dimm(&pdev->resource[i], bus); + if (err) + goto err_unregister; + } + + return 0; + +err_unregister: + /* + * FIXME: should we unregister the dimms that were registered + * successfully + */ + return err; +} + +static int ramdax_probe(struct platform_device *pdev) +{ + static struct nvdimm_bus_descriptor nd_desc; + struct device *dev = &pdev->dev; + struct nvdimm_bus *nvdimm_bus; + struct device_node *np; + int rc = -ENXIO; + + nd_desc.provider_name = "ramdax"; + nd_desc.module = THIS_MODULE; + nd_desc.ndctl = ramdax_ctl; + nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); + if (!nvdimm_bus) + goto err; + + np = dev_of_node(&pdev->dev); + if (np) + rc = ramdax_probe_of(pdev, nvdimm_bus, np); + else + rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY, + IORESOURCE_MEM, 0, -1, nvdimm_bus, + ramdax_register_dimm); + if (rc) + goto err; + + platform_set_drvdata(pdev, nvdimm_bus); + + return 0; +err: + nvdimm_bus_unregister(nvdimm_bus); + return rc; +} + +static struct platform_driver ramdax_driver = { + .probe = ramdax_probe, + .remove = ramdax_remove, + .driver = { + .name = "ramdax", + }, +}; + +module_platform_driver(ramdax_driver); + +MODULE_DESCRIPTION("NVDIMM support for e820 type-12 memory and OF pmem-region"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Microsoft Corporation"); From f59b701b4674f7955170b54c4167c5590f4714eb Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 31 Oct 2025 16:42:20 -0700 Subject: [PATCH 2/5] tools/testing/nvdimm: Use per-DIMM device handle KASAN reports a global-out-of-bounds access when running these nfit tests: clear.sh, pmem-errors.sh, pfn-meta-errors.sh, btt-errors.sh, daxdev-errors.sh, and inject-error.sh. [] BUG: KASAN: global-out-of-bounds in nfit_test_ctl+0x769f/0x7840 [nfit_test] [] Read of size 4 at addr ffffffffc03ea01c by task ndctl/1215 [] The buggy address belongs to the variable: [] handle+0x1c/0x1df4 [nfit_test] nfit_test_search_spa() uses handle[nvdimm->id] to retrieve a device handle and triggers a KASAN error when it reads past the end of the handle array. It should not be indexing the handle array at all. The correct device handle is stored in per-DIMM test data. Each DIMM has a struct nfit_mem that embeds a struct acpi_nfit_memdev that describes the NFIT device handle. Use that device handle here. Fixes: 10246dc84dfc ("acpi nfit: nfit_test supports translate SPA") Cc: stable@vger.kernel.org Signed-off-by: Alison Schofield Reviewed-by: Dave Jiang > --- Link: https://patch.msgid.link/20251031234227.1303113-1-alison.schofield@intel.com Signed-off-by: Ira Weiny --- tools/testing/nvdimm/test/nfit.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index cfd4378e2129..f87e9f251d13 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -670,6 +670,7 @@ static int nfit_test_search_spa(struct nvdimm_bus *bus, .addr = spa->spa, .region = NULL, }; + struct nfit_mem *nfit_mem; u64 dpa; ret = device_for_each_child(&bus->dev, &ctx, @@ -687,8 +688,12 @@ static int nfit_test_search_spa(struct nvdimm_bus *bus, */ nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1]; nvdimm = nd_mapping->nvdimm; + nfit_mem = nvdimm_provider_data(nvdimm); + if (!nfit_mem) + return -EINVAL; - spa->devices[0].nfit_device_handle = handle[nvdimm->id]; + spa->devices[0].nfit_device_handle = + __to_nfit_memdev(nfit_mem)->device_handle; spa->num_nvdimms = 1; spa->devices[0].dpa = dpa; From 7e898a9a992293eea8df11c8bb6fe120df6f8e6f Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Wed, 5 Nov 2025 16:08:26 +0100 Subject: [PATCH 3/5] nvdimm: replace use of system_wq with system_percpu_wq Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. This patch continues the effort to refactor worqueue APIs, which has begun with the change introducing new workqueues and a new alloc_workqueue flag: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") Replace system_wq with system_percpu_wq, keeping the same old behavior. The old wq (system_wq) will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Reviewed-by: Dave Jiang > --- Link: https://patch.msgid.link/20251105150826.248673-1-marco.crivellari@suse.com Signed-off-by: Ira Weiny --- drivers/nvdimm/security.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/security.c b/drivers/nvdimm/security.c index 4adce8c38870..e41f6951ca0f 100644 --- a/drivers/nvdimm/security.c +++ b/drivers/nvdimm/security.c @@ -424,7 +424,7 @@ static int security_overwrite(struct nvdimm *nvdimm, unsigned int keyid) * query. */ get_device(dev); - queue_delayed_work(system_wq, &nvdimm->dwork, 0); + queue_delayed_work(system_percpu_wq, &nvdimm->dwork, 0); } return rc; @@ -457,7 +457,7 @@ static void __nvdimm_security_overwrite_query(struct nvdimm *nvdimm) /* setup delayed work again */ tmo += 10; - queue_delayed_work(system_wq, &nvdimm->dwork, tmo * HZ); + queue_delayed_work(system_percpu_wq, &nvdimm->dwork, tmo * HZ); nvdimm->sec.overwrite_tmo = min(15U * 60U, tmo); return; } From acd9ea1714bbe910753bf6f3ce0e861a7fed6b56 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 5 Nov 2025 19:47:08 +0700 Subject: [PATCH 4/5] Documentation: btt: Unwrap bit 31-30 nested table Bit 31-30 usage table is already formatted as reST simple table, but it is wrapped in literal code block instead. Unwrap it. Signed-off-by: Bagas Sanjaya Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20251105124707.44736-2-bagasdotme@gmail.com Signed-off-by: Ira Weiny --- Documentation/driver-api/nvdimm/btt.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/nvdimm/btt.rst b/Documentation/driver-api/nvdimm/btt.rst index 107395c042ae..2d8269f834bd 100644 --- a/Documentation/driver-api/nvdimm/btt.rst +++ b/Documentation/driver-api/nvdimm/btt.rst @@ -83,7 +83,7 @@ flags, and the remaining form the internal block number. ======== ============================================================= Bit Description ======== ============================================================= -31 - 30 Error and Zero flags - Used in the following way:: +31 - 30 Error and Zero flags - Used in the following way: == == ==================================================== 31 30 Description From 30065e73d7c018cf2e1bec68e2d6ffafc17b3c25 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 26 Nov 2025 15:11:53 +0300 Subject: [PATCH 5/5] nvdimm: Prevent integer overflow in ramdax_get_config_data() The "cmd->in_offset" variable comes from the user via the __nd_ioctl() function. The problem is that the "cmd->in_offset + cmd->in_length" addition could have an integer wrapping issue if cmd->in_offset is close to UINT_MAX . Both "cmd->in_offset" and "cmd->in_length" are u32 variables. Fixes: 43bc0aa19a21 ("nvdimm: allow exposing RAM carveouts as NVDIMM DIMM devices") Signed-off-by: Dan Carpenter Acked-by: Mike Rapoport (Microsoft) Link: https://patch.msgid.link/aSbuiYCznEIZDa02@stanley.mountain Signed-off-by: Ira Weiny --- drivers/nvdimm/ramdax.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/ramdax.c b/drivers/nvdimm/ramdax.c index 63cf05791829..954cb7919807 100644 --- a/drivers/nvdimm/ramdax.c +++ b/drivers/nvdimm/ramdax.c @@ -143,7 +143,7 @@ static int ramdax_get_config_data(struct nvdimm *nvdimm, int buf_len, return -EINVAL; if (struct_size(cmd, out_buf, cmd->in_length) > buf_len) return -EINVAL; - if (cmd->in_offset + cmd->in_length > LABEL_AREA_SIZE) + if (size_add(cmd->in_offset, cmd->in_length) > LABEL_AREA_SIZE) return -EINVAL; memcpy(cmd->out_buf, dimm->label_area + cmd->in_offset, cmd->in_length); @@ -160,7 +160,7 @@ static int ramdax_set_config_data(struct nvdimm *nvdimm, int buf_len, return -EINVAL; if (struct_size(cmd, in_buf, cmd->in_length) > buf_len) return -EINVAL; - if (cmd->in_offset + cmd->in_length > LABEL_AREA_SIZE) + if (size_add(cmd->in_offset, cmd->in_length) > LABEL_AREA_SIZE) return -EINVAL; memcpy(dimm->label_area + cmd->in_offset, cmd->in_buf, cmd->in_length);