From fa9681ed5c6ae980af18545690e31d8f9c088c33 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Tue, 3 Mar 2026 01:08:57 +0000 Subject: [PATCH 01/24] RISC-V: KVM: Validate SBI STA shmem alignment in kvm_sbi_ext_sta_set_reg() The RISC-V SBI Steal-Time Accounting (STA) extension requires the shared memory physical address to be 64-byte aligned, or set to all-ones to explicitly disable steal-time accounting. KVM exposes the SBI STA shared memory configuration to userspace via KVM_SET_ONE_REG. However, the current implementation of kvm_sbi_ext_sta_set_reg() does not validate the alignment of the configured shared memory address. As a result, userspace can install a misaligned shared memory address that violates the SBI specification. Such an invalid configuration may later reach runtime code paths that assume a valid and properly aligned shared memory region. In particular, KVM_RUN can trigger the following WARN_ON in kvm_riscv_vcpu_record_steal_time(): WARNING: arch/riscv/kvm/vcpu_sbi_sta.c:49 at kvm_riscv_vcpu_record_steal_time WARN_ON paths are not expected to be reachable during normal runtime execution, and may result in a kernel panic when panic_on_warn is enabled. Fix this by validating the computed shared memory GPA at the KVM_SET_ONE_REG boundary. A temporary GPA is constructed and checked before committing it to vcpu->arch.sta.shmem. The validation allows either a 64-byte aligned GPA or INVALID_GPA (all-ones), which disables STA as defined by the SBI specification. This prevents invalid userspace state from reaching runtime code paths that assume SBI STA invariants and avoids unexpected WARN_ON behavior. Fixes: f61ce890b1f074 ("RISC-V: KVM: Add support for SBI STA registers") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20260303010859.1763177-2-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_sta.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_sta.c b/arch/riscv/kvm/vcpu_sbi_sta.c index afa0545c3bcf..3b834709b429 100644 --- a/arch/riscv/kvm/vcpu_sbi_sta.c +++ b/arch/riscv/kvm/vcpu_sbi_sta.c @@ -181,6 +181,7 @@ static int kvm_sbi_ext_sta_set_reg(struct kvm_vcpu *vcpu, unsigned long reg_num, unsigned long reg_size, const void *reg_val) { unsigned long value; + gpa_t new_shmem = INVALID_GPA; if (reg_size != sizeof(unsigned long)) return -EINVAL; @@ -191,18 +192,18 @@ static int kvm_sbi_ext_sta_set_reg(struct kvm_vcpu *vcpu, unsigned long reg_num, if (IS_ENABLED(CONFIG_32BIT)) { gpa_t hi = upper_32_bits(vcpu->arch.sta.shmem); - vcpu->arch.sta.shmem = value; - vcpu->arch.sta.shmem |= hi << 32; + new_shmem = value; + new_shmem |= hi << 32; } else { - vcpu->arch.sta.shmem = value; + new_shmem = value; } break; case KVM_REG_RISCV_SBI_STA_REG(shmem_hi): if (IS_ENABLED(CONFIG_32BIT)) { gpa_t lo = lower_32_bits(vcpu->arch.sta.shmem); - vcpu->arch.sta.shmem = ((gpa_t)value << 32); - vcpu->arch.sta.shmem |= lo; + new_shmem = ((gpa_t)value << 32); + new_shmem |= lo; } else if (value != 0) { return -EINVAL; } @@ -211,6 +212,11 @@ static int kvm_sbi_ext_sta_set_reg(struct kvm_vcpu *vcpu, unsigned long reg_num, return -ENOENT; } + if (new_shmem != INVALID_GPA && !IS_ALIGNED(new_shmem, 64)) + return -EINVAL; + + vcpu->arch.sta.shmem = new_shmem; + return 0; } From 40351ed924dd30ded1b43c7333ce695a4a835f7b Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Tue, 3 Mar 2026 01:08:58 +0000 Subject: [PATCH 02/24] KVM: selftests: Refactor UAPI tests into dedicated function Move steal time UAPI tests from steal_time_init() into a separate check_steal_time_uapi() function for better code organization and maintainability. Previously, x86 and ARM64 architectures performed UAPI validation tests within steal_time_init(), mixing initialization logic with uapi tests. Changes by architecture: x86_64: - Extract MSR reserved bits test from steal_time_init() - Move to check_steal_time_uapi() which tests that setting MSR_KVM_STEAL_TIME with KVM_STEAL_RESERVED_MASK fails ARM64: - Extract three UAPI tests from steal_time_init(): Device attribute support check Misaligned IPA rejection (EINVAL) Duplicate IPA setting rejection (EEXIST) - Move all tests to check_steal_time_uapi() RISC-V: - Add empty check_steal_time_uapi() stub for future use - No changes to steal_time_init() (had no tests to extract) The new check_steal_time_uapi() function: - Is called once before the per-VCPU test loop No functional change intended. Suggested-by: Andrew Jones Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20260303010859.1763177-3-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/steal_time.c | 67 ++++++++++++++++++------ 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 7be8adfe5dd3..8e4d7c13b598 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -69,16 +69,10 @@ static bool is_steal_time_supported(struct kvm_vcpu *vcpu) static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) { - int ret; - /* ST_GPA_BASE is identity mapped */ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); sync_global_to_guest(vcpu->vm, st_gva[i]); - ret = _vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, - (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK); - TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); - vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED); } @@ -99,6 +93,21 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) st->pad[8], st->pad[9], st->pad[10]); } +static void check_steal_time_uapi(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + int ret; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + ret = _vcpu_set_msr(vcpu, MSR_KVM_STEAL_TIME, + (ulong)ST_GPA_BASE | KVM_STEAL_RESERVED_MASK); + TEST_ASSERT(ret == 0, "Bad GPA didn't fail"); + + kvm_vm_free(vm); +} + #elif defined(__aarch64__) /* PV_TIME_ST must have 64-byte alignment */ @@ -170,7 +179,6 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) { struct kvm_vm *vm = vcpu->vm; uint64_t st_ipa; - int ret; struct kvm_device_attr dev = { .group = KVM_ARM_VCPU_PVTIME_CTRL, @@ -178,21 +186,12 @@ static void steal_time_init(struct kvm_vcpu *vcpu, uint32_t i) .addr = (uint64_t)&st_ipa, }; - vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); - /* ST_GPA_BASE is identity mapped */ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE); sync_global_to_guest(vm, st_gva[i]); - st_ipa = (ulong)st_gva[i] | 1; - ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); - TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); - st_ipa = (ulong)st_gva[i]; vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); - - ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); - TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); } static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) @@ -205,6 +204,36 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) ksft_print_msg(" st_time: %ld\n", st->st_time); } +static void check_steal_time_uapi(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint64_t st_ipa; + int ret; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + struct kvm_device_attr dev = { + .group = KVM_ARM_VCPU_PVTIME_CTRL, + .attr = KVM_ARM_VCPU_PVTIME_IPA, + .addr = (uint64_t)&st_ipa, + }; + + vcpu_ioctl(vcpu, KVM_HAS_DEVICE_ATTR, &dev); + + st_ipa = (ulong)ST_GPA_BASE | 1; + ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL"); + + st_ipa = (ulong)ST_GPA_BASE; + vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); + + ret = __vcpu_ioctl(vcpu, KVM_SET_DEVICE_ATTR, &dev); + TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST"); + + kvm_vm_free(vm); +} + #elif defined(__riscv) /* SBI STA shmem must have 64-byte alignment */ @@ -301,6 +330,10 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) pr_info("\n"); } +static void check_steal_time_uapi(void) +{ +} + #elif defined(__loongarch__) /* steal_time must have 64-byte alignment */ @@ -465,6 +498,8 @@ int main(int ac, char **av) TEST_REQUIRE(is_steal_time_supported(vcpus[0])); ksft_set_plan(NR_VCPUS); + check_steal_time_uapi(); + /* Run test on each VCPU */ for (i = 0; i < NR_VCPUS; ++i) { steal_time_init(vcpus[i], i); From 7c61e7433b49ca948dc8cc2b70a20b3dbc36363d Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Tue, 3 Mar 2026 01:08:59 +0000 Subject: [PATCH 03/24] RISC-V: KVM: selftests: Add RISC-V SBI STA shmem alignment tests Add RISC-V KVM selftests to verify the SBI Steal-Time Accounting (STA) shared memory alignment requirements. The SBI specification requires the STA shared memory GPA to be 64-byte aligned, or set to all-ones to explicitly disable steal-time accounting. This test verifies that KVM enforces the expected behavior when configuring the SBI STA shared memory via KVM_SET_ONE_REG. Specifically, the test checks that: - misaligned GPAs are rejected with -EINVAL - 64-byte aligned GPAs are accepted - all-ones GPA is accepted Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20260303010859.1763177-4-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- .../selftests/kvm/include/kvm_util_types.h | 2 ++ tools/testing/selftests/kvm/steal_time.c | 31 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util_types.h b/tools/testing/selftests/kvm/include/kvm_util_types.h index ec787b97cf18..0366e9bce7f9 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_types.h +++ b/tools/testing/selftests/kvm/include/kvm_util_types.h @@ -17,4 +17,6 @@ typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ +#define INVALID_GPA (~(uint64_t)0) + #endif /* SELFTEST_KVM_UTIL_TYPES_H */ diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c index 8e4d7c13b598..efe56a10d13e 100644 --- a/tools/testing/selftests/kvm/steal_time.c +++ b/tools/testing/selftests/kvm/steal_time.c @@ -332,6 +332,37 @@ static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpu_idx) static void check_steal_time_uapi(void) { + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct kvm_one_reg reg; + uint64_t shmem; + int ret; + + vm = vm_create_with_one_vcpu(&vcpu, NULL); + + reg.id = KVM_REG_RISCV | + KVM_REG_SIZE_ULONG | + KVM_REG_RISCV_SBI_STATE | + KVM_REG_RISCV_SBI_STA | + KVM_REG_RISCV_SBI_STA_REG(shmem_lo); + reg.addr = (uint64_t)&shmem; + + shmem = ST_GPA_BASE + 1; + ret = __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "misaligned STA shmem returns -EINVAL"); + + shmem = ST_GPA_BASE; + ret = __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); + TEST_ASSERT(ret == 0, + "aligned STA shmem succeeds"); + + shmem = INVALID_GPA; + ret = __vcpu_ioctl(vcpu, KVM_SET_ONE_REG, ®); + TEST_ASSERT(ret == 0, + "all-ones for STA shmem succeeds"); + + kvm_vm_free(vm); } #elif defined(__loongarch__) From aa35bcf2e76234fef7bbca9bf364039692a27661 Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Thu, 12 Mar 2026 00:18:32 +0100 Subject: [PATCH 04/24] RISC-V: KVM: fix PMU snapshot_set_shmem on 32-bit hosts When saddr_high != 0 on RV32, the goto out was unconditional, causing valid 64-bit addresses to be rejected. Only goto out when the address is invalid (64-bit host with saddr_high != 0). Fixes: c2f41ddbcdd7 ("RISC-V: KVM: Implement SBI PMU Snapshot feature") Signed-off-by: Osama Abdelkader Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20260311231833.13189-1-osama.abdelkader@gmail.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_pmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index e873430e596b..f3bf985dcf43 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -427,11 +427,12 @@ int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long s saddr = saddr_low; if (saddr_high != 0) { - if (IS_ENABLED(CONFIG_32BIT)) + if (IS_ENABLED(CONFIG_32BIT)) { saddr |= ((gpa_t)saddr_high << 32); - else + } else { sbiret = SBI_ERR_INVALID_ADDRESS; - goto out; + goto out; + } } kvpmu->sdata = kzalloc(snapshot_area_size, GFP_ATOMIC); From b7c958d7c1eb1cb9b2be7b5ee4129fcd66cec978 Mon Sep 17 00:00:00 2001 From: Osama Abdelkader Date: Mon, 16 Mar 2026 16:16:11 +0100 Subject: [PATCH 05/24] riscv: kvm: fix vector context allocation leak When the second kzalloc (host_context.vector.datap) fails in kvm_riscv_vcpu_alloc_vector_context, the first allocation (guest_context.vector.datap) is leaked. Free it before returning. Fixes: 0f4b82579716 ("riscv: KVM: Add vector lazy save/restore support") Cc: stable@vger.kernel.org Signed-off-by: Osama Abdelkader Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20260316151612.13305-1-osama.abdelkader@gmail.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_vector.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c index 05f3cc2d8e31..5b6ad82d47be 100644 --- a/arch/riscv/kvm/vcpu_vector.c +++ b/arch/riscv/kvm/vcpu_vector.c @@ -80,8 +80,11 @@ int kvm_riscv_vcpu_alloc_vector_context(struct kvm_vcpu *vcpu) return -ENOMEM; vcpu->arch.host_context.vector.datap = kzalloc(riscv_v_vsize, GFP_KERNEL); - if (!vcpu->arch.host_context.vector.datap) + if (!vcpu->arch.host_context.vector.datap) { + kfree(vcpu->arch.guest_context.vector.datap); + vcpu->arch.guest_context.vector.datap = NULL; return -ENOMEM; + } return 0; } From 99594f75b49edc7046057bca06d892c16967a9b3 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Mon, 16 Mar 2026 01:45:32 +0000 Subject: [PATCH 06/24] RISC-V: KVM: Fix array out-of-bounds in pmu_ctr_read() and pmu_fw_ctr_read_hi() When a guest invokes SBI_EXT_PMU_COUNTER_FW_READ or SBI_EXT_PMU_COUNTER_FW_READ_HI on a firmware counter that has not been configured via SBI_EXT_PMU_COUNTER_CFG_MATCH, the pmc->event_idx remains SBI_PMU_EVENT_IDX_INVALID (0xFFFFFFFF). get_event_code() extracts the lower 16 bits, yielding 0xFFFF (65535), which is then used to index into kvpmu->fw_event[]. Since fw_event is only RISCV_KVM_MAX_FW_CTRS (32) entries, this triggers an array-index-out-of-bounds: UBSAN: array-index-out-of-bounds in arch/riscv/kvm/vcpu_pmu.c:255:37 index 65535 is out of range for type 'kvm_fw_event [32]' Add a check for the known unconfigured case (SBI_PMU_EVENT_IDX_INVALID) and a WARN_ONCE guard for any unexpected out-of-bounds event codes, returning -EINVAL in both cases. Fixes: badc386869e2c ("RISC-V: KVM: Support firmware events") Fixes: 08fb07d6dcf71 ("RISC-V: KVM: Support 64 bit firmware counters on RV32") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20260316014533.2312254-2-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_pmu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index f3bf985dcf43..9e9f3302cef8 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -226,7 +226,14 @@ static int pmu_fw_ctr_read_hi(struct kvm_vcpu *vcpu, unsigned long cidx, if (pmc->cinfo.type != SBI_PMU_CTR_TYPE_FW) return -EINVAL; + if (pmc->event_idx == SBI_PMU_EVENT_IDX_INVALID) + return -EINVAL; + fevent_code = get_event_code(pmc->event_idx); + if (WARN_ONCE(fevent_code >= SBI_PMU_FW_MAX, + "Invalid firmware event code: %d\n", fevent_code)) + return -EINVAL; + pmc->counter_val = kvpmu->fw_event[fevent_code].value; *out_val = pmc->counter_val >> 32; @@ -251,7 +258,14 @@ static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, pmc = &kvpmu->pmc[cidx]; if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) { + if (pmc->event_idx == SBI_PMU_EVENT_IDX_INVALID) + return -EINVAL; + fevent_code = get_event_code(pmc->event_idx); + if (WARN_ONCE(fevent_code >= SBI_PMU_FW_MAX, + "Invalid firmware event code: %d\n", fevent_code)) + return -EINVAL; + pmc->counter_val = kvpmu->fw_event[fevent_code].value; } else if (pmc->perf_event) { pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running); From 198c7ce9801abd63c44176c3f034577b887c0070 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Mon, 16 Mar 2026 01:45:33 +0000 Subject: [PATCH 07/24] RISC-V: KVM: selftests: Fix firmware counter read in sbi_pmu_test The current sbi_pmu_test attempts to read firmware counters without configuring them first with SBI_EXT_PMU_COUNTER_CFG_MATCH. Previously this did not fail because KVM incorrectly allowed the read and accessed fw_event[] with an out-of-bounds index when the counter was unconfigured. After fixing that bug, the read now correctly returns SBI_ERR_INVALID_PARAM, causing the selftest to fail. Update the test to configure a firmware event before reading the counter. Also add a negative test to ensure that attempting to read an unconfigured firmware counter fails gracefully. Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Andrew Jones Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20260316014533.2312254-3-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- .../testing/selftests/kvm/include/riscv/sbi.h | 37 +++++++++++++++++++ .../selftests/kvm/riscv/sbi_pmu_test.c | 20 +++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/include/riscv/sbi.h b/tools/testing/selftests/kvm/include/riscv/sbi.h index 046b432ae896..16f1815ac48f 100644 --- a/tools/testing/selftests/kvm/include/riscv/sbi.h +++ b/tools/testing/selftests/kvm/include/riscv/sbi.h @@ -97,6 +97,43 @@ enum sbi_pmu_hw_generic_events_t { SBI_PMU_HW_GENERAL_MAX, }; +enum sbi_pmu_fw_generic_events_t { + SBI_PMU_FW_MISALIGNED_LOAD = 0, + SBI_PMU_FW_MISALIGNED_STORE = 1, + SBI_PMU_FW_ACCESS_LOAD = 2, + SBI_PMU_FW_ACCESS_STORE = 3, + SBI_PMU_FW_ILLEGAL_INSN = 4, + SBI_PMU_FW_SET_TIMER = 5, + SBI_PMU_FW_IPI_SENT = 6, + SBI_PMU_FW_IPI_RCVD = 7, + SBI_PMU_FW_FENCE_I_SENT = 8, + SBI_PMU_FW_FENCE_I_RCVD = 9, + SBI_PMU_FW_SFENCE_VMA_SENT = 10, + SBI_PMU_FW_SFENCE_VMA_RCVD = 11, + SBI_PMU_FW_SFENCE_VMA_ASID_SENT = 12, + SBI_PMU_FW_SFENCE_VMA_ASID_RCVD = 13, + + SBI_PMU_FW_HFENCE_GVMA_SENT = 14, + SBI_PMU_FW_HFENCE_GVMA_RCVD = 15, + SBI_PMU_FW_HFENCE_GVMA_VMID_SENT = 16, + SBI_PMU_FW_HFENCE_GVMA_VMID_RCVD = 17, + + SBI_PMU_FW_HFENCE_VVMA_SENT = 18, + SBI_PMU_FW_HFENCE_VVMA_RCVD = 19, + SBI_PMU_FW_HFENCE_VVMA_ASID_SENT = 20, + SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD = 21, + SBI_PMU_FW_MAX, +}; + +/* SBI PMU event types */ +enum sbi_pmu_event_type { + SBI_PMU_EVENT_TYPE_HW = 0x0, + SBI_PMU_EVENT_TYPE_CACHE = 0x1, + SBI_PMU_EVENT_TYPE_RAW = 0x2, + SBI_PMU_EVENT_TYPE_RAW_V2 = 0x3, + SBI_PMU_EVENT_TYPE_FW = 0xf, +}; + /* SBI PMU counter types */ enum sbi_pmu_ctr_type { SBI_PMU_CTR_TYPE_HW = 0x0, diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c index 924a335d2262..cec1621ace23 100644 --- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -436,6 +436,7 @@ static void test_pmu_basic_sanity(void) struct sbiret ret; int num_counters = 0, i; union sbi_pmu_ctr_info ctrinfo; + unsigned long fw_eidx; probe = guest_sbi_probe_extension(SBI_EXT_PMU, &out_val); GUEST_ASSERT(probe && out_val == 1); @@ -461,7 +462,24 @@ static void test_pmu_basic_sanity(void) pmu_csr_read_num(ctrinfo.csr); GUEST_ASSERT(illegal_handler_invoked); } else if (ctrinfo.type == SBI_PMU_CTR_TYPE_FW) { - read_fw_counter(i, ctrinfo); + /* Read without configure should fail */ + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ, + i, 0, 0, 0, 0, 0); + GUEST_ASSERT(ret.error == SBI_ERR_INVALID_PARAM); + + /* + * Try to configure with a common firmware event. + * If configuration succeeds, verify we can read it. + */ + fw_eidx = ((unsigned long)SBI_PMU_EVENT_TYPE_FW << 16) | + SBI_PMU_FW_ACCESS_LOAD; + + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, + i, 1, 0, fw_eidx, 0, 0); + if (ret.error == 0) { + GUEST_ASSERT(ret.value == i); + read_fw_counter(i, ctrinfo); + } } } From 9b33ab1e8c189bd0aced0d07545b0f31a459d40e Mon Sep 17 00:00:00 2001 From: Yufeng Wang Date: Tue, 17 Mar 2026 19:47:59 +0800 Subject: [PATCH 08/24] riscv: kvm: add null pointer check for vector datap Add WARN_ON check before accessing cntx->vector.datap in kvm_riscv_vcpu_vreg_addr() to detect potential null pointer dereferences early, consistent with the pattern used in kvm_riscv_vcpu_vector_reset(). This helps catch initialization issues where vector context allocation may have failed. Signed-off-by: Yufeng Wang Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20260317114759.53165-1-r4o5m6e8o@163.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_vector.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c index 5b6ad82d47be..f3f5fb665cf6 100644 --- a/arch/riscv/kvm/vcpu_vector.c +++ b/arch/riscv/kvm/vcpu_vector.c @@ -130,6 +130,7 @@ static int kvm_riscv_vcpu_vreg_addr(struct kvm_vcpu *vcpu, } else if (reg_num <= KVM_REG_RISCV_VECTOR_REG(31)) { if (reg_size != vlenb) return -EINVAL; + WARN_ON(!cntx->vector.datap); *reg_addr = cntx->vector.datap + (reg_num - KVM_REG_RISCV_VECTOR_REG(0)) * vlenb; } else { From 310e2096b082ae0010e7afef666073b966ac2fa7 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Wed, 18 Mar 2026 09:29:56 +0000 Subject: [PATCH 09/24] RISC-V: KVM: Fix double-free of sdata in kvm_pmu_clear_snapshot_area() In kvm_riscv_vcpu_pmu_snapshot_set_shmem(), when kvm_vcpu_write_guest() fails, kvpmu->sdata is freed but not set to NULL. This leaves a dangling pointer that will be freed again when kvm_pmu_clear_snapshot_area() is called during vcpu teardown, triggering a KASAN double-free report. First free occurs in kvm_riscv_vcpu_pmu_snapshot_set_shmem(): kvm_riscv_vcpu_pmu_snapshot_set_shmem arch/riscv/kvm/vcpu_pmu.c:443 kvm_sbi_ext_pmu_handler arch/riscv/kvm/vcpu_sbi_pmu.c:74 kvm_riscv_vcpu_sbi_ecall arch/riscv/kvm/vcpu_sbi.c:608 kvm_riscv_vcpu_exit arch/riscv/kvm/vcpu_exit.c:240 kvm_arch_vcpu_ioctl_run arch/riscv/kvm/vcpu.c:1008 kvm_vcpu_ioctl virt/kvm/kvm_main.c:4476 Second free (double-free) occurs in kvm_pmu_clear_snapshot_area(): kvm_pmu_clear_snapshot_area arch/riscv/kvm/vcpu_pmu.c:403 [inline] kvm_riscv_vcpu_pmu_deinit.part arch/riscv/kvm/vcpu_pmu.c:905 kvm_riscv_vcpu_pmu_deinit arch/riscv/kvm/vcpu_pmu.c:893 kvm_arch_vcpu_destroy arch/riscv/kvm/vcpu.c:199 kvm_vcpu_destroy virt/kvm/kvm_main.c:469 [inline] kvm_destroy_vcpus virt/kvm/kvm_main.c:489 kvm_arch_destroy_vm arch/riscv/kvm/vm.c:54 kvm_destroy_vm virt/kvm/kvm_main.c:1301 [inline] kvm_put_kvm virt/kvm/kvm_main.c:1338 kvm_vm_release virt/kvm/kvm_main.c:1361 Fix it by setting kvpmu->sdata to NULL after kfree() in kvm_riscv_vcpu_pmu_snapshot_set_shmem(), so that the subsequent kfree(NULL) in kvm_pmu_clear_snapshot_area() becomes a safe no-op. This bug was found by fuzzing the KVM RISC-V PMU interface. Fixes: c2f41ddbcdd756 ("RISC-V: KVM: Implement SBI PMU Snapshot feature") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Nutty Liu Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20260318092956.708246-1-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_pmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index 9e9f3302cef8..bb6380ec7fc4 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -456,6 +456,7 @@ int kvm_riscv_vcpu_pmu_snapshot_set_shmem(struct kvm_vcpu *vcpu, unsigned long s /* No need to check writable slot explicitly as kvm_vcpu_write_guest does it internally */ if (kvm_vcpu_write_guest(vcpu, saddr, kvpmu->sdata, snapshot_area_size)) { kfree(kvpmu->sdata); + kvpmu->sdata = NULL; sbiret = SBI_ERR_INVALID_ADDRESS; goto out; } From 1762ac42eed653557d2feb9e37f45995ac238ce6 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Thu, 19 Mar 2026 03:59:02 +0000 Subject: [PATCH 10/24] RISC-V: KVM: Fix integer overflow in kvm_pmu_validate_counter_mask() When a guest initiates an SBI_EXT_PMU_COUNTER_CFG_MATCH call with ctr_base=0xfffffffffffffffe, ctr_mask=0xeb5f and flags=0x1 (SBI_PMU_CFG_FLAG_SKIP_MATCH), kvm_riscv_vcpu_pmu_ctr_cfg_match() first invokes kvm_pmu_validate_counter_mask() to verify whether ctr_base and ctr_mask are valid, by evaluating: !ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu)) With the above inputs, __fls(0xeb5f) equals 15, and adding 15 to 0xfffffffffffffffe causes an integer overflow, wrapping around to 13. Since 13 is less than kvm_pmu_num_counters(), the validation wrongly succeeds. Thereafter, since flags & SBI_PMU_CFG_FLAG_SKIP_MATCH is satisfied, the code evaluates: !test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use) Here __ffs(0xeb5f) equals 0, so test_bit() receives 0xfffffffffffffffe as the bit index and attempts to access the corresponding element of the kvpmu->pmc_in_use, which results in an invalid memory access. This triggers the following Oops: Unable to handle kernel paging request at virtual address e3ebffff12abba89 generic_test_bit include/asm-generic/bitops/generic-non-atomic.h:128 kvm_riscv_vcpu_pmu_ctr_cfg_match arch/riscv/kvm/vcpu_pmu.c:758 kvm_sbi_ext_pmu_handler arch/riscv/kvm/vcpu_sbi_pmu.c:49 kvm_riscv_vcpu_sbi_ecall arch/riscv/kvm/vcpu_sbi.c:608 kvm_riscv_vcpu_exit arch/riscv/kvm/vcpu_exit.c:240 The root cause is that kvm_pmu_validate_counter_mask() does not account for the case where ctr_base itself is out of range, allowing the subsequent addition to silently overflow and bypass the check. Fix this by explicitly validating ctr_base against kvm_pmu_num_counters() before performing the addition. This bug was found by fuzzing the KVM RISC-V PMU interface. Fixes: 0cb74b65d2e5e6 ("RISC-V: KVM: Implement perf support without sampling") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Nutty Liu Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20260319035902.924661-1-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_pmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index bb6380ec7fc4..49427094a079 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -280,8 +280,10 @@ static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx, static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base, unsigned long ctr_mask) { - /* Make sure the we have a valid counter mask requested from the caller */ - if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu))) + unsigned long num_ctrs = kvm_pmu_num_counters(kvpmu); + + /* Make sure we have a valid counter mask requested from the caller */ + if (!ctr_mask || ctr_base >= num_ctrs || (ctr_base + __fls(ctr_mask) >= num_ctrs)) return -EINVAL; return 0; From a216e24fc947573bfbd56471bd7c1f1d8c7a2b19 Mon Sep 17 00:00:00 2001 From: Wang Yechao Date: Mon, 30 Mar 2026 16:10:52 +0800 Subject: [PATCH 11/24] RISC-V: KVM: Fix lost write protection on huge pages during dirty logging When enabling dirty log in small chunks (e.g., QEMU default chunk size of 256K), the chunk size is always smaller than the page size of huge pages (1G or 2M) used in the gstage page tables. This caused the write protection to be incorrectly skipped for huge PTEs because the condition `(end - addr) >= page_size` was not satisfied. Remove the size check in `kvm_riscv_gstage_wp_range()` to ensure huge PTEs are always write-protected regardless of the chunk size. Additionally, explicitly align the address down to the page size before invoking `kvm_riscv_gstage_op_pte()` to guarantee that the address passed to the operation function is page-aligned. This fixes the issue where dirty pages might not be tracked correctly when using huge pages. Fixes: 9d05c1fee837 ("RISC-V: KVM: Implement stage2 page table programming") Signed-off-by: Wang Yechao Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/202603301610527120YZ-pAJY6x9SBpSRo1Wg4@zte.com.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/gstage.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index b67d60d722c2..d2001d508046 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -304,10 +304,9 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end if (!found_leaf) goto next; - if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - kvm_riscv_gstage_op_pte(gstage, addr, ptep, - ptep_level, GSTAGE_OP_WP); - + addr = ALIGN_DOWN(addr, page_size); + kvm_riscv_gstage_op_pte(gstage, addr, ptep, + ptep_level, GSTAGE_OP_WP); next: addr += page_size; } From 6ad36f39a7691bb59d2486efd467710fcbebee62 Mon Sep 17 00:00:00 2001 From: Wang Yechao Date: Mon, 30 Mar 2026 16:12:58 +0800 Subject: [PATCH 12/24] RISC-V: KVM: Split huge pages during fault handling for dirty logging During dirty logging, all huge pages are write-protected. When the guest writes to a write-protected huge page, a page fault is triggered. Before recovering the write permission, the huge page must be split into smaller pages (e.g., 4K). After splitting, the normal mapping process proceeds, allowing write permission to be restored at the smaller page granularity. If dirty logging is disabled because migration failed or was cancelled, only recover the write permission at the 4K level, and skip recovering the huge page mapping at this time to avoid the overhead of freeing page tables. The huge page mapping can be recovered in the ioctl context, similar to x86, in a later patch. Signed-off-by: Wang Yechao Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/202603301612587174XZ6QMCrymBqv30S6BN50@zte.com.cn Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_gstage.h | 4 + arch/riscv/kvm/gstage.c | 126 ++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 595e2183173e..a89d1422cc84 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, bool page_rdonly, bool page_exec, struct kvm_gstage_mapping *out_map); +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t addr, u32 target_level, bool flush); + enum kvm_riscv_gstage_op { GSTAGE_OP_NOP = 0, /* Nothing */ GSTAGE_OP_CLEAR, /* Clear/Unmap */ diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index d2001d508046..ffec3e5ddcaf 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -163,13 +163,32 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, return 0; } +static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 level, + gpa_t addr, pte_t *ptep, pgprot_t prot) +{ + pte_t new_pte; + + if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot)) + return; + + new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot); + new_pte = pte_mkdirty(new_pte); + + set_pte(ptep, new_pte); + + gstage_tlb_flush(gstage, level, addr); +} + int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, gpa_t gpa, phys_addr_t hpa, unsigned long page_size, bool page_rdonly, bool page_exec, struct kvm_gstage_mapping *out_map) { + bool found_leaf; + u32 ptep_level; pgprot_t prot; + pte_t *ptep; int ret; out_map->addr = gpa; @@ -203,12 +222,119 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, else prot = PAGE_WRITE; } + + found_leaf = kvm_riscv_gstage_get_leaf(gstage, gpa, &ptep, &ptep_level); + if (found_leaf) { + /* + * ptep_level is the current gstage mapping level of addr, out_map->level + * is the required mapping level during fault handling. + * + * 1) ptep_level > out_map->level + * This happens when dirty logging is enabled and huge pages are used. + * KVM must track the pages at 4K level, and split the huge mapping + * into 4K mappings. + * + * 2) ptep_level < out_map->level + * This happens when dirty logging is disabled and huge pages are used. + * The gstage is split into 4K mappings, but the out_map level is now + * back to the huge page level. Ignore the out_map level this time, and + * just update the pte prot here. Otherwise, we would fall back to mapping + * the gstage at huge page level in `kvm_riscv_gstage_set_pte`, with the + * overhead of freeing the page tables(not support now), which would slow + * down the vCPUs' performance. + * + * It is better to recover the huge page mapping in the ioctl context when + * disabling dirty logging. + * + * 3) ptep_level == out_map->level + * We already have the ptep, just update the pte prot if the pfn not change. + * There is no need to invoke `kvm_riscv_gstage_set_pte` again. + */ + if (ptep_level > out_map->level) { + kvm_riscv_gstage_split_huge(gstage, pcache, gpa, + out_map->level, true); + } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) { + kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot); + return 0; + } + } + out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); out_map->pte = pte_mkdirty(out_map->pte); return kvm_riscv_gstage_set_pte(gstage, pcache, out_map); } +static inline unsigned long make_child_pte(unsigned long huge_pte, int index, + unsigned long child_page_size) +{ + unsigned long child_pte = huge_pte; + unsigned long child_pfn_offset; + + /* + * The child_pte already has the base address of the huge page being + * split. So we just have to OR in the offset to the page at the next + * lower level for the given index. + */ + child_pfn_offset = index * (child_page_size / PAGE_SIZE); + child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0))); + + return child_pte; +} + +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t addr, u32 target_level, bool flush) +{ + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + pte_t *next_ptep = (pte_t *)gstage->pgd; + unsigned long huge_pte, child_pte; + unsigned long child_page_size; + pte_t *ptep; + int i, ret; + + if (!pcache) + return -ENOMEM; + + while(current_level > target_level) { + ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)]; + + if (!pte_val(ptep_get(ptep))) + break; + + if (!gstage_pte_leaf(ptep)) { + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + current_level--; + continue; + } + + huge_pte = pte_val(ptep_get(ptep)); + + ret = gstage_level_to_page_size(current_level - 1, &child_page_size); + if (ret) + return ret; + + next_ptep = kvm_mmu_memory_cache_alloc(pcache); + if (!next_ptep) + return -ENOMEM; + + for (i = 0; i < PTRS_PER_PTE; i++) { + child_pte = make_child_pte(huge_pte, i, child_page_size); + set_pte((pte_t *)&next_ptep[i], __pte(child_pte)); + } + + set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), + __pgprot(_PAGE_TABLE))); + + if (flush) + gstage_tlb_flush(gstage, current_level, addr); + + current_level--; + } + + return 0; +} + void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op) { From 1323a5cfe52c7937ea3cd7a75e0355cacd805da4 Mon Sep 17 00:00:00 2001 From: Jinyu Tang Date: Fri, 27 Feb 2026 20:10:08 +0800 Subject: [PATCH 13/24] KVM: riscv: Skip CSR restore if VCPU is reloaded on the same core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, kvm_arch_vcpu_load() unconditionally restores guest CSRs, HGATP, and AIA state. However, when a VCPU is loaded back on the same physical CPU, and no other KVM VCPU has run on this CPU since it was last put, the hardware CSRs and AIA registers are still valid. This patch optimizes the vcpu_load path by skipping the expensive CSR and AIA writes if all the following conditions are met: 1. It is being reloaded on the same CPU (vcpu->arch.last_exit_cpu == cpu). 2. The CSRs are not dirty (!vcpu->arch.csr_dirty). 3. No other VCPU used this CPU (vcpu == __this_cpu_read(kvm_former_vcpu)). To ensure this fast-path doesn't break corner cases: - Live migration and VCPU reset are naturally safe. KVM initializes last_exit_cpu to -1, which guarantees the fast-path won't trigger. - The 'csr_dirty' flag tracks runtime userspace interventions. If userspace modifies guest configurations (e.g., hedeleg via KVM_SET_GUEST_DEBUG, or CSRs including AIA via KVM_SET_ONE_REG), the flag is set to skip the fast path. With the 'csr_dirty' safeguard proven effective, it is safe to include kvm_riscv_vcpu_aia_load() inside the skip logic now. Signed-off-by: Jinyu Tang Reviewed-by: Nutty Liu Reviewed-by: Andrew Jones Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20260227121008.442241-1-tjytimi@163.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 3 +++ arch/riscv/kvm/vcpu.c | 24 ++++++++++++++++++++++-- arch/riscv/kvm/vcpu_onereg.c | 2 ++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 24585304c02b..7ee47b83c80d 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -273,6 +273,9 @@ struct kvm_vcpu_arch { /* 'static' configurations which are set only once */ struct kvm_vcpu_config cfg; + /* Indicates modified guest CSRs */ + bool csr_dirty; + /* SBI steal-time accounting */ struct { gpa_t shmem; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index fdd99ac1e714..1d5c777eba80 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -24,6 +24,8 @@ #define CREATE_TRACE_POINTS #include "trace.h" +static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_former_vcpu); + const struct kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), STATS_DESC_COUNTER(VCPU, ecall_exit_stat), @@ -537,6 +539,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.cfg.hedeleg |= BIT(EXC_BREAKPOINT); } + vcpu->arch.csr_dirty = true; + return 0; } @@ -581,6 +585,21 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + /* + * If VCPU is being reloaded on the same physical CPU and no + * other KVM VCPU has run on this CPU since it was last put, + * we can skip the expensive CSR and HGATP writes. + * + * Note: If a new CSR is added to this fast-path skip block, + * make sure that 'csr_dirty' is set to true in any + * ioctl (e.g., KVM_SET_ONE_REG) that modifies it. + */ + if (vcpu != __this_cpu_read(kvm_former_vcpu)) + __this_cpu_write(kvm_former_vcpu, vcpu); + else if (vcpu->arch.last_exit_cpu == cpu && !vcpu->arch.csr_dirty) + goto csr_restore_done; + + vcpu->arch.csr_dirty = false; if (kvm_riscv_nacl_sync_csr_available()) { nsh = nacl_shmem(); nacl_csr_write(nsh, CSR_VSSTATUS, csr->vsstatus); @@ -624,6 +643,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_riscv_mmu_update_hgatp(vcpu); + kvm_riscv_vcpu_aia_load(vcpu, cpu); + +csr_restore_done: kvm_riscv_vcpu_timer_restore(vcpu); kvm_riscv_vcpu_host_fp_save(&vcpu->arch.host_context); @@ -633,8 +655,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_riscv_vcpu_guest_vector_restore(&vcpu->arch.guest_context, vcpu->arch.isa); - kvm_riscv_vcpu_aia_load(vcpu, cpu); - kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); vcpu->cpu = cpu; diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 45ecc0082e90..97fa72ba47c1 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -670,6 +670,8 @@ static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, if (rc) return rc; + vcpu->arch.csr_dirty = true; + return 0; } From 1ec8bea903f439acca927144570e1e419d2f9c9a Mon Sep 17 00:00:00 2001 From: Mayuresh Chitale Date: Thu, 2 Apr 2026 15:48:14 +0530 Subject: [PATCH 14/24] KVM: riscv: selftests: Implement kvm_arch_has_default_irqchip kvm_arch_has_default_irqchip is required for irqfd_test and returns true if an in-kernel interrupt controller is supported. Fixes: a133052666bed ("KVM: selftests: Fix irqfd_test for non-x86 architectures") Signed-off-by: Mayuresh Chitale Link: https://lore.kernel.org/r/20260402101818.2982071-1-mayuresh.chitale@oss.qualcomm.com Signed-off-by: Anup Patel --- tools/testing/selftests/kvm/lib/riscv/processor.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/riscv/processor.c b/tools/testing/selftests/kvm/lib/riscv/processor.c index 51dd455ff52c..067c6b2c15b0 100644 --- a/tools/testing/selftests/kvm/lib/riscv/processor.c +++ b/tools/testing/selftests/kvm/lib/riscv/processor.c @@ -566,3 +566,8 @@ unsigned long riscv64_get_satp_mode(void) return val; } + +bool kvm_arch_has_default_irqchip(void) +{ + return kvm_check_cap(KVM_CAP_IRQCHIP); +} From cf05b059d59fff0207d93d7d21b34dad9b460b20 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 20 Jan 2026 13:29:50 +0530 Subject: [PATCH 15/24] RISC-V: KVM: Introduce common kvm_riscv_isa_check_host() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename kvm_riscv_vcpu_isa_check_host() to kvm_riscv_isa_check_host() and use it as common function with KVM RISC-V to check isa extensions supported by host. Signed-off-by: Anup Patel Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20260120080013.2153519-5-anup.patel@oss.qualcomm.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 4 ++++ arch/riscv/kvm/aia_device.c | 2 +- arch/riscv/kvm/vcpu_fp.c | 8 +++---- arch/riscv/kvm/vcpu_onereg.c | 38 ++++++++++++++++--------------- arch/riscv/kvm/vcpu_pmu.c | 2 +- arch/riscv/kvm/vcpu_timer.c | 2 +- arch/riscv/kvm/vcpu_vector.c | 4 ++-- 7 files changed, 33 insertions(+), 27 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 7ee47b83c80d..cf8da3ec6392 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -311,6 +311,10 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, void __kvm_riscv_switch_to(struct kvm_vcpu_arch *vcpu_arch); +int __kvm_riscv_isa_check_host(unsigned long kvm_ext, unsigned long *guest_ext); +#define kvm_riscv_isa_check_host(ext) \ + __kvm_riscv_isa_check_host(KVM_RISCV_ISA_EXT_##ext, NULL) + void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu); unsigned long kvm_riscv_vcpu_num_regs(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_copy_reg_indices(struct kvm_vcpu *vcpu, diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 49c71d3cdb00..f3010dd2030a 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -23,7 +23,7 @@ static int aia_create(struct kvm_device *dev, u32 type) if (irqchip_in_kernel(kvm)) return -EEXIST; - if (!riscv_isa_extension_available(NULL, SSAIA)) + if (kvm_riscv_isa_check_host(SSAIA)) return -ENODEV; ret = -EBUSY; diff --git a/arch/riscv/kvm/vcpu_fp.c b/arch/riscv/kvm/vcpu_fp.c index bd5a9e7e7165..2faa0cd37b69 100644 --- a/arch/riscv/kvm/vcpu_fp.c +++ b/arch/riscv/kvm/vcpu_fp.c @@ -60,17 +60,17 @@ void kvm_riscv_vcpu_guest_fp_restore(struct kvm_cpu_context *cntx, void kvm_riscv_vcpu_host_fp_save(struct kvm_cpu_context *cntx) { /* No need to check host sstatus as it can be modified outside */ - if (riscv_isa_extension_available(NULL, d)) + if (!kvm_riscv_isa_check_host(D)) __kvm_riscv_fp_d_save(cntx); - else if (riscv_isa_extension_available(NULL, f)) + else if (!kvm_riscv_isa_check_host(F)) __kvm_riscv_fp_f_save(cntx); } void kvm_riscv_vcpu_host_fp_restore(struct kvm_cpu_context *cntx) { - if (riscv_isa_extension_available(NULL, d)) + if (!kvm_riscv_isa_check_host(D)) __kvm_riscv_fp_d_restore(cntx); - else if (riscv_isa_extension_available(NULL, f)) + else if (!kvm_riscv_isa_check_host(F)) __kvm_riscv_fp_f_restore(cntx); } #endif diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index 97fa72ba47c1..a92351b78bf8 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -120,7 +120,7 @@ static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) return KVM_RISCV_ISA_EXT_MAX; } -static int kvm_riscv_vcpu_isa_check_host(unsigned long kvm_ext, unsigned long *guest_ext) +int __kvm_riscv_isa_check_host(unsigned long kvm_ext, unsigned long *base_ext) { unsigned long host_ext; @@ -129,8 +129,7 @@ static int kvm_riscv_vcpu_isa_check_host(unsigned long kvm_ext, unsigned long *g return -ENOENT; kvm_ext = array_index_nospec(kvm_ext, ARRAY_SIZE(kvm_isa_ext_arr)); - *guest_ext = kvm_isa_ext_arr[kvm_ext]; - switch (*guest_ext) { + switch (kvm_isa_ext_arr[kvm_ext]) { case RISCV_ISA_EXT_SMNPM: /* * Pointer masking effective in (H)S-mode is provided by the @@ -141,13 +140,16 @@ static int kvm_riscv_vcpu_isa_check_host(unsigned long kvm_ext, unsigned long *g host_ext = RISCV_ISA_EXT_SSNPM; break; default: - host_ext = *guest_ext; + host_ext = kvm_isa_ext_arr[kvm_ext]; break; } if (!__riscv_isa_extension_available(NULL, host_ext)) return -ENOENT; + if (base_ext) + *base_ext = kvm_isa_ext_arr[kvm_ext]; + return 0; } @@ -158,7 +160,7 @@ static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) return false; case KVM_RISCV_ISA_EXT_SSCOFPMF: /* Sscofpmf depends on interrupt filtering defined in ssaia */ - return __riscv_isa_extension_available(NULL, RISCV_ISA_EXT_SSAIA); + return !kvm_riscv_isa_check_host(SSAIA); case KVM_RISCV_ISA_EXT_SVADU: /* * The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero. @@ -265,7 +267,7 @@ void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu) unsigned long guest_ext, i; for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) { - if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext)) + if (__kvm_riscv_isa_check_host(i, &guest_ext)) continue; if (kvm_riscv_vcpu_isa_enable_allowed(i)) set_bit(guest_ext, vcpu->arch.isa); @@ -290,17 +292,17 @@ static int kvm_riscv_vcpu_get_reg_config(struct kvm_vcpu *vcpu, reg_val = vcpu->arch.isa[0] & KVM_RISCV_BASE_ISA_MASK; break; case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): - if (!riscv_isa_extension_available(NULL, ZICBOM)) + if (kvm_riscv_isa_check_host(ZICBOM)) return -ENOENT; reg_val = riscv_cbom_block_size; break; case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size): - if (!riscv_isa_extension_available(NULL, ZICBOZ)) + if (kvm_riscv_isa_check_host(ZICBOZ)) return -ENOENT; reg_val = riscv_cboz_block_size; break; case KVM_REG_RISCV_CONFIG_REG(zicbop_block_size): - if (!riscv_isa_extension_available(NULL, ZICBOP)) + if (kvm_riscv_isa_check_host(ZICBOP)) return -ENOENT; reg_val = riscv_cbop_block_size; break; @@ -384,19 +386,19 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, } break; case KVM_REG_RISCV_CONFIG_REG(zicbom_block_size): - if (!riscv_isa_extension_available(NULL, ZICBOM)) + if (kvm_riscv_isa_check_host(ZICBOM)) return -ENOENT; if (reg_val != riscv_cbom_block_size) return -EINVAL; break; case KVM_REG_RISCV_CONFIG_REG(zicboz_block_size): - if (!riscv_isa_extension_available(NULL, ZICBOZ)) + if (kvm_riscv_isa_check_host(ZICBOZ)) return -ENOENT; if (reg_val != riscv_cboz_block_size) return -EINVAL; break; case KVM_REG_RISCV_CONFIG_REG(zicbop_block_size): - if (!riscv_isa_extension_available(NULL, ZICBOP)) + if (kvm_riscv_isa_check_host(ZICBOP)) return -ENOENT; if (reg_val != riscv_cbop_block_size) return -EINVAL; @@ -682,7 +684,7 @@ static int riscv_vcpu_get_isa_ext_single(struct kvm_vcpu *vcpu, unsigned long guest_ext; int ret; - ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext); + ret = __kvm_riscv_isa_check_host(reg_num, &guest_ext); if (ret) return ret; @@ -700,7 +702,7 @@ static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu, unsigned long guest_ext; int ret; - ret = kvm_riscv_vcpu_isa_check_host(reg_num, &guest_ext); + ret = __kvm_riscv_isa_check_host(reg_num, &guest_ext); if (ret) return ret; @@ -859,13 +861,13 @@ static int copy_config_reg_indices(const struct kvm_vcpu *vcpu, * was not available. */ if (i == KVM_REG_RISCV_CONFIG_REG(zicbom_block_size) && - !riscv_isa_extension_available(NULL, ZICBOM)) + kvm_riscv_isa_check_host(ZICBOM)) continue; else if (i == KVM_REG_RISCV_CONFIG_REG(zicboz_block_size) && - !riscv_isa_extension_available(NULL, ZICBOZ)) + kvm_riscv_isa_check_host(ZICBOZ)) continue; else if (i == KVM_REG_RISCV_CONFIG_REG(zicbop_block_size) && - !riscv_isa_extension_available(NULL, ZICBOP)) + kvm_riscv_isa_check_host(ZICBOP)) continue; size = IS_ENABLED(CONFIG_32BIT) ? KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; @@ -1086,7 +1088,7 @@ static int copy_isa_ext_reg_indices(const struct kvm_vcpu *vcpu, KVM_REG_SIZE_U32 : KVM_REG_SIZE_U64; u64 reg = KVM_REG_RISCV | size | KVM_REG_RISCV_ISA_EXT | i; - if (kvm_riscv_vcpu_isa_check_host(i, &guest_ext)) + if (__kvm_riscv_isa_check_host(i, &guest_ext)) continue; if (uindices) { diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index 49427094a079..a8ca1e65734a 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -845,7 +845,7 @@ void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) * filtering is available in the host. Otherwise, guest will always count * events while the execution is in hypervisor mode. */ - if (!riscv_isa_extension_available(NULL, SSCOFPMF)) + if (kvm_riscv_isa_check_host(SSCOFPMF)) return; ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs); diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index f36247e4c783..cac4f3a5f213 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -253,7 +253,7 @@ int kvm_riscv_vcpu_timer_init(struct kvm_vcpu *vcpu) t->next_set = false; /* Enable sstc for every vcpu if available in hardware */ - if (riscv_isa_extension_available(NULL, SSTC)) { + if (!kvm_riscv_isa_check_host(SSTC)) { t->sstc_enabled = true; hrtimer_setup(&t->hrt, kvm_riscv_vcpu_vstimer_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL); diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c index f3f5fb665cf6..42ffb489c447 100644 --- a/arch/riscv/kvm/vcpu_vector.c +++ b/arch/riscv/kvm/vcpu_vector.c @@ -63,13 +63,13 @@ void kvm_riscv_vcpu_guest_vector_restore(struct kvm_cpu_context *cntx, void kvm_riscv_vcpu_host_vector_save(struct kvm_cpu_context *cntx) { /* No need to check host sstatus as it can be modified outside */ - if (riscv_isa_extension_available(NULL, v)) + if (!kvm_riscv_isa_check_host(V)) __kvm_riscv_vector_save(cntx); } void kvm_riscv_vcpu_host_vector_restore(struct kvm_cpu_context *cntx) { - if (riscv_isa_extension_available(NULL, v)) + if (!kvm_riscv_isa_check_host(V)) __kvm_riscv_vector_restore(cntx); } From e0b5cfc316f1d36e22b0745d20360b4719b89209 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 20 Jan 2026 13:29:51 +0530 Subject: [PATCH 16/24] RISC-V: KVM: Factor-out ISA checks into separate sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KVM ISA extension related checks are not VCPU specific and should be factored out of vcpu_onereg.c into separate sources. Signed-off-by: Anup Patel Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20260120080013.2153519-6-anup.patel@oss.qualcomm.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 4 - arch/riscv/include/asm/kvm_isa.h | 20 +++ arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/aia_device.c | 2 +- arch/riscv/kvm/isa.c | 253 +++++++++++++++++++++++++++++ arch/riscv/kvm/vcpu_fp.c | 1 + arch/riscv/kvm/vcpu_onereg.c | 258 +----------------------------- arch/riscv/kvm/vcpu_pmu.c | 3 +- arch/riscv/kvm/vcpu_timer.c | 1 + arch/riscv/kvm/vcpu_vector.c | 1 + 10 files changed, 288 insertions(+), 256 deletions(-) create mode 100644 arch/riscv/include/asm/kvm_isa.h create mode 100644 arch/riscv/kvm/isa.c diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index cf8da3ec6392..7ee47b83c80d 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -311,10 +311,6 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, void __kvm_riscv_switch_to(struct kvm_vcpu_arch *vcpu_arch); -int __kvm_riscv_isa_check_host(unsigned long kvm_ext, unsigned long *guest_ext); -#define kvm_riscv_isa_check_host(ext) \ - __kvm_riscv_isa_check_host(KVM_RISCV_ISA_EXT_##ext, NULL) - void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu); unsigned long kvm_riscv_vcpu_num_regs(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_copy_reg_indices(struct kvm_vcpu *vcpu, diff --git a/arch/riscv/include/asm/kvm_isa.h b/arch/riscv/include/asm/kvm_isa.h new file mode 100644 index 000000000000..bc4b956d5f17 --- /dev/null +++ b/arch/riscv/include/asm/kvm_isa.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2026 Qualcomm Technologies, Inc. + */ + +#ifndef __KVM_RISCV_ISA_H +#define __KVM_RISCV_ISA_H + +#include + +unsigned long kvm_riscv_base2isa_ext(unsigned long base_ext); + +int __kvm_riscv_isa_check_host(unsigned long ext, unsigned long *base_ext); +#define kvm_riscv_isa_check_host(ext) \ + __kvm_riscv_isa_check_host(KVM_RISCV_ISA_EXT_##ext, NULL) + +bool kvm_riscv_isa_enable_allowed(unsigned long ext); +bool kvm_riscv_isa_disable_allowed(unsigned long ext); + +#endif diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 3b8afb038b35..07eab96189e7 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -15,6 +15,7 @@ kvm-y += aia_aplic.o kvm-y += aia_device.o kvm-y += aia_imsic.o kvm-y += gstage.o +kvm-y += isa.o kvm-y += main.o kvm-y += mmu.o kvm-y += nacl.o diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index f3010dd2030a..3d1e81e2a36b 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include static int aia_create(struct kvm_device *dev, u32 type) { diff --git a/arch/riscv/kvm/isa.c b/arch/riscv/kvm/isa.c new file mode 100644 index 000000000000..1132d909cc25 --- /dev/null +++ b/arch/riscv/kvm/isa.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Qualcomm Technologies, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define KVM_ISA_EXT_ARR(ext) \ +[KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext + +/* Mapping between KVM ISA Extension ID & guest ISA extension ID */ +static const unsigned long kvm_isa_ext_arr[] = { + /* Single letter extensions (alphabetically sorted) */ + [KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a, + [KVM_RISCV_ISA_EXT_C] = RISCV_ISA_EXT_c, + [KVM_RISCV_ISA_EXT_D] = RISCV_ISA_EXT_d, + [KVM_RISCV_ISA_EXT_F] = RISCV_ISA_EXT_f, + [KVM_RISCV_ISA_EXT_H] = RISCV_ISA_EXT_h, + [KVM_RISCV_ISA_EXT_I] = RISCV_ISA_EXT_i, + [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m, + [KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v, + /* Multi letter extensions (alphabetically sorted) */ + KVM_ISA_EXT_ARR(SMNPM), + KVM_ISA_EXT_ARR(SMSTATEEN), + KVM_ISA_EXT_ARR(SSAIA), + KVM_ISA_EXT_ARR(SSCOFPMF), + KVM_ISA_EXT_ARR(SSNPM), + KVM_ISA_EXT_ARR(SSTC), + KVM_ISA_EXT_ARR(SVADE), + KVM_ISA_EXT_ARR(SVADU), + KVM_ISA_EXT_ARR(SVINVAL), + KVM_ISA_EXT_ARR(SVNAPOT), + KVM_ISA_EXT_ARR(SVPBMT), + KVM_ISA_EXT_ARR(SVVPTC), + KVM_ISA_EXT_ARR(ZAAMO), + KVM_ISA_EXT_ARR(ZABHA), + KVM_ISA_EXT_ARR(ZACAS), + KVM_ISA_EXT_ARR(ZALASR), + KVM_ISA_EXT_ARR(ZALRSC), + KVM_ISA_EXT_ARR(ZAWRS), + KVM_ISA_EXT_ARR(ZBA), + KVM_ISA_EXT_ARR(ZBB), + KVM_ISA_EXT_ARR(ZBC), + KVM_ISA_EXT_ARR(ZBKB), + KVM_ISA_EXT_ARR(ZBKC), + KVM_ISA_EXT_ARR(ZBKX), + KVM_ISA_EXT_ARR(ZBS), + KVM_ISA_EXT_ARR(ZCA), + KVM_ISA_EXT_ARR(ZCB), + KVM_ISA_EXT_ARR(ZCD), + KVM_ISA_EXT_ARR(ZCF), + KVM_ISA_EXT_ARR(ZCLSD), + KVM_ISA_EXT_ARR(ZCMOP), + KVM_ISA_EXT_ARR(ZFA), + KVM_ISA_EXT_ARR(ZFBFMIN), + KVM_ISA_EXT_ARR(ZFH), + KVM_ISA_EXT_ARR(ZFHMIN), + KVM_ISA_EXT_ARR(ZICBOM), + KVM_ISA_EXT_ARR(ZICBOP), + KVM_ISA_EXT_ARR(ZICBOZ), + KVM_ISA_EXT_ARR(ZICCRSE), + KVM_ISA_EXT_ARR(ZICNTR), + KVM_ISA_EXT_ARR(ZICOND), + KVM_ISA_EXT_ARR(ZICSR), + KVM_ISA_EXT_ARR(ZIFENCEI), + KVM_ISA_EXT_ARR(ZIHINTNTL), + KVM_ISA_EXT_ARR(ZIHINTPAUSE), + KVM_ISA_EXT_ARR(ZIHPM), + KVM_ISA_EXT_ARR(ZILSD), + KVM_ISA_EXT_ARR(ZIMOP), + KVM_ISA_EXT_ARR(ZKND), + KVM_ISA_EXT_ARR(ZKNE), + KVM_ISA_EXT_ARR(ZKNH), + KVM_ISA_EXT_ARR(ZKR), + KVM_ISA_EXT_ARR(ZKSED), + KVM_ISA_EXT_ARR(ZKSH), + KVM_ISA_EXT_ARR(ZKT), + KVM_ISA_EXT_ARR(ZTSO), + KVM_ISA_EXT_ARR(ZVBB), + KVM_ISA_EXT_ARR(ZVBC), + KVM_ISA_EXT_ARR(ZVFBFMIN), + KVM_ISA_EXT_ARR(ZVFBFWMA), + KVM_ISA_EXT_ARR(ZVFH), + KVM_ISA_EXT_ARR(ZVFHMIN), + KVM_ISA_EXT_ARR(ZVKB), + KVM_ISA_EXT_ARR(ZVKG), + KVM_ISA_EXT_ARR(ZVKNED), + KVM_ISA_EXT_ARR(ZVKNHA), + KVM_ISA_EXT_ARR(ZVKNHB), + KVM_ISA_EXT_ARR(ZVKSED), + KVM_ISA_EXT_ARR(ZVKSH), + KVM_ISA_EXT_ARR(ZVKT), +}; + +unsigned long kvm_riscv_base2isa_ext(unsigned long base_ext) +{ + unsigned long i; + + for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { + if (kvm_isa_ext_arr[i] == base_ext) + return i; + } + + return KVM_RISCV_ISA_EXT_MAX; +} + +int __kvm_riscv_isa_check_host(unsigned long kvm_ext, unsigned long *base_ext) +{ + unsigned long host_ext; + + if (kvm_ext >= KVM_RISCV_ISA_EXT_MAX || + kvm_ext >= ARRAY_SIZE(kvm_isa_ext_arr)) + return -ENOENT; + + kvm_ext = array_index_nospec(kvm_ext, ARRAY_SIZE(kvm_isa_ext_arr)); + switch (kvm_isa_ext_arr[kvm_ext]) { + case RISCV_ISA_EXT_SMNPM: + /* + * Pointer masking effective in (H)S-mode is provided by the + * Smnpm extension, so that extension is reported to the guest, + * even though the CSR bits for configuring VS-mode pointer + * masking on the host side are part of the Ssnpm extension. + */ + host_ext = RISCV_ISA_EXT_SSNPM; + break; + default: + host_ext = kvm_isa_ext_arr[kvm_ext]; + break; + } + + if (!__riscv_isa_extension_available(NULL, host_ext)) + return -ENOENT; + + if (base_ext) + *base_ext = kvm_isa_ext_arr[kvm_ext]; + + return 0; +} + +bool kvm_riscv_isa_enable_allowed(unsigned long ext) +{ + switch (ext) { + case KVM_RISCV_ISA_EXT_H: + return false; + case KVM_RISCV_ISA_EXT_SSCOFPMF: + /* Sscofpmf depends on interrupt filtering defined in ssaia */ + return !kvm_riscv_isa_check_host(SSAIA); + case KVM_RISCV_ISA_EXT_SVADU: + /* + * The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero. + * Guest OS can use Svadu only when host OS enable Svadu. + */ + return arch_has_hw_pte_young(); + case KVM_RISCV_ISA_EXT_V: + return riscv_v_vstate_ctrl_user_allowed(); + default: + break; + } + + return true; +} + +bool kvm_riscv_isa_disable_allowed(unsigned long ext) +{ + switch (ext) { + /* Extensions which don't have any mechanism to disable */ + case KVM_RISCV_ISA_EXT_A: + case KVM_RISCV_ISA_EXT_C: + case KVM_RISCV_ISA_EXT_I: + case KVM_RISCV_ISA_EXT_M: + /* There is not architectural config bit to disable sscofpmf completely */ + case KVM_RISCV_ISA_EXT_SSCOFPMF: + case KVM_RISCV_ISA_EXT_SSNPM: + case KVM_RISCV_ISA_EXT_SSTC: + case KVM_RISCV_ISA_EXT_SVINVAL: + case KVM_RISCV_ISA_EXT_SVNAPOT: + case KVM_RISCV_ISA_EXT_SVVPTC: + case KVM_RISCV_ISA_EXT_ZAAMO: + case KVM_RISCV_ISA_EXT_ZABHA: + case KVM_RISCV_ISA_EXT_ZACAS: + case KVM_RISCV_ISA_EXT_ZALASR: + case KVM_RISCV_ISA_EXT_ZALRSC: + case KVM_RISCV_ISA_EXT_ZAWRS: + case KVM_RISCV_ISA_EXT_ZBA: + case KVM_RISCV_ISA_EXT_ZBB: + case KVM_RISCV_ISA_EXT_ZBC: + case KVM_RISCV_ISA_EXT_ZBKB: + case KVM_RISCV_ISA_EXT_ZBKC: + case KVM_RISCV_ISA_EXT_ZBKX: + case KVM_RISCV_ISA_EXT_ZBS: + case KVM_RISCV_ISA_EXT_ZCA: + case KVM_RISCV_ISA_EXT_ZCB: + case KVM_RISCV_ISA_EXT_ZCD: + case KVM_RISCV_ISA_EXT_ZCF: + case KVM_RISCV_ISA_EXT_ZCMOP: + case KVM_RISCV_ISA_EXT_ZFA: + case KVM_RISCV_ISA_EXT_ZFBFMIN: + case KVM_RISCV_ISA_EXT_ZFH: + case KVM_RISCV_ISA_EXT_ZFHMIN: + case KVM_RISCV_ISA_EXT_ZICBOP: + case KVM_RISCV_ISA_EXT_ZICCRSE: + case KVM_RISCV_ISA_EXT_ZICNTR: + case KVM_RISCV_ISA_EXT_ZICOND: + case KVM_RISCV_ISA_EXT_ZICSR: + case KVM_RISCV_ISA_EXT_ZIFENCEI: + case KVM_RISCV_ISA_EXT_ZIHINTNTL: + case KVM_RISCV_ISA_EXT_ZIHINTPAUSE: + case KVM_RISCV_ISA_EXT_ZIHPM: + case KVM_RISCV_ISA_EXT_ZIMOP: + case KVM_RISCV_ISA_EXT_ZKND: + case KVM_RISCV_ISA_EXT_ZKNE: + case KVM_RISCV_ISA_EXT_ZKNH: + case KVM_RISCV_ISA_EXT_ZKR: + case KVM_RISCV_ISA_EXT_ZKSED: + case KVM_RISCV_ISA_EXT_ZKSH: + case KVM_RISCV_ISA_EXT_ZKT: + case KVM_RISCV_ISA_EXT_ZTSO: + case KVM_RISCV_ISA_EXT_ZVBB: + case KVM_RISCV_ISA_EXT_ZVBC: + case KVM_RISCV_ISA_EXT_ZVFBFMIN: + case KVM_RISCV_ISA_EXT_ZVFBFWMA: + case KVM_RISCV_ISA_EXT_ZVFH: + case KVM_RISCV_ISA_EXT_ZVFHMIN: + case KVM_RISCV_ISA_EXT_ZVKB: + case KVM_RISCV_ISA_EXT_ZVKG: + case KVM_RISCV_ISA_EXT_ZVKNED: + case KVM_RISCV_ISA_EXT_ZVKNHA: + case KVM_RISCV_ISA_EXT_ZVKNHB: + case KVM_RISCV_ISA_EXT_ZVKSED: + case KVM_RISCV_ISA_EXT_ZVKSH: + case KVM_RISCV_ISA_EXT_ZVKT: + return false; + /* Extensions which can be disabled using Smstateen */ + case KVM_RISCV_ISA_EXT_SSAIA: + return riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN); + case KVM_RISCV_ISA_EXT_SVADE: + /* + * The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero. + * Svade can't be disabled unless we support Svadu. + */ + return arch_has_hw_pte_young(); + default: + break; + } + + return true; +} diff --git a/arch/riscv/kvm/vcpu_fp.c b/arch/riscv/kvm/vcpu_fp.c index 2faa0cd37b69..6ad6df26a2fd 100644 --- a/arch/riscv/kvm/vcpu_fp.c +++ b/arch/riscv/kvm/vcpu_fp.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_FPU void kvm_riscv_vcpu_fp_reset(struct kvm_vcpu *vcpu) diff --git a/arch/riscv/kvm/vcpu_onereg.c b/arch/riscv/kvm/vcpu_onereg.c index a92351b78bf8..bb920e8923c9 100644 --- a/arch/riscv/kvm/vcpu_onereg.c +++ b/arch/riscv/kvm/vcpu_onereg.c @@ -15,261 +15,19 @@ #include #include #include +#include #include -#include -#include #define KVM_RISCV_BASE_ISA_MASK GENMASK(25, 0) -#define KVM_ISA_EXT_ARR(ext) \ -[KVM_RISCV_ISA_EXT_##ext] = RISCV_ISA_EXT_##ext - -/* Mapping between KVM ISA Extension ID & guest ISA extension ID */ -static const unsigned long kvm_isa_ext_arr[] = { - /* Single letter extensions (alphabetically sorted) */ - [KVM_RISCV_ISA_EXT_A] = RISCV_ISA_EXT_a, - [KVM_RISCV_ISA_EXT_C] = RISCV_ISA_EXT_c, - [KVM_RISCV_ISA_EXT_D] = RISCV_ISA_EXT_d, - [KVM_RISCV_ISA_EXT_F] = RISCV_ISA_EXT_f, - [KVM_RISCV_ISA_EXT_H] = RISCV_ISA_EXT_h, - [KVM_RISCV_ISA_EXT_I] = RISCV_ISA_EXT_i, - [KVM_RISCV_ISA_EXT_M] = RISCV_ISA_EXT_m, - [KVM_RISCV_ISA_EXT_V] = RISCV_ISA_EXT_v, - /* Multi letter extensions (alphabetically sorted) */ - KVM_ISA_EXT_ARR(SMNPM), - KVM_ISA_EXT_ARR(SMSTATEEN), - KVM_ISA_EXT_ARR(SSAIA), - KVM_ISA_EXT_ARR(SSCOFPMF), - KVM_ISA_EXT_ARR(SSNPM), - KVM_ISA_EXT_ARR(SSTC), - KVM_ISA_EXT_ARR(SVADE), - KVM_ISA_EXT_ARR(SVADU), - KVM_ISA_EXT_ARR(SVINVAL), - KVM_ISA_EXT_ARR(SVNAPOT), - KVM_ISA_EXT_ARR(SVPBMT), - KVM_ISA_EXT_ARR(SVVPTC), - KVM_ISA_EXT_ARR(ZAAMO), - KVM_ISA_EXT_ARR(ZABHA), - KVM_ISA_EXT_ARR(ZACAS), - KVM_ISA_EXT_ARR(ZALASR), - KVM_ISA_EXT_ARR(ZALRSC), - KVM_ISA_EXT_ARR(ZAWRS), - KVM_ISA_EXT_ARR(ZBA), - KVM_ISA_EXT_ARR(ZBB), - KVM_ISA_EXT_ARR(ZBC), - KVM_ISA_EXT_ARR(ZBKB), - KVM_ISA_EXT_ARR(ZBKC), - KVM_ISA_EXT_ARR(ZBKX), - KVM_ISA_EXT_ARR(ZBS), - KVM_ISA_EXT_ARR(ZCA), - KVM_ISA_EXT_ARR(ZCB), - KVM_ISA_EXT_ARR(ZCD), - KVM_ISA_EXT_ARR(ZCF), - KVM_ISA_EXT_ARR(ZCLSD), - KVM_ISA_EXT_ARR(ZCMOP), - KVM_ISA_EXT_ARR(ZFA), - KVM_ISA_EXT_ARR(ZFBFMIN), - KVM_ISA_EXT_ARR(ZFH), - KVM_ISA_EXT_ARR(ZFHMIN), - KVM_ISA_EXT_ARR(ZICBOM), - KVM_ISA_EXT_ARR(ZICBOP), - KVM_ISA_EXT_ARR(ZICBOZ), - KVM_ISA_EXT_ARR(ZICCRSE), - KVM_ISA_EXT_ARR(ZICNTR), - KVM_ISA_EXT_ARR(ZICOND), - KVM_ISA_EXT_ARR(ZICSR), - KVM_ISA_EXT_ARR(ZIFENCEI), - KVM_ISA_EXT_ARR(ZIHINTNTL), - KVM_ISA_EXT_ARR(ZIHINTPAUSE), - KVM_ISA_EXT_ARR(ZIHPM), - KVM_ISA_EXT_ARR(ZILSD), - KVM_ISA_EXT_ARR(ZIMOP), - KVM_ISA_EXT_ARR(ZKND), - KVM_ISA_EXT_ARR(ZKNE), - KVM_ISA_EXT_ARR(ZKNH), - KVM_ISA_EXT_ARR(ZKR), - KVM_ISA_EXT_ARR(ZKSED), - KVM_ISA_EXT_ARR(ZKSH), - KVM_ISA_EXT_ARR(ZKT), - KVM_ISA_EXT_ARR(ZTSO), - KVM_ISA_EXT_ARR(ZVBB), - KVM_ISA_EXT_ARR(ZVBC), - KVM_ISA_EXT_ARR(ZVFBFMIN), - KVM_ISA_EXT_ARR(ZVFBFWMA), - KVM_ISA_EXT_ARR(ZVFH), - KVM_ISA_EXT_ARR(ZVFHMIN), - KVM_ISA_EXT_ARR(ZVKB), - KVM_ISA_EXT_ARR(ZVKG), - KVM_ISA_EXT_ARR(ZVKNED), - KVM_ISA_EXT_ARR(ZVKNHA), - KVM_ISA_EXT_ARR(ZVKNHB), - KVM_ISA_EXT_ARR(ZVKSED), - KVM_ISA_EXT_ARR(ZVKSH), - KVM_ISA_EXT_ARR(ZVKT), -}; - -static unsigned long kvm_riscv_vcpu_base2isa_ext(unsigned long base_ext) -{ - unsigned long i; - - for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { - if (kvm_isa_ext_arr[i] == base_ext) - return i; - } - - return KVM_RISCV_ISA_EXT_MAX; -} - -int __kvm_riscv_isa_check_host(unsigned long kvm_ext, unsigned long *base_ext) -{ - unsigned long host_ext; - - if (kvm_ext >= KVM_RISCV_ISA_EXT_MAX || - kvm_ext >= ARRAY_SIZE(kvm_isa_ext_arr)) - return -ENOENT; - - kvm_ext = array_index_nospec(kvm_ext, ARRAY_SIZE(kvm_isa_ext_arr)); - switch (kvm_isa_ext_arr[kvm_ext]) { - case RISCV_ISA_EXT_SMNPM: - /* - * Pointer masking effective in (H)S-mode is provided by the - * Smnpm extension, so that extension is reported to the guest, - * even though the CSR bits for configuring VS-mode pointer - * masking on the host side are part of the Ssnpm extension. - */ - host_ext = RISCV_ISA_EXT_SSNPM; - break; - default: - host_ext = kvm_isa_ext_arr[kvm_ext]; - break; - } - - if (!__riscv_isa_extension_available(NULL, host_ext)) - return -ENOENT; - - if (base_ext) - *base_ext = kvm_isa_ext_arr[kvm_ext]; - - return 0; -} - -static bool kvm_riscv_vcpu_isa_enable_allowed(unsigned long ext) -{ - switch (ext) { - case KVM_RISCV_ISA_EXT_H: - return false; - case KVM_RISCV_ISA_EXT_SSCOFPMF: - /* Sscofpmf depends on interrupt filtering defined in ssaia */ - return !kvm_riscv_isa_check_host(SSAIA); - case KVM_RISCV_ISA_EXT_SVADU: - /* - * The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero. - * Guest OS can use Svadu only when host OS enable Svadu. - */ - return arch_has_hw_pte_young(); - case KVM_RISCV_ISA_EXT_V: - return riscv_v_vstate_ctrl_user_allowed(); - default: - break; - } - - return true; -} - -static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) -{ - switch (ext) { - /* Extensions which don't have any mechanism to disable */ - case KVM_RISCV_ISA_EXT_A: - case KVM_RISCV_ISA_EXT_C: - case KVM_RISCV_ISA_EXT_I: - case KVM_RISCV_ISA_EXT_M: - /* There is not architectural config bit to disable sscofpmf completely */ - case KVM_RISCV_ISA_EXT_SSCOFPMF: - case KVM_RISCV_ISA_EXT_SSNPM: - case KVM_RISCV_ISA_EXT_SSTC: - case KVM_RISCV_ISA_EXT_SVINVAL: - case KVM_RISCV_ISA_EXT_SVNAPOT: - case KVM_RISCV_ISA_EXT_SVVPTC: - case KVM_RISCV_ISA_EXT_ZAAMO: - case KVM_RISCV_ISA_EXT_ZABHA: - case KVM_RISCV_ISA_EXT_ZACAS: - case KVM_RISCV_ISA_EXT_ZALASR: - case KVM_RISCV_ISA_EXT_ZALRSC: - case KVM_RISCV_ISA_EXT_ZAWRS: - case KVM_RISCV_ISA_EXT_ZBA: - case KVM_RISCV_ISA_EXT_ZBB: - case KVM_RISCV_ISA_EXT_ZBC: - case KVM_RISCV_ISA_EXT_ZBKB: - case KVM_RISCV_ISA_EXT_ZBKC: - case KVM_RISCV_ISA_EXT_ZBKX: - case KVM_RISCV_ISA_EXT_ZBS: - case KVM_RISCV_ISA_EXT_ZCA: - case KVM_RISCV_ISA_EXT_ZCB: - case KVM_RISCV_ISA_EXT_ZCD: - case KVM_RISCV_ISA_EXT_ZCF: - case KVM_RISCV_ISA_EXT_ZCMOP: - case KVM_RISCV_ISA_EXT_ZFA: - case KVM_RISCV_ISA_EXT_ZFBFMIN: - case KVM_RISCV_ISA_EXT_ZFH: - case KVM_RISCV_ISA_EXT_ZFHMIN: - case KVM_RISCV_ISA_EXT_ZICBOP: - case KVM_RISCV_ISA_EXT_ZICCRSE: - case KVM_RISCV_ISA_EXT_ZICNTR: - case KVM_RISCV_ISA_EXT_ZICOND: - case KVM_RISCV_ISA_EXT_ZICSR: - case KVM_RISCV_ISA_EXT_ZIFENCEI: - case KVM_RISCV_ISA_EXT_ZIHINTNTL: - case KVM_RISCV_ISA_EXT_ZIHINTPAUSE: - case KVM_RISCV_ISA_EXT_ZIHPM: - case KVM_RISCV_ISA_EXT_ZIMOP: - case KVM_RISCV_ISA_EXT_ZKND: - case KVM_RISCV_ISA_EXT_ZKNE: - case KVM_RISCV_ISA_EXT_ZKNH: - case KVM_RISCV_ISA_EXT_ZKR: - case KVM_RISCV_ISA_EXT_ZKSED: - case KVM_RISCV_ISA_EXT_ZKSH: - case KVM_RISCV_ISA_EXT_ZKT: - case KVM_RISCV_ISA_EXT_ZTSO: - case KVM_RISCV_ISA_EXT_ZVBB: - case KVM_RISCV_ISA_EXT_ZVBC: - case KVM_RISCV_ISA_EXT_ZVFBFMIN: - case KVM_RISCV_ISA_EXT_ZVFBFWMA: - case KVM_RISCV_ISA_EXT_ZVFH: - case KVM_RISCV_ISA_EXT_ZVFHMIN: - case KVM_RISCV_ISA_EXT_ZVKB: - case KVM_RISCV_ISA_EXT_ZVKG: - case KVM_RISCV_ISA_EXT_ZVKNED: - case KVM_RISCV_ISA_EXT_ZVKNHA: - case KVM_RISCV_ISA_EXT_ZVKNHB: - case KVM_RISCV_ISA_EXT_ZVKSED: - case KVM_RISCV_ISA_EXT_ZVKSH: - case KVM_RISCV_ISA_EXT_ZVKT: - return false; - /* Extensions which can be disabled using Smstateen */ - case KVM_RISCV_ISA_EXT_SSAIA: - return riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN); - case KVM_RISCV_ISA_EXT_SVADE: - /* - * The henvcfg.ADUE is read-only zero if menvcfg.ADUE is zero. - * Svade can't be disabled unless we support Svadu. - */ - return arch_has_hw_pte_young(); - default: - break; - } - - return true; -} - void kvm_riscv_vcpu_setup_isa(struct kvm_vcpu *vcpu) { unsigned long guest_ext, i; - for (i = 0; i < ARRAY_SIZE(kvm_isa_ext_arr); i++) { + for (i = 0; i < KVM_RISCV_ISA_EXT_MAX; i++) { if (__kvm_riscv_isa_check_host(i, &guest_ext)) continue; - if (kvm_riscv_vcpu_isa_enable_allowed(i)) + if (kvm_riscv_isa_enable_allowed(i)) set_bit(guest_ext, vcpu->arch.isa); } } @@ -363,15 +121,15 @@ static int kvm_riscv_vcpu_set_reg_config(struct kvm_vcpu *vcpu, if (!vcpu->arch.ran_atleast_once) { /* Ignore the enable/disable request for certain extensions */ for (i = 0; i < RISCV_ISA_EXT_BASE; i++) { - isa_ext = kvm_riscv_vcpu_base2isa_ext(i); + isa_ext = kvm_riscv_base2isa_ext(i); if (isa_ext >= KVM_RISCV_ISA_EXT_MAX) { reg_val &= ~BIT(i); continue; } - if (!kvm_riscv_vcpu_isa_enable_allowed(isa_ext)) + if (!kvm_riscv_isa_enable_allowed(isa_ext)) if (reg_val & BIT(i)) reg_val &= ~BIT(i); - if (!kvm_riscv_vcpu_isa_disable_allowed(isa_ext)) + if (!kvm_riscv_isa_disable_allowed(isa_ext)) if (!(reg_val & BIT(i))) reg_val |= BIT(i); } @@ -715,10 +473,10 @@ static int riscv_vcpu_set_isa_ext_single(struct kvm_vcpu *vcpu, * extension can be disabled */ if (reg_val == 1 && - kvm_riscv_vcpu_isa_enable_allowed(reg_num)) + kvm_riscv_isa_enable_allowed(reg_num)) set_bit(guest_ext, vcpu->arch.isa); else if (!reg_val && - kvm_riscv_vcpu_isa_disable_allowed(reg_num)) + kvm_riscv_isa_disable_allowed(reg_num)) clear_bit(guest_ext, vcpu->arch.isa); else return -EINVAL; diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c index a8ca1e65734a..18ef07b3f13a 100644 --- a/arch/riscv/kvm/vcpu_pmu.c +++ b/arch/riscv/kvm/vcpu_pmu.c @@ -7,16 +7,17 @@ */ #define pr_fmt(fmt) "riscv-kvm-pmu: " fmt +#include #include #include #include #include #include #include +#include #include #include #include -#include #define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs) #define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16) diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index cac4f3a5f213..9817ff802821 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/arch/riscv/kvm/vcpu_vector.c b/arch/riscv/kvm/vcpu_vector.c index 42ffb489c447..62d2fb77bb9b 100644 --- a/arch/riscv/kvm/vcpu_vector.c +++ b/arch/riscv/kvm/vcpu_vector.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include From b1834e5afbcd463c83f99dc12e1863803d5a1803 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 20 Jan 2026 13:29:52 +0530 Subject: [PATCH 17/24] RISC-V: KVM: Move timer state defines closer to struct in UAPI header The KVM_RISCV_TIMER_STATE_xyz defines specify possible values of the "state" member in struct kvm_riscv_timer so move these defines closer to struct kvm_riscv_timer in uapi/asm/kvm.h. Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20260120080013.2153519-7-anup.patel@oss.qualcomm.com Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 6a89c1d00a72..504e73305343 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -110,6 +110,10 @@ struct kvm_riscv_timer { __u64 state; }; +/* Possible states for kvm_riscv_timer */ +#define KVM_RISCV_TIMER_STATE_OFF 0 +#define KVM_RISCV_TIMER_STATE_ON 1 + /* * ISA extension IDs specific to KVM. This is not the same as the host ISA * extension IDs as that is internal to the host and should not be exposed @@ -238,10 +242,6 @@ struct kvm_riscv_sbi_fwft { struct kvm_riscv_sbi_fwft_feature pointer_masking; }; -/* Possible states for kvm_riscv_timer */ -#define KVM_RISCV_TIMER_STATE_OFF 0 -#define KVM_RISCV_TIMER_STATE_ON 1 - /* If you need to interpret the index values, here is the key: */ #define KVM_REG_RISCV_TYPE_MASK 0x00000000FF000000 #define KVM_REG_RISCV_TYPE_SHIFT 24 From 0ee5501441cc7536271073063f13390164148e25 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 20 Jan 2026 13:29:53 +0530 Subject: [PATCH 18/24] RISC-V: KVM: Add hideleg to struct kvm_vcpu_config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hideleg CSR state when VCPU is running in guest VS/VU-mode will be different from when it is running in guest HS-mode. To achieve this, add hideleg to struct kvm_vcpu_config and re-program hideleg CSR upon every kvm_arch_vcpu_load(). Signed-off-by: Anup Patel Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20260120080013.2153519-8-anup.patel@oss.qualcomm.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 1 + arch/riscv/kvm/vcpu.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 7ee47b83c80d..abf4f388eac5 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -171,6 +171,7 @@ struct kvm_vcpu_config { u64 henvcfg; u64 hstateen0; unsigned long hedeleg; + unsigned long hideleg; }; struct kvm_vcpu_smstateen_csr { diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 1d5c777eba80..77a9400d7293 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -136,6 +136,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.ran_atleast_once = false; vcpu->arch.cfg.hedeleg = KVM_HEDELEG_DEFAULT; + vcpu->arch.cfg.hideleg = KVM_HIDELEG_DEFAULT; vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; bitmap_zero(vcpu->arch.isa, RISCV_ISA_EXT_MAX); @@ -610,6 +611,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) nacl_csr_write(nsh, CSR_VSCAUSE, csr->vscause); nacl_csr_write(nsh, CSR_VSTVAL, csr->vstval); nacl_csr_write(nsh, CSR_HEDELEG, cfg->hedeleg); + nacl_csr_write(nsh, CSR_HIDELEG, cfg->hideleg); nacl_csr_write(nsh, CSR_HVIP, csr->hvip); nacl_csr_write(nsh, CSR_VSATP, csr->vsatp); nacl_csr_write(nsh, CSR_HENVCFG, cfg->henvcfg); @@ -629,6 +631,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) csr_write(CSR_VSCAUSE, csr->vscause); csr_write(CSR_VSTVAL, csr->vstval); csr_write(CSR_HEDELEG, cfg->hedeleg); + csr_write(CSR_HIDELEG, cfg->hideleg); csr_write(CSR_HVIP, csr->hvip); csr_write(CSR_VSATP, csr->vsatp); csr_write(CSR_HENVCFG, cfg->henvcfg); From 6ed523e2b6129d7dd4aab801a610930d85b2d3f8 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 20 Jan 2026 13:29:54 +0530 Subject: [PATCH 19/24] RISC-V: KVM: Factor-out VCPU config into separate sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VCPU config deals with hideleg, hedeleg, henvcfg, and hstateenX CSR configuration for each VCPU. Factor-out VCPU config into separate sources so that VCPU config can do things differently for guest HS-mode and guest VS/VU-mode. Signed-off-by: Anup Patel Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20260120080013.2153519-9-anup.patel@oss.qualcomm.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_host.h | 20 +---- arch/riscv/include/asm/kvm_vcpu_config.h | 25 ++++++ arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/main.c | 4 +- arch/riscv/kvm/vcpu.c | 80 +++--------------- arch/riscv/kvm/vcpu_config.c | 103 +++++++++++++++++++++++ 6 files changed, 146 insertions(+), 87 deletions(-) create mode 100644 arch/riscv/include/asm/kvm_vcpu_config.h create mode 100644 arch/riscv/kvm/vcpu_config.c diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index abf4f388eac5..85e1bb5b4d7e 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -47,18 +48,6 @@ #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE -#define KVM_HEDELEG_DEFAULT (BIT(EXC_INST_MISALIGNED) | \ - BIT(EXC_INST_ILLEGAL) | \ - BIT(EXC_BREAKPOINT) | \ - BIT(EXC_SYSCALL) | \ - BIT(EXC_INST_PAGE_FAULT) | \ - BIT(EXC_LOAD_PAGE_FAULT) | \ - BIT(EXC_STORE_PAGE_FAULT)) - -#define KVM_HIDELEG_DEFAULT (BIT(IRQ_VS_SOFT) | \ - BIT(IRQ_VS_TIMER) | \ - BIT(IRQ_VS_EXT)) - #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -167,13 +156,6 @@ struct kvm_vcpu_csr { unsigned long senvcfg; }; -struct kvm_vcpu_config { - u64 henvcfg; - u64 hstateen0; - unsigned long hedeleg; - unsigned long hideleg; -}; - struct kvm_vcpu_smstateen_csr { unsigned long sstateen0; }; diff --git a/arch/riscv/include/asm/kvm_vcpu_config.h b/arch/riscv/include/asm/kvm_vcpu_config.h new file mode 100644 index 000000000000..fcc15a0296b3 --- /dev/null +++ b/arch/riscv/include/asm/kvm_vcpu_config.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2026 Qualcomm Technologies, Inc. + */ + +#ifndef __KVM_VCPU_RISCV_CONFIG_H +#define __KVM_VCPU_RISCV_CONFIG_H + +#include + +struct kvm_vcpu; + +struct kvm_vcpu_config { + u64 henvcfg; + u64 hstateen0; + unsigned long hedeleg; + unsigned long hideleg; +}; + +void kvm_riscv_vcpu_config_init(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_config_guest_debug(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_config_ran_once(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_config_load(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 07eab96189e7..296c2ba05089 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -21,6 +21,7 @@ kvm-y += mmu.o kvm-y += nacl.o kvm-y += tlb.o kvm-y += vcpu.o +kvm-y += vcpu_config.o kvm-y += vcpu_exit.o kvm-y += vcpu_fp.o kvm-y += vcpu_insn.o diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 0f3fe3986fc0..5399c3b4071d 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -41,8 +41,8 @@ int kvm_arch_enable_virtualization_cpu(void) if (rc) return rc; - csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); - csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); + csr_write(CSR_HEDELEG, 0); + csr_write(CSR_HIDELEG, 0); /* VS should access only the time counter directly. Everything else should trap */ csr_write(CSR_HCOUNTEREN, 0x02); diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 77a9400d7293..6929f7ce5948 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -135,11 +135,12 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Mark this VCPU never ran */ vcpu->arch.ran_atleast_once = false; - vcpu->arch.cfg.hedeleg = KVM_HEDELEG_DEFAULT; - vcpu->arch.cfg.hideleg = KVM_HIDELEG_DEFAULT; vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; bitmap_zero(vcpu->arch.isa, RISCV_ISA_EXT_MAX); + /* Setup VCPU config */ + kvm_riscv_vcpu_config_init(vcpu); + /* Setup ISA features available to VCPU */ kvm_riscv_vcpu_setup_isa(vcpu); @@ -532,59 +533,18 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - if (dbg->control & KVM_GUESTDBG_ENABLE) { + if (dbg->control & KVM_GUESTDBG_ENABLE) vcpu->guest_debug = dbg->control; - vcpu->arch.cfg.hedeleg &= ~BIT(EXC_BREAKPOINT); - } else { + else vcpu->guest_debug = 0; - vcpu->arch.cfg.hedeleg |= BIT(EXC_BREAKPOINT); - } - - vcpu->arch.csr_dirty = true; return 0; } -static void kvm_riscv_vcpu_setup_config(struct kvm_vcpu *vcpu) -{ - const unsigned long *isa = vcpu->arch.isa; - struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; - - if (riscv_isa_extension_available(isa, SVPBMT)) - cfg->henvcfg |= ENVCFG_PBMTE; - - if (riscv_isa_extension_available(isa, SSTC)) - cfg->henvcfg |= ENVCFG_STCE; - - if (riscv_isa_extension_available(isa, ZICBOM)) - cfg->henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); - - if (riscv_isa_extension_available(isa, ZICBOZ)) - cfg->henvcfg |= ENVCFG_CBZE; - - if (riscv_isa_extension_available(isa, SVADU) && - !riscv_isa_extension_available(isa, SVADE)) - cfg->henvcfg |= ENVCFG_ADUE; - - if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { - cfg->hstateen0 |= SMSTATEEN0_HSENVCFG; - if (riscv_isa_extension_available(isa, SSAIA)) - cfg->hstateen0 |= SMSTATEEN0_AIA_IMSIC | - SMSTATEEN0_AIA | - SMSTATEEN0_AIA_ISEL; - if (riscv_isa_extension_available(isa, SMSTATEEN)) - cfg->hstateen0 |= SMSTATEEN0_SSTATEEN0; - } - - if (vcpu->guest_debug) - cfg->hedeleg &= ~BIT(EXC_BREAKPOINT); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { void *nsh; struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; /* * If VCPU is being reloaded on the same physical CPU and no @@ -601,6 +561,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) goto csr_restore_done; vcpu->arch.csr_dirty = false; + + /* + * Load VCPU config CSRs before other CSRs because + * the read/write behaviour of certain CSRs change + * based on VCPU config CSRs. + */ + kvm_riscv_vcpu_config_load(vcpu); + if (kvm_riscv_nacl_sync_csr_available()) { nsh = nacl_shmem(); nacl_csr_write(nsh, CSR_VSSTATUS, csr->vsstatus); @@ -610,18 +578,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) nacl_csr_write(nsh, CSR_VSEPC, csr->vsepc); nacl_csr_write(nsh, CSR_VSCAUSE, csr->vscause); nacl_csr_write(nsh, CSR_VSTVAL, csr->vstval); - nacl_csr_write(nsh, CSR_HEDELEG, cfg->hedeleg); - nacl_csr_write(nsh, CSR_HIDELEG, cfg->hideleg); nacl_csr_write(nsh, CSR_HVIP, csr->hvip); nacl_csr_write(nsh, CSR_VSATP, csr->vsatp); - nacl_csr_write(nsh, CSR_HENVCFG, cfg->henvcfg); - if (IS_ENABLED(CONFIG_32BIT)) - nacl_csr_write(nsh, CSR_HENVCFGH, cfg->henvcfg >> 32); - if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { - nacl_csr_write(nsh, CSR_HSTATEEN0, cfg->hstateen0); - if (IS_ENABLED(CONFIG_32BIT)) - nacl_csr_write(nsh, CSR_HSTATEEN0H, cfg->hstateen0 >> 32); - } } else { csr_write(CSR_VSSTATUS, csr->vsstatus); csr_write(CSR_VSIE, csr->vsie); @@ -630,18 +588,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) csr_write(CSR_VSEPC, csr->vsepc); csr_write(CSR_VSCAUSE, csr->vscause); csr_write(CSR_VSTVAL, csr->vstval); - csr_write(CSR_HEDELEG, cfg->hedeleg); - csr_write(CSR_HIDELEG, cfg->hideleg); csr_write(CSR_HVIP, csr->hvip); csr_write(CSR_VSATP, csr->vsatp); - csr_write(CSR_HENVCFG, cfg->henvcfg); - if (IS_ENABLED(CONFIG_32BIT)) - csr_write(CSR_HENVCFGH, cfg->henvcfg >> 32); - if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { - csr_write(CSR_HSTATEEN0, cfg->hstateen0); - if (IS_ENABLED(CONFIG_32BIT)) - csr_write(CSR_HSTATEEN0H, cfg->hstateen0 >> 32); - } } kvm_riscv_mmu_update_hgatp(vcpu); @@ -891,7 +839,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; if (!vcpu->arch.ran_atleast_once) - kvm_riscv_vcpu_setup_config(vcpu); + kvm_riscv_vcpu_config_ran_once(vcpu); /* Mark this VCPU ran at least once */ vcpu->arch.ran_atleast_once = true; diff --git a/arch/riscv/kvm/vcpu_config.c b/arch/riscv/kvm/vcpu_config.c new file mode 100644 index 000000000000..238418fed2b9 --- /dev/null +++ b/arch/riscv/kvm/vcpu_config.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2026 Qualcomm Technologies, Inc. + */ + +#include +#include + +#define KVM_HEDELEG_DEFAULT (BIT(EXC_INST_MISALIGNED) | \ + BIT(EXC_INST_ILLEGAL) | \ + BIT(EXC_BREAKPOINT) | \ + BIT(EXC_SYSCALL) | \ + BIT(EXC_INST_PAGE_FAULT) | \ + BIT(EXC_LOAD_PAGE_FAULT) | \ + BIT(EXC_STORE_PAGE_FAULT)) + +#define KVM_HIDELEG_DEFAULT (BIT(IRQ_VS_SOFT) | \ + BIT(IRQ_VS_TIMER) | \ + BIT(IRQ_VS_EXT)) + +void kvm_riscv_vcpu_config_init(struct kvm_vcpu *vcpu) +{ + vcpu->arch.cfg.hedeleg = KVM_HEDELEG_DEFAULT; + vcpu->arch.cfg.hideleg = KVM_HIDELEG_DEFAULT; +} + +void kvm_riscv_vcpu_config_guest_debug(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + if (vcpu->guest_debug) + cfg->hedeleg &= ~BIT(EXC_BREAKPOINT); + else + cfg->hedeleg |= BIT(EXC_BREAKPOINT); + + vcpu->arch.csr_dirty = true; +} + +void kvm_riscv_vcpu_config_ran_once(struct kvm_vcpu *vcpu) +{ + const unsigned long *isa = vcpu->arch.isa; + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + + if (riscv_isa_extension_available(isa, SVPBMT)) + cfg->henvcfg |= ENVCFG_PBMTE; + + if (riscv_isa_extension_available(isa, SSTC)) + cfg->henvcfg |= ENVCFG_STCE; + + if (riscv_isa_extension_available(isa, ZICBOM)) + cfg->henvcfg |= (ENVCFG_CBIE | ENVCFG_CBCFE); + + if (riscv_isa_extension_available(isa, ZICBOZ)) + cfg->henvcfg |= ENVCFG_CBZE; + + if (riscv_isa_extension_available(isa, SVADU) && + !riscv_isa_extension_available(isa, SVADE)) + cfg->henvcfg |= ENVCFG_ADUE; + + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + cfg->hstateen0 |= SMSTATEEN0_HSENVCFG; + if (riscv_isa_extension_available(isa, SSAIA)) + cfg->hstateen0 |= SMSTATEEN0_AIA_IMSIC | + SMSTATEEN0_AIA | + SMSTATEEN0_AIA_ISEL; + if (riscv_isa_extension_available(isa, SMSTATEEN)) + cfg->hstateen0 |= SMSTATEEN0_SSTATEEN0; + } + + if (vcpu->guest_debug) + cfg->hedeleg &= ~BIT(EXC_BREAKPOINT); +} + +void kvm_riscv_vcpu_config_load(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; + void *nsh; + + if (kvm_riscv_nacl_sync_csr_available()) { + nsh = nacl_shmem(); + nacl_csr_write(nsh, CSR_HEDELEG, cfg->hedeleg); + nacl_csr_write(nsh, CSR_HIDELEG, cfg->hideleg); + nacl_csr_write(nsh, CSR_HENVCFG, cfg->henvcfg); + if (IS_ENABLED(CONFIG_32BIT)) + nacl_csr_write(nsh, CSR_HENVCFGH, cfg->henvcfg >> 32); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + nacl_csr_write(nsh, CSR_HSTATEEN0, cfg->hstateen0); + if (IS_ENABLED(CONFIG_32BIT)) + nacl_csr_write(nsh, CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + } + } else { + csr_write(CSR_HEDELEG, cfg->hedeleg); + csr_write(CSR_HIDELEG, cfg->hideleg); + csr_write(CSR_HENVCFG, cfg->henvcfg); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HENVCFGH, cfg->henvcfg >> 32); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) { + csr_write(CSR_HSTATEEN0, cfg->hstateen0); + if (IS_ENABLED(CONFIG_32BIT)) + csr_write(CSR_HSTATEEN0H, cfg->hstateen0 >> 32); + } + } +} From e2494f83f9d717a7f607cfaefb4e69e55b8e024d Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Tue, 20 Jan 2026 13:29:55 +0530 Subject: [PATCH 20/24] RISC-V: KVM: Don't check hstateen0 when updating sstateen0 CSR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hstateen0 will be programmed differently for guest HS-mode and guest VS/VU-mode so don't check hstateen0.SSTATEEN0 bit when updating sstateen0 CSR in kvm_riscv_vcpu_swap_in_guest_state() and kvm_riscv_vcpu_swap_in_host_state(). Signed-off-by: Anup Patel Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20260120080013.2153519-10-anup.patel@oss.qualcomm.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 6929f7ce5948..a73690eda84b 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -721,28 +721,22 @@ static __always_inline void kvm_riscv_vcpu_swap_in_guest_state(struct kvm_vcpu * { struct kvm_vcpu_smstateen_csr *smcsr = &vcpu->arch.smstateen_csr; struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; vcpu->arch.host_scounteren = csr_swap(CSR_SCOUNTEREN, csr->scounteren); vcpu->arch.host_senvcfg = csr_swap(CSR_SENVCFG, csr->senvcfg); - if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && - (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) - vcpu->arch.host_sstateen0 = csr_swap(CSR_SSTATEEN0, - smcsr->sstateen0); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) + vcpu->arch.host_sstateen0 = csr_swap(CSR_SSTATEEN0, smcsr->sstateen0); } static __always_inline void kvm_riscv_vcpu_swap_in_host_state(struct kvm_vcpu *vcpu) { struct kvm_vcpu_smstateen_csr *smcsr = &vcpu->arch.smstateen_csr; struct kvm_vcpu_csr *csr = &vcpu->arch.guest_csr; - struct kvm_vcpu_config *cfg = &vcpu->arch.cfg; csr->scounteren = csr_swap(CSR_SCOUNTEREN, vcpu->arch.host_scounteren); csr->senvcfg = csr_swap(CSR_SENVCFG, vcpu->arch.host_senvcfg); - if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN) && - (cfg->hstateen0 & SMSTATEEN0_SSTATEEN0)) - smcsr->sstateen0 = csr_swap(CSR_SSTATEEN0, - vcpu->arch.host_sstateen0); + if (riscv_has_extension_unlikely(RISCV_ISA_EXT_SMSTATEEN)) + smcsr->sstateen0 = csr_swap(CSR_SSTATEEN0, vcpu->arch.host_sstateen0); } /* From efcac8424ba6ab75f2e16be9b0ccfdf60b13b294 Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Fri, 3 Apr 2026 23:30:16 +0800 Subject: [PATCH 21/24] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode Introduces one per-VM architecture-specific fields to support runtime configuration of the G-stage page table format: - kvm->arch.pgd_levels: the corresponding number of page table levels for the selected mode. These fields replace the previous global variables kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different virtual machines to independently select their G-stage page table format instead of being forced to share the maximum mode detected by the kernel at boot time. Signed-off-by: Fangyu Yu Reviewed-by: Andrew Jones Reviewed-by: Anup Patel Reviewed-by: Guo Ren Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20260403153019.9916-2-fangyu.yu@linux.alibaba.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_gstage.h | 37 ++++++++++++--- arch/riscv/include/asm/kvm_host.h | 1 + arch/riscv/kvm/gstage.c | 71 ++++++++++++++--------------- arch/riscv/kvm/main.c | 12 ++--- arch/riscv/kvm/mmu.c | 20 ++++---- arch/riscv/kvm/vm.c | 5 +- arch/riscv/kvm/vmid.c | 3 +- 7 files changed, 89 insertions(+), 60 deletions(-) diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index a89d1422cc84..c35874768641 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -29,16 +29,22 @@ struct kvm_gstage_mapping { #define kvm_riscv_gstage_index_bits 10 #endif -extern unsigned long kvm_riscv_gstage_mode; -extern unsigned long kvm_riscv_gstage_pgd_levels; +extern unsigned long kvm_riscv_gstage_max_pgd_levels; #define kvm_riscv_gstage_pgd_xbits 2 #define kvm_riscv_gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits)) -#define kvm_riscv_gstage_gpa_bits (HGATP_PAGE_SHIFT + \ - (kvm_riscv_gstage_pgd_levels * \ - kvm_riscv_gstage_index_bits) + \ - kvm_riscv_gstage_pgd_xbits) -#define kvm_riscv_gstage_gpa_size ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits)) + +static inline unsigned long kvm_riscv_gstage_gpa_bits(unsigned long pgd_levels) +{ + return (HGATP_PAGE_SHIFT + + pgd_levels * kvm_riscv_gstage_index_bits + + kvm_riscv_gstage_pgd_xbits); +} + +static inline gpa_t kvm_riscv_gstage_gpa_size(unsigned long pgd_levels) +{ + return BIT_ULL(kvm_riscv_gstage_gpa_bits(pgd_levels)); +} bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, pte_t **ptepp, u32 *ptep_level); @@ -73,4 +79,21 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end void kvm_riscv_gstage_mode_detect(void); +static inline unsigned long kvm_riscv_gstage_mode(unsigned long pgd_levels) +{ + switch (pgd_levels) { + case 2: + return HGATP_MODE_SV32X4; + case 3: + return HGATP_MODE_SV39X4; + case 4: + return HGATP_MODE_SV48X4; + case 5: + return HGATP_MODE_SV57X4; + default: + WARN_ON_ONCE(1); + return HGATP_MODE_OFF; + } +} + #endif diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 85e1bb5b4d7e..75b0a951c1bc 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -83,6 +83,7 @@ struct kvm_arch { /* G-stage page table */ pgd_t *pgd; phys_addr_t pgd_phys; + unsigned long pgd_levels; /* Guest Timer */ struct kvm_guest_timer timer; diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index ffec3e5ddcaf..bf7e54af2c7c 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -12,22 +12,21 @@ #include #ifdef CONFIG_64BIT -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4; -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3; +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3; #else -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4; -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2; +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2; #endif #define gstage_pte_leaf(__ptep) \ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) +static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage, + gpa_t addr, u32 level) { unsigned long mask; unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level); - if (level == (kvm_riscv_gstage_pgd_levels - 1)) + if (level == gstage->kvm->arch.pgd_levels - 1) mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1; else mask = PTRS_PER_PTE - 1; @@ -40,12 +39,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte) return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); } -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) +static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size, + u32 *out_level) { u32 i; unsigned long psz = 1UL << 12; - for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) { + for (i = 0; i < gstage->kvm->arch.pgd_levels; i++) { if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) { *out_level = i; return 0; @@ -55,21 +55,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) return -EINVAL; } -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) +static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level, + unsigned long *out_pgorder) { - if (kvm_riscv_gstage_pgd_levels < level) + if (gstage->kvm->arch.pgd_levels < level) return -EINVAL; *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits); return 0; } -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) +static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level, + unsigned long *out_pgsize) { int rc; unsigned long page_order = PAGE_SHIFT; - rc = gstage_level_to_page_order(level, &page_order); + rc = gstage_level_to_page_order(gstage, level, &page_order); if (rc) return rc; @@ -81,11 +83,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, pte_t **ptepp, u32 *ptep_level) { pte_t *ptep; - u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + u32 current_level = gstage->kvm->arch.pgd_levels - 1; *ptep_level = current_level; ptep = (pte_t *)gstage->pgd; - ptep = &ptep[gstage_pte_index(addr, current_level)]; + ptep = &ptep[gstage_pte_index(gstage, addr, current_level)]; while (ptep && pte_val(ptep_get(ptep))) { if (gstage_pte_leaf(ptep)) { *ptep_level = current_level; @@ -97,7 +99,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, current_level--; *ptep_level = current_level; ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); - ptep = &ptep[gstage_pte_index(addr, current_level)]; + ptep = &ptep[gstage_pte_index(gstage, addr, current_level)]; } else { ptep = NULL; } @@ -110,7 +112,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr) { unsigned long order = PAGE_SHIFT; - if (gstage_level_to_page_order(level, &order)) + if (gstage_level_to_page_order(gstage, level, &order)) return; addr &= ~(BIT(order) - 1); @@ -125,9 +127,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, const struct kvm_gstage_mapping *map) { - u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + u32 current_level = gstage->kvm->arch.pgd_levels - 1; pte_t *next_ptep = (pte_t *)gstage->pgd; - pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)]; if (current_level < map->level) return -EINVAL; @@ -151,7 +153,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, } current_level--; - ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)]; } if (pte_val(*ptep) != pte_val(map->pte)) { @@ -194,7 +196,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, out_map->addr = gpa; out_map->level = 0; - ret = gstage_page_size_to_level(page_size, &out_map->level); + ret = gstage_page_size_to_level(gstage, page_size, &out_map->level); if (ret) return ret; @@ -286,7 +288,7 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, gpa_t addr, u32 target_level, bool flush) { - u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + u32 current_level = gstage->kvm->arch.pgd_levels - 1; pte_t *next_ptep = (pte_t *)gstage->pgd; unsigned long huge_pte, child_pte; unsigned long child_page_size; @@ -297,7 +299,7 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, return -ENOMEM; while(current_level > target_level) { - ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)]; + ptep = (pte_t *)&next_ptep[gstage_pte_index(gstage, addr, current_level)]; if (!pte_val(ptep_get(ptep))) break; @@ -310,7 +312,7 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, huge_pte = pte_val(ptep_get(ptep)); - ret = gstage_level_to_page_size(current_level - 1, &child_page_size); + ret = gstage_level_to_page_size(gstage, current_level - 1, &child_page_size); if (ret) return ret; @@ -343,7 +345,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, u32 next_ptep_level; unsigned long next_page_size, page_size; - ret = gstage_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size); if (ret) return; @@ -355,7 +357,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, if (ptep_level && !gstage_pte_leaf(ptep)) { next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); next_ptep_level = ptep_level - 1; - ret = gstage_level_to_page_size(next_ptep_level, &next_page_size); + ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size); if (ret) return; @@ -389,7 +391,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage, while (addr < end) { found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size); if (ret) break; @@ -423,7 +425,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end while (addr < end) { found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(gstage, ptep_level, &page_size); if (ret) break; @@ -444,39 +446,34 @@ void __init kvm_riscv_gstage_mode_detect(void) /* Try Sv57x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV57X4; - kvm_riscv_gstage_pgd_levels = 5; + kvm_riscv_gstage_max_pgd_levels = 5; goto done; } /* Try Sv48x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV48X4; - kvm_riscv_gstage_pgd_levels = 4; + kvm_riscv_gstage_max_pgd_levels = 4; goto done; } /* Try Sv39x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV39X4; - kvm_riscv_gstage_pgd_levels = 3; + kvm_riscv_gstage_max_pgd_levels = 3; goto done; } #else /* CONFIG_32BIT */ /* Try Sv32x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) { - kvm_riscv_gstage_mode = HGATP_MODE_SV32X4; - kvm_riscv_gstage_pgd_levels = 2; + kvm_riscv_gstage_max_pgd_levels = 2; goto done; } #endif /* KVM depends on !HGATP_MODE_OFF */ - kvm_riscv_gstage_mode = HGATP_MODE_OFF; - kvm_riscv_gstage_pgd_levels = 0; + kvm_riscv_gstage_max_pgd_levels = 0; done: csr_write(CSR_HGATP, 0); diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 5399c3b4071d..cb8a65273c1f 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -105,17 +105,17 @@ static int __init riscv_kvm_init(void) return rc; kvm_riscv_gstage_mode_detect(); - switch (kvm_riscv_gstage_mode) { - case HGATP_MODE_SV32X4: + switch (kvm_riscv_gstage_max_pgd_levels) { + case 2: str = "Sv32x4"; break; - case HGATP_MODE_SV39X4: + case 3: str = "Sv39x4"; break; - case HGATP_MODE_SV48X4: + case 4: str = "Sv48x4"; break; - case HGATP_MODE_SV57X4: + case 5: str = "Sv57x4"; break; default: @@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void) (rc) ? slist : "no features"); } - kvm_info("using %s G-stage page table format\n", str); + kvm_info("highest G-stage page table mode is %s\n", str); kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 088d33ba90ed..fbcdd75cb9af 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, if (!writable) map.pte = pte_wrprotect(map.pte); - ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels); + ret = kvm_mmu_topup_memory_cache(&pcache, kvm->arch.pgd_levels); if (ret) goto out; @@ -186,7 +186,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest GPA space. */ if ((new->base_gfn + new->npages) >= - (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT)) + kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels) >> PAGE_SHIFT) return -EFAULT; hva = new->userspace_addr; @@ -472,7 +472,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, memset(out_map, 0, sizeof(*out_map)); /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels); + ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.pgd_levels); if (ret) { kvm_err("Failed to topup G-stage cache\n"); return ret; @@ -575,6 +575,7 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm) return -ENOMEM; kvm->arch.pgd = page_to_virt(pgd_page); kvm->arch.pgd_phys = page_to_phys(pgd_page); + kvm->arch.pgd_levels = kvm_riscv_gstage_max_pgd_levels; return 0; } @@ -590,10 +591,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm) gstage.flags = 0; gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); gstage.pgd = kvm->arch.pgd; - kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false); + kvm_riscv_gstage_unmap_range(&gstage, 0UL, + kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; kvm->arch.pgd_phys = 0; + kvm->arch.pgd_levels = 0; } spin_unlock(&kvm->mmu_lock); @@ -603,11 +606,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm) void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu) { - unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT; - struct kvm_arch *k = &vcpu->kvm->arch; + struct kvm_arch *ka = &vcpu->kvm->arch; + unsigned long hgatp = kvm_riscv_gstage_mode(ka->pgd_levels) + << HGATP_MODE_SHIFT; - hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; - hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; + hgatp |= (READ_ONCE(ka->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; + hgatp |= (ka->pgd_phys >> PAGE_SHIFT) & HGATP_PPN; ncsr_write(CSR_HGATP, hgatp); diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 13c63ae1a78b..fb7c4e07961f 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -199,7 +199,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_VM_GPA_BITS: - r = kvm_riscv_gstage_gpa_bits; + if (!kvm) + r = kvm_riscv_gstage_gpa_bits(kvm_riscv_gstage_max_pgd_levels); + else + r = kvm_riscv_gstage_gpa_bits(kvm->arch.pgd_levels); break; default: r = 0; diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index cf34d448289d..c15bdb1dd8be 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -26,7 +26,8 @@ static DEFINE_SPINLOCK(vmid_lock); void __init kvm_riscv_gstage_vmid_detect(void) { /* Figure-out number of VMID bits in HW */ - csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID); + csr_write(CSR_HGATP, (kvm_riscv_gstage_mode(kvm_riscv_gstage_max_pgd_levels) << + HGATP_MODE_SHIFT) | HGATP_VMID); vmid_bits = csr_read(CSR_HGATP); vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT; vmid_bits = fls_long(vmid_bits); From ec92248431be7ad08742e0d1dff5109cec5ef905 Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Fri, 3 Apr 2026 23:30:17 +0800 Subject: [PATCH 22/24] RISC-V: KVM: Cache gstage pgd_levels in struct kvm_gstage Gstage page-table helpers frequently chase gstage->kvm->arch to fetch pgd_levels. This adds noise and repeats the same dereference chain in hot paths. Add pgd_levels to struct kvm_gstage and initialize it from kvm->arch when setting up a gstage instance. Introduce kvm_riscv_gstage_init() to centralize initialization and switch gstage code to use gstage->pgd_levels. Suggested-by: Anup Patel Signed-off-by: Fangyu Yu Reviewed-by: Anup Patel Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20260403153019.9916-3-fangyu.yu@linux.alibaba.com Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_gstage.h | 10 ++++++ arch/riscv/kvm/gstage.c | 12 +++---- arch/riscv/kvm/mmu.c | 50 ++++++----------------------- 3 files changed, 26 insertions(+), 46 deletions(-) diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index c35874768641..9c908432bc17 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -15,6 +15,7 @@ struct kvm_gstage { #define KVM_GSTAGE_FLAGS_LOCAL BIT(0) unsigned long vmid; pgd_t *pgd; + unsigned long pgd_levels; }; struct kvm_gstage_mapping { @@ -96,4 +97,13 @@ static inline unsigned long kvm_riscv_gstage_mode(unsigned long pgd_levels) } } +static inline void kvm_riscv_gstage_init(struct kvm_gstage *gstage, struct kvm *kvm) +{ + gstage->kvm = kvm; + gstage->flags = 0; + gstage->vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage->pgd = kvm->arch.pgd; + gstage->pgd_levels = kvm->arch.pgd_levels; +} + #endif diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index bf7e54af2c7c..d9fe8be2a151 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -26,7 +26,7 @@ static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage, unsigned long mask; unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level); - if (level == gstage->kvm->arch.pgd_levels - 1) + if (level == gstage->pgd_levels - 1) mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1; else mask = PTRS_PER_PTE - 1; @@ -45,7 +45,7 @@ static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long pa u32 i; unsigned long psz = 1UL << 12; - for (i = 0; i < gstage->kvm->arch.pgd_levels; i++) { + for (i = 0; i < gstage->pgd_levels; i++) { if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) { *out_level = i; return 0; @@ -58,7 +58,7 @@ static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long pa static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level, unsigned long *out_pgorder) { - if (gstage->kvm->arch.pgd_levels < level) + if (gstage->pgd_levels < level) return -EINVAL; *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits); @@ -83,7 +83,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, pte_t **ptepp, u32 *ptep_level) { pte_t *ptep; - u32 current_level = gstage->kvm->arch.pgd_levels - 1; + u32 current_level = gstage->pgd_levels - 1; *ptep_level = current_level; ptep = (pte_t *)gstage->pgd; @@ -127,7 +127,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, const struct kvm_gstage_mapping *map) { - u32 current_level = gstage->kvm->arch.pgd_levels - 1; + u32 current_level = gstage->pgd_levels - 1; pte_t *next_ptep = (pte_t *)gstage->pgd; pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)]; @@ -288,7 +288,7 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, gpa_t addr, u32 target_level, bool flush) { - u32 current_level = gstage->kvm->arch.pgd_levels - 1; + u32 current_level = gstage->pgd_levels - 1; pte_t *next_ptep = (pte_t *)gstage->pgd; unsigned long huge_pte, child_pte; unsigned long child_page_size; diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index fbcdd75cb9af..2d3def024270 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -24,10 +24,7 @@ static void mmu_wp_memory_region(struct kvm *kvm, int slot) phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); spin_lock(&kvm->mmu_lock); kvm_riscv_gstage_wp_range(&gstage, start, end); @@ -49,10 +46,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, struct kvm_gstage_mapping map; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); @@ -89,10 +83,7 @@ void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) { struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); spin_lock(&kvm->mmu_lock); kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); @@ -109,10 +100,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); kvm_riscv_gstage_wp_range(&gstage, start, end); } @@ -141,10 +129,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; struct kvm_gstage gstage; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); spin_lock(&kvm->mmu_lock); kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); @@ -250,10 +235,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.pgd) return false; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); mmu_locked = spin_trylock(&kvm->mmu_lock); kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, @@ -275,10 +257,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; @@ -298,10 +277,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; @@ -463,10 +439,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, struct kvm_gstage gstage; struct page *page; - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); /* Setup initial state of output mapping */ memset(out_map, 0, sizeof(*out_map)); @@ -587,10 +560,7 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm) spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - gstage.kvm = kvm; - gstage.flags = 0; - gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); - gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_init(&gstage, kvm); kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(kvm->arch.pgd_levels), false); pgd = READ_ONCE(kvm->arch.pgd); From 7263b4fdb0b240e67e3ebd802e0df761d35a7fdf Mon Sep 17 00:00:00 2001 From: Fangyu Yu Date: Fri, 3 Apr 2026 23:30:18 +0800 Subject: [PATCH 23/24] RISC-V: KVM: Reuse KVM_CAP_VM_GPA_BITS to select HGATP.MODE Reuse KVM_CAP_VM_GPA_BITS to advertise and select the effective G-stage GPA width for a VM. KVM_CHECK_EXTENSION(KVM_CAP_VM_GPA_BITS) returns the effective GPA bits for a VM, KVM_ENABLE_CAP(KVM_CAP_VM_GPA_BITS) allows userspace to downsize the effective GPA width by selecting a smaller G-stage page table format: - gpa_bits <= 41 selects Sv39x4 (pgd_levels=3) - gpa_bits <= 50 selects Sv48x4 (pgd_levels=4) - gpa_bits <= 59 selects Sv57x4 (pgd_levels=5) Reject the request with -EINVAL for unsupported values and with -EBUSY if vCPUs have been created or any memslot is populated. Signed-off-by: Fangyu Yu Reviewed-by: Andrew Jones Reviewed-by: Guo Ren Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20260403153019.9916-4-fangyu.yu@linux.alibaba.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vm.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index fb7c4e07961f..a9f083feeb76 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -214,12 +214,52 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { + if (cap->flags) + return -EINVAL; + switch (cap->cap) { case KVM_CAP_RISCV_MP_STATE_RESET: - if (cap->flags) - return -EINVAL; kvm->arch.mp_state_reset = true; return 0; + case KVM_CAP_VM_GPA_BITS: { + unsigned long gpa_bits = cap->args[0]; + unsigned long new_levels; + int r = 0; + + /* Decide target pgd levels from requested gpa_bits */ +#ifdef CONFIG_64BIT + if (gpa_bits <= 41) + new_levels = 3; /* Sv39x4 */ + else if (gpa_bits <= 50) + new_levels = 4; /* Sv48x4 */ + else if (gpa_bits <= 59) + new_levels = 5; /* Sv57x4 */ + else + return -EINVAL; +#else + /* 32-bit: only Sv32x4*/ + if (gpa_bits <= 34) + new_levels = 2; + else + return -EINVAL; +#endif + if (new_levels > kvm_riscv_gstage_max_pgd_levels) + return -EINVAL; + + /* Follow KVM's lock ordering: kvm->lock -> kvm->slots_lock. */ + mutex_lock(&kvm->lock); + mutex_lock(&kvm->slots_lock); + + if (kvm->created_vcpus || !kvm_are_all_memslots_empty(kvm)) + r = -EBUSY; + else + kvm->arch.pgd_levels = new_levels; + + mutex_unlock(&kvm->slots_lock); + mutex_unlock(&kvm->lock); + + return r; + } default: return -EINVAL; } From ddbf9c76c4020bf63a0799b00faad40caa3de6c2 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Fri, 3 Apr 2026 23:20:11 +0000 Subject: [PATCH 24/24] RISC-V: KVM: Fix shift-out-of-bounds in make_xfence_request() The make_xfence_request() function uses a shift operation to check if a vCPU is in the hart mask: if (!(hmask & (1UL << (vcpu->vcpu_id - hbase)))) However, when the difference between vcpu_id and hbase is >= BITS_PER_LONG, the shift operation causes undefined behavior. This was detected by UBSAN: UBSAN: shift-out-of-bounds in arch/riscv/kvm/tlb.c:343:23 shift exponent 256 is too large for 64-bit type 'long unsigned int' Fix this by adding a bounds check before the shift operation. This bug was found by fuzzing the KVM RISC-V interface. Fixes: 13acfec2dbcc ("RISC-V: KVM: Add remote HFENCE functions based on VCPU requests") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20260403232011.2394966-1-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/tlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index ff1aeac4eb8e..439c20c2775a 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -338,7 +338,8 @@ static void make_xfence_request(struct kvm *kvm, bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (hbase != -1UL) { - if (vcpu->vcpu_id < hbase) + if (vcpu->vcpu_id < hbase || + vcpu->vcpu_id >= hbase + BITS_PER_LONG) continue; if (!(hmask & (1UL << (vcpu->vcpu_id - hbase)))) continue;