From 12a94953c37e834c3eabb839ce057094946fe67a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Nov 2025 16:39:54 +0000 Subject: [PATCH 1/6] perf/arm-cmn: Support CMN-600AE The functional safety features of CMN-600AE have little to no impact on the PMU relative to the base CMN-600 design, so for simplicity we can reasonably just treat it as the same thing. The only obvious difference is that the revision numbers aren't aligned, so we may hide some aliases for events which do actually exist, but those can still be specified via the underlying "type,eventid" format so it's not too big a deal. Signed-off-by: Robin Murphy Reviewed-by: Ilkka Koskinen Tested-by: Michal Simek Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 23245352a3fc..651edd73bfcb 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -210,6 +210,7 @@ enum cmn_model { enum cmn_part { PART_CMN600 = 0x434, PART_CMN650 = 0x436, + PART_CMN600AE = 0x438, PART_CMN700 = 0x43c, PART_CI700 = 0x43a, PART_CMN_S3 = 0x43e, @@ -2266,6 +2267,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) reg = readq_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_01); part = FIELD_GET(CMN_CFGM_PID0_PART_0, reg); part |= FIELD_GET(CMN_CFGM_PID1_PART_1, reg) << 8; + /* 600AE is close enough that it's not really worth more complexity */ + if (part == PART_CMN600AE) + part = PART_CMN600; if (cmn->part && cmn->part != part) dev_warn(cmn->dev, "Firmware binding mismatch: expected part number 0x%x, found 0x%x\n", From 0c7c64146f707ffe7abd5d11c9828d8129903ab5 Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Mon, 15 Dec 2025 20:20:03 -0500 Subject: [PATCH 2/6] drivers: perf: use bitmap_empty() where appropriate bitmap_empty() is more verbose and efficient, as it stops traversing bitmaps as soon as the 1st set bit found. Switch perf code to using bitmap_empty() where appropriate, and correspondingly use boolean types. Signed-off-by: Yury Norov (NVIDIA) Signed-off-by: Will Deacon --- drivers/perf/riscv_pmu_sbi.c | 2 +- drivers/perf/starfive_starlink_pmu.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 7dd282da67ce..9dcc22fd48ef 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -1244,7 +1244,7 @@ static int riscv_pm_pmu_notify(struct notifier_block *b, unsigned long cmd, { struct riscv_pmu *rvpmu = container_of(b, struct riscv_pmu, riscv_pm_nb); struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events); - int enabled = bitmap_weight(cpuc->used_hw_ctrs, RISCV_MAX_COUNTERS); + bool enabled = !bitmap_empty(cpuc->used_hw_ctrs, RISCV_MAX_COUNTERS); struct perf_event *event; int idx; diff --git a/drivers/perf/starfive_starlink_pmu.c b/drivers/perf/starfive_starlink_pmu.c index 5e5a672b4229..964897c2baa9 100644 --- a/drivers/perf/starfive_starlink_pmu.c +++ b/drivers/perf/starfive_starlink_pmu.c @@ -450,8 +450,7 @@ static int starlink_pmu_pm_notify(struct notifier_block *b, starlink_pmu_pm_nb); struct starlink_hw_events *hw_events = this_cpu_ptr(starlink_pmu->hw_events); - int enabled = bitmap_weight(hw_events->used_mask, - STARLINK_PMU_MAX_COUNTERS); + bool enabled = !bitmap_empty(hw_events->used_mask, STARLINK_PMU_MAX_COUNTERS); struct perf_event *event; int idx; From 0113affc91014a14251890c3af8f2bade1c20222 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 15 Dec 2025 13:04:58 +0000 Subject: [PATCH 3/6] perf/arm_dsu: Support DSU-110 DSU-110 sneakily made all the event counters 64-bit, perhaps related to no longer having AArch32 EL1 to worry about. While the DSU version itself is not easily discoverable, the size of a counter certainly is. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/perf/arm_dsu_pmu.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index cb4fb59fe04b..8663721ee018 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -66,13 +66,6 @@ */ #define DSU_PMU_IDX_CYCLE_COUNTER 31 -/* All event counters are 32bit, with a 64bit Cycle counter */ -#define DSU_PMU_COUNTER_WIDTH(idx) \ - (((idx) == DSU_PMU_IDX_CYCLE_COUNTER) ? 64 : 32) - -#define DSU_PMU_COUNTER_MASK(idx) \ - GENMASK_ULL((DSU_PMU_COUNTER_WIDTH((idx)) - 1), 0) - #define DSU_EXT_ATTR(_name, _func, _config) \ (&((struct dev_ext_attribute[]) { \ { \ @@ -107,6 +100,7 @@ struct dsu_hw_events { * @num_counters : Number of event counters implemented by the PMU, * excluding the cycle counter. * @irq : Interrupt line for counter overflow. + * @has_32b_pmevcntr : Are the non-cycle counters only 32-bit? * @cpmceid_bitmap : Bitmap for the availability of architected common * events (event_code < 0x40). */ @@ -120,6 +114,7 @@ struct dsu_pmu { struct hlist_node cpuhp_node; s8 num_counters; int irq; + bool has_32b_pmevcntr; DECLARE_BITMAP(cpmceid_bitmap, DSU_PMU_MAX_COMMON_EVENTS); }; @@ -328,6 +323,11 @@ static inline void dsu_pmu_set_event(struct dsu_pmu *dsu_pmu, raw_spin_unlock_irqrestore(&dsu_pmu->pmu_lock, flags); } +static u64 dsu_pmu_counter_mask(struct hw_perf_event *hw) +{ + return (hw->flags && hw->idx != DSU_PMU_IDX_CYCLE_COUNTER) ? U32_MAX : U64_MAX; +} + static void dsu_pmu_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -339,7 +339,7 @@ static void dsu_pmu_event_update(struct perf_event *event) new_count = dsu_pmu_read_counter(event); } while (local64_cmpxchg(&hwc->prev_count, prev_count, new_count) != prev_count); - delta = (new_count - prev_count) & DSU_PMU_COUNTER_MASK(hwc->idx); + delta = (new_count - prev_count) & dsu_pmu_counter_mask(hwc); local64_add(delta, &event->count); } @@ -362,8 +362,7 @@ static inline u32 dsu_pmu_get_reset_overflow(void) */ static void dsu_pmu_set_event_period(struct perf_event *event) { - int idx = event->hw.idx; - u64 val = DSU_PMU_COUNTER_MASK(idx) >> 1; + u64 val = dsu_pmu_counter_mask(&event->hw) >> 1; local64_set(&event->hw.prev_count, val); dsu_pmu_write_counter(event, val); @@ -564,6 +563,7 @@ static int dsu_pmu_event_init(struct perf_event *event) return -EINVAL; event->hw.config_base = event->attr.config; + event->hw.flags = dsu_pmu->has_32b_pmevcntr; return 0; } @@ -664,6 +664,10 @@ static void dsu_pmu_probe_pmu(struct dsu_pmu *dsu_pmu) cpmceid[1] = __dsu_pmu_read_pmceid(1); bitmap_from_arr32(dsu_pmu->cpmceid_bitmap, cpmceid, DSU_PMU_MAX_COMMON_EVENTS); + /* Newer DSUs have 64-bit counters */ + __dsu_pmu_write_counter(0, U64_MAX); + if (__dsu_pmu_read_counter(0) != U64_MAX) + dsu_pmu->has_32b_pmevcntr = true; } static void dsu_pmu_set_active_cpu(int cpu, struct dsu_pmu *dsu_pmu) From 85c0dbd8b6e2ca5e672560c7cd86801bffa0d884 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 15 Dec 2025 13:04:59 +0000 Subject: [PATCH 4/6] perf/arm_dsu: Support DSU-120 DSU-120 has the same system register interface as previous DSUs, but no longer offers a dedicated cycle counter. While this is not directly discoverable via PMCR, the PMCCNTR register is still defined to exist with RAZ/WI behaviour, allowing for a straightforward heuristic. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/perf/arm_dsu_pmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 8663721ee018..56c592f0dae3 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -101,6 +101,7 @@ struct dsu_hw_events { * excluding the cycle counter. * @irq : Interrupt line for counter overflow. * @has_32b_pmevcntr : Are the non-cycle counters only 32-bit? + * @has_pmccntr : Do we even have a dedicated cycle counter? * @cpmceid_bitmap : Bitmap for the availability of architected common * events (event_code < 0x40). */ @@ -115,6 +116,7 @@ struct dsu_pmu { s8 num_counters; int irq; bool has_32b_pmevcntr; + bool has_pmccntr; DECLARE_BITMAP(cpmceid_bitmap, DSU_PMU_MAX_COMMON_EVENTS); }; @@ -281,7 +283,7 @@ static int dsu_pmu_get_event_idx(struct dsu_hw_events *hw_events, struct dsu_pmu *dsu_pmu = to_dsu_pmu(event->pmu); unsigned long *used_mask = hw_events->used_mask; - if (evtype == DSU_PMU_EVT_CYCLES) { + if (evtype == DSU_PMU_EVT_CYCLES && dsu_pmu->has_pmccntr) { if (test_and_set_bit(DSU_PMU_IDX_CYCLE_COUNTER, used_mask)) return -EAGAIN; return DSU_PMU_IDX_CYCLE_COUNTER; @@ -668,6 +670,10 @@ static void dsu_pmu_probe_pmu(struct dsu_pmu *dsu_pmu) __dsu_pmu_write_counter(0, U64_MAX); if (__dsu_pmu_read_counter(0) != U64_MAX) dsu_pmu->has_32b_pmevcntr = true; + /* On even newer DSUs, PMCCNTR is RAZ/WI */ + __dsu_pmu_write_pmccntr(U64_MAX); + if (__dsu_pmu_read_pmccntr() == U64_MAX) + dsu_pmu->has_pmccntr = true; } static void dsu_pmu_set_active_cpu(int cpu, struct dsu_pmu *dsu_pmu) From 79448fa1f495c3e3b7119e53bedc3cce273aa95f Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 15 Dec 2025 13:05:00 +0000 Subject: [PATCH 5/6] perf/arm_dsu: Allow standard cycles events Since we do not use the divide-by-64 option, there should be no significant difference between the dedicated cycle counter and the standard cycles event. Since using the latter on DSU-120 now has the side-effect of allowing multiple cycles events to be scheduled simultaneously (beneficial for multiple cycle-based metrics), there seems little reason not to allow the same on older DSUs as well. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- drivers/perf/arm_dsu_pmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 56c592f0dae3..32b0dd7c693b 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -284,9 +284,8 @@ static int dsu_pmu_get_event_idx(struct dsu_hw_events *hw_events, unsigned long *used_mask = hw_events->used_mask; if (evtype == DSU_PMU_EVT_CYCLES && dsu_pmu->has_pmccntr) { - if (test_and_set_bit(DSU_PMU_IDX_CYCLE_COUNTER, used_mask)) - return -EAGAIN; - return DSU_PMU_IDX_CYCLE_COUNTER; + if (!test_and_set_bit(DSU_PMU_IDX_CYCLE_COUNTER, used_mask)) + return DSU_PMU_IDX_CYCLE_COUNTER; } idx = find_first_zero_bit(used_mask, dsu_pmu->num_counters); From ab26d9c85554c4ff1d95ca8341522880ed9219d6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 28 Jan 2026 10:55:34 +0100 Subject: [PATCH 6/6] perf/cxlpmu: Replace IRQF_ONESHOT with IRQF_NO_THREAD Passing IRQF_ONESHOT ensures that the interrupt source is masked until the secondary (threaded) handler is done. If only a primary handler is used then the flag makes no sense because the interrupt can not fire (again) while its handler is running. The flag also disallows force-threading of the primary handler and the irq-core will warn about this. The intention here was probably not allowing forced-threading. Replace IRQF_ONESHOT with IRQF_NO_THREAD. Reviewed-by: Jonathan Cameron Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Will Deacon --- drivers/perf/cxl_pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/cxl_pmu.c b/drivers/perf/cxl_pmu.c index d094030220bf..68a54d97d2a8 100644 --- a/drivers/perf/cxl_pmu.c +++ b/drivers/perf/cxl_pmu.c @@ -877,7 +877,7 @@ static int cxl_pmu_probe(struct device *dev) if (!irq_name) return -ENOMEM; - rc = devm_request_irq(dev, irq, cxl_pmu_irq, IRQF_SHARED | IRQF_ONESHOT, + rc = devm_request_irq(dev, irq, cxl_pmu_irq, IRQF_SHARED | IRQF_NO_THREAD, irq_name, info); if (rc) return rc;